NN-512

Back

Index

Files

Top || Input graph file

Config Prefix=ResNeXt50 Platform=AVX512Float32 L1DataCachePerThread=32KiB L2CachePerThreadExL1=960KiB L3CachePerThreadExL1L2=1408KiB
Input ToTensor=image Channels=3 Height=224 Width=224
BatchNorm FromTensor=image ToTensor=bn1 Epsilon=0.00002
Conv FromTensor=bn1 ToTensor=sevenDS ToChannels=64 FilterH=7 FilterW=7 StrideH=2 StrideW=2 PaddingH=3 PaddingW=3 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=sevenDS ToTensor=bn2 Epsilon=0.00002
Activation FromTensor=bn2 ToTensor=relu1 Kind=ReLU Param=0
Pooling FromTensor=relu1 ToTensor=pool1 Kind=Max3x3Stride2 PaddingH=1 PaddingW=1
Conv FromTensor=pool1 ToTensor=one1 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one1 ToTensor=bn3 Epsilon=0.00002
Conv FromTensor=pool1 ToTensor=one2 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one2 ToTensor=bn4 Epsilon=0.00002
Activation FromTensor=bn4 ToTensor=relu2 Kind=ReLU Param=0
Conv FromTensor=relu2 ToTensor=three1 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three1 ToTensor=bn5 Epsilon=0.00002
Activation FromTensor=bn5 ToTensor=relu3 Kind=ReLU Param=0
Conv FromTensor=relu3 ToTensor=one3 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one3 ToTensor=bn6 Epsilon=0.00002
Add FromTensor1=bn3 FromTensor2=bn6 ToTensor=add1
Activation FromTensor=add1 ToTensor=relu4 Kind=ReLU Param=0
Conv FromTensor=relu4 ToTensor=one4 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one4 ToTensor=bn7 Epsilon=0.00002
Activation FromTensor=bn7 ToTensor=relu5 Kind=ReLU Param=0
Conv FromTensor=relu5 ToTensor=three2 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three2 ToTensor=bn8 Epsilon=0.00002
Activation FromTensor=bn8 ToTensor=relu6 Kind=ReLU Param=0
Conv FromTensor=relu6 ToTensor=one5 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one5 ToTensor=bn9 Epsilon=0.00002
Add FromTensor1=relu4 FromTensor2=bn9 ToTensor=add2
Activation FromTensor=add2 ToTensor=relu7 Kind=ReLU Param=0
Conv FromTensor=relu7 ToTensor=one6 ToChannels=128 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one6 ToTensor=bn10 Epsilon=0.00002
Activation FromTensor=bn10 ToTensor=relu8 Kind=ReLU Param=0
Conv FromTensor=relu8 ToTensor=three3 ToChannels=128 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three3 ToTensor=bn11 Epsilon=0.00002
Activation FromTensor=bn11 ToTensor=relu9 Kind=ReLU Param=0
Conv FromTensor=relu9 ToTensor=one7 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one7 ToTensor=bn12 Epsilon=0.00002
Add FromTensor1=relu7 FromTensor2=bn12 ToTensor=add3
Activation FromTensor=add3 ToTensor=relu10 Kind=ReLU Param=0
Conv FromTensor=relu10 ToTensor=oneDS1 ToChannels=512 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS1 ToTensor=bn13 Epsilon=0.00002
Conv FromTensor=relu10 ToTensor=one8 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one8 ToTensor=bn14 Epsilon=0.00002
Activation FromTensor=bn14 ToTensor=relu11 Kind=ReLU Param=0
Conv FromTensor=relu11 ToTensor=threeDS1 ToChannels=256 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS1 ToTensor=bn15 Epsilon=0.00002
Activation FromTensor=bn15 ToTensor=relu12 Kind=ReLU Param=0
Conv FromTensor=relu12 ToTensor=one9 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one9 ToTensor=bn16 Epsilon=0.00002
Add FromTensor1=bn13 FromTensor2=bn16 ToTensor=add4
Activation FromTensor=add4 ToTensor=relu13 Kind=ReLU Param=0
Conv FromTensor=relu13 ToTensor=one10 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one10 ToTensor=bn17 Epsilon=0.00002
Activation FromTensor=bn17 ToTensor=relu14 Kind=ReLU Param=0
Conv FromTensor=relu14 ToTensor=three4 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three4 ToTensor=bn18 Epsilon=0.00002
Activation FromTensor=bn18 ToTensor=relu15 Kind=ReLU Param=0
Conv FromTensor=relu15 ToTensor=one11 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one11 ToTensor=bn19 Epsilon=0.00002
Add FromTensor1=relu13 FromTensor2=bn19 ToTensor=add5
Activation FromTensor=add5 ToTensor=relu16 Kind=ReLU Param=0
Conv FromTensor=relu16 ToTensor=one12 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one12 ToTensor=bn20 Epsilon=0.00002
Activation FromTensor=bn20 ToTensor=relu17 Kind=ReLU Param=0
Conv FromTensor=relu17 ToTensor=three5 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three5 ToTensor=bn21 Epsilon=0.00002
Activation FromTensor=bn21 ToTensor=relu18 Kind=ReLU Param=0
Conv FromTensor=relu18 ToTensor=one13 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one13 ToTensor=bn22 Epsilon=0.00002
Add FromTensor1=relu16 FromTensor2=bn22 ToTensor=add6
Activation FromTensor=add6 ToTensor=relu19 Kind=ReLU Param=0
Conv FromTensor=relu19 ToTensor=one14 ToChannels=256 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one14 ToTensor=bn23 Epsilon=0.00002
Activation FromTensor=bn23 ToTensor=relu20 Kind=ReLU Param=0
Conv FromTensor=relu20 ToTensor=three6 ToChannels=256 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three6 ToTensor=bn24 Epsilon=0.00002
Activation FromTensor=bn24 ToTensor=relu21 Kind=ReLU Param=0
Conv FromTensor=relu21 ToTensor=one15 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one15 ToTensor=bn25 Epsilon=0.00002
Add FromTensor1=relu19 FromTensor2=bn25 ToTensor=add7
Activation FromTensor=add7 ToTensor=relu22 Kind=ReLU Param=0
Conv FromTensor=relu22 ToTensor=oneDS2 ToChannels=1024 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS2 ToTensor=bn26 Epsilon=0.00002
Conv FromTensor=relu22 ToTensor=one16 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one16 ToTensor=bn27 Epsilon=0.00002
Activation FromTensor=bn27 ToTensor=relu23 Kind=ReLU Param=0
Conv FromTensor=relu23 ToTensor=threeDS2 ToChannels=512 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS2 ToTensor=bn28 Epsilon=0.00002
Activation FromTensor=bn28 ToTensor=relu24 Kind=ReLU Param=0
Conv FromTensor=relu24 ToTensor=one17 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one17 ToTensor=bn29 Epsilon=0.00002
Add FromTensor1=bn26 FromTensor2=bn29 ToTensor=add8
Activation FromTensor=add8 ToTensor=relu25 Kind=ReLU Param=0
Conv FromTensor=relu25 ToTensor=one18 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one18 ToTensor=bn30 Epsilon=0.00002
Activation FromTensor=bn30 ToTensor=relu26 Kind=ReLU Param=0
Conv FromTensor=relu26 ToTensor=three7 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three7 ToTensor=bn31 Epsilon=0.00002
Activation FromTensor=bn31 ToTensor=relu27 Kind=ReLU Param=0
Conv FromTensor=relu27 ToTensor=one19 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one19 ToTensor=bn32 Epsilon=0.00002
Add FromTensor1=relu25 FromTensor2=bn32 ToTensor=add9
Activation FromTensor=add9 ToTensor=relu28 Kind=ReLU Param=0
Conv FromTensor=relu28 ToTensor=one20 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one20 ToTensor=bn33 Epsilon=0.00002
Activation FromTensor=bn33 ToTensor=relu29 Kind=ReLU Param=0
Conv FromTensor=relu29 ToTensor=three8 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three8 ToTensor=bn34 Epsilon=0.00002
Activation FromTensor=bn34 ToTensor=relu30 Kind=ReLU Param=0
Conv FromTensor=relu30 ToTensor=one21 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one21 ToTensor=bn35 Epsilon=0.00002
Add FromTensor1=relu28 FromTensor2=bn35 ToTensor=add10
Activation FromTensor=add10 ToTensor=relu31 Kind=ReLU Param=0
Conv FromTensor=relu31 ToTensor=one22 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one22 ToTensor=bn36 Epsilon=0.00002
Activation FromTensor=bn36 ToTensor=relu32 Kind=ReLU Param=0
Conv FromTensor=relu32 ToTensor=three9 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three9 ToTensor=bn37 Epsilon=0.00002
Activation FromTensor=bn37 ToTensor=relu33 Kind=ReLU Param=0
Conv FromTensor=relu33 ToTensor=one23 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one23 ToTensor=bn38 Epsilon=0.00002
Add FromTensor1=relu31 FromTensor2=bn38 ToTensor=add11
Activation FromTensor=add11 ToTensor=relu34 Kind=ReLU Param=0
Conv FromTensor=relu34 ToTensor=one24 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one24 ToTensor=bn39 Epsilon=0.00002
Activation FromTensor=bn39 ToTensor=relu35 Kind=ReLU Param=0
Conv FromTensor=relu35 ToTensor=three10 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three10 ToTensor=bn40 Epsilon=0.00002
Activation FromTensor=bn40 ToTensor=relu36 Kind=ReLU Param=0
Conv FromTensor=relu36 ToTensor=one25 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one25 ToTensor=bn41 Epsilon=0.00002
Add FromTensor1=relu34 FromTensor2=bn41 ToTensor=add12
Activation FromTensor=add12 ToTensor=relu37 Kind=ReLU Param=0
Conv FromTensor=relu37 ToTensor=one26 ToChannels=512 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one26 ToTensor=bn42 Epsilon=0.00002
Activation FromTensor=bn42 ToTensor=relu38 Kind=ReLU Param=0
Conv FromTensor=relu38 ToTensor=three11 ToChannels=512 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three11 ToTensor=bn43 Epsilon=0.00002
Activation FromTensor=bn43 ToTensor=relu39 Kind=ReLU Param=0
Conv FromTensor=relu39 ToTensor=one27 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one27 ToTensor=bn44 Epsilon=0.00002
Add FromTensor1=relu37 FromTensor2=bn44 ToTensor=add13
Activation FromTensor=add13 ToTensor=relu40 Kind=ReLU Param=0
Conv FromTensor=relu40 ToTensor=oneDS3 ToChannels=2048 FilterH=1 FilterW=1 StrideH=2 StrideW=2 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=oneDS3 ToTensor=bn45 Epsilon=0.00002
Conv FromTensor=relu40 ToTensor=one28 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one28 ToTensor=bn46 Epsilon=0.00002
Activation FromTensor=bn46 ToTensor=relu41 Kind=ReLU Param=0
Conv FromTensor=relu41 ToTensor=threeDS3 ToChannels=1024 FilterH=3 FilterW=3 StrideH=2 StrideW=2 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=threeDS3 ToTensor=bn47 Epsilon=0.00002
Activation FromTensor=bn47 ToTensor=relu42 Kind=ReLU Param=0
Conv FromTensor=relu42 ToTensor=one29 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one29 ToTensor=bn48 Epsilon=0.00002
Add FromTensor1=bn45 FromTensor2=bn48 ToTensor=add14
Activation FromTensor=add14 ToTensor=relu43 Kind=ReLU Param=0
Conv FromTensor=relu43 ToTensor=one30 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one30 ToTensor=bn49 Epsilon=0.00002
Activation FromTensor=bn49 ToTensor=relu44 Kind=ReLU Param=0
Conv FromTensor=relu44 ToTensor=three12 ToChannels=1024 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three12 ToTensor=bn50 Epsilon=0.00002
Activation FromTensor=bn50 ToTensor=relu45 Kind=ReLU Param=0
Conv FromTensor=relu45 ToTensor=one31 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one31 ToTensor=bn51 Epsilon=0.00002
Add FromTensor1=relu43 FromTensor2=bn51 ToTensor=add15
Activation FromTensor=add15 ToTensor=relu46 Kind=ReLU Param=0
Conv FromTensor=relu46 ToTensor=one32 ToChannels=1024 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one32 ToTensor=bn52 Epsilon=0.00002
Activation FromTensor=bn52 ToTensor=relu47 Kind=ReLU Param=0
Conv FromTensor=relu47 ToTensor=three13 ToChannels=1024 FilterH=3 FilterW=3 StrideH=1 StrideW=1 PaddingH=1 PaddingW=1 DilationH=1 DilationW=1 Groups=32
BatchNorm FromTensor=three13 ToTensor=bn53 Epsilon=0.00002
Activation FromTensor=bn53 ToTensor=relu48 Kind=ReLU Param=0
Conv FromTensor=relu48 ToTensor=one33 ToChannels=2048 FilterH=1 FilterW=1 StrideH=1 StrideW=1 PaddingH=0 PaddingW=0 DilationH=1 DilationW=1 Groups=1
BatchNorm FromTensor=one33 ToTensor=bn54 Epsilon=0.00002
Add FromTensor1=relu46 FromTensor2=bn54 ToTensor=add16
Activation FromTensor=add16 ToTensor=relu49 Kind=ReLU Param=0
Pooling FromTensor=relu49 ToTensor=pool2 Kind=AvgGlobal PaddingH=0 PaddingW=0
FullyConnected FromTensor=pool2 ToTensor=fc ToChannels=1000
Softmax FromTensor=fc ToTensor=prob
Output FromTensor=prob

Top || Output ResNeXt50.h file

#pragma once

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <pthread.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" { /**/
#endif

// All weights, biases, and other trained parameters are passed into
// the initialization code through the Params struct that is declared
// just below this comment. The corresponding struct definition can be
// found near the end of this header file.
//
// Each field of the Params struct is an array of float that holds a
// parameter tensor in NCHW format with no padding. The struct fields
// are ordered by name, lexically bytewise. If you concatenate all the
// trained parameter tensors to a file in this same format and order
// you can load the struct as follows (error checking omitted here):
//
// size_t size = sizeof(ResNeXt50Params);
// ResNeXt50Params* to = malloc(size);
// FILE* from = fopen("ParamsFile", "r");
// fread(to, size, 1, from);
// fclose(from);
//
// Be careful to match endianness (and floating point format).

typedef struct ResNeXt50Params ResNeXt50Params;

// The Net contains weights, biases, and other trained parameters in a
// form that enables efficient inference. It is created from the input
// parameter struct without modifying that struct. The input parameter
// struct is no longer needed once the Net has been created. Threads
// that are used to create the Net are temporary (in particular, those
// threads are not used for inference).
//
// ResNeXt50Params* params = malloc(sizeof(ResNeXt50Params));
//
// ... Load params (read from a file, perhaps) ...
//
// ResNeXt50Net* net; // For example, 4 threads:
// char* err = ResNeXt50NetCreate(&net, params, 4);
// free(params);
//
// if (err) { // Nonzero err indicates failure; net is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Perform all inference that depends on net ...
//
// ResNeXt50NetDestroy(net);
//
// The Net can be shared and reused without restriction because it is
// never modified (not even temporarily) after being created. The Net
// should be destroyed (to free memory) once all dependent inference
// is complete.

typedef struct ResNeXt50Net ResNeXt50Net;

char* ResNeXt50NetCreate(
ResNeXt50Net**,
ResNeXt50Params*,
ptrdiff_t threads
);

void ResNeXt50NetDestroy(ResNeXt50Net*);

// An Engine performs inference. It contains inference threads, scratch
// memory, and a pointer to the Net. Any number of Engines can share the
// same Net (and perform inference in parallel) because the Net is never
// modified. For best performance the number of inference threads should
// not exceed the number of CPU cores.
//
// ResNeXt50Net* net;
//
// ... Create net ...
//
// ResNeXt50Engine* engine; // For example, 4 inference threads:
// char* err = ResNeXt50EngineCreate(&engine, net, 4);
//
// if (err) { // Nonzero err means failure; engine is unmodified.
// printf("%s\n", err); // Explain the failure, add a newline.
// free(err); // Free the error string to avoid a memory leak.
//
// ... Destroy net ...
//
// exit(1); // Exit, or propagate the failure some other way.
// }
//
// ... Use the POSIX threads API to adjust engine's threads ...
// ... Use engine to perform inference (dependent on net) ...
//
// ResNeXt50EngineDestroy(engine); // Terminate threads, free memory.
//
// ... Destroy net ...
//
// The POSIX threads API can be used to adjust an Engine's threads. If
// an Engine has N threads, those threads are indexed 0, 1, 2, ..., N-1
// and a pthread_t identifier is associated with each index. To set the
// CPU affinity mask for the first inference thread, for example:
//
// pthread_t thread; // The first thread has index 0:
// char* err = ResNeXt50EnginePthreadT(engine, 0, &thread);
//
// assert(!err); // Can only fail if the thread index is invalid.
//
// pthread_setaffinity_np(thread, ...); // Details omitted.
//
// The inference function reads floats from (one or more) input tensors
// and writes floats to (one or more) output tensors. All the input and
// output tensors are owned (allocated and freed) by the caller and are
// in CHW format, 32-bit floating point, fully packed (in other words,
// C has the largest pitch, W has the smallest pitch, and there is no
// padding anywhere).
//
// float* imageData = malloc(sizeof(float)*3*224*224);
// float* probData = malloc(sizeof(float)*1000*1*1);
//
// for (...) { // Reuse the input and output tensors.
//
// ... Write the input floats ...
//
// ResNeXt50EngineInference( // This function cannot fail.
// engine, // Pass an Engine as the first argument.
// imageData, // The tensor arguments are sorted by name.
// probData
// );
//
// ... Read the output floats ...
//
// }
//
// free(imageData);
// free(probData);
//
// The tensor parameters of the inference function are ordered by name,
// lexically bytewise. In other words, the function parameters have been
// sorted by name using Go's "<" string comparison operator (a bytewise
// lexical string sort).

typedef struct ResNeXt50Engine ResNeXt50Engine;

char* ResNeXt50EngineCreate(
ResNeXt50Engine**,
ResNeXt50Net*,
ptrdiff_t threads
);

char* ResNeXt50EnginePthreadT(
ResNeXt50Engine*,
ptrdiff_t threadIdx,
pthread_t* to
);

void ResNeXt50EngineInference(
ResNeXt50Engine*,
float* imageData,
float* probData
);

void ResNeXt50EngineDestroy(ResNeXt50Engine*);

// The fields of the following struct have been sorted by name using
// Go's "<" string comparison operator (bytewise lexical string sort).
// Tensor dimensions are NxCxHxW where N is the outermost/slowest and
// W is the innermost/fastest. There is no padding anywhere.

struct ResNeXt50Params {
float bn10Means[128]; // 1x128x1x1
float bn10Scales[128]; // 1x128x1x1
float bn10Shifts[128]; // 1x128x1x1
float bn10Variances[128]; // 1x128x1x1
float bn11Means[128]; // 1x128x1x1
float bn11Scales[128]; // 1x128x1x1
float bn11Shifts[128]; // 1x128x1x1
float bn11Variances[128]; // 1x128x1x1
float bn12Means[256]; // 1x256x1x1
float bn12Scales[256]; // 1x256x1x1
float bn12Shifts[256]; // 1x256x1x1
float bn12Variances[256]; // 1x256x1x1
float bn13Means[512]; // 1x512x1x1
float bn13Scales[512]; // 1x512x1x1
float bn13Shifts[512]; // 1x512x1x1
float bn13Variances[512]; // 1x512x1x1
float bn14Means[256]; // 1x256x1x1
float bn14Scales[256]; // 1x256x1x1
float bn14Shifts[256]; // 1x256x1x1
float bn14Variances[256]; // 1x256x1x1
float bn15Means[256]; // 1x256x1x1
float bn15Scales[256]; // 1x256x1x1
float bn15Shifts[256]; // 1x256x1x1
float bn15Variances[256]; // 1x256x1x1
float bn16Means[512]; // 1x512x1x1
float bn16Scales[512]; // 1x512x1x1
float bn16Shifts[512]; // 1x512x1x1
float bn16Variances[512]; // 1x512x1x1
float bn17Means[256]; // 1x256x1x1
float bn17Scales[256]; // 1x256x1x1
float bn17Shifts[256]; // 1x256x1x1
float bn17Variances[256]; // 1x256x1x1
float bn18Means[256]; // 1x256x1x1
float bn18Scales[256]; // 1x256x1x1
float bn18Shifts[256]; // 1x256x1x1
float bn18Variances[256]; // 1x256x1x1
float bn19Means[512]; // 1x512x1x1
float bn19Scales[512]; // 1x512x1x1
float bn19Shifts[512]; // 1x512x1x1
float bn19Variances[512]; // 1x512x1x1
float bn1Means[3]; // 1x3x1x1
float bn1Scales[3]; // 1x3x1x1
float bn1Shifts[3]; // 1x3x1x1
float bn1Variances[3]; // 1x3x1x1
float bn20Means[256]; // 1x256x1x1
float bn20Scales[256]; // 1x256x1x1
float bn20Shifts[256]; // 1x256x1x1
float bn20Variances[256]; // 1x256x1x1
float bn21Means[256]; // 1x256x1x1
float bn21Scales[256]; // 1x256x1x1
float bn21Shifts[256]; // 1x256x1x1
float bn21Variances[256]; // 1x256x1x1
float bn22Means[512]; // 1x512x1x1
float bn22Scales[512]; // 1x512x1x1
float bn22Shifts[512]; // 1x512x1x1
float bn22Variances[512]; // 1x512x1x1
float bn23Means[256]; // 1x256x1x1
float bn23Scales[256]; // 1x256x1x1
float bn23Shifts[256]; // 1x256x1x1
float bn23Variances[256]; // 1x256x1x1
float bn24Means[256]; // 1x256x1x1
float bn24Scales[256]; // 1x256x1x1
float bn24Shifts[256]; // 1x256x1x1
float bn24Variances[256]; // 1x256x1x1
float bn25Means[512]; // 1x512x1x1
float bn25Scales[512]; // 1x512x1x1
float bn25Shifts[512]; // 1x512x1x1
float bn25Variances[512]; // 1x512x1x1
float bn26Means[1024]; // 1x1024x1x1
float bn26Scales[1024]; // 1x1024x1x1
float bn26Shifts[1024]; // 1x1024x1x1
float bn26Variances[1024]; // 1x1024x1x1
float bn27Means[512]; // 1x512x1x1
float bn27Scales[512]; // 1x512x1x1
float bn27Shifts[512]; // 1x512x1x1
float bn27Variances[512]; // 1x512x1x1
float bn28Means[512]; // 1x512x1x1
float bn28Scales[512]; // 1x512x1x1
float bn28Shifts[512]; // 1x512x1x1
float bn28Variances[512]; // 1x512x1x1
float bn29Means[1024]; // 1x1024x1x1
float bn29Scales[1024]; // 1x1024x1x1
float bn29Shifts[1024]; // 1x1024x1x1
float bn29Variances[1024]; // 1x1024x1x1
float bn2Means[64]; // 1x64x1x1
float bn2Scales[64]; // 1x64x1x1
float bn2Shifts[64]; // 1x64x1x1
float bn2Variances[64]; // 1x64x1x1
float bn30Means[512]; // 1x512x1x1
float bn30Scales[512]; // 1x512x1x1
float bn30Shifts[512]; // 1x512x1x1
float bn30Variances[512]; // 1x512x1x1
float bn31Means[512]; // 1x512x1x1
float bn31Scales[512]; // 1x512x1x1
float bn31Shifts[512]; // 1x512x1x1
float bn31Variances[512]; // 1x512x1x1
float bn32Means[1024]; // 1x1024x1x1
float bn32Scales[1024]; // 1x1024x1x1
float bn32Shifts[1024]; // 1x1024x1x1
float bn32Variances[1024]; // 1x1024x1x1
float bn33Means[512]; // 1x512x1x1
float bn33Scales[512]; // 1x512x1x1
float bn33Shifts[512]; // 1x512x1x1
float bn33Variances[512]; // 1x512x1x1
float bn34Means[512]; // 1x512x1x1
float bn34Scales[512]; // 1x512x1x1
float bn34Shifts[512]; // 1x512x1x1
float bn34Variances[512]; // 1x512x1x1
float bn35Means[1024]; // 1x1024x1x1
float bn35Scales[1024]; // 1x1024x1x1
float bn35Shifts[1024]; // 1x1024x1x1
float bn35Variances[1024]; // 1x1024x1x1
float bn36Means[512]; // 1x512x1x1
float bn36Scales[512]; // 1x512x1x1
float bn36Shifts[512]; // 1x512x1x1
float bn36Variances[512]; // 1x512x1x1
float bn37Means[512]; // 1x512x1x1
float bn37Scales[512]; // 1x512x1x1
float bn37Shifts[512]; // 1x512x1x1
float bn37Variances[512]; // 1x512x1x1
float bn38Means[1024]; // 1x1024x1x1
float bn38Scales[1024]; // 1x1024x1x1
float bn38Shifts[1024]; // 1x1024x1x1
float bn38Variances[1024]; // 1x1024x1x1
float bn39Means[512]; // 1x512x1x1
float bn39Scales[512]; // 1x512x1x1
float bn39Shifts[512]; // 1x512x1x1
float bn39Variances[512]; // 1x512x1x1
float bn3Means[256]; // 1x256x1x1
float bn3Scales[256]; // 1x256x1x1
float bn3Shifts[256]; // 1x256x1x1
float bn3Variances[256]; // 1x256x1x1
float bn40Means[512]; // 1x512x1x1
float bn40Scales[512]; // 1x512x1x1
float bn40Shifts[512]; // 1x512x1x1
float bn40Variances[512]; // 1x512x1x1
float bn41Means[1024]; // 1x1024x1x1
float bn41Scales[1024]; // 1x1024x1x1
float bn41Shifts[1024]; // 1x1024x1x1
float bn41Variances[1024]; // 1x1024x1x1
float bn42Means[512]; // 1x512x1x1
float bn42Scales[512]; // 1x512x1x1
float bn42Shifts[512]; // 1x512x1x1
float bn42Variances[512]; // 1x512x1x1
float bn43Means[512]; // 1x512x1x1
float bn43Scales[512]; // 1x512x1x1
float bn43Shifts[512]; // 1x512x1x1
float bn43Variances[512]; // 1x512x1x1
float bn44Means[1024]; // 1x1024x1x1
float bn44Scales[1024]; // 1x1024x1x1
float bn44Shifts[1024]; // 1x1024x1x1
float bn44Variances[1024]; // 1x1024x1x1
float bn45Means[2048]; // 1x2048x1x1
float bn45Scales[2048]; // 1x2048x1x1
float bn45Shifts[2048]; // 1x2048x1x1
float bn45Variances[2048]; // 1x2048x1x1
float bn46Means[1024]; // 1x1024x1x1
float bn46Scales[1024]; // 1x1024x1x1
float bn46Shifts[1024]; // 1x1024x1x1
float bn46Variances[1024]; // 1x1024x1x1
float bn47Means[1024]; // 1x1024x1x1
float bn47Scales[1024]; // 1x1024x1x1
float bn47Shifts[1024]; // 1x1024x1x1
float bn47Variances[1024]; // 1x1024x1x1
float bn48Means[2048]; // 1x2048x1x1
float bn48Scales[2048]; // 1x2048x1x1
float bn48Shifts[2048]; // 1x2048x1x1
float bn48Variances[2048]; // 1x2048x1x1
float bn49Means[1024]; // 1x1024x1x1
float bn49Scales[1024]; // 1x1024x1x1
float bn49Shifts[1024]; // 1x1024x1x1
float bn49Variances[1024]; // 1x1024x1x1
float bn4Means[128]; // 1x128x1x1
float bn4Scales[128]; // 1x128x1x1
float bn4Shifts[128]; // 1x128x1x1
float bn4Variances[128]; // 1x128x1x1
float bn50Means[1024]; // 1x1024x1x1
float bn50Scales[1024]; // 1x1024x1x1
float bn50Shifts[1024]; // 1x1024x1x1
float bn50Variances[1024]; // 1x1024x1x1
float bn51Means[2048]; // 1x2048x1x1
float bn51Scales[2048]; // 1x2048x1x1
float bn51Shifts[2048]; // 1x2048x1x1
float bn51Variances[2048]; // 1x2048x1x1
float bn52Means[1024]; // 1x1024x1x1
float bn52Scales[1024]; // 1x1024x1x1
float bn52Shifts[1024]; // 1x1024x1x1
float bn52Variances[1024]; // 1x1024x1x1
float bn53Means[1024]; // 1x1024x1x1
float bn53Scales[1024]; // 1x1024x1x1
float bn53Shifts[1024]; // 1x1024x1x1
float bn53Variances[1024]; // 1x1024x1x1
float bn54Means[2048]; // 1x2048x1x1
float bn54Scales[2048]; // 1x2048x1x1
float bn54Shifts[2048]; // 1x2048x1x1
float bn54Variances[2048]; // 1x2048x1x1
float bn5Means[128]; // 1x128x1x1
float bn5Scales[128]; // 1x128x1x1
float bn5Shifts[128]; // 1x128x1x1
float bn5Variances[128]; // 1x128x1x1
float bn6Means[256]; // 1x256x1x1
float bn6Scales[256]; // 1x256x1x1
float bn6Shifts[256]; // 1x256x1x1
float bn6Variances[256]; // 1x256x1x1
float bn7Means[128]; // 1x128x1x1
float bn7Scales[128]; // 1x128x1x1
float bn7Shifts[128]; // 1x128x1x1
float bn7Variances[128]; // 1x128x1x1
float bn8Means[128]; // 1x128x1x1
float bn8Scales[128]; // 1x128x1x1
float bn8Shifts[128]; // 1x128x1x1
float bn8Variances[128]; // 1x128x1x1
float bn9Means[256]; // 1x256x1x1
float bn9Scales[256]; // 1x256x1x1
float bn9Shifts[256]; // 1x256x1x1
float bn9Variances[256]; // 1x256x1x1
float fcBiases[1000]; // 1x1000x1x1
float fcWeights[2048000]; // 1000x2048x1x1
float one10Biases[256]; // 1x256x1x1
float one10Weights[131072]; // 256x512x1x1
float one11Biases[512]; // 1x512x1x1
float one11Weights[131072]; // 512x256x1x1
float one12Biases[256]; // 1x256x1x1
float one12Weights[131072]; // 256x512x1x1
float one13Biases[512]; // 1x512x1x1
float one13Weights[131072]; // 512x256x1x1
float one14Biases[256]; // 1x256x1x1
float one14Weights[131072]; // 256x512x1x1
float one15Biases[512]; // 1x512x1x1
float one15Weights[131072]; // 512x256x1x1
float one16Biases[512]; // 1x512x1x1
float one16Weights[262144]; // 512x512x1x1
float one17Biases[1024]; // 1x1024x1x1
float one17Weights[524288]; // 1024x512x1x1
float one18Biases[512]; // 1x512x1x1
float one18Weights[524288]; // 512x1024x1x1
float one19Biases[1024]; // 1x1024x1x1
float one19Weights[524288]; // 1024x512x1x1
float one1Biases[256]; // 1x256x1x1
float one1Weights[16384]; // 256x64x1x1
float one20Biases[512]; // 1x512x1x1
float one20Weights[524288]; // 512x1024x1x1
float one21Biases[1024]; // 1x1024x1x1
float one21Weights[524288]; // 1024x512x1x1
float one22Biases[512]; // 1x512x1x1
float one22Weights[524288]; // 512x1024x1x1
float one23Biases[1024]; // 1x1024x1x1
float one23Weights[524288]; // 1024x512x1x1
float one24Biases[512]; // 1x512x1x1
float one24Weights[524288]; // 512x1024x1x1
float one25Biases[1024]; // 1x1024x1x1
float one25Weights[524288]; // 1024x512x1x1
float one26Biases[512]; // 1x512x1x1
float one26Weights[524288]; // 512x1024x1x1
float one27Biases[1024]; // 1x1024x1x1
float one27Weights[524288]; // 1024x512x1x1
float one28Biases[1024]; // 1x1024x1x1
float one28Weights[1048576]; // 1024x1024x1x1
float one29Biases[2048]; // 1x2048x1x1
float one29Weights[2097152]; // 2048x1024x1x1
float one2Biases[128]; // 1x128x1x1
float one2Weights[8192]; // 128x64x1x1
float one30Biases[1024]; // 1x1024x1x1
float one30Weights[2097152]; // 1024x2048x1x1
float one31Biases[2048]; // 1x2048x1x1
float one31Weights[2097152]; // 2048x1024x1x1
float one32Biases[1024]; // 1x1024x1x1
float one32Weights[2097152]; // 1024x2048x1x1
float one33Biases[2048]; // 1x2048x1x1
float one33Weights[2097152]; // 2048x1024x1x1
float one3Biases[256]; // 1x256x1x1
float one3Weights[32768]; // 256x128x1x1
float one4Biases[128]; // 1x128x1x1
float one4Weights[32768]; // 128x256x1x1
float one5Biases[256]; // 1x256x1x1
float one5Weights[32768]; // 256x128x1x1
float one6Biases[128]; // 1x128x1x1
float one6Weights[32768]; // 128x256x1x1
float one7Biases[256]; // 1x256x1x1
float one7Weights[32768]; // 256x128x1x1
float one8Biases[256]; // 1x256x1x1
float one8Weights[65536]; // 256x256x1x1
float one9Biases[512]; // 1x512x1x1
float one9Weights[131072]; // 512x256x1x1
float oneDS1Biases[512]; // 1x512x1x1
float oneDS1Weights[131072]; // 512x256x1x1
float oneDS2Biases[1024]; // 1x1024x1x1
float oneDS2Weights[524288]; // 1024x512x1x1
float oneDS3Biases[2048]; // 1x2048x1x1
float oneDS3Weights[2097152]; // 2048x1024x1x1
float sevenDSBiases[64]; // 1x64x1x1
float sevenDSWeights[9408]; // 64x3x7x7
float three10Biases[512]; // 1x512x1x1
float three10Weights[73728]; // 512x16x3x3
float three11Biases[512]; // 1x512x1x1
float three11Weights[73728]; // 512x16x3x3
float three12Biases[1024]; // 1x1024x1x1
float three12Weights[294912]; // 1024x32x3x3
float three13Biases[1024]; // 1x1024x1x1
float three13Weights[294912]; // 1024x32x3x3
float three1Biases[128]; // 1x128x1x1
float three1Weights[4608]; // 128x4x3x3
float three2Biases[128]; // 1x128x1x1
float three2Weights[4608]; // 128x4x3x3
float three3Biases[128]; // 1x128x1x1
float three3Weights[4608]; // 128x4x3x3
float three4Biases[256]; // 1x256x1x1
float three4Weights[18432]; // 256x8x3x3
float three5Biases[256]; // 1x256x1x1
float three5Weights[18432]; // 256x8x3x3
float three6Biases[256]; // 1x256x1x1
float three6Weights[18432]; // 256x8x3x3
float three7Biases[512]; // 1x512x1x1
float three7Weights[73728]; // 512x16x3x3
float three8Biases[512]; // 1x512x1x1
float three8Weights[73728]; // 512x16x3x3
float three9Biases[512]; // 1x512x1x1
float three9Weights[73728]; // 512x16x3x3
float threeDS1Biases[256]; // 1x256x1x1
float threeDS1Weights[18432]; // 256x8x3x3
float threeDS2Biases[512]; // 1x512x1x1
float threeDS2Weights[73728]; // 512x16x3x3
float threeDS3Biases[1024]; // 1x1024x1x1
float threeDS3Weights[294912]; // 1024x32x3x3
} __attribute__((packed));

#ifdef __cplusplus
/**/ }
#endif

// End of file.

Top || Output ResNeXt50.c file

// To build an object file:
// gcc -c -w -std=c99 -pthread -Ofast -mavx512f ResNeXt50.c

// NN-512 (https://NN-512.com)
//
// Copyright (C) 2019 [
// 37ef ced3 3727 60b4
// 3c29 f9c6 dc30 d518
// f4f3 4106 6964 cab4
// a06f c1a3 83fd 090e
// ]
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <errno.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <immintrin.h>

#include "ResNeXt50.h"

static char* ResNeXt50Errmsg1(ptrdiff_t lineNum1, char* format1, ...) {
char* msg1 = malloc(277);
int step1 = sprintf(msg1, "ResNeXt50: line %td: ", lineNum1);
va_list ap1;
va_start(ap1, format1);
vsnprintf(msg1+step1, 277-step1, format1, ap1);
va_end(ap1);
return msg1;
}

typedef struct ResNeXt50ThreaderTask1 ResNeXt50ThreaderTask1;
typedef void (*ResNeXt50ThreaderCallee1)(ResNeXt50ThreaderTask1*, int64_t*);
typedef struct ResNeXt50ThreaderHub1 ResNeXt50ThreaderHub1;
typedef struct ResNeXt50ThreaderNode1 ResNeXt50ThreaderNode1;
typedef struct ResNeXt50ThreaderUnwind1 ResNeXt50ThreaderUnwind1;
typedef struct ResNeXt50ThreaderTeam1 ResNeXt50ThreaderTeam1;

struct ResNeXt50ThreaderTask1 {
ResNeXt50ThreaderCallee1 callee1;
void* any1;
ptrdiff_t nd1;
int64_t hull1[4];
};

struct ResNeXt50ThreaderHub1 {
pthread_mutex_t mut1;
pthread_cond_t cond1;
ptrdiff_t pending1;
ptrdiff_t offset1;
long mask1;
long status1[];
};

struct ResNeXt50ThreaderNode1 {
pthread_mutex_t mut2;
int64_t np1;
int64_t pt1[4];
ResNeXt50ThreaderTask1* task1;
pthread_cond_t cond2;
ResNeXt50ThreaderTeam1* team1;
pthread_t thr1;
} __attribute__((aligned(64)));

struct ResNeXt50ThreaderUnwind1 {
ptrdiff_t join1;
ptrdiff_t nodeConds1;
ptrdiff_t nodeMuts1;
ptrdiff_t hubCond1;
ptrdiff_t hubMut1;
void* nodes1;
void* hub1;
};

struct ResNeXt50ThreaderTeam1 {
ptrdiff_t nt1;
ResNeXt50ThreaderHub1* hub2;
ResNeXt50ThreaderNode1* nodes2;
ResNeXt50ThreaderUnwind1 unwind1;
};

static void ResNeXt50ThreaderInc1(
ptrdiff_t nd2,
int64_t*restrict hull2,
int64_t*restrict pt2
) {
for (ptrdiff_t i1 = 0; i1 < nd2; ++i1) {
int64_t elem1 = pt2[i1];
if (++elem1 == hull2[i1]) {
pt2[i1] = 0;
} else {
pt2[i1] = elem1;
break;
}
}
}

static void ResNeXt50ThreaderPut1(
ptrdiff_t nd3,
int64_t*restrict hull3,
int64_t*restrict pt3,
int64_t val1
) {
ptrdiff_t i2 = 0;
for (; i2 < nd3 && val1; ) {
int64_t wrap1 = hull3[i2];
int64_t carry1 = val1/wrap1;
pt3[i2++] = val1-carry1*wrap1;
val1 = carry1;
}
for (; i2 < nd3; pt3[i2++] = 0);
}

static void ResNeXt50ThreaderAdd1(
ptrdiff_t nd4,
int64_t*restrict hull4,
int64_t*restrict pt4,
int64_t*restrict plus1,
int64_t carry2
) {
for (ptrdiff_t i3 = 0; i3 < nd4; ++i3) {
int64_t wrap2 = hull4[i3];
int64_t sum1 = pt4[i3]+plus1[i3]+carry2;
if (sum1 < wrap2) {
pt4[i3] = sum1;
carry2 = 0;
} else {
pt4[i3] = sum1-wrap2;
carry2 = 1;
}
}
}

static void* ResNeXt50ThreaderMain1(void* arg1) {
ResNeXt50ThreaderNode1* node1 = arg1;
ResNeXt50ThreaderTeam1* team2 = node1->team1;
ptrdiff_t nt2 = team2->nt1;
ResNeXt50ThreaderHub1* hub3 = team2->hub2;
ResNeXt50ThreaderNode1* nodes3 = team2->nodes2;
size_t role1 = node1-nodes3;
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
for (; ; ) {
ResNeXt50ThreaderTask1* task2 = node1->task1;
if (!task2) {
for (; __builtin_expect(pthread_cond_wait(&node1->cond2, &node1->mut2), 0); );
continue;
}
int64_t np2 = node1->np1;
if (np2 < 0) {
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
return 0;
}
node1->task1 = 0;
ResNeXt50ThreaderCallee1 callee2 = task2->callee1;
ptrdiff_t nd5 = task2->nd1;
int64_t pt5[4];
for (; np2; np2 = node1->np1) {
memcpy(pt5, node1->pt1, sizeof(pt5));
node1->np1 = np2-1;
ResNeXt50ThreaderInc1(nd5, task2->hull1, node1->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node1->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[role1/(sizeof(long)*8)] &= ~((long)1<<role1%(sizeof(long)*8));
ptrdiff_t offset2 = hub3->offset1;
long mask2 = hub3->mask1;
ptrdiff_t wrapped1 = 0;
for (; ; ) {
long hand1 = hub3->status1[offset2]&mask2;
if (!hand1) {
++offset2;
mask2 = -1;
continue;
}
ptrdiff_t target1 = offset2*(sizeof(long)*8)+__builtin_ctzl(hand1);
if (target1 == nt2) {
if (wrapped1) break;
offset2 = 0;
mask2 = -1;
wrapped1 = 1;
continue;
}
hand1 &= -hand1;
hub3->offset1 = offset2;
hub3->mask1 = mask2-hand1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
ResNeXt50ThreaderNode1* node2 = nodes3+target1;
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
for (np2 = node2->np1; np2; np2 = node2->np1) {
memcpy(pt5, node2->pt1, sizeof(pt5));
node2->np1 = np2-1;
ResNeXt50ThreaderInc1(nd5, task2->hull1, node2->pt1);
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
callee2(task2, pt5);
for (; __builtin_expect(pthread_mutex_lock(&node2->mut2), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&node2->mut2), 0); );
for (; __builtin_expect(pthread_mutex_lock(&hub3->mut1), 0); );
hub3->status1[offset2] &= ~hand1;
offset2 = hub3->offset1;
mask2 = hub3->mask1;
wrapped1 = 0;
}
ptrdiff_t pending2 = --hub3->pending1;
for (; __builtin_expect(pthread_mutex_unlock(&hub3->mut1), 0); );
if (!pending2) for (; __builtin_expect(pthread_cond_signal(&hub3->cond1), 0); );
for (; __builtin_expect(pthread_mutex_lock(&node1->mut2), 0); );
}
}

static void ResNeXt50ThreaderDestroy1(ResNeXt50ThreaderTeam1* team3) {
if (!team3) return;
ResNeXt50ThreaderNode1* nodes4 = team3->nodes2;
ResNeXt50ThreaderNode1* stop1 = nodes4+team3->unwind1.join1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_lock(&node3->mut2), 0); );
node3->np1 = -1;
node3->task1 = (ResNeXt50ThreaderTask1*)1;
for (; __builtin_expect(pthread_mutex_unlock(&node3->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node3->cond2), 0); );
}
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_join(node3->thr1, 0), 0); );
}
stop1 = nodes4+team3->unwind1.nodeConds1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_cond_destroy(&node3->cond2), 0); );
}
stop1 = nodes4+team3->unwind1.nodeMuts1;
for (ResNeXt50ThreaderNode1* node3 = nodes4; node3 != stop1; ++node3) {
for (; __builtin_expect(pthread_mutex_destroy(&node3->mut2), 0); );
}
ResNeXt50ThreaderHub1* hub4 = team3->hub2;
if (team3->unwind1.hubCond1) {
for (; __builtin_expect(pthread_cond_destroy(&hub4->cond1), 0); );
}
if (team3->unwind1.hubMut1) {
for (; __builtin_expect(pthread_mutex_destroy(&hub4->mut1), 0); );
}
free(team3->unwind1.nodes1);
free(team3->unwind1.hub1);
free(team3);
}

static char* ResNeXt50ThreaderCreate1Up4(ResNeXt50ThreaderTeam1* team8, ptrdiff_t nt7) {
ResNeXt50ThreaderNode1* nodes5 = team8->nodes2;
for (ResNeXt50ThreaderNode1* node4 = nodes5; node4 != nodes5+nt7; ++node4) {
int err2 = pthread_mutex_init(&node4->mut2, 0);
if (__builtin_expect(err2, 0)) {
char* msg2 = ResNeXt50Errmsg1(__LINE__, "errno %d", err2);
team8->unwind1.nodeMuts1 = node4-nodes5;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg2;
}
node4->task1 = 0;
int err3 = pthread_cond_init(&node4->cond2, 0);
if (__builtin_expect(err3, 0)) {
char* msg3 = ResNeXt50Errmsg1(__LINE__, "errno %d", err3);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5;
team8->unwind1.join1 = node4-nodes5;
return msg3;
}
node4->team1 = team8;
int err4 = pthread_create(&node4->thr1, 0, ResNeXt50ThreaderMain1, node4);
if (__builtin_expect(err4, 0)) {
char* msg4 = ResNeXt50Errmsg1(__LINE__, "errno %d", err4);
team8->unwind1.nodeMuts1 = node4-nodes5+1;
team8->unwind1.nodeConds1 = node4-nodes5+1;
team8->unwind1.join1 = node4-nodes5;
return msg4;
}
}
team8->unwind1.nodeMuts1 = nt7;
team8->unwind1.nodeConds1 = nt7;
team8->unwind1.join1 = nt7;
return 0;
}

static char* ResNeXt50ThreaderCreate1Up3(ResNeXt50ThreaderTeam1* team7, ptrdiff_t nt6) {
ResNeXt50ThreaderHub1* hub5 = team7->hub2;
int err5 = pthread_mutex_init(&hub5->mut1, 0);
if (__builtin_expect(err5, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", err5);
}
team7->unwind1.hubMut1 = 1;
int err6 = pthread_cond_init(&hub5->cond1, 0);
if (__builtin_expect(err6, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", err6);
}
team7->unwind1.hubCond1 = 1;
return ResNeXt50ThreaderCreate1Up4(team7, nt6);
}

static char* ResNeXt50ThreaderCreate1Up2(ResNeXt50ThreaderTeam1* team6, ptrdiff_t nt5) {
size_t size2 = nt5*sizeof(ResNeXt50ThreaderNode1);
if (__builtin_expect(size2/sizeof(ResNeXt50ThreaderNode1) != (size_t)nt5, 0)) {
return ResNeXt50Errmsg1(__LINE__, "too many threads");
}
void* addr3 = malloc(size2+63);
if (__builtin_expect(!addr3, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
team6->unwind1.nodes1 = addr3;
team6->nodes2 = (void*)(((size_t)addr3+63)&-64);
return ResNeXt50ThreaderCreate1Up3(team6, nt5);
}

static char* ResNeXt50ThreaderCreate1Up1(ResNeXt50ThreaderTeam1* team5, ptrdiff_t nt4) {
team5->nt1 = nt4;
size_t size1 = sizeof(ResNeXt50ThreaderHub1);
size1 += sizeof(long)*((size_t)nt4/(sizeof(long)*8)+1);
size1 = (size1+63)&-64;
void* addr2 = malloc(size1+63);
if (__builtin_expect(!addr2, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
team5->unwind1.hub1 = addr2;
team5->hub2 = (void*)(((size_t)addr2+63)&-64);
return ResNeXt50ThreaderCreate1Up2(team5, nt4);
}

static char* ResNeXt50ThreaderCreate1(ResNeXt50ThreaderTeam1** team4, ptrdiff_t nt3) {
if (__builtin_expect(nt3 < 1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "too few threads");
}
void* addr1 = calloc(1, sizeof(ResNeXt50ThreaderTeam1));
if (__builtin_expect(!addr1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
char* err1 = ResNeXt50ThreaderCreate1Up1(addr1, nt3);
if (__builtin_expect(!!err1, 0)) {
ResNeXt50ThreaderDestroy1(addr1);
} else {
*team4 = addr1;
}
return err1;
}

static char* ResNeXt50ThreaderPthreadT1(
pthread_t* thr2,
ResNeXt50ThreaderTeam1* team9,
ptrdiff_t idx1
) {
if (__builtin_expect(idx1 < 0 || idx1 >= team9->nt1, 0)) {
return ResNeXt50Errmsg1(__LINE__, "bad thread idx");
}
*thr2 = team9->nodes2[idx1].thr1;
return 0;
}

static void ResNeXt50ThreaderDo1(ResNeXt50ThreaderTeam1* team10, ResNeXt50ThreaderTask1* task3) {
ptrdiff_t nd6 = task3->nd1;
if (nd6 < 1) return;
int64_t tot1 = task3->hull1[0];
for (ptrdiff_t i4 = 1; i4 < nd6; tot1 *= task3->hull1[i4++]);
ptrdiff_t nt8 = team10->nt1;
int64_t each1 = tot1/nt8;
ptrdiff_t more1 = tot1%nt8;
int64_t plus2[4];
ResNeXt50ThreaderPut1(nd6, task3->hull1, plus2, each1);
int64_t pt6[4] = {0};
ResNeXt50ThreaderHub1* hub6 = team10->hub2;
for (; __builtin_expect(pthread_mutex_lock(&hub6->mut1), 0); );
ResNeXt50ThreaderNode1* node5 = team10->nodes2;
for (ptrdiff_t i4 = 0; ; ++node5) {
for (; __builtin_expect(pthread_mutex_lock(&node5->mut2), 0); );
int64_t carry3 = i4 < more1;
node5->np1 = each1+carry3;
memcpy(node5->pt1, pt6, sizeof(pt6));
node5->task1 = task3;
for (; __builtin_expect(pthread_mutex_unlock(&node5->mut2), 0); );
for (; __builtin_expect(pthread_cond_signal(&node5->cond2), 0); );
if (++i4 == nt8) break;
ResNeXt50ThreaderAdd1(nd6, task3->hull1, pt6, plus2, carry3);
}
hub6->offset1 = 0;
hub6->mask1 = -1;
for (ptrdiff_t i4 = (size_t)nt8/(sizeof(long)*8); i4 >= 0; ) {
hub6->status1[i4--] = -1;
}
for (hub6->pending1 = nt8; hub6->pending1; ) {
for (; __builtin_expect(pthread_cond_wait(&hub6->cond1, &hub6->mut1), 0); );
}
for (; __builtin_expect(pthread_mutex_unlock(&hub6->mut1), 0); );
}

static __m512 ResNeXt50Exp1(__m512 x1) {
x1 = _mm512_max_ps(x1, _mm512_set1_ps(-8.733654e+01f));
x1 = _mm512_min_ps(x1, _mm512_set1_ps(8.872284e+01f));
__m512 t1 = _mm512_mul_ps(x1, _mm512_set1_ps(1.442695e+00f));
__m512 r1 = _mm512_roundscale_ps(t1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512 f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-6.9314575e-01f), x1);
f1 = _mm512_fmadd_ps(r1, _mm512_set1_ps(-1.4286068e-06f), f1);
__m512 g1 = _mm512_set1_ps(4.194439e-02f);
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(1.6800667e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(4.9999994e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.999569e-01f));
g1 = _mm512_fmadd_ps(g1, f1, _mm512_set1_ps(9.9999964e-01f));
__m512i y1 = _mm512_slli_epi32(_mm512_cvtps_epi32(t1), 23);
return _mm512_castsi512_ps(_mm512_add_epi32(y1, _mm512_castps_si512(g1)));
}

static void ResNeXt50Softmax1(ResNeXt50ThreaderTeam1* team99, char** tensors173) {
(void)team99;
char*restrict ptr5 = tensors173[0];
char*restrict ptr6 = tensors173[1];
__m512 max1 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0);
__m512 max2 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1);
__m512 max3 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2);
__m512 max4 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3);
__m512 max5 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4);
__m512 max6 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5);
__m512 max7 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6);
__m512 max8 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7);
__m512 max9 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8);
__m512 max10 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9);
__m512 max11 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10);
__m512 max12 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11);
__m512 max13 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12);
__m512 max14 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13);
__m512 max15 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14);
__m512 max16 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15);
for (ptrdiff_t i105 = 1; i105 <= 2; ++i105) {
__m512 dat2714 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i105);
__m512 dat2715 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i105);
__m512 dat2716 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i105);
__m512 dat2717 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i105);
__m512 dat2718 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i105);
__m512 dat2719 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i105);
__m512 dat2720 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i105);
__m512 dat2721 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i105);
__m512 dat2722 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i105);
__m512 dat2723 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i105);
__m512 dat2724 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i105);
__m512 dat2725 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i105);
__m512 dat2726 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i105);
__m512 dat2727 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i105);
__m512 dat2728 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i105);
__m512 dat2729 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i105);
max1 = _mm512_max_ps(max1, dat2714);
max2 = _mm512_max_ps(max2, dat2715);
max3 = _mm512_max_ps(max3, dat2716);
max4 = _mm512_max_ps(max4, dat2717);
max5 = _mm512_max_ps(max5, dat2718);
max6 = _mm512_max_ps(max6, dat2719);
max7 = _mm512_max_ps(max7, dat2720);
max8 = _mm512_max_ps(max8, dat2721);
max9 = _mm512_max_ps(max9, dat2722);
max10 = _mm512_max_ps(max10, dat2723);
max11 = _mm512_max_ps(max11, dat2724);
max12 = _mm512_max_ps(max12, dat2725);
max13 = _mm512_max_ps(max13, dat2726);
max14 = _mm512_max_ps(max14, dat2727);
max15 = _mm512_max_ps(max15, dat2728);
max16 = _mm512_max_ps(max16, dat2729);
}
__m512 dat2730 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
__m512 dat2731 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2732 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2733 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2734 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2735 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2736 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2737 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2738 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2739 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2740 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2741 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2742 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2743 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
max1 = _mm512_max_ps(max1, dat2730);
max2 = _mm512_max_ps(max2, dat2731);
max3 = _mm512_max_ps(max3, dat2732);
max4 = _mm512_max_ps(max4, dat2733);
max5 = _mm512_max_ps(max5, dat2734);
max6 = _mm512_max_ps(max6, dat2735);
max7 = _mm512_max_ps(max7, dat2736);
max8 = _mm512_max_ps(max8, dat2737);
max9 = _mm512_max_ps(max9, dat2738);
max10 = _mm512_max_ps(max10, dat2739);
max11 = _mm512_max_ps(max11, dat2740);
max12 = _mm512_max_ps(max12, dat2741);
max13 = _mm512_max_ps(max13, dat2742);
max14 = _mm512_max_ps(max14, dat2743);
__m512 dat2744 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*62);
max16 = _mm512_mask_max_ps(max16, 255, max16, dat2744);
max1 = _mm512_max_ps(max1, max9);
max2 = _mm512_max_ps(max2, max10);
max3 = _mm512_max_ps(max3, max11);
max4 = _mm512_max_ps(max4, max12);
max5 = _mm512_max_ps(max5, max13);
max6 = _mm512_max_ps(max6, max14);
max7 = _mm512_max_ps(max7, max15);
max8 = _mm512_max_ps(max8, max16);
max1 = _mm512_max_ps(max1, max5);
max2 = _mm512_max_ps(max2, max6);
max3 = _mm512_max_ps(max3, max7);
max4 = _mm512_max_ps(max4, max8);
max1 = _mm512_max_ps(max1, max3);
max2 = _mm512_max_ps(max2, max4);
max1 = _mm512_max_ps(max1, max2);
__m512i p5 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
max1 = _mm512_mask_max_ps(max1, 255, max1, _mm512_permutexvar_ps(p5, max1));
__m512i p6 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
max1 = _mm512_mask_max_ps(max1, 15, max1, _mm512_permutexvar_ps(p6, max1));
__m512i p7 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
max1 = _mm512_mask_max_ps(max1, 3, max1, _mm512_permutexvar_ps(p7, max1));
__m512i p8 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
max1 = _mm512_mask_max_ps(max1, 1, max1, _mm512_permutexvar_ps(p8, max1));
__m512i p9 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
max1 = _mm512_permutexvar_ps(p9, max1);
__m512 sum917 = _mm512_setzero_ps();
__m512 neg1 = _mm512_sub_ps(sum917, max1);
__m512 dat2775 = _mm512_maskz_loadu_ps(255, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3);
__m512 dat2774 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3);
__m512 dat2773 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3);
__m512 dat2772 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3);
__m512 dat2771 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3);
__m512 dat2770 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3);
__m512 dat2769 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3);
__m512 dat2768 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3);
__m512 dat2767 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3);
__m512 dat2766 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3);
__m512 dat2765 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3);
__m512 dat2764 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3);
__m512 dat2763 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3);
__m512 dat2762 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3);
__m512 dat2761 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3);
dat2775 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2775));
sum917 = _mm512_mask_add_ps(sum917, 255, sum917, dat2775);
dat2774 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2774));
sum917 = _mm512_add_ps(sum917, dat2774);
dat2773 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2773));
sum917 = _mm512_add_ps(sum917, dat2773);
dat2772 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2772));
sum917 = _mm512_add_ps(sum917, dat2772);
dat2771 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2771));
sum917 = _mm512_add_ps(sum917, dat2771);
dat2770 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2770));
sum917 = _mm512_add_ps(sum917, dat2770);
dat2769 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2769));
sum917 = _mm512_add_ps(sum917, dat2769);
dat2768 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2768));
sum917 = _mm512_add_ps(sum917, dat2768);
dat2767 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2767));
sum917 = _mm512_add_ps(sum917, dat2767);
dat2766 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2766));
sum917 = _mm512_add_ps(sum917, dat2766);
dat2765 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2765));
sum917 = _mm512_add_ps(sum917, dat2765);
dat2764 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2764));
sum917 = _mm512_add_ps(sum917, dat2764);
dat2763 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2763));
sum917 = _mm512_add_ps(sum917, dat2763);
dat2762 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2762));
sum917 = _mm512_add_ps(sum917, dat2762);
dat2761 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2761));
sum917 = _mm512_add_ps(sum917, dat2761);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*3, 255, dat2775);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*3, 65535, dat2774);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*3, 65535, dat2773);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*3, 65535, dat2772);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*3, 65535, dat2771);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*3, 65535, dat2770);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*3, 65535, dat2769);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*3, 65535, dat2768);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*3, 65535, dat2767);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*3, 65535, dat2766);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*3, 65535, dat2765);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*3, 65535, dat2764);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*3, 65535, dat2763);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*3, 65535, dat2762);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*3, 65535, dat2761);
for (ptrdiff_t i106 = 2; i106 >= 0; --i106) {
__m512 dat2760 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i106);
__m512 dat2759 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i106);
__m512 dat2758 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i106);
__m512 dat2757 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i106);
__m512 dat2756 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i106);
__m512 dat2755 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i106);
__m512 dat2754 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i106);
__m512 dat2753 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i106);
__m512 dat2752 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i106);
__m512 dat2751 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i106);
__m512 dat2750 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i106);
__m512 dat2749 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i106);
__m512 dat2748 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i106);
__m512 dat2747 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i106);
__m512 dat2746 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i106);
__m512 dat2745 = _mm512_maskz_loadu_ps(65535, ptr5+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i106);
dat2760 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2760));
sum917 = _mm512_add_ps(sum917, dat2760);
dat2759 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2759));
sum917 = _mm512_add_ps(sum917, dat2759);
dat2758 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2758));
sum917 = _mm512_add_ps(sum917, dat2758);
dat2757 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2757));
sum917 = _mm512_add_ps(sum917, dat2757);
dat2756 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2756));
sum917 = _mm512_add_ps(sum917, dat2756);
dat2755 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2755));
sum917 = _mm512_add_ps(sum917, dat2755);
dat2754 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2754));
sum917 = _mm512_add_ps(sum917, dat2754);
dat2753 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2753));
sum917 = _mm512_add_ps(sum917, dat2753);
dat2752 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2752));
sum917 = _mm512_add_ps(sum917, dat2752);
dat2751 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2751));
sum917 = _mm512_add_ps(sum917, dat2751);
dat2750 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2750));
sum917 = _mm512_add_ps(sum917, dat2750);
dat2749 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2749));
sum917 = _mm512_add_ps(sum917, dat2749);
dat2748 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2748));
sum917 = _mm512_add_ps(sum917, dat2748);
dat2747 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2747));
sum917 = _mm512_add_ps(sum917, dat2747);
dat2746 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2746));
sum917 = _mm512_add_ps(sum917, dat2746);
dat2745 = ResNeXt50Exp1(_mm512_add_ps(neg1, dat2745));
sum917 = _mm512_add_ps(sum917, dat2745);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*15+(ptrdiff_t)64*16*i106, 65535, dat2760);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*14+(ptrdiff_t)64*16*i106, 65535, dat2759);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*13+(ptrdiff_t)64*16*i106, 65535, dat2758);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*12+(ptrdiff_t)64*16*i106, 65535, dat2757);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*11+(ptrdiff_t)64*16*i106, 65535, dat2756);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*10+(ptrdiff_t)64*16*i106, 65535, dat2755);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*9+(ptrdiff_t)64*16*i106, 65535, dat2754);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*8+(ptrdiff_t)64*16*i106, 65535, dat2753);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*7+(ptrdiff_t)64*16*i106, 65535, dat2752);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*6+(ptrdiff_t)64*16*i106, 65535, dat2751);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*5+(ptrdiff_t)64*16*i106, 65535, dat2750);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*4+(ptrdiff_t)64*16*i106, 65535, dat2749);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*3+(ptrdiff_t)64*16*i106, 65535, dat2748);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*2+(ptrdiff_t)64*16*i106, 65535, dat2747);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*1+(ptrdiff_t)64*16*i106, 65535, dat2746);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*0+(ptrdiff_t)64*16*i106, 65535, dat2745);
}
__m512i p10 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8);
sum917 = _mm512_mask_add_ps(sum917, 255, sum917, _mm512_permutexvar_ps(p10, sum917));
__m512i p11 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4);
sum917 = _mm512_mask_add_ps(sum917, 15, sum917, _mm512_permutexvar_ps(p11, sum917));
__m512i p12 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2);
sum917 = _mm512_mask_add_ps(sum917, 3, sum917, _mm512_permutexvar_ps(p12, sum917));
__m512i p13 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
sum917 = _mm512_mask_add_ps(sum917, 1, sum917, _mm512_permutexvar_ps(p13, sum917));
__m512i p14 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
sum917 = _mm512_permutexvar_ps(p14, sum917);
__m512 rcp44 = _mm512_div_ps(_mm512_set1_ps(1e+00f), sum917);
for (ptrdiff_t i107 = 0; i107 < 62; ++i107) {
__m512 dat2776 = _mm512_maskz_loadu_ps(65535, ptr6+(ptrdiff_t)64*i107);
dat2776 = _mm512_mul_ps(rcp44, dat2776);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*i107, 65535, dat2776);
}
__m512 dat2777 = _mm512_maskz_loadu_ps(255, ptr6+(ptrdiff_t)64*62);
dat2777 = _mm512_mul_ps(rcp44, dat2777);
_mm512_mask_storeu_ps(ptr6+(ptrdiff_t)64*62, 255, dat2777);
}

static __m512 ResNeXt50Rsqrt1(__m512 x2) {
__m512 y2 = _mm512_rsqrt14_ps(x2);
__m512 z1 = _mm512_mul_ps(x2, y2);
__m512 a1 = _mm512_mul_ps(y2, _mm512_set1_ps(5e-01f));
__m512 b1 = _mm512_fnmadd_ps(y2, z1, _mm512_set1_ps(3e+00f));
return _mm512_mul_ps(a1, b1);
}

static void ResNeXt50BnSimplify1(
float*restrict means1,
float*restrict variances1,
float*restrict scales1,
float*restrict shifts1,
char*restrict mas1
) {
__m512 eps1 = _mm512_set1_ps(2e-05f);
__m512i xlo1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512 va1 = _mm512_maskz_loadu_ps(7, variances1+(ptrdiff_t)16*0);
__m512 rcp1 = ResNeXt50Rsqrt1(_mm512_add_ps(eps1, va1));
__m512 sc1 = _mm512_maskz_loadu_ps(7, scales1+(ptrdiff_t)16*0);
__m512 mul1 = _mm512_mul_ps(rcp1, sc1);
__m512 me1 = _mm512_maskz_loadu_ps(7, means1+(ptrdiff_t)16*0);
__m512 sh1 = _mm512_maskz_loadu_ps(7, shifts1+(ptrdiff_t)16*0);
__m512 add1 = _mm512_fnmadd_ps(me1, mul1, sh1);
__m512 lo1 = _mm512_permutex2var_ps(mul1, xlo1, add1);
_mm512_mask_storeu_ps(mas1+(ptrdiff_t)64*0, 63, lo1);
}

static void ResNeXt50BnSimplify2(
float*restrict means2,
float*restrict variances2,
float*restrict scales2,
float*restrict shifts2,
char*restrict mas2
) {
__m512 eps2 = _mm512_set1_ps(2e-05f);
__m512i xlo2 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
__m512 va2 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*0);
__m512 va3 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*1);
__m512 va4 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*2);
__m512 va5 = _mm512_loadu_ps(variances2+(ptrdiff_t)16*3);
__m512 rcp2 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va2));
__m512 rcp3 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va3));
__m512 rcp4 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va4));
__m512 rcp5 = ResNeXt50Rsqrt1(_mm512_add_ps(eps2, va5));
__m512 sc2 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*0);
__m512 sc3 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*1);
__m512 sc4 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*2);
__m512 sc5 = _mm512_loadu_ps(scales2+(ptrdiff_t)16*3);
__m512 mul2 = _mm512_mul_ps(rcp2, sc2);
__m512 mul3 = _mm512_mul_ps(rcp3, sc3);
__m512 mul4 = _mm512_mul_ps(rcp4, sc4);
__m512 mul5 = _mm512_mul_ps(rcp5, sc5);
__m512 me2 = _mm512_loadu_ps(means2+(ptrdiff_t)16*0);
__m512 me3 = _mm512_loadu_ps(means2+(ptrdiff_t)16*1);
__m512 me4 = _mm512_loadu_ps(means2+(ptrdiff_t)16*2);
__m512 me5 = _mm512_loadu_ps(means2+(ptrdiff_t)16*3);
__m512 sh2 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*0);
__m512 sh3 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*1);
__m512 sh4 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*2);
__m512 sh5 = _mm512_loadu_ps(shifts2+(ptrdiff_t)16*3);
__m512 add2 = _mm512_fnmadd_ps(me2, mul2, sh2);
__m512 add3 = _mm512_fnmadd_ps(me3, mul3, sh3);
__m512 add4 = _mm512_fnmadd_ps(me4, mul4, sh4);
__m512 add5 = _mm512_fnmadd_ps(me5, mul5, sh5);
__m512 lo2 = _mm512_permutex2var_ps(mul2, xlo2, add2);
__m512 lo3 = _mm512_permutex2var_ps(mul3, xlo2, add3);
__m512 lo4 = _mm512_permutex2var_ps(mul4, xlo2, add4);
__m512 lo5 = _mm512_permutex2var_ps(mul5, xlo2, add5);
__m512 hi1 = _mm512_permutex2var_ps(mul2, xhi1, add2);
__m512 hi2 = _mm512_permutex2var_ps(mul3, xhi1, add3);
__m512 hi3 = _mm512_permutex2var_ps(mul4, xhi1, add4);
__m512 hi4 = _mm512_permutex2var_ps(mul5, xhi1, add5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*0, lo2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*1, hi1);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*2, lo3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*3, hi2);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*4, lo4);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*5, hi3);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*6, lo5);
_mm512_storeu_ps(mas2+(ptrdiff_t)64*7, hi4);
}

static void ResNeXt50BnSimplify3(
float*restrict means3,
float*restrict variances3,
float*restrict scales3,
float*restrict shifts3,
char*restrict mas4
) {
__m512 eps3 = _mm512_set1_ps(2e-05f);
__m512i xlo3 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i11 = 0; i11 < 3; ++i11) {
__m512 va6 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 va7 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 va8 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 va9 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 va10 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 rcp6 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va6));
__m512 rcp7 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va7));
__m512 rcp8 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va8));
__m512 rcp9 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va9));
__m512 rcp10 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va10));
__m512 sc6 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sc7 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sc8 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sc9 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sc10 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 mul6 = _mm512_mul_ps(rcp6, sc6);
__m512 mul7 = _mm512_mul_ps(rcp7, sc7);
__m512 mul8 = _mm512_mul_ps(rcp8, sc8);
__m512 mul9 = _mm512_mul_ps(rcp9, sc9);
__m512 mul10 = _mm512_mul_ps(rcp10, sc10);
__m512 me6 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 me7 = _mm512_loadu_ps(means3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 me8 = _mm512_loadu_ps(means3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 me9 = _mm512_loadu_ps(means3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 me10 = _mm512_loadu_ps(means3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 sh6 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*i11);
__m512 sh7 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*1+(ptrdiff_t)80*i11);
__m512 sh8 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*2+(ptrdiff_t)80*i11);
__m512 sh9 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*3+(ptrdiff_t)80*i11);
__m512 sh10 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*4+(ptrdiff_t)80*i11);
__m512 add6 = _mm512_fnmadd_ps(me6, mul6, sh6);
__m512 add7 = _mm512_fnmadd_ps(me7, mul7, sh7);
__m512 add8 = _mm512_fnmadd_ps(me8, mul8, sh8);
__m512 add9 = _mm512_fnmadd_ps(me9, mul9, sh9);
__m512 add10 = _mm512_fnmadd_ps(me10, mul10, sh10);
__m512 lo6 = _mm512_permutex2var_ps(mul6, xlo3, add6);
__m512 lo7 = _mm512_permutex2var_ps(mul7, xlo3, add7);
__m512 lo8 = _mm512_permutex2var_ps(mul8, xlo3, add8);
__m512 lo9 = _mm512_permutex2var_ps(mul9, xlo3, add9);
__m512 lo10 = _mm512_permutex2var_ps(mul10, xlo3, add10);
__m512 hi5 = _mm512_permutex2var_ps(mul6, xhi2, add6);
__m512 hi6 = _mm512_permutex2var_ps(mul7, xhi2, add7);
__m512 hi7 = _mm512_permutex2var_ps(mul8, xhi2, add8);
__m512 hi8 = _mm512_permutex2var_ps(mul9, xhi2, add9);
__m512 hi9 = _mm512_permutex2var_ps(mul10, xhi2, add10);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*i11, lo6);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*i11, hi5);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*2+(ptrdiff_t)640*i11, lo7);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*3+(ptrdiff_t)640*i11, hi6);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*4+(ptrdiff_t)640*i11, lo8);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*5+(ptrdiff_t)640*i11, hi7);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*6+(ptrdiff_t)640*i11, lo9);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*7+(ptrdiff_t)640*i11, hi8);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*8+(ptrdiff_t)640*i11, lo10);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*9+(ptrdiff_t)640*i11, hi9);
}
__m512 va11 = _mm512_loadu_ps(variances3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 rcp11 = ResNeXt50Rsqrt1(_mm512_add_ps(eps3, va11));
__m512 sc11 = _mm512_loadu_ps(scales3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 mul11 = _mm512_mul_ps(rcp11, sc11);
__m512 me11 = _mm512_loadu_ps(means3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 sh11 = _mm512_loadu_ps(shifts3+(ptrdiff_t)16*0+(ptrdiff_t)80*3);
__m512 add11 = _mm512_fnmadd_ps(me11, mul11, sh11);
__m512 lo11 = _mm512_permutex2var_ps(mul11, xlo3, add11);
__m512 hi10 = _mm512_permutex2var_ps(mul11, xhi2, add11);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*0+(ptrdiff_t)640*3, lo11);
_mm512_storeu_ps(mas4+(ptrdiff_t)64*1+(ptrdiff_t)640*3, hi10);
}

static void ResNeXt50BnSimplify4(
float*restrict means4,
float*restrict variances4,
float*restrict scales4,
float*restrict shifts4,
char*restrict mas5
) {
__m512 eps4 = _mm512_set1_ps(2e-05f);
__m512i xlo4 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi3 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i12 = 0; i12 < 1; ++i12) {
__m512 va12 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 va13 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 va14 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 va15 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 va16 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 rcp12 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va12));
__m512 rcp13 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va13));
__m512 rcp14 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va14));
__m512 rcp15 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va15));
__m512 rcp16 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va16));
__m512 sc12 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 sc13 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 sc14 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 sc15 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 sc16 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 mul12 = _mm512_mul_ps(rcp12, sc12);
__m512 mul13 = _mm512_mul_ps(rcp13, sc13);
__m512 mul14 = _mm512_mul_ps(rcp14, sc14);
__m512 mul15 = _mm512_mul_ps(rcp15, sc15);
__m512 mul16 = _mm512_mul_ps(rcp16, sc16);
__m512 me12 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 me13 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 me14 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 me15 = _mm512_loadu_ps(means4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 me16 = _mm512_loadu_ps(means4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 sh12 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*i12);
__m512 sh13 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*i12);
__m512 sh14 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*i12);
__m512 sh15 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*3+(ptrdiff_t)80*i12);
__m512 sh16 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*4+(ptrdiff_t)80*i12);
__m512 add12 = _mm512_fnmadd_ps(me12, mul12, sh12);
__m512 add13 = _mm512_fnmadd_ps(me13, mul13, sh13);
__m512 add14 = _mm512_fnmadd_ps(me14, mul14, sh14);
__m512 add15 = _mm512_fnmadd_ps(me15, mul15, sh15);
__m512 add16 = _mm512_fnmadd_ps(me16, mul16, sh16);
__m512 lo12 = _mm512_permutex2var_ps(mul12, xlo4, add12);
__m512 lo13 = _mm512_permutex2var_ps(mul13, xlo4, add13);
__m512 lo14 = _mm512_permutex2var_ps(mul14, xlo4, add14);
__m512 lo15 = _mm512_permutex2var_ps(mul15, xlo4, add15);
__m512 lo16 = _mm512_permutex2var_ps(mul16, xlo4, add16);
__m512 hi11 = _mm512_permutex2var_ps(mul12, xhi3, add12);
__m512 hi12 = _mm512_permutex2var_ps(mul13, xhi3, add13);
__m512 hi13 = _mm512_permutex2var_ps(mul14, xhi3, add14);
__m512 hi14 = _mm512_permutex2var_ps(mul15, xhi3, add15);
__m512 hi15 = _mm512_permutex2var_ps(mul16, xhi3, add16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*0+(ptrdiff_t)640*i12, lo12);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*1+(ptrdiff_t)640*i12, hi11);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*2+(ptrdiff_t)640*i12, lo13);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*3+(ptrdiff_t)640*i12, hi12);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*4+(ptrdiff_t)640*i12, lo14);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*5+(ptrdiff_t)640*i12, hi13);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*6+(ptrdiff_t)640*i12, lo15);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*7+(ptrdiff_t)640*i12, hi14);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*8+(ptrdiff_t)640*i12, lo16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*9+(ptrdiff_t)640*i12, hi15);
}
__m512 va17 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 va18 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 va19 = _mm512_loadu_ps(variances4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 rcp17 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va17));
__m512 rcp18 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va18));
__m512 rcp19 = ResNeXt50Rsqrt1(_mm512_add_ps(eps4, va19));
__m512 sc17 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sc18 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sc19 = _mm512_loadu_ps(scales4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 mul17 = _mm512_mul_ps(rcp17, sc17);
__m512 mul18 = _mm512_mul_ps(rcp18, sc18);
__m512 mul19 = _mm512_mul_ps(rcp19, sc19);
__m512 me17 = _mm512_loadu_ps(means4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 me18 = _mm512_loadu_ps(means4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 me19 = _mm512_loadu_ps(means4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 sh17 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*0+(ptrdiff_t)80*1);
__m512 sh18 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*1+(ptrdiff_t)80*1);
__m512 sh19 = _mm512_loadu_ps(shifts4+(ptrdiff_t)16*2+(ptrdiff_t)80*1);
__m512 add17 = _mm512_fnmadd_ps(me17, mul17, sh17);
__m512 add18 = _mm512_fnmadd_ps(me18, mul18, sh18);
__m512 add19 = _mm512_fnmadd_ps(me19, mul19, sh19);
__m512 lo17 = _mm512_permutex2var_ps(mul17, xlo4, add17);
__m512 lo18 = _mm512_permutex2var_ps(mul18, xlo4, add18);
__m512 lo19 = _mm512_permutex2var_ps(mul19, xlo4, add19);
__m512 hi16 = _mm512_permutex2var_ps(mul17, xhi3, add17);
__m512 hi17 = _mm512_permutex2var_ps(mul18, xhi3, add18);
__m512 hi18 = _mm512_permutex2var_ps(mul19, xhi3, add19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*0+(ptrdiff_t)640*1, lo17);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*1+(ptrdiff_t)640*1, hi16);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*2+(ptrdiff_t)640*1, lo18);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*3+(ptrdiff_t)640*1, hi17);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*4+(ptrdiff_t)640*1, lo19);
_mm512_storeu_ps(mas5+(ptrdiff_t)64*5+(ptrdiff_t)640*1, hi18);
}

static void ResNeXt50BnSimplify5(
float*restrict means5,
float*restrict variances5,
float*restrict scales5,
float*restrict shifts5,
char*restrict mas8
) {
__m512 eps5 = _mm512_set1_ps(2e-05f);
__m512i xlo5 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi4 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i30 = 0; i30 < 6; ++i30) {
__m512 va20 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 va21 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 va22 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 va23 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 va24 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 rcp20 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va20));
__m512 rcp21 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va21));
__m512 rcp22 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va22));
__m512 rcp23 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va23));
__m512 rcp24 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va24));
__m512 sc20 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sc21 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sc22 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sc23 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sc24 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 mul20 = _mm512_mul_ps(rcp20, sc20);
__m512 mul21 = _mm512_mul_ps(rcp21, sc21);
__m512 mul22 = _mm512_mul_ps(rcp22, sc22);
__m512 mul23 = _mm512_mul_ps(rcp23, sc23);
__m512 mul24 = _mm512_mul_ps(rcp24, sc24);
__m512 me20 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 me21 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 me22 = _mm512_loadu_ps(means5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 me23 = _mm512_loadu_ps(means5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 me24 = _mm512_loadu_ps(means5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 sh20 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*i30);
__m512 sh21 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*i30);
__m512 sh22 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*2+(ptrdiff_t)80*i30);
__m512 sh23 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*3+(ptrdiff_t)80*i30);
__m512 sh24 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*4+(ptrdiff_t)80*i30);
__m512 add20 = _mm512_fnmadd_ps(me20, mul20, sh20);
__m512 add21 = _mm512_fnmadd_ps(me21, mul21, sh21);
__m512 add22 = _mm512_fnmadd_ps(me22, mul22, sh22);
__m512 add23 = _mm512_fnmadd_ps(me23, mul23, sh23);
__m512 add24 = _mm512_fnmadd_ps(me24, mul24, sh24);
__m512 lo20 = _mm512_permutex2var_ps(mul20, xlo5, add20);
__m512 lo21 = _mm512_permutex2var_ps(mul21, xlo5, add21);
__m512 lo22 = _mm512_permutex2var_ps(mul22, xlo5, add22);
__m512 lo23 = _mm512_permutex2var_ps(mul23, xlo5, add23);
__m512 lo24 = _mm512_permutex2var_ps(mul24, xlo5, add24);
__m512 hi19 = _mm512_permutex2var_ps(mul20, xhi4, add20);
__m512 hi20 = _mm512_permutex2var_ps(mul21, xhi4, add21);
__m512 hi21 = _mm512_permutex2var_ps(mul22, xhi4, add22);
__m512 hi22 = _mm512_permutex2var_ps(mul23, xhi4, add23);
__m512 hi23 = _mm512_permutex2var_ps(mul24, xhi4, add24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*i30, lo20);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*i30, hi19);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*2+(ptrdiff_t)640*i30, lo21);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*3+(ptrdiff_t)640*i30, hi20);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*4+(ptrdiff_t)640*i30, lo22);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*5+(ptrdiff_t)640*i30, hi21);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*6+(ptrdiff_t)640*i30, lo23);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*7+(ptrdiff_t)640*i30, hi22);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*8+(ptrdiff_t)640*i30, lo24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*9+(ptrdiff_t)640*i30, hi23);
}
__m512 va25 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 va26 = _mm512_loadu_ps(variances5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 rcp25 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va25));
__m512 rcp26 = ResNeXt50Rsqrt1(_mm512_add_ps(eps5, va26));
__m512 sc25 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sc26 = _mm512_loadu_ps(scales5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 mul25 = _mm512_mul_ps(rcp25, sc25);
__m512 mul26 = _mm512_mul_ps(rcp26, sc26);
__m512 me25 = _mm512_loadu_ps(means5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 me26 = _mm512_loadu_ps(means5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 sh25 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*0+(ptrdiff_t)80*6);
__m512 sh26 = _mm512_loadu_ps(shifts5+(ptrdiff_t)16*1+(ptrdiff_t)80*6);
__m512 add25 = _mm512_fnmadd_ps(me25, mul25, sh25);
__m512 add26 = _mm512_fnmadd_ps(me26, mul26, sh26);
__m512 lo25 = _mm512_permutex2var_ps(mul25, xlo5, add25);
__m512 lo26 = _mm512_permutex2var_ps(mul26, xlo5, add26);
__m512 hi24 = _mm512_permutex2var_ps(mul25, xhi4, add25);
__m512 hi25 = _mm512_permutex2var_ps(mul26, xhi4, add26);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*0+(ptrdiff_t)640*6, lo25);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*1+(ptrdiff_t)640*6, hi24);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*2+(ptrdiff_t)640*6, lo26);
_mm512_storeu_ps(mas8+(ptrdiff_t)64*3+(ptrdiff_t)640*6, hi25);
}

static void ResNeXt50BnSimplify6(
float*restrict means6,
float*restrict variances6,
float*restrict scales6,
float*restrict shifts6,
char*restrict mas11
) {
__m512 eps6 = _mm512_set1_ps(2e-05f);
__m512i xlo6 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi5 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i52 = 0; i52 < 12; ++i52) {
__m512 va27 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 va28 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 va29 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 va30 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 va31 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 rcp27 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va27));
__m512 rcp28 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va28));
__m512 rcp29 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va29));
__m512 rcp30 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va30));
__m512 rcp31 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va31));
__m512 sc27 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 sc28 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 sc29 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 sc30 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 sc31 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 mul27 = _mm512_mul_ps(rcp27, sc27);
__m512 mul28 = _mm512_mul_ps(rcp28, sc28);
__m512 mul29 = _mm512_mul_ps(rcp29, sc29);
__m512 mul30 = _mm512_mul_ps(rcp30, sc30);
__m512 mul31 = _mm512_mul_ps(rcp31, sc31);
__m512 me27 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 me28 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 me29 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 me30 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 me31 = _mm512_loadu_ps(means6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 sh27 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*i52);
__m512 sh28 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*i52);
__m512 sh29 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*i52);
__m512 sh30 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*i52);
__m512 sh31 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*4+(ptrdiff_t)80*i52);
__m512 add27 = _mm512_fnmadd_ps(me27, mul27, sh27);
__m512 add28 = _mm512_fnmadd_ps(me28, mul28, sh28);
__m512 add29 = _mm512_fnmadd_ps(me29, mul29, sh29);
__m512 add30 = _mm512_fnmadd_ps(me30, mul30, sh30);
__m512 add31 = _mm512_fnmadd_ps(me31, mul31, sh31);
__m512 lo27 = _mm512_permutex2var_ps(mul27, xlo6, add27);
__m512 lo28 = _mm512_permutex2var_ps(mul28, xlo6, add28);
__m512 lo29 = _mm512_permutex2var_ps(mul29, xlo6, add29);
__m512 lo30 = _mm512_permutex2var_ps(mul30, xlo6, add30);
__m512 lo31 = _mm512_permutex2var_ps(mul31, xlo6, add31);
__m512 hi26 = _mm512_permutex2var_ps(mul27, xhi5, add27);
__m512 hi27 = _mm512_permutex2var_ps(mul28, xhi5, add28);
__m512 hi28 = _mm512_permutex2var_ps(mul29, xhi5, add29);
__m512 hi29 = _mm512_permutex2var_ps(mul30, xhi5, add30);
__m512 hi30 = _mm512_permutex2var_ps(mul31, xhi5, add31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*i52, lo27);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*i52, hi26);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*i52, lo28);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*i52, hi27);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*4+(ptrdiff_t)640*i52, lo29);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*5+(ptrdiff_t)640*i52, hi28);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*6+(ptrdiff_t)640*i52, lo30);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*7+(ptrdiff_t)640*i52, hi29);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*8+(ptrdiff_t)640*i52, lo31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*9+(ptrdiff_t)640*i52, hi30);
}
__m512 va32 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 va33 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 va34 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 va35 = _mm512_loadu_ps(variances6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 rcp32 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va32));
__m512 rcp33 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va33));
__m512 rcp34 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va34));
__m512 rcp35 = ResNeXt50Rsqrt1(_mm512_add_ps(eps6, va35));
__m512 sc32 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sc33 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sc34 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sc35 = _mm512_loadu_ps(scales6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 mul32 = _mm512_mul_ps(rcp32, sc32);
__m512 mul33 = _mm512_mul_ps(rcp33, sc33);
__m512 mul34 = _mm512_mul_ps(rcp34, sc34);
__m512 mul35 = _mm512_mul_ps(rcp35, sc35);
__m512 me32 = _mm512_loadu_ps(means6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 me33 = _mm512_loadu_ps(means6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 me34 = _mm512_loadu_ps(means6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 me35 = _mm512_loadu_ps(means6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 sh32 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*0+(ptrdiff_t)80*12);
__m512 sh33 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*1+(ptrdiff_t)80*12);
__m512 sh34 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*2+(ptrdiff_t)80*12);
__m512 sh35 = _mm512_loadu_ps(shifts6+(ptrdiff_t)16*3+(ptrdiff_t)80*12);
__m512 add32 = _mm512_fnmadd_ps(me32, mul32, sh32);
__m512 add33 = _mm512_fnmadd_ps(me33, mul33, sh33);
__m512 add34 = _mm512_fnmadd_ps(me34, mul34, sh34);
__m512 add35 = _mm512_fnmadd_ps(me35, mul35, sh35);
__m512 lo32 = _mm512_permutex2var_ps(mul32, xlo6, add32);
__m512 lo33 = _mm512_permutex2var_ps(mul33, xlo6, add33);
__m512 lo34 = _mm512_permutex2var_ps(mul34, xlo6, add34);
__m512 lo35 = _mm512_permutex2var_ps(mul35, xlo6, add35);
__m512 hi31 = _mm512_permutex2var_ps(mul32, xhi5, add32);
__m512 hi32 = _mm512_permutex2var_ps(mul33, xhi5, add33);
__m512 hi33 = _mm512_permutex2var_ps(mul34, xhi5, add34);
__m512 hi34 = _mm512_permutex2var_ps(mul35, xhi5, add35);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*0+(ptrdiff_t)640*12, lo32);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*1+(ptrdiff_t)640*12, hi31);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*2+(ptrdiff_t)640*12, lo33);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*3+(ptrdiff_t)640*12, hi32);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*4+(ptrdiff_t)640*12, lo34);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*5+(ptrdiff_t)640*12, hi33);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*6+(ptrdiff_t)640*12, lo35);
_mm512_storeu_ps(mas11+(ptrdiff_t)64*7+(ptrdiff_t)640*12, hi34);
}

static void ResNeXt50BnSimplify7(
float*restrict means7,
float*restrict variances7,
float*restrict scales7,
float*restrict shifts7,
char*restrict mas14
) {
__m512 eps7 = _mm512_set1_ps(2e-05f);
__m512i xlo7 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i xhi6 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (ptrdiff_t i74 = 0; i74 < 25; ++i74) {
__m512 va36 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 va37 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 va38 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 va39 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 va40 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 rcp36 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va36));
__m512 rcp37 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va37));
__m512 rcp38 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va38));
__m512 rcp39 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va39));
__m512 rcp40 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va40));
__m512 sc36 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 sc37 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 sc38 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 sc39 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 sc40 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 mul36 = _mm512_mul_ps(rcp36, sc36);
__m512 mul37 = _mm512_mul_ps(rcp37, sc37);
__m512 mul38 = _mm512_mul_ps(rcp38, sc38);
__m512 mul39 = _mm512_mul_ps(rcp39, sc39);
__m512 mul40 = _mm512_mul_ps(rcp40, sc40);
__m512 me36 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 me37 = _mm512_loadu_ps(means7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 me38 = _mm512_loadu_ps(means7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 me39 = _mm512_loadu_ps(means7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 me40 = _mm512_loadu_ps(means7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 sh36 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*i74);
__m512 sh37 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*1+(ptrdiff_t)80*i74);
__m512 sh38 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*2+(ptrdiff_t)80*i74);
__m512 sh39 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*3+(ptrdiff_t)80*i74);
__m512 sh40 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*4+(ptrdiff_t)80*i74);
__m512 add36 = _mm512_fnmadd_ps(me36, mul36, sh36);
__m512 add37 = _mm512_fnmadd_ps(me37, mul37, sh37);
__m512 add38 = _mm512_fnmadd_ps(me38, mul38, sh38);
__m512 add39 = _mm512_fnmadd_ps(me39, mul39, sh39);
__m512 add40 = _mm512_fnmadd_ps(me40, mul40, sh40);
__m512 lo36 = _mm512_permutex2var_ps(mul36, xlo7, add36);
__m512 lo37 = _mm512_permutex2var_ps(mul37, xlo7, add37);
__m512 lo38 = _mm512_permutex2var_ps(mul38, xlo7, add38);
__m512 lo39 = _mm512_permutex2var_ps(mul39, xlo7, add39);
__m512 lo40 = _mm512_permutex2var_ps(mul40, xlo7, add40);
__m512 hi35 = _mm512_permutex2var_ps(mul36, xhi6, add36);
__m512 hi36 = _mm512_permutex2var_ps(mul37, xhi6, add37);
__m512 hi37 = _mm512_permutex2var_ps(mul38, xhi6, add38);
__m512 hi38 = _mm512_permutex2var_ps(mul39, xhi6, add39);
__m512 hi39 = _mm512_permutex2var_ps(mul40, xhi6, add40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*i74, lo36);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*i74, hi35);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*i74, lo37);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*i74, hi36);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*i74, lo38);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*i74, hi37);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*6+(ptrdiff_t)640*i74, lo39);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*7+(ptrdiff_t)640*i74, hi38);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*8+(ptrdiff_t)640*i74, lo40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*9+(ptrdiff_t)640*i74, hi39);
}
__m512 va41 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 va42 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 va43 = _mm512_loadu_ps(variances7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 rcp41 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va41));
__m512 rcp42 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va42));
__m512 rcp43 = ResNeXt50Rsqrt1(_mm512_add_ps(eps7, va43));
__m512 sc41 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sc42 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sc43 = _mm512_loadu_ps(scales7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 mul41 = _mm512_mul_ps(rcp41, sc41);
__m512 mul42 = _mm512_mul_ps(rcp42, sc42);
__m512 mul43 = _mm512_mul_ps(rcp43, sc43);
__m512 me41 = _mm512_loadu_ps(means7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 me42 = _mm512_loadu_ps(means7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 me43 = _mm512_loadu_ps(means7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 sh41 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*0+(ptrdiff_t)80*25);
__m512 sh42 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*1+(ptrdiff_t)80*25);
__m512 sh43 = _mm512_loadu_ps(shifts7+(ptrdiff_t)16*2+(ptrdiff_t)80*25);
__m512 add41 = _mm512_fnmadd_ps(me41, mul41, sh41);
__m512 add42 = _mm512_fnmadd_ps(me42, mul42, sh42);
__m512 add43 = _mm512_fnmadd_ps(me43, mul43, sh43);
__m512 lo41 = _mm512_permutex2var_ps(mul41, xlo7, add41);
__m512 lo42 = _mm512_permutex2var_ps(mul42, xlo7, add42);
__m512 lo43 = _mm512_permutex2var_ps(mul43, xlo7, add43);
__m512 hi40 = _mm512_permutex2var_ps(mul41, xhi6, add41);
__m512 hi41 = _mm512_permutex2var_ps(mul42, xhi6, add42);
__m512 hi42 = _mm512_permutex2var_ps(mul43, xhi6, add43);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*0+(ptrdiff_t)640*25, lo41);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*1+(ptrdiff_t)640*25, hi40);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*2+(ptrdiff_t)640*25, lo42);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*3+(ptrdiff_t)640*25, hi41);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*4+(ptrdiff_t)640*25, lo43);
_mm512_storeu_ps(mas14+(ptrdiff_t)64*5+(ptrdiff_t)640*25, hi42);
}

static void ResNeXt50Glopl1Callee1(ResNeXt50ThreaderTask1* task172, int64_t* pt91) {
char** tensors168 = task172->any1;
ptrdiff_t c80 = pt91[0];
char*restrict ptr3 = tensors168[0]+(ptrdiff_t)40960*c80;
char*restrict ptr4 = tensors168[1]+(ptrdiff_t)512*c80;
__m512 buf1 = _mm512_setzero_ps();
__mmask16 mask3 = 65535;
for (ptrdiff_t i100 = 0; i100 < 64; ++i100) {
__m512 acc1 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)0+(ptrdiff_t)640*i100);
__m512 acc2 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)64+(ptrdiff_t)640*i100);
__m512 acc3 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)128+(ptrdiff_t)640*i100);
__m512 acc4 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)192+(ptrdiff_t)640*i100);
__m512 acc5 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)320+(ptrdiff_t)640*i100);
__m512 acc6 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)384+(ptrdiff_t)640*i100);
__m512 acc7 = _mm512_maskz_loadu_ps(65535, ptr3+(ptrdiff_t)448+(ptrdiff_t)640*i100);
__m512 acc8 = _mm512_maskz_loadu_ps(1, ptr3+(ptrdiff_t)512+(ptrdiff_t)640*i100);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc3);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc7);
acc2 = _mm512_mask_add_ps(acc2, 1, acc2, acc4);
acc6 = _mm512_mask_add_ps(acc6, 1, acc6, acc8);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, acc2);
acc5 = _mm512_mask_add_ps(acc5, 65535, acc5, acc6);
__m512i pm1lo1 = _mm512_set_epi32(16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0);
__m512i pm1hi1 = _mm512_set_epi32(17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1);
__m512 hi43 = _mm512_shuffle_f32x4(acc1, acc1, 238);
__m512 hi46 = _mm512_shuffle_f32x4(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 255, acc1, hi43);
acc5 = _mm512_mask_add_ps(acc5, 255, acc5, hi46);
__m512 hi44 = _mm512_shuffle_f32x4(acc1, acc1, 1);
__m512 hi47 = _mm512_shuffle_f32x4(acc5, acc5, 1);
acc1 = _mm512_mask_add_ps(acc1, 15, acc1, hi44);
acc5 = _mm512_mask_add_ps(acc5, 15, acc5, hi47);
__m512 hi45 = _mm512_shuffle_ps(acc1, acc1, 238);
__m512 hi48 = _mm512_shuffle_ps(acc5, acc5, 238);
acc1 = _mm512_mask_add_ps(acc1, 3, acc1, hi45);
acc5 = _mm512_mask_add_ps(acc5, 3, acc5, hi48);
__m512 hi49 = _mm512_permutex2var_ps(acc1, pm1hi1, acc5);
acc1 = _mm512_permutex2var_ps(acc1, pm1lo1, acc5);
acc1 = _mm512_mask_add_ps(acc1, 65535, acc1, hi49);
buf1 = _mm512_mask_mov_ps(buf1, mask3, acc1);
mask3 &= mask3<<2;
if (__builtin_expect(!mask3, 0)) {
mask3 = 65535;
buf1 = _mm512_mul_ps(buf1, _mm512_set1_ps(2.0408163e-02f));
_mm512_mask_storeu_ps(ptr4+(ptrdiff_t)4*((ptrdiff_t)2*i100-14), 65535, buf1);
}
}
}

static void ResNeXt50Glopl1(ResNeXt50ThreaderTeam1* team96, char** tensors167) {
ResNeXt50ThreaderTask1 task173;
task173.callee1 = ResNeXt50Glopl1Callee1;
task173.any1 = tensors167;
task173.nd1 = 1;
task173.hull1[0] = 16;
ResNeXt50ThreaderDo1(team96, &task173);
}

static void ResNeXt50Thrpl1Callee1(ResNeXt50ThreaderTask1* task12, int64_t* pt11) {
char** tensors10 = task12->any1;
ptrdiff_t b43 = pt11[0];
ptrdiff_t e5 = pt11[1];
ptrdiff_t c4 = pt11[2];
char*restrict ptr1 = tensors10[0]-(ptrdiff_t)448+(ptrdiff_t)50176*b43+(ptrdiff_t)448*e5+(ptrdiff_t)50240*c4;
char*restrict ptr2 = tensors10[1]+(ptrdiff_t)12544*b43+(ptrdiff_t)224*e5+(ptrdiff_t)12608*c4;
for (ptrdiff_t i10 = 0; i10 < 1; ++i10) {
__m512 in1 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 in2 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat894 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
__m512 dat895 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*0);
in1 = _mm512_max_ps(in1, dat894);
in2 = _mm512_max_ps(in2, dat895);
__m512i pm57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm59 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out1 = _mm512_permutex2var_ps(in1, pm57, in2);
__m512 pack263 = _mm512_permutex2var_ps(in1, pm58, in2);
__m512 pack264 = _mm512_permutex2var_ps(in1, pm59, in2);
out1 = _mm512_mask_max_ps(out1, 65535, out1, pack263);
out1 = _mm512_mask_max_ps(out1, 65534, out1, pack264);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*0, 65535, out1);
for (ptrdiff_t k44 = 1; k44 < 3; ++k44) {
__m512 in3 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 in4 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat896 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
__m512 dat897 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*k44);
in3 = _mm512_max_ps(in3, dat896);
in4 = _mm512_max_ps(in4, dat897);
__m512 blend1 = _mm512_mask_mov_ps(in4, 32768, in2);
__m512 out2 = _mm512_permutex2var_ps(in3, pm57, in4);
__m512 pack265 = _mm512_permutex2var_ps(in3, pm58, in4);
__m512 pack266 = _mm512_permutex2var_ps(in3, pm59, blend1);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack265);
out2 = _mm512_mask_max_ps(out2, 65535, out2, pack266);
in2 = in4;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*k44, 65535, out2);
}
__m512 in5 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
__m512 dat898 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*0+(ptrdiff_t)128*3);
in5 = _mm512_max_ps(in5, dat898);
__m512 blend2 = _mm512_mask_mov_ps(in5, 32768, in2);
__m512 out3 = _mm512_permutexvar_ps(pm57, in5);
__m512 pack267 = _mm512_permutexvar_ps(pm58, in5);
__m512 pack268 = _mm512_permutexvar_ps(pm59, blend2);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack267);
out3 = _mm512_mask_max_ps(out3, 255, out3, pack268);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*0+(ptrdiff_t)64*3, 255, out3);
for (ptrdiff_t j6 = 1; j6 < 56; ++j6) {
__m512 in6 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 in7 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat899 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat901 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat900 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
__m512 dat902 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*0);
in6 = _mm512_max_ps(in6, dat899);
in7 = _mm512_max_ps(in7, dat901);
in6 = _mm512_max_ps(in6, dat900);
in7 = _mm512_max_ps(in7, dat902);
__m512i pm60 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pm61 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm62 = _mm512_set_epi32(29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1, 31);
__m512 out4 = _mm512_permutex2var_ps(in6, pm60, in7);
__m512 pack269 = _mm512_permutex2var_ps(in6, pm61, in7);
__m512 pack270 = _mm512_permutex2var_ps(in6, pm62, in7);
out4 = _mm512_mask_max_ps(out4, 65535, out4, pack269);
out4 = _mm512_mask_max_ps(out4, 65534, out4, pack270);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*0, 65535, out4);
for (ptrdiff_t k45 = 1; k45 < 3; ++k45) {
__m512 in8 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 in9 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)64+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat903 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat905 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)512+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat904 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
__m512 dat906 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)960+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*k45);
in8 = _mm512_max_ps(in8, dat903);
in9 = _mm512_max_ps(in9, dat905);
in8 = _mm512_max_ps(in8, dat904);
in9 = _mm512_max_ps(in9, dat906);
__m512 blend3 = _mm512_mask_mov_ps(in9, 32768, in7);
__m512 out5 = _mm512_permutex2var_ps(in8, pm60, in9);
__m512 pack271 = _mm512_permutex2var_ps(in8, pm61, in9);
__m512 pack272 = _mm512_permutex2var_ps(in8, pm62, blend3);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack271);
out5 = _mm512_mask_max_ps(out5, 65535, out5, pack272);
in7 = in9;
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*k45, 65535, out5);
}
__m512 in10 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)0+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat907 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)448+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
__m512 dat908 = _mm512_maskz_loadu_ps(65535, ptr1+(ptrdiff_t)896+(ptrdiff_t)50240*i10+(ptrdiff_t)896*j6+(ptrdiff_t)128*3);
in10 = _mm512_max_ps(in10, dat907);
in10 = _mm512_max_ps(in10, dat908);
__m512 blend4 = _mm512_mask_mov_ps(in10, 32768, in7);
__m512 out6 = _mm512_permutexvar_ps(pm60, in10);
__m512 pack273 = _mm512_permutexvar_ps(pm61, in10);
__m512 pack274 = _mm512_permutexvar_ps(pm62, blend4);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack273);
out6 = _mm512_mask_max_ps(out6, 255, out6, pack274);
_mm512_mask_storeu_ps(ptr2+(ptrdiff_t)12608*i10+(ptrdiff_t)224*j6+(ptrdiff_t)64*3, 255, out6);
}
}
}

static void ResNeXt50Thrpl1(ResNeXt50ThreaderTeam1* team18, char** tensors9) {
ResNeXt50ThreaderTask1 task13;
task13.callee1 = ResNeXt50Thrpl1Callee1;
task13.any1 = tensors9;
task13.nd1 = 3;
task13.hull1[0] = 1;
task13.hull1[1] = 1;
task13.hull1[2] = 64;
ResNeXt50ThreaderDo1(team18, &task13);
}

static void ResNeXt50FcArrange1Callee1(ResNeXt50ThreaderTask1* task174, int64_t* pt92) {
char** tensors170 = task174->any1;
ptrdiff_t t46 = pt92[0];
char*restrict weights1 = tensors170[0]+(ptrdiff_t)131072*t46;
char*restrict biases1 = tensors170[1]+(ptrdiff_t)64*t46;
char*restrict weights2 = tensors170[2]+(ptrdiff_t)65536*t46;
char*restrict biases2 = tensors170[2]+(ptrdiff_t)4096000+(ptrdiff_t)64*t46;
if (t46 < 62) {
for (ptrdiff_t i101 = 0; i101 < 1; ++i101) {
for (ptrdiff_t j91 = 0; j91 < 128; ++j91) {
__m512 wtLo1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi1 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi2 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi3 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi4 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)65536+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi5 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)73728+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)81920+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi6 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)90112+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)98304+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi7 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)106496+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtLo8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)114688+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m512 wtHi8 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)122880+(ptrdiff_t)131072*i101+(ptrdiff_t)64*j91);
__m256i halfLo1 = _mm512_cvtps_ph(wtLo1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi1 = _mm512_cvtps_ph(wtHi1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo2 = _mm512_cvtps_ph(wtLo2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi2 = _mm512_cvtps_ph(wtHi2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo3 = _mm512_cvtps_ph(wtLo3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi3 = _mm512_cvtps_ph(wtHi3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo4 = _mm512_cvtps_ph(wtLo4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi4 = _mm512_cvtps_ph(wtHi4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo5 = _mm512_cvtps_ph(wtLo5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi5 = _mm512_cvtps_ph(wtHi5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo6 = _mm512_cvtps_ph(wtLo6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi6 = _mm512_cvtps_ph(wtHi6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo7 = _mm512_cvtps_ph(wtLo7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi7 = _mm512_cvtps_ph(wtHi7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo8 = _mm512_cvtps_ph(wtLo8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi8 = _mm512_cvtps_ph(wtHi8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield1 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo1), halfHi1, 1);
__m512i yield2 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo2), halfHi2, 1);
__m512i yield3 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo3), halfHi3, 1);
__m512i yield4 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo4), halfHi4, 1);
__m512i yield5 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo5), halfHi5, 1);
__m512i yield6 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo6), halfHi6, 1);
__m512i yield7 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo7), halfHi7, 1);
__m512i yield8 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo8), halfHi8, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield2);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield3);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield4);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield5);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield6);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield7);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i101+(ptrdiff_t)512*j91, 65535, yield8);
}
__m512 bias10 = _mm512_maskz_loadu_ps(65535, biases1+(ptrdiff_t)0+(ptrdiff_t)64*i101);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)64*i101, 65535, bias10);
}
return;
}
for (ptrdiff_t i102 = 0; i102 < 1; ++i102) {
for (ptrdiff_t j92 = 0; j92 < 64; ++j92) {
__m512 wtLo9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)0+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi9 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8192+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16384+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi10 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24576+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32768+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi11 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)40960+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49152+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi12 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57344+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)64+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi13 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)8256+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)16448+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi14 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)24640+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)32832+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi15 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)41024+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtLo16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)49216+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m512 wtHi16 = _mm512_maskz_loadu_ps(65535, weights1+(ptrdiff_t)57408+(ptrdiff_t)65536*i102+(ptrdiff_t)128*j92);
__m256i halfLo9 = _mm512_cvtps_ph(wtLo9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi9 = _mm512_cvtps_ph(wtHi9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo10 = _mm512_cvtps_ph(wtLo10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi10 = _mm512_cvtps_ph(wtHi10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo11 = _mm512_cvtps_ph(wtLo11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi11 = _mm512_cvtps_ph(wtHi11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo12 = _mm512_cvtps_ph(wtLo12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi12 = _mm512_cvtps_ph(wtHi12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo13 = _mm512_cvtps_ph(wtLo13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi13 = _mm512_cvtps_ph(wtHi13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo14 = _mm512_cvtps_ph(wtLo14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi14 = _mm512_cvtps_ph(wtHi14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo15 = _mm512_cvtps_ph(wtLo15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi15 = _mm512_cvtps_ph(wtHi15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfLo16 = _mm512_cvtps_ph(wtLo16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m256i halfHi16 = _mm512_cvtps_ph(wtHi16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
__m512i yield9 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo9), halfHi9, 1);
__m512i yield10 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo10), halfHi10, 1);
__m512i yield11 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo11), halfHi11, 1);
__m512i yield12 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo12), halfHi12, 1);
__m512i yield13 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo13), halfHi13, 1);
__m512i yield14 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo14), halfHi14, 1);
__m512i yield15 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo15), halfHi15, 1);
__m512i yield16 = _mm512_inserti64x4(_mm512_castsi256_si512(halfLo16), halfHi16, 1);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)0+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield9);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)64+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield10);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)128+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield11);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)192+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield12);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)256+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield13);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)320+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield14);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)384+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield15);
_mm512_mask_storeu_epi32(weights2+(ptrdiff_t)448+(ptrdiff_t)65536*i102+(ptrdiff_t)512*j92, 65535, yield16);
}
__m512 bias11 = _mm512_maskz_loadu_ps(255, biases1+(ptrdiff_t)0+(ptrdiff_t)32*i102);
_mm512_mask_storeu_ps(biases2+(ptrdiff_t)0+(ptrdiff_t)32*i102, 255, bias11);
}
}

static void ResNeXt50FcArrange1(ResNeXt50ThreaderTeam1* team97, char** tensors169) {
ResNeXt50ThreaderTask1 task175;
task175.callee1 = ResNeXt50FcArrange1Callee1;
task175.any1 = tensors169;
task175.nd1 = 1;
task175.hull1[0] = 63;
ResNeXt50ThreaderDo1(team97, &task175);
}

static void ResNeXt50FcApply1Callee1(ResNeXt50ThreaderTask1* task176, int64_t* pt93) {
char** tensors172 = task176->any1;
ptrdiff_t t47 = pt93[0];
char*restrict wtPtr27 = tensors172[0]+(ptrdiff_t)65536*t47;
char*restrict biasPtr26 = tensors172[0]+(ptrdiff_t)4096000+(ptrdiff_t)64*t47;
char*restrict datPtr56 = tensors172[1];
char*restrict datPtr57 = tensors172[2]+(ptrdiff_t)64*t47;
if (t47 < 62) {
for (ptrdiff_t i103 = 0; i103 < 1; ++i103) {
__m512 sum893 = _mm512_setzero_ps();
__m512 sum894 = _mm512_setzero_ps();
__m512 sum895 = _mm512_setzero_ps();
__m512 sum896 = _mm512_setzero_ps();
__m512 sum897 = _mm512_setzero_ps();
__m512 sum898 = _mm512_setzero_ps();
__m512 sum899 = _mm512_setzero_ps();
__m512 sum900 = _mm512_setzero_ps();
__m512 sum901 = _mm512_setzero_ps();
__m512 sum902 = _mm512_setzero_ps();
__m512 sum903 = _mm512_setzero_ps();
__m512 sum904 = _mm512_setzero_ps();
__m512 sum905 = _mm512_setzero_ps();
__m512 sum906 = _mm512_setzero_ps();
__m512 sum907 = _mm512_setzero_ps();
__m512 sum908 = _mm512_setzero_ps();
for (ptrdiff_t j93 = 0; j93 < 128; ++j93) {
__m512i wts1 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 dat2712 = _mm512_maskz_loadu_ps(65535, datPtr56+(ptrdiff_t)0+(ptrdiff_t)64*j93);
__m512i wts2 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts3 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts4 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 wtLo17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts1));
__m512 wtHi17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts1, 1));
__m512 wtLo18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts2));
__m512 wtHi18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts2, 1));
__m512 wtLo19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts3));
__m512 wtHi19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts3, 1));
__m512 wtLo20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts4));
__m512 wtHi20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts4, 1));
sum893 = _mm512_fmadd_ps(wtLo17, dat2712, sum893);
sum894 = _mm512_fmadd_ps(wtHi17, dat2712, sum894);
sum895 = _mm512_fmadd_ps(wtLo18, dat2712, sum895);
sum896 = _mm512_fmadd_ps(wtHi18, dat2712, sum896);
sum897 = _mm512_fmadd_ps(wtLo19, dat2712, sum897);
sum898 = _mm512_fmadd_ps(wtHi19, dat2712, sum898);
sum899 = _mm512_fmadd_ps(wtLo20, dat2712, sum899);
sum900 = _mm512_fmadd_ps(wtHi20, dat2712, sum900);
__m512i wts5 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)256+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts6 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)320+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts7 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)384+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512i wts8 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)448+(ptrdiff_t)65536*i103+(ptrdiff_t)512*j93);
__m512 wtLo21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts5));
__m512 wtHi21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts5, 1));
__m512 wtLo22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts6));
__m512 wtHi22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts6, 1));
__m512 wtLo23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts7));
__m512 wtHi23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts7, 1));
__m512 wtLo24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts8));
__m512 wtHi24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts8, 1));
sum901 = _mm512_fmadd_ps(wtLo21, dat2712, sum901);
sum902 = _mm512_fmadd_ps(wtHi21, dat2712, sum902);
sum903 = _mm512_fmadd_ps(wtLo22, dat2712, sum903);
sum904 = _mm512_fmadd_ps(wtHi22, dat2712, sum904);
sum905 = _mm512_fmadd_ps(wtLo23, dat2712, sum905);
sum906 = _mm512_fmadd_ps(wtHi23, dat2712, sum906);
sum907 = _mm512_fmadd_ps(wtLo24, dat2712, sum907);
sum908 = _mm512_fmadd_ps(wtHi24, dat2712, sum908);
}
__m512 bias12 = _mm512_maskz_loadu_ps(65535, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)64*i103);
__m512i pm1Lo1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
__m512i pm1Hi1 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
__m512i pm4Lo1 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi1 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper4 = _mm512_shuffle_f32x4(sum893, sum901, 238);
__m512 upper5 = _mm512_shuffle_f32x4(sum897, sum905, 238);
sum893 = _mm512_shuffle_f32x4(sum893, sum901, 68);
sum897 = _mm512_shuffle_f32x4(sum897, sum905, 68);
sum893 = _mm512_add_ps(sum893, upper4);
sum897 = _mm512_add_ps(sum897, upper5);
__m512 upper7 = _mm512_shuffle_f32x4(sum895, sum903, 238);
__m512 upper8 = _mm512_shuffle_f32x4(sum899, sum907, 238);
sum895 = _mm512_shuffle_f32x4(sum895, sum903, 68);
sum899 = _mm512_shuffle_f32x4(sum899, sum907, 68);
sum895 = _mm512_add_ps(sum895, upper7);
sum899 = _mm512_add_ps(sum899, upper8);
__m512 upper3 = _mm512_permutex2var_ps(sum893, pm4Hi1, sum897);
__m512 upper6 = _mm512_permutex2var_ps(sum895, pm4Hi1, sum899);
sum893 = _mm512_permutex2var_ps(sum893, pm4Lo1, sum897);
sum895 = _mm512_permutex2var_ps(sum895, pm4Lo1, sum899);
sum893 = _mm512_add_ps(sum893, upper3);
sum895 = _mm512_add_ps(sum895, upper6);
__m512 upper11 = _mm512_shuffle_f32x4(sum894, sum902, 238);
__m512 upper12 = _mm512_shuffle_f32x4(sum898, sum906, 238);
sum894 = _mm512_shuffle_f32x4(sum894, sum902, 68);
sum898 = _mm512_shuffle_f32x4(sum898, sum906, 68);
sum894 = _mm512_add_ps(sum894, upper11);
sum898 = _mm512_add_ps(sum898, upper12);
__m512 upper14 = _mm512_shuffle_f32x4(sum896, sum904, 238);
__m512 upper15 = _mm512_shuffle_f32x4(sum900, sum908, 238);
sum896 = _mm512_shuffle_f32x4(sum896, sum904, 68);
sum900 = _mm512_shuffle_f32x4(sum900, sum908, 68);
sum896 = _mm512_add_ps(sum896, upper14);
sum900 = _mm512_add_ps(sum900, upper15);
__m512 upper10 = _mm512_permutex2var_ps(sum894, pm4Hi1, sum898);
__m512 upper13 = _mm512_permutex2var_ps(sum896, pm4Hi1, sum900);
sum894 = _mm512_permutex2var_ps(sum894, pm4Lo1, sum898);
sum896 = _mm512_permutex2var_ps(sum896, pm4Lo1, sum900);
sum894 = _mm512_add_ps(sum894, upper10);
sum896 = _mm512_add_ps(sum896, upper13);
__m512 upper2 = _mm512_shuffle_ps(sum893, sum895, 238);
__m512 upper9 = _mm512_shuffle_ps(sum894, sum896, 238);
sum893 = _mm512_shuffle_ps(sum893, sum895, 68);
sum894 = _mm512_shuffle_ps(sum894, sum896, 68);
sum893 = _mm512_add_ps(sum893, upper2);
sum894 = _mm512_add_ps(sum894, upper9);
__m512 upper1 = _mm512_permutex2var_ps(sum893, pm1Hi1, sum894);
sum893 = _mm512_permutex2var_ps(sum893, pm1Lo1, sum894);
sum893 = _mm512_add_ps(sum893, upper1);
sum893 = _mm512_add_ps(sum893, bias12);
_mm512_mask_storeu_ps(datPtr57+(ptrdiff_t)0+(ptrdiff_t)64*i103, 65535, sum893);
}
return;
}
for (ptrdiff_t i104 = 0; i104 < 1; ++i104) {
__m512 sum909 = _mm512_setzero_ps();
__m512 sum910 = _mm512_setzero_ps();
__m512 sum911 = _mm512_setzero_ps();
__m512 sum912 = _mm512_setzero_ps();
__m512 sum913 = _mm512_setzero_ps();
__m512 sum914 = _mm512_setzero_ps();
__m512 sum915 = _mm512_setzero_ps();
__m512 sum916 = _mm512_setzero_ps();
for (ptrdiff_t j94 = 0; j94 < 128; ++j94) {
__m512i wts9 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)0+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512 dat2713 = _mm512_maskz_loadu_ps(65535, datPtr56+(ptrdiff_t)0+(ptrdiff_t)64*j94);
__m512i wts10 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)64+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512i wts11 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)128+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512i wts12 = _mm512_maskz_loadu_epi32(65535, wtPtr27+(ptrdiff_t)192+(ptrdiff_t)65536*i104+(ptrdiff_t)256*j94);
__m512 wtLo25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts9));
__m512 wtHi25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts9, 1));
__m512 wtLo26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts10));
__m512 wtHi26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts10, 1));
__m512 wtLo27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts11));
__m512 wtHi27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts11, 1));
__m512 wtLo28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wts12));
__m512 wtHi28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wts12, 1));
sum909 = _mm512_fmadd_ps(wtLo25, dat2713, sum909);
sum910 = _mm512_fmadd_ps(wtHi25, dat2713, sum910);
sum911 = _mm512_fmadd_ps(wtLo26, dat2713, sum911);
sum912 = _mm512_fmadd_ps(wtHi26, dat2713, sum912);
sum913 = _mm512_fmadd_ps(wtLo27, dat2713, sum913);
sum914 = _mm512_fmadd_ps(wtHi27, dat2713, sum914);
sum915 = _mm512_fmadd_ps(wtLo28, dat2713, sum915);
sum916 = _mm512_fmadd_ps(wtHi28, dat2713, sum916);
}
__m512 bias13 = _mm512_maskz_loadu_ps(255, biasPtr26+(ptrdiff_t)0+(ptrdiff_t)32*i104);
__m512i pmEven1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmOdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512i pm4Lo2 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i pm4Hi2 = _mm512_set_epi32(31, 30, 29, 28, 15, 14, 13, 12, 23, 22, 21, 20, 7, 6, 5, 4);
__m512 upper18 = _mm512_shuffle_f32x4(sum909, sum913, 238);
__m512 upper19 = _mm512_shuffle_f32x4(sum911, sum915, 238);
sum909 = _mm512_shuffle_f32x4(sum909, sum913, 68);
sum911 = _mm512_shuffle_f32x4(sum911, sum915, 68);
sum909 = _mm512_add_ps(sum909, upper18);
sum911 = _mm512_add_ps(sum911, upper19);
__m512 upper21 = _mm512_shuffle_f32x4(sum910, sum914, 238);
__m512 upper22 = _mm512_shuffle_f32x4(sum912, sum916, 238);
sum910 = _mm512_shuffle_f32x4(sum910, sum914, 68);
sum912 = _mm512_shuffle_f32x4(sum912, sum916, 68);
sum910 = _mm512_add_ps(sum910, upper21);
sum912 = _mm512_add_ps(sum912, upper22);
__m512 upper17 = _mm512_permutex2var_ps(sum909, pm4Hi2, sum911);
__m512 upper20 = _mm512_permutex2var_ps(sum910, pm4Hi2, sum912);
sum909 = _mm512_permutex2var_ps(sum909, pm4Lo2, sum911);
sum910 = _mm512_permutex2var_ps(sum910, pm4Lo2, sum912);
sum909 = _mm512_add_ps(sum909, upper17);
sum910 = _mm512_add_ps(sum910, upper20);
__m512 upper16 = _mm512_shuffle_ps(sum909, sum910, 238);
sum909 = _mm512_shuffle_ps(sum909, sum910, 68);
sum909 = _mm512_add_ps(sum909, upper16);
__m512 upper23 = _mm512_permutexvar_ps(pmOdd1, sum909);
sum909 = _mm512_permutexvar_ps(pmEven1, sum909);
sum909 = _mm512_add_ps(sum909, upper23);
sum909 = _mm512_add_ps(sum909, bias13);
_mm512_mask_storeu_ps(datPtr57+(ptrdiff_t)0+(ptrdiff_t)32*i104, 255, sum909);
}
}

static void ResNeXt50FcApply1(ResNeXt50ThreaderTeam1* team98, char** tensors171) {
ResNeXt50ThreaderTask1 task177;
task177.callee1 = ResNeXt50FcApply1Callee1;
task177.any1 = tensors171;
task177.nd1 = 1;
task177.hull1[0] = 63;
ResNeXt50ThreaderDo1(team98, &task177);
}

static void ResNeXt50OneArrangeWts1Callee1(ResNeXt50ThreaderTask1* task14, int64_t* pt12) {
char** tensors12 = task14->any1;
ptrdiff_t b44 = pt12[0];
char*restrict wtPtr2 = tensors12[0]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr2 = tensors12[1]+(ptrdiff_t)1536*0;
char*restrict bnPtr3 = tensors12[2]+(ptrdiff_t)8*384*0;
char*restrict wtPtr3 = tensors12[3]+(ptrdiff_t)3340*0+(ptrdiff_t)98304*0;
char*restrict biasPtr3 = tensors12[4]+(ptrdiff_t)1536*0;
char*restrict bnPtr4 = tensors12[5]+(ptrdiff_t)8*384*0;
char*restrict arranged1 = tensors12[6]+(ptrdiff_t)1284096*0+(ptrdiff_t)99840*0;
ptrdiff_t ii1 = 1;
for (ptrdiff_t i13 = 0; i13 < ii1; ++i13) {
ptrdiff_t j7 = 8*b44;
ptrdiff_t jj19 = j7+8;
for (; j7 < jj19; ++j7) {
if (j7 < 16) {
ptrdiff_t k46 = 0+16*(j7-0);
ptrdiff_t l9 = (size_t)(0+k46)/6;
ptrdiff_t cut1 = (size_t)(0+k46)%6;
switch (cut1) {
case 0:;
case 2: {
__m512 sum2 = _mm512_maskz_loadu_ps(65535, biasPtr2+1536*i13+4*k46);
__m512i pmMul2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo1 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k46+384*i13));
__m512 masHi1 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k46+384*i13)+(ptrdiff_t)64);
__m512 postMul4 = _mm512_permutex2var_ps(masLo1, pmMul2, masHi1);
__m512 postAdd2 = _mm512_permutex2var_ps(masLo1, pmAdd2, masHi1);
sum2 = _mm512_fmadd_ps(sum2, postMul4, postAdd2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum2);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 65535-(4095>>cut1), sum2);
ptrdiff_t c5 = 0;
for (; c5 != 4; ++c5) {
__m512 wt15 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)0);
__m512 wt16 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)256);
__m512 wt17 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)512);
__m512 wt18 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)768);
__m512 wt19 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1024);
__m512 wt20 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1280);
__m512 wt21 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1536);
__m512 wt22 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)1792);
__m512 wt23 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2048);
__m512 wt24 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2304);
__m512 wt25 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2560);
__m512 wt26 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)2816);
__m512 wt27 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3072);
__m512 wt28 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3328);
__m512 wt29 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3584);
__m512 wt30 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c5+(ptrdiff_t)3840);
__m512 tmp1 = _mm512_unpacklo_ps(wt15, wt16);
__m512 tmp2 = _mm512_unpackhi_ps(wt15, wt16);
__m512 tmp3 = _mm512_unpacklo_ps(wt17, wt18);
__m512 tmp4 = _mm512_unpackhi_ps(wt17, wt18);
__m512 tmp5 = _mm512_unpacklo_ps(wt19, wt20);
__m512 tmp6 = _mm512_unpackhi_ps(wt19, wt20);
__m512 tmp7 = _mm512_unpacklo_ps(wt21, wt22);
__m512 tmp8 = _mm512_unpackhi_ps(wt21, wt22);
__m512 tmp9 = _mm512_unpacklo_ps(wt23, wt24);
__m512 tmp10 = _mm512_unpackhi_ps(wt23, wt24);
__m512 tmp11 = _mm512_unpacklo_ps(wt25, wt26);
__m512 tmp12 = _mm512_unpackhi_ps(wt25, wt26);
__m512 tmp13 = _mm512_unpacklo_ps(wt27, wt28);
__m512 tmp14 = _mm512_unpackhi_ps(wt27, wt28);
__m512 tmp15 = _mm512_unpacklo_ps(wt29, wt30);
__m512 tmp16 = _mm512_unpackhi_ps(wt29, wt30);
__m512 tmp17 = _mm512_shuffle_ps(tmp1, tmp3, 68);
__m512 tmp18 = _mm512_shuffle_ps(tmp1, tmp3, 238);
__m512 tmp19 = _mm512_shuffle_ps(tmp2, tmp4, 68);
__m512 tmp20 = _mm512_shuffle_ps(tmp2, tmp4, 238);
__m512 tmp21 = _mm512_shuffle_ps(tmp5, tmp7, 68);
__m512 tmp22 = _mm512_shuffle_ps(tmp5, tmp7, 238);
__m512 tmp23 = _mm512_shuffle_ps(tmp6, tmp8, 68);
__m512 tmp24 = _mm512_shuffle_ps(tmp6, tmp8, 238);
__m512 tmp25 = _mm512_shuffle_ps(tmp9, tmp11, 68);
__m512 tmp26 = _mm512_shuffle_ps(tmp9, tmp11, 238);
__m512 tmp27 = _mm512_shuffle_ps(tmp10, tmp12, 68);
__m512 tmp28 = _mm512_shuffle_ps(tmp10, tmp12, 238);
__m512 tmp29 = _mm512_shuffle_ps(tmp13, tmp15, 68);
__m512 tmp30 = _mm512_shuffle_ps(tmp13, tmp15, 238);
__m512 tmp31 = _mm512_shuffle_ps(tmp14, tmp16, 68);
__m512 tmp32 = _mm512_shuffle_ps(tmp14, tmp16, 238);
__m512 tmp33 = _mm512_shuffle_f32x4(tmp17, tmp21, 136);
__m512 tmp34 = _mm512_shuffle_f32x4(tmp17, tmp21, 221);
__m512 tmp35 = _mm512_shuffle_f32x4(tmp18, tmp22, 136);
__m512 tmp36 = _mm512_shuffle_f32x4(tmp18, tmp22, 221);
__m512 tmp37 = _mm512_shuffle_f32x4(tmp19, tmp23, 136);
__m512 tmp38 = _mm512_shuffle_f32x4(tmp19, tmp23, 221);
__m512 tmp39 = _mm512_shuffle_f32x4(tmp20, tmp24, 136);
__m512 tmp40 = _mm512_shuffle_f32x4(tmp20, tmp24, 221);
__m512 tmp41 = _mm512_shuffle_f32x4(tmp25, tmp29, 136);
__m512 tmp42 = _mm512_shuffle_f32x4(tmp25, tmp29, 221);
__m512 tmp43 = _mm512_shuffle_f32x4(tmp26, tmp30, 136);
__m512 tmp44 = _mm512_shuffle_f32x4(tmp26, tmp30, 221);
__m512 tmp45 = _mm512_shuffle_f32x4(tmp27, tmp31, 136);
__m512 tmp46 = _mm512_shuffle_f32x4(tmp27, tmp31, 221);
__m512 tmp47 = _mm512_shuffle_f32x4(tmp28, tmp32, 136);
__m512 tmp48 = _mm512_shuffle_f32x4(tmp28, tmp32, 221);
wt15 = _mm512_shuffle_f32x4(tmp33, tmp41, 136);
wt23 = _mm512_shuffle_f32x4(tmp33, tmp41, 221);
wt16 = _mm512_shuffle_f32x4(tmp35, tmp43, 136);
wt24 = _mm512_shuffle_f32x4(tmp35, tmp43, 221);
wt17 = _mm512_shuffle_f32x4(tmp37, tmp45, 136);
wt25 = _mm512_shuffle_f32x4(tmp37, tmp45, 221);
wt18 = _mm512_shuffle_f32x4(tmp39, tmp47, 136);
wt26 = _mm512_shuffle_f32x4(tmp39, tmp47, 221);
wt19 = _mm512_shuffle_f32x4(tmp34, tmp42, 136);
wt27 = _mm512_shuffle_f32x4(tmp34, tmp42, 221);
wt20 = _mm512_shuffle_f32x4(tmp36, tmp44, 136);
wt28 = _mm512_shuffle_f32x4(tmp36, tmp44, 221);
wt21 = _mm512_shuffle_f32x4(tmp38, tmp46, 136);
wt29 = _mm512_shuffle_f32x4(tmp38, tmp46, 221);
wt22 = _mm512_shuffle_f32x4(tmp40, tmp48, 136);
wt30 = _mm512_shuffle_f32x4(tmp40, tmp48, 221);
wt15 = _mm512_mul_ps(wt15, postMul4);
wt16 = _mm512_mul_ps(wt16, postMul4);
wt17 = _mm512_mul_ps(wt17, postMul4);
wt18 = _mm512_mul_ps(wt18, postMul4);
wt19 = _mm512_mul_ps(wt19, postMul4);
wt20 = _mm512_mul_ps(wt20, postMul4);
wt21 = _mm512_mul_ps(wt21, postMul4);
wt22 = _mm512_mul_ps(wt22, postMul4);
wt23 = _mm512_mul_ps(wt23, postMul4);
wt24 = _mm512_mul_ps(wt24, postMul4);
wt25 = _mm512_mul_ps(wt25, postMul4);
wt26 = _mm512_mul_ps(wt26, postMul4);
wt27 = _mm512_mul_ps(wt27, postMul4);
wt28 = _mm512_mul_ps(wt28, postMul4);
wt29 = _mm512_mul_ps(wt29, postMul4);
wt30 = _mm512_mul_ps(wt30, postMul4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)0, 63>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)0, 63>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)0, 63>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)0, 63>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)0, 63>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)0, 63>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)0, 63>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)0, 63>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)0, 63>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)0, 63>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)0, 63>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)0, 63>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)0, 63>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)0, 63>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)0, 63>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)0, 63>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)1536, 4032>>cut1, wt30);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt15);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt16);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt17);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt18);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt19);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt20);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt21);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt22);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt23);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt24);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt25);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt26);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt27);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt28);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt29);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c5)+(ptrdiff_t)3072, 65535-(4095>>cut1), wt30);
}
break;
}
default: {
cut1 = 4;
__m512 sum3 = _mm512_maskz_loadu_ps(65535, biasPtr2+1536*i13+4*k46);
__m512i pmMul3 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd3 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo2 = _mm512_loadu_ps(bnPtr3+(ptrdiff_t)8*(k46+384*i13));
__m512 masHi2 = _mm512_maskz_loadu_ps(65535, bnPtr3+(ptrdiff_t)8*(k46+384*i13)+(ptrdiff_t)64);
__m512 postMul5 = _mm512_permutex2var_ps(masLo2, pmMul3, masHi2);
__m512 postAdd3 = _mm512_permutex2var_ps(masLo2, pmAdd3, masHi2);
sum3 = _mm512_fmadd_ps(sum3, postMul5, postAdd3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)0, 63>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)1536, 4032>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)3072, 258048>>cut1, sum3);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*0+(ptrdiff_t)4608, 65535-(262143>>cut1), sum3);
ptrdiff_t c6 = 0;
for (; c6 != 4; ++c6) {
__m512 wt31 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)0);
__m512 wt32 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)256);
__m512 wt33 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)512);
__m512 wt34 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)768);
__m512 wt35 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1024);
__m512 wt36 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1280);
__m512 wt37 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1536);
__m512 wt38 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)1792);
__m512 wt39 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2048);
__m512 wt40 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2304);
__m512 wt41 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2560);
__m512 wt42 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)2816);
__m512 wt43 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3072);
__m512 wt44 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3328);
__m512 wt45 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3584);
__m512 wt46 = _mm512_maskz_loadu_ps(65535, wtPtr2+98304*i13+256*k46+64*c6+(ptrdiff_t)3840);
__m512 tmp49 = _mm512_unpacklo_ps(wt31, wt32);
__m512 tmp50 = _mm512_unpackhi_ps(wt31, wt32);
__m512 tmp51 = _mm512_unpacklo_ps(wt33, wt34);
__m512 tmp52 = _mm512_unpackhi_ps(wt33, wt34);
__m512 tmp53 = _mm512_unpacklo_ps(wt35, wt36);
__m512 tmp54 = _mm512_unpackhi_ps(wt35, wt36);
__m512 tmp55 = _mm512_unpacklo_ps(wt37, wt38);
__m512 tmp56 = _mm512_unpackhi_ps(wt37, wt38);
__m512 tmp57 = _mm512_unpacklo_ps(wt39, wt40);
__m512 tmp58 = _mm512_unpackhi_ps(wt39, wt40);
__m512 tmp59 = _mm512_unpacklo_ps(wt41, wt42);
__m512 tmp60 = _mm512_unpackhi_ps(wt41, wt42);
__m512 tmp61 = _mm512_unpacklo_ps(wt43, wt44);
__m512 tmp62 = _mm512_unpackhi_ps(wt43, wt44);
__m512 tmp63 = _mm512_unpacklo_ps(wt45, wt46);
__m512 tmp64 = _mm512_unpackhi_ps(wt45, wt46);
__m512 tmp65 = _mm512_shuffle_ps(tmp49, tmp51, 68);
__m512 tmp66 = _mm512_shuffle_ps(tmp49, tmp51, 238);
__m512 tmp67 = _mm512_shuffle_ps(tmp50, tmp52, 68);
__m512 tmp68 = _mm512_shuffle_ps(tmp50, tmp52, 238);
__m512 tmp69 = _mm512_shuffle_ps(tmp53, tmp55, 68);
__m512 tmp70 = _mm512_shuffle_ps(tmp53, tmp55, 238);
__m512 tmp71 = _mm512_shuffle_ps(tmp54, tmp56, 68);
__m512 tmp72 = _mm512_shuffle_ps(tmp54, tmp56, 238);
__m512 tmp73 = _mm512_shuffle_ps(tmp57, tmp59, 68);
__m512 tmp74 = _mm512_shuffle_ps(tmp57, tmp59, 238);
__m512 tmp75 = _mm512_shuffle_ps(tmp58, tmp60, 68);
__m512 tmp76 = _mm512_shuffle_ps(tmp58, tmp60, 238);
__m512 tmp77 = _mm512_shuffle_ps(tmp61, tmp63, 68);
__m512 tmp78 = _mm512_shuffle_ps(tmp61, tmp63, 238);
__m512 tmp79 = _mm512_shuffle_ps(tmp62, tmp64, 68);
__m512 tmp80 = _mm512_shuffle_ps(tmp62, tmp64, 238);
__m512 tmp81 = _mm512_shuffle_f32x4(tmp65, tmp69, 136);
__m512 tmp82 = _mm512_shuffle_f32x4(tmp65, tmp69, 221);
__m512 tmp83 = _mm512_shuffle_f32x4(tmp66, tmp70, 136);
__m512 tmp84 = _mm512_shuffle_f32x4(tmp66, tmp70, 221);
__m512 tmp85 = _mm512_shuffle_f32x4(tmp67, tmp71, 136);
__m512 tmp86 = _mm512_shuffle_f32x4(tmp67, tmp71, 221);
__m512 tmp87 = _mm512_shuffle_f32x4(tmp68, tmp72, 136);
__m512 tmp88 = _mm512_shuffle_f32x4(tmp68, tmp72, 221);
__m512 tmp89 = _mm512_shuffle_f32x4(tmp73, tmp77, 136);
__m512 tmp90 = _mm512_shuffle_f32x4(tmp73, tmp77, 221);
__m512 tmp91 = _mm512_shuffle_f32x4(tmp74, tmp78, 136);
__m512 tmp92 = _mm512_shuffle_f32x4(tmp74, tmp78, 221);
__m512 tmp93 = _mm512_shuffle_f32x4(tmp75, tmp79, 136);
__m512 tmp94 = _mm512_shuffle_f32x4(tmp75, tmp79, 221);
__m512 tmp95 = _mm512_shuffle_f32x4(tmp76, tmp80, 136);
__m512 tmp96 = _mm512_shuffle_f32x4(tmp76, tmp80, 221);
wt31 = _mm512_shuffle_f32x4(tmp81, tmp89, 136);
wt39 = _mm512_shuffle_f32x4(tmp81, tmp89, 221);
wt32 = _mm512_shuffle_f32x4(tmp83, tmp91, 136);
wt40 = _mm512_shuffle_f32x4(tmp83, tmp91, 221);
wt33 = _mm512_shuffle_f32x4(tmp85, tmp93, 136);
wt41 = _mm512_shuffle_f32x4(tmp85, tmp93, 221);
wt34 = _mm512_shuffle_f32x4(tmp87, tmp95, 136);
wt42 = _mm512_shuffle_f32x4(tmp87, tmp95, 221);
wt35 = _mm512_shuffle_f32x4(tmp82, tmp90, 136);
wt43 = _mm512_shuffle_f32x4(tmp82, tmp90, 221);
wt36 = _mm512_shuffle_f32x4(tmp84, tmp92, 136);
wt44 = _mm512_shuffle_f32x4(tmp84, tmp92, 221);
wt37 = _mm512_shuffle_f32x4(tmp86, tmp94, 136);
wt45 = _mm512_shuffle_f32x4(tmp86, tmp94, 221);
wt38 = _mm512_shuffle_f32x4(tmp88, tmp96, 136);
wt46 = _mm512_shuffle_f32x4(tmp88, tmp96, 221);
wt31 = _mm512_mul_ps(wt31, postMul5);
wt32 = _mm512_mul_ps(wt32, postMul5);
wt33 = _mm512_mul_ps(wt33, postMul5);
wt34 = _mm512_mul_ps(wt34, postMul5);
wt35 = _mm512_mul_ps(wt35, postMul5);
wt36 = _mm512_mul_ps(wt36, postMul5);
wt37 = _mm512_mul_ps(wt37, postMul5);
wt38 = _mm512_mul_ps(wt38, postMul5);
wt39 = _mm512_mul_ps(wt39, postMul5);
wt40 = _mm512_mul_ps(wt40, postMul5);
wt41 = _mm512_mul_ps(wt41, postMul5);
wt42 = _mm512_mul_ps(wt42, postMul5);
wt43 = _mm512_mul_ps(wt43, postMul5);
wt44 = _mm512_mul_ps(wt44, postMul5);
wt45 = _mm512_mul_ps(wt45, postMul5);
wt46 = _mm512_mul_ps(wt46, postMul5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)0, 63>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)0, 63>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)0, 63>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)0, 63>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)0, 63>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)0, 63>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)0, 63>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)0, 63>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)0, 63>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)0, 63>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)0, 63>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)0, 63>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)0, 63>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)0, 63>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)0, 63>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)0, 63>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)1536, 4032>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)3072, 258048>>cut1, wt46);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(1+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt31);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(2+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt32);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(3+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt33);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(4+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt34);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(5+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt35);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(6+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt36);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(7+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt37);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(8+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt38);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(9+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt39);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(10+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt40);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(11+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt41);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(12+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt42);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(13+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt43);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(14+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt44);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(15+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt45);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l9+4*cut1+24*(16+16*c6)+(ptrdiff_t)4608, 65535-(262143>>cut1), wt46);
}
}
}
} else {
ptrdiff_t k47 = 0+16*(j7-16);
ptrdiff_t l10 = (size_t)(256+k47)/6;
ptrdiff_t cut2 = (size_t)(256+k47)%6;
switch (cut2) {
case 0:;
case 2: {
__m512 sum4 = _mm512_maskz_loadu_ps(65535, biasPtr3+1536*i13+4*k47);
__m512i pmMul4 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd4 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo3 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k47+384*i13));
__m512 masHi3 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k47+384*i13)+(ptrdiff_t)64);
__m512 postMul6 = _mm512_permutex2var_ps(masLo3, pmMul4, masHi3);
__m512 postAdd4 = _mm512_permutex2var_ps(masLo3, pmAdd4, masHi3);
sum4 = _mm512_fmadd_ps(sum4, postMul6, postAdd4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum4);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 65535-(4095>>cut2), sum4);
ptrdiff_t c7 = 0;
for (; c7 != 4; ++c7) {
__m512 wt47 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)0);
__m512 wt48 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)256);
__m512 wt49 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)512);
__m512 wt50 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)768);
__m512 wt51 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1024);
__m512 wt52 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1280);
__m512 wt53 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1536);
__m512 wt54 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)1792);
__m512 wt55 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2048);
__m512 wt56 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2304);
__m512 wt57 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2560);
__m512 wt58 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)2816);
__m512 wt59 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3072);
__m512 wt60 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3328);
__m512 wt61 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3584);
__m512 wt62 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c7+(ptrdiff_t)3840);
__m512 tmp97 = _mm512_unpacklo_ps(wt47, wt48);
__m512 tmp98 = _mm512_unpackhi_ps(wt47, wt48);
__m512 tmp99 = _mm512_unpacklo_ps(wt49, wt50);
__m512 tmp100 = _mm512_unpackhi_ps(wt49, wt50);
__m512 tmp101 = _mm512_unpacklo_ps(wt51, wt52);
__m512 tmp102 = _mm512_unpackhi_ps(wt51, wt52);
__m512 tmp103 = _mm512_unpacklo_ps(wt53, wt54);
__m512 tmp104 = _mm512_unpackhi_ps(wt53, wt54);
__m512 tmp105 = _mm512_unpacklo_ps(wt55, wt56);
__m512 tmp106 = _mm512_unpackhi_ps(wt55, wt56);
__m512 tmp107 = _mm512_unpacklo_ps(wt57, wt58);
__m512 tmp108 = _mm512_unpackhi_ps(wt57, wt58);
__m512 tmp109 = _mm512_unpacklo_ps(wt59, wt60);
__m512 tmp110 = _mm512_unpackhi_ps(wt59, wt60);
__m512 tmp111 = _mm512_unpacklo_ps(wt61, wt62);
__m512 tmp112 = _mm512_unpackhi_ps(wt61, wt62);
__m512 tmp113 = _mm512_shuffle_ps(tmp97, tmp99, 68);
__m512 tmp114 = _mm512_shuffle_ps(tmp97, tmp99, 238);
__m512 tmp115 = _mm512_shuffle_ps(tmp98, tmp100, 68);
__m512 tmp116 = _mm512_shuffle_ps(tmp98, tmp100, 238);
__m512 tmp117 = _mm512_shuffle_ps(tmp101, tmp103, 68);
__m512 tmp118 = _mm512_shuffle_ps(tmp101, tmp103, 238);
__m512 tmp119 = _mm512_shuffle_ps(tmp102, tmp104, 68);
__m512 tmp120 = _mm512_shuffle_ps(tmp102, tmp104, 238);
__m512 tmp121 = _mm512_shuffle_ps(tmp105, tmp107, 68);
__m512 tmp122 = _mm512_shuffle_ps(tmp105, tmp107, 238);
__m512 tmp123 = _mm512_shuffle_ps(tmp106, tmp108, 68);
__m512 tmp124 = _mm512_shuffle_ps(tmp106, tmp108, 238);
__m512 tmp125 = _mm512_shuffle_ps(tmp109, tmp111, 68);
__m512 tmp126 = _mm512_shuffle_ps(tmp109, tmp111, 238);
__m512 tmp127 = _mm512_shuffle_ps(tmp110, tmp112, 68);
__m512 tmp128 = _mm512_shuffle_ps(tmp110, tmp112, 238);
__m512 tmp129 = _mm512_shuffle_f32x4(tmp113, tmp117, 136);
__m512 tmp130 = _mm512_shuffle_f32x4(tmp113, tmp117, 221);
__m512 tmp131 = _mm512_shuffle_f32x4(tmp114, tmp118, 136);
__m512 tmp132 = _mm512_shuffle_f32x4(tmp114, tmp118, 221);
__m512 tmp133 = _mm512_shuffle_f32x4(tmp115, tmp119, 136);
__m512 tmp134 = _mm512_shuffle_f32x4(tmp115, tmp119, 221);
__m512 tmp135 = _mm512_shuffle_f32x4(tmp116, tmp120, 136);
__m512 tmp136 = _mm512_shuffle_f32x4(tmp116, tmp120, 221);
__m512 tmp137 = _mm512_shuffle_f32x4(tmp121, tmp125, 136);
__m512 tmp138 = _mm512_shuffle_f32x4(tmp121, tmp125, 221);
__m512 tmp139 = _mm512_shuffle_f32x4(tmp122, tmp126, 136);
__m512 tmp140 = _mm512_shuffle_f32x4(tmp122, tmp126, 221);
__m512 tmp141 = _mm512_shuffle_f32x4(tmp123, tmp127, 136);
__m512 tmp142 = _mm512_shuffle_f32x4(tmp123, tmp127, 221);
__m512 tmp143 = _mm512_shuffle_f32x4(tmp124, tmp128, 136);
__m512 tmp144 = _mm512_shuffle_f32x4(tmp124, tmp128, 221);
wt47 = _mm512_shuffle_f32x4(tmp129, tmp137, 136);
wt55 = _mm512_shuffle_f32x4(tmp129, tmp137, 221);
wt48 = _mm512_shuffle_f32x4(tmp131, tmp139, 136);
wt56 = _mm512_shuffle_f32x4(tmp131, tmp139, 221);
wt49 = _mm512_shuffle_f32x4(tmp133, tmp141, 136);
wt57 = _mm512_shuffle_f32x4(tmp133, tmp141, 221);
wt50 = _mm512_shuffle_f32x4(tmp135, tmp143, 136);
wt58 = _mm512_shuffle_f32x4(tmp135, tmp143, 221);
wt51 = _mm512_shuffle_f32x4(tmp130, tmp138, 136);
wt59 = _mm512_shuffle_f32x4(tmp130, tmp138, 221);
wt52 = _mm512_shuffle_f32x4(tmp132, tmp140, 136);
wt60 = _mm512_shuffle_f32x4(tmp132, tmp140, 221);
wt53 = _mm512_shuffle_f32x4(tmp134, tmp142, 136);
wt61 = _mm512_shuffle_f32x4(tmp134, tmp142, 221);
wt54 = _mm512_shuffle_f32x4(tmp136, tmp144, 136);
wt62 = _mm512_shuffle_f32x4(tmp136, tmp144, 221);
wt47 = _mm512_mul_ps(wt47, postMul6);
wt48 = _mm512_mul_ps(wt48, postMul6);
wt49 = _mm512_mul_ps(wt49, postMul6);
wt50 = _mm512_mul_ps(wt50, postMul6);
wt51 = _mm512_mul_ps(wt51, postMul6);
wt52 = _mm512_mul_ps(wt52, postMul6);
wt53 = _mm512_mul_ps(wt53, postMul6);
wt54 = _mm512_mul_ps(wt54, postMul6);
wt55 = _mm512_mul_ps(wt55, postMul6);
wt56 = _mm512_mul_ps(wt56, postMul6);
wt57 = _mm512_mul_ps(wt57, postMul6);
wt58 = _mm512_mul_ps(wt58, postMul6);
wt59 = _mm512_mul_ps(wt59, postMul6);
wt60 = _mm512_mul_ps(wt60, postMul6);
wt61 = _mm512_mul_ps(wt61, postMul6);
wt62 = _mm512_mul_ps(wt62, postMul6);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)0, 63>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)0, 63>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)0, 63>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)0, 63>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)0, 63>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)0, 63>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)0, 63>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)0, 63>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)0, 63>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)0, 63>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)0, 63>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)0, 63>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)0, 63>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)0, 63>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)0, 63>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)0, 63>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)1536, 4032>>cut2, wt62);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt47);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt48);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt49);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt50);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt51);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt52);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt53);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt54);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt55);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt56);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt57);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt58);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt59);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt60);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt61);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c7)+(ptrdiff_t)3072, 65535-(4095>>cut2), wt62);
}
break;
}
default: {
cut2 = 4;
__m512 sum5 = _mm512_maskz_loadu_ps(65535, biasPtr3+1536*i13+4*k47);
__m512i pmMul5 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd5 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo4 = _mm512_loadu_ps(bnPtr4+(ptrdiff_t)8*(k47+384*i13));
__m512 masHi4 = _mm512_maskz_loadu_ps(65535, bnPtr4+(ptrdiff_t)8*(k47+384*i13)+(ptrdiff_t)64);
__m512 postMul7 = _mm512_permutex2var_ps(masLo4, pmMul5, masHi4);
__m512 postAdd5 = _mm512_permutex2var_ps(masLo4, pmAdd5, masHi4);
sum5 = _mm512_fmadd_ps(sum5, postMul7, postAdd5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)0, 63>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)1536, 4032>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)3072, 258048>>cut2, sum5);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*0+(ptrdiff_t)4608, 65535-(262143>>cut2), sum5);
ptrdiff_t c8 = 0;
for (; c8 != 4; ++c8) {
__m512 wt63 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)0);
__m512 wt64 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)256);
__m512 wt65 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)512);
__m512 wt66 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)768);
__m512 wt67 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1024);
__m512 wt68 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1280);
__m512 wt69 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1536);
__m512 wt70 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)1792);
__m512 wt71 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2048);
__m512 wt72 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2304);
__m512 wt73 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2560);
__m512 wt74 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)2816);
__m512 wt75 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3072);
__m512 wt76 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3328);
__m512 wt77 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3584);
__m512 wt78 = _mm512_maskz_loadu_ps(65535, wtPtr3+98304*i13+256*k47+64*c8+(ptrdiff_t)3840);
__m512 tmp145 = _mm512_unpacklo_ps(wt63, wt64);
__m512 tmp146 = _mm512_unpackhi_ps(wt63, wt64);
__m512 tmp147 = _mm512_unpacklo_ps(wt65, wt66);
__m512 tmp148 = _mm512_unpackhi_ps(wt65, wt66);
__m512 tmp149 = _mm512_unpacklo_ps(wt67, wt68);
__m512 tmp150 = _mm512_unpackhi_ps(wt67, wt68);
__m512 tmp151 = _mm512_unpacklo_ps(wt69, wt70);
__m512 tmp152 = _mm512_unpackhi_ps(wt69, wt70);
__m512 tmp153 = _mm512_unpacklo_ps(wt71, wt72);
__m512 tmp154 = _mm512_unpackhi_ps(wt71, wt72);
__m512 tmp155 = _mm512_unpacklo_ps(wt73, wt74);
__m512 tmp156 = _mm512_unpackhi_ps(wt73, wt74);
__m512 tmp157 = _mm512_unpacklo_ps(wt75, wt76);
__m512 tmp158 = _mm512_unpackhi_ps(wt75, wt76);
__m512 tmp159 = _mm512_unpacklo_ps(wt77, wt78);
__m512 tmp160 = _mm512_unpackhi_ps(wt77, wt78);
__m512 tmp161 = _mm512_shuffle_ps(tmp145, tmp147, 68);
__m512 tmp162 = _mm512_shuffle_ps(tmp145, tmp147, 238);
__m512 tmp163 = _mm512_shuffle_ps(tmp146, tmp148, 68);
__m512 tmp164 = _mm512_shuffle_ps(tmp146, tmp148, 238);
__m512 tmp165 = _mm512_shuffle_ps(tmp149, tmp151, 68);
__m512 tmp166 = _mm512_shuffle_ps(tmp149, tmp151, 238);
__m512 tmp167 = _mm512_shuffle_ps(tmp150, tmp152, 68);
__m512 tmp168 = _mm512_shuffle_ps(tmp150, tmp152, 238);
__m512 tmp169 = _mm512_shuffle_ps(tmp153, tmp155, 68);
__m512 tmp170 = _mm512_shuffle_ps(tmp153, tmp155, 238);
__m512 tmp171 = _mm512_shuffle_ps(tmp154, tmp156, 68);
__m512 tmp172 = _mm512_shuffle_ps(tmp154, tmp156, 238);
__m512 tmp173 = _mm512_shuffle_ps(tmp157, tmp159, 68);
__m512 tmp174 = _mm512_shuffle_ps(tmp157, tmp159, 238);
__m512 tmp175 = _mm512_shuffle_ps(tmp158, tmp160, 68);
__m512 tmp176 = _mm512_shuffle_ps(tmp158, tmp160, 238);
__m512 tmp177 = _mm512_shuffle_f32x4(tmp161, tmp165, 136);
__m512 tmp178 = _mm512_shuffle_f32x4(tmp161, tmp165, 221);
__m512 tmp179 = _mm512_shuffle_f32x4(tmp162, tmp166, 136);
__m512 tmp180 = _mm512_shuffle_f32x4(tmp162, tmp166, 221);
__m512 tmp181 = _mm512_shuffle_f32x4(tmp163, tmp167, 136);
__m512 tmp182 = _mm512_shuffle_f32x4(tmp163, tmp167, 221);
__m512 tmp183 = _mm512_shuffle_f32x4(tmp164, tmp168, 136);
__m512 tmp184 = _mm512_shuffle_f32x4(tmp164, tmp168, 221);
__m512 tmp185 = _mm512_shuffle_f32x4(tmp169, tmp173, 136);
__m512 tmp186 = _mm512_shuffle_f32x4(tmp169, tmp173, 221);
__m512 tmp187 = _mm512_shuffle_f32x4(tmp170, tmp174, 136);
__m512 tmp188 = _mm512_shuffle_f32x4(tmp170, tmp174, 221);
__m512 tmp189 = _mm512_shuffle_f32x4(tmp171, tmp175, 136);
__m512 tmp190 = _mm512_shuffle_f32x4(tmp171, tmp175, 221);
__m512 tmp191 = _mm512_shuffle_f32x4(tmp172, tmp176, 136);
__m512 tmp192 = _mm512_shuffle_f32x4(tmp172, tmp176, 221);
wt63 = _mm512_shuffle_f32x4(tmp177, tmp185, 136);
wt71 = _mm512_shuffle_f32x4(tmp177, tmp185, 221);
wt64 = _mm512_shuffle_f32x4(tmp179, tmp187, 136);
wt72 = _mm512_shuffle_f32x4(tmp179, tmp187, 221);
wt65 = _mm512_shuffle_f32x4(tmp181, tmp189, 136);
wt73 = _mm512_shuffle_f32x4(tmp181, tmp189, 221);
wt66 = _mm512_shuffle_f32x4(tmp183, tmp191, 136);
wt74 = _mm512_shuffle_f32x4(tmp183, tmp191, 221);
wt67 = _mm512_shuffle_f32x4(tmp178, tmp186, 136);
wt75 = _mm512_shuffle_f32x4(tmp178, tmp186, 221);
wt68 = _mm512_shuffle_f32x4(tmp180, tmp188, 136);
wt76 = _mm512_shuffle_f32x4(tmp180, tmp188, 221);
wt69 = _mm512_shuffle_f32x4(tmp182, tmp190, 136);
wt77 = _mm512_shuffle_f32x4(tmp182, tmp190, 221);
wt70 = _mm512_shuffle_f32x4(tmp184, tmp192, 136);
wt78 = _mm512_shuffle_f32x4(tmp184, tmp192, 221);
wt63 = _mm512_mul_ps(wt63, postMul7);
wt64 = _mm512_mul_ps(wt64, postMul7);
wt65 = _mm512_mul_ps(wt65, postMul7);
wt66 = _mm512_mul_ps(wt66, postMul7);
wt67 = _mm512_mul_ps(wt67, postMul7);
wt68 = _mm512_mul_ps(wt68, postMul7);
wt69 = _mm512_mul_ps(wt69, postMul7);
wt70 = _mm512_mul_ps(wt70, postMul7);
wt71 = _mm512_mul_ps(wt71, postMul7);
wt72 = _mm512_mul_ps(wt72, postMul7);
wt73 = _mm512_mul_ps(wt73, postMul7);
wt74 = _mm512_mul_ps(wt74, postMul7);
wt75 = _mm512_mul_ps(wt75, postMul7);
wt76 = _mm512_mul_ps(wt76, postMul7);
wt77 = _mm512_mul_ps(wt77, postMul7);
wt78 = _mm512_mul_ps(wt78, postMul7);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)0, 63>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)0, 63>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)0, 63>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)0, 63>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)0, 63>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)0, 63>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)0, 63>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)0, 63>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)0, 63>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)0, 63>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)0, 63>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)0, 63>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)0, 63>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)0, 63>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)0, 63>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)0, 63>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)1536, 4032>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)3072, 258048>>cut2, wt78);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(1+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt63);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(2+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt64);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(3+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt65);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(4+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt66);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(5+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt67);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(6+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt68);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(7+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt69);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(8+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt70);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(9+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt71);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(10+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt72);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(11+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt73);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(12+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt74);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(13+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt75);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(14+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt76);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(15+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt77);
_mm512_mask_storeu_ps(arranged1+99840*i13+1560*l10+4*cut2+24*(16+16*c8)+(ptrdiff_t)4608, 65535-(262143>>cut2), wt78);
}
}
}
}
}
}
}

static void ResNeXt50OneArrangeWts1(ResNeXt50ThreaderTeam1* team19, char** tensors11) {
ResNeXt50ThreaderTask1 task15;
task15.callee1 = ResNeXt50OneArrangeWts1Callee1;
task15.any1 = tensors11;
task15.nd1 = 3;
task15.hull1[0] = 3;
task15.hull1[1] = 1;
task15.hull1[2] = 1;
ResNeXt50ThreaderDo1(team19, &task15);
}

static void ResNeXt50OneArrangeDats1Callee1(ResNeXt50ThreaderTask1* task16, int64_t* pt13) {
char** tensors14 = task16->any1;
ptrdiff_t c9 = pt13[1];
char*restrict datPtr3 = tensors14[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)806912*0;
char*restrict arranged2 = tensors14[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)802816*0;
ptrdiff_t ii2 = 1;
for (ptrdiff_t i14 = 0; i14 < ii2; ++i14) {
ptrdiff_t j8 = 2*c9;
ptrdiff_t jj20 = j8+(c9 < 23 ? 1 : 2);
for (; j8 != 49; ++j8) {
ptrdiff_t k48 = 0;
ptrdiff_t kk24 = k48+64;
for (; k48 < kk24; ++k48) {
__m512 dat909 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)0);
__m512 dat910 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)64);
__m512 dat911 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)128);
__m512 dat912 = _mm512_maskz_loadu_ps(65535, datPtr3+806912*i14+256*j8+12608*k48+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)0, 65535, dat909);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)64, 65535, dat910);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)128, 65535, dat911);
_mm512_mask_storeu_ps(arranged2+802816*i14+16384*j8+256*k48+(ptrdiff_t)192, 65535, dat912);
}
if (j8 >= jj20) goto next1;
}
next1:;
}
}

static void ResNeXt50OneArrangeDats1(ResNeXt50ThreaderTeam1* team20, char** tensors13) {
ResNeXt50ThreaderTask1 task17;
task17.callee1 = ResNeXt50OneArrangeDats1Callee1;
task17.any1 = tensors13;
task17.nd1 = 4;
task17.hull1[0] = 1;
task17.hull1[1] = 24;
task17.hull1[2] = 1;
task17.hull1[3] = 1;
ResNeXt50ThreaderDo1(team20, &task17);
}

static void ResNeXt50OneApply1Callee1(ResNeXt50ThreaderTask1* task18, int64_t* pt14) {
void** pair2 = task18->any1;
char** tensors16 = pair2[0];
ptrdiff_t e6 = 0;
ptrdiff_t g6 = 0;
ptrdiff_t d3 = pt14[1];
ptrdiff_t w22 = pt14[0];
char*restrict arrangedWts1 = tensors16[0]+1284096*e6+(ptrdiff_t)99840*1*g6;
char*restrict arrangedDats1 = tensors16[1]+10474240*e6+(ptrdiff_t)802816*1*g6;
char*restrict datPtr4 = tensors16[2]+(ptrdiff_t)4841472*1*g6;
ptrdiff_t ii3 = 1;
for (ptrdiff_t i15 = 0; i15 < ii3; ++i15) {
ptrdiff_t j9 = 1*d3;
ptrdiff_t jj21 = j9+0;
for (; j9 != 49; ++j9) {
ptrdiff_t k49 = 8*w22;
ptrdiff_t kk25 = k49+7;
for (; k49 != 64; ++k49) {
ptrdiff_t s10 = -1;
__m512 sum6 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)24));
__m512 sum10 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)28));
__m512 sum14 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)32));
__m512 sum18 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)36));
__m512 sum22 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)40));
__m512 sum26 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)44));
__m512 sum7 = sum6;
__m512 sum8 = sum6;
__m512 sum9 = sum6;
__m512 sum11 = sum10;
__m512 sum12 = sum10;
__m512 sum13 = sum10;
__m512 sum15 = sum14;
__m512 sum16 = sum14;
__m512 sum17 = sum14;
__m512 sum19 = sum18;
__m512 sum20 = sum18;
__m512 sum21 = sum18;
__m512 sum23 = sum22;
__m512 sum24 = sum22;
__m512 sum25 = sum22;
__m512 sum27 = sum26;
__m512 sum28 = sum26;
__m512 sum29 = sum26;
for (s10 = 0; s10 < 64; ++s10) {
__m512 dat913 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)0);
__m512 dat914 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)64);
__m512 dat915 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)128);
__m512 dat916 = _mm512_loadu_ps(arrangedDats1+802816*i15+16384*j9+256*s10+(ptrdiff_t)192);
__m512 wt79 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)24));
sum6 = _mm512_fmadd_ps(wt79, dat913, sum6);
sum7 = _mm512_fmadd_ps(wt79, dat914, sum7);
sum8 = _mm512_fmadd_ps(wt79, dat915, sum8);
sum9 = _mm512_fmadd_ps(wt79, dat916, sum9);
__m512 wt80 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)28));
sum10 = _mm512_fmadd_ps(wt80, dat913, sum10);
sum11 = _mm512_fmadd_ps(wt80, dat914, sum11);
sum12 = _mm512_fmadd_ps(wt80, dat915, sum12);
sum13 = _mm512_fmadd_ps(wt80, dat916, sum13);
__m512 wt81 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)32));
sum14 = _mm512_fmadd_ps(wt81, dat913, sum14);
sum15 = _mm512_fmadd_ps(wt81, dat914, sum15);
sum16 = _mm512_fmadd_ps(wt81, dat915, sum16);
sum17 = _mm512_fmadd_ps(wt81, dat916, sum17);
__m512 wt82 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)36));
sum18 = _mm512_fmadd_ps(wt82, dat913, sum18);
sum19 = _mm512_fmadd_ps(wt82, dat914, sum19);
sum20 = _mm512_fmadd_ps(wt82, dat915, sum20);
sum21 = _mm512_fmadd_ps(wt82, dat916, sum21);
__m512 wt83 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)40));
sum22 = _mm512_fmadd_ps(wt83, dat913, sum22);
sum23 = _mm512_fmadd_ps(wt83, dat914, sum23);
sum24 = _mm512_fmadd_ps(wt83, dat915, sum24);
sum25 = _mm512_fmadd_ps(wt83, dat916, sum25);
__m512 wt84 = _mm512_set1_ps(*(float*)(arrangedWts1+99840*i15+1560*k49+24*s10+(ptrdiff_t)44));
sum26 = _mm512_fmadd_ps(wt84, dat913, sum26);
sum27 = _mm512_fmadd_ps(wt84, dat914, sum27);
sum28 = _mm512_fmadd_ps(wt84, dat915, sum28);
sum29 = _mm512_fmadd_ps(wt84, dat916, sum29);
}
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)0, 65535, sum6);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)64, 65535, sum7);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)128, 65535, sum8);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)192, 65535, sum9);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12608, 65535, sum10);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12672, 65535, sum11);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12736, 65535, sum12);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)12800, 65535, sum13);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25216, 65535, sum14);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25280, 65535, sum15);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25344, 65535, sum16);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)25408, 65535, sum17);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37824, 65535, sum18);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37888, 65535, sum19);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)37952, 65535, sum20);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)38016, 65535, sum21);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50432, 65535, sum22);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50496, 65535, sum23);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50560, 65535, sum24);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)50624, 65535, sum25);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63040, 65535, sum26);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63104, 65535, sum27);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63168, 65535, sum28);
_mm512_mask_storeu_ps(datPtr4+4841472*i15+256*j9+75648*k49+(ptrdiff_t)63232, 65535, sum29);
if (k49 >= kk25) return;
}
if (j9 >= jj21) return;
}
}
}

static void ResNeXt50OneApply1(ResNeXt50ThreaderTeam1* team21, char** tensors15) {
void* pair1[] = {tensors15, 0};
ResNeXt50ThreaderTask1 task19;
task19.callee1 = ResNeXt50OneApply1Callee1;
task19.any1 = pair1;
task19.nd1 = 3;
task19.hull1[0] = 8;
task19.hull1[1] = 49;
task19.hull1[2] = 1;
ResNeXt50ThreaderDo1(team21, &task19);
}

static void ResNeXt50OneArrangeWts2Callee1(ResNeXt50ThreaderTask1* task28, int64_t* pt19) {
char** tensors26 = task28->any1;
ptrdiff_t b48 = pt19[0];
char*restrict wtPtr5 = tensors26[0]+(ptrdiff_t)3340*0+(ptrdiff_t)131072*0;
char*restrict biasPtr5 = tensors26[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr6 = tensors26[2]+(ptrdiff_t)8*256*0;
char*restrict arranged3 = tensors26[3]+(ptrdiff_t)856064*0+(ptrdiff_t)132096*0;
ptrdiff_t ii8 = 1;
for (ptrdiff_t i20 = 0; i20 < ii8; ++i20) {
ptrdiff_t j14 = 4*b48;
ptrdiff_t jj23 = j14+4;
for (; j14 < jj23; ++j14) {
if (j14 < 15) {
ptrdiff_t k71 = 0+16*(j14-0);
ptrdiff_t l23 = (size_t)(0+k71)/6;
ptrdiff_t cut5 = (size_t)(0+k71)%6;
switch (cut5) {
case 0:;
case 2: {
__m512 sum71 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k71);
__m512i pmMul7 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd7 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo5 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k71+256*i20));
__m512 masHi5 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k71+256*i20)+(ptrdiff_t)64);
__m512 postMul14 = _mm512_permutex2var_ps(masLo5, pmMul7, masHi5);
__m512 postAdd8 = _mm512_permutex2var_ps(masLo5, pmAdd7, masHi5);
sum71 = _mm512_fmadd_ps(sum71, postMul14, postAdd8);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum71);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)3072, 4032>>cut5, sum71);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)6144, 65535-(4095>>cut5), sum71);
ptrdiff_t c12 = 0;
for (; c12 != 8; ++c12) {
__m512 wt105 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)0);
__m512 wt106 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)512);
__m512 wt107 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)1024);
__m512 wt108 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)1536);
__m512 wt109 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)2048);
__m512 wt110 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)2560);
__m512 wt111 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)3072);
__m512 wt112 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)3584);
__m512 wt113 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)4096);
__m512 wt114 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)4608);
__m512 wt115 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)5120);
__m512 wt116 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)5632);
__m512 wt117 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)6144);
__m512 wt118 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)6656);
__m512 wt119 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)7168);
__m512 wt120 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c12+(ptrdiff_t)7680);
__m512 tmp5205 = _mm512_unpacklo_ps(wt105, wt106);
__m512 tmp5206 = _mm512_unpackhi_ps(wt105, wt106);
__m512 tmp5207 = _mm512_unpacklo_ps(wt107, wt108);
__m512 tmp5208 = _mm512_unpackhi_ps(wt107, wt108);
__m512 tmp5209 = _mm512_unpacklo_ps(wt109, wt110);
__m512 tmp5210 = _mm512_unpackhi_ps(wt109, wt110);
__m512 tmp5211 = _mm512_unpacklo_ps(wt111, wt112);
__m512 tmp5212 = _mm512_unpackhi_ps(wt111, wt112);
__m512 tmp5213 = _mm512_unpacklo_ps(wt113, wt114);
__m512 tmp5214 = _mm512_unpackhi_ps(wt113, wt114);
__m512 tmp5215 = _mm512_unpacklo_ps(wt115, wt116);
__m512 tmp5216 = _mm512_unpackhi_ps(wt115, wt116);
__m512 tmp5217 = _mm512_unpacklo_ps(wt117, wt118);
__m512 tmp5218 = _mm512_unpackhi_ps(wt117, wt118);
__m512 tmp5219 = _mm512_unpacklo_ps(wt119, wt120);
__m512 tmp5220 = _mm512_unpackhi_ps(wt119, wt120);
__m512 tmp5221 = _mm512_shuffle_ps(tmp5205, tmp5207, 68);
__m512 tmp5222 = _mm512_shuffle_ps(tmp5205, tmp5207, 238);
__m512 tmp5223 = _mm512_shuffle_ps(tmp5206, tmp5208, 68);
__m512 tmp5224 = _mm512_shuffle_ps(tmp5206, tmp5208, 238);
__m512 tmp5225 = _mm512_shuffle_ps(tmp5209, tmp5211, 68);
__m512 tmp5226 = _mm512_shuffle_ps(tmp5209, tmp5211, 238);
__m512 tmp5227 = _mm512_shuffle_ps(tmp5210, tmp5212, 68);
__m512 tmp5228 = _mm512_shuffle_ps(tmp5210, tmp5212, 238);
__m512 tmp5229 = _mm512_shuffle_ps(tmp5213, tmp5215, 68);
__m512 tmp5230 = _mm512_shuffle_ps(tmp5213, tmp5215, 238);
__m512 tmp5231 = _mm512_shuffle_ps(tmp5214, tmp5216, 68);
__m512 tmp5232 = _mm512_shuffle_ps(tmp5214, tmp5216, 238);
__m512 tmp5233 = _mm512_shuffle_ps(tmp5217, tmp5219, 68);
__m512 tmp5234 = _mm512_shuffle_ps(tmp5217, tmp5219, 238);
__m512 tmp5235 = _mm512_shuffle_ps(tmp5218, tmp5220, 68);
__m512 tmp5236 = _mm512_shuffle_ps(tmp5218, tmp5220, 238);
__m512 tmp5237 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 136);
__m512 tmp5238 = _mm512_shuffle_f32x4(tmp5221, tmp5225, 221);
__m512 tmp5239 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 136);
__m512 tmp5240 = _mm512_shuffle_f32x4(tmp5222, tmp5226, 221);
__m512 tmp5241 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 136);
__m512 tmp5242 = _mm512_shuffle_f32x4(tmp5223, tmp5227, 221);
__m512 tmp5243 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 136);
__m512 tmp5244 = _mm512_shuffle_f32x4(tmp5224, tmp5228, 221);
__m512 tmp5245 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 136);
__m512 tmp5246 = _mm512_shuffle_f32x4(tmp5229, tmp5233, 221);
__m512 tmp5247 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 136);
__m512 tmp5248 = _mm512_shuffle_f32x4(tmp5230, tmp5234, 221);
__m512 tmp5249 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 136);
__m512 tmp5250 = _mm512_shuffle_f32x4(tmp5231, tmp5235, 221);
__m512 tmp5251 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 136);
__m512 tmp5252 = _mm512_shuffle_f32x4(tmp5232, tmp5236, 221);
wt105 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 136);
wt113 = _mm512_shuffle_f32x4(tmp5237, tmp5245, 221);
wt106 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 136);
wt114 = _mm512_shuffle_f32x4(tmp5239, tmp5247, 221);
wt107 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 136);
wt115 = _mm512_shuffle_f32x4(tmp5241, tmp5249, 221);
wt108 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 136);
wt116 = _mm512_shuffle_f32x4(tmp5243, tmp5251, 221);
wt109 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 136);
wt117 = _mm512_shuffle_f32x4(tmp5238, tmp5246, 221);
wt110 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 136);
wt118 = _mm512_shuffle_f32x4(tmp5240, tmp5248, 221);
wt111 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 136);
wt119 = _mm512_shuffle_f32x4(tmp5242, tmp5250, 221);
wt112 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 136);
wt120 = _mm512_shuffle_f32x4(tmp5244, tmp5252, 221);
wt105 = _mm512_mul_ps(wt105, postMul14);
wt106 = _mm512_mul_ps(wt106, postMul14);
wt107 = _mm512_mul_ps(wt107, postMul14);
wt108 = _mm512_mul_ps(wt108, postMul14);
wt109 = _mm512_mul_ps(wt109, postMul14);
wt110 = _mm512_mul_ps(wt110, postMul14);
wt111 = _mm512_mul_ps(wt111, postMul14);
wt112 = _mm512_mul_ps(wt112, postMul14);
wt113 = _mm512_mul_ps(wt113, postMul14);
wt114 = _mm512_mul_ps(wt114, postMul14);
wt115 = _mm512_mul_ps(wt115, postMul14);
wt116 = _mm512_mul_ps(wt116, postMul14);
wt117 = _mm512_mul_ps(wt117, postMul14);
wt118 = _mm512_mul_ps(wt118, postMul14);
wt119 = _mm512_mul_ps(wt119, postMul14);
wt120 = _mm512_mul_ps(wt120, postMul14);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)0, 63>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)0, 63>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)0, 63>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)0, 63>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)0, 63>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)0, 63>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)0, 63>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)0, 63>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)0, 63>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)0, 63>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)0, 63>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)0, 63>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)0, 63>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)0, 63>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)0, 63>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)0, 63>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)3072, 4032>>cut5, wt120);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt105);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt106);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt107);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt108);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt109);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt110);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt111);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt112);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt113);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt114);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt115);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt116);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt117);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt118);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt119);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c12)+(ptrdiff_t)6144, 65535-(4095>>cut5), wt120);
}
break;
}
default: {
cut5 = 4;
__m512 sum72 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k71);
__m512i pmMul8 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd8 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo6 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k71+256*i20));
__m512 masHi6 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k71+256*i20)+(ptrdiff_t)64);
__m512 postMul15 = _mm512_permutex2var_ps(masLo6, pmMul8, masHi6);
__m512 postAdd9 = _mm512_permutex2var_ps(masLo6, pmAdd8, masHi6);
sum72 = _mm512_fmadd_ps(sum72, postMul15, postAdd9);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)0, 63>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)3072, 4032>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)6144, 258048>>cut5, sum72);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*0+(ptrdiff_t)9216, 65535-(262143>>cut5), sum72);
ptrdiff_t c13 = 0;
for (; c13 != 8; ++c13) {
__m512 wt121 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)0);
__m512 wt122 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)512);
__m512 wt123 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)1024);
__m512 wt124 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)1536);
__m512 wt125 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)2048);
__m512 wt126 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)2560);
__m512 wt127 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)3072);
__m512 wt128 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)3584);
__m512 wt129 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)4096);
__m512 wt130 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)4608);
__m512 wt131 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)5120);
__m512 wt132 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)5632);
__m512 wt133 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)6144);
__m512 wt134 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)6656);
__m512 wt135 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)7168);
__m512 wt136 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k71+64*c13+(ptrdiff_t)7680);
__m512 tmp5253 = _mm512_unpacklo_ps(wt121, wt122);
__m512 tmp5254 = _mm512_unpackhi_ps(wt121, wt122);
__m512 tmp5255 = _mm512_unpacklo_ps(wt123, wt124);
__m512 tmp5256 = _mm512_unpackhi_ps(wt123, wt124);
__m512 tmp5257 = _mm512_unpacklo_ps(wt125, wt126);
__m512 tmp5258 = _mm512_unpackhi_ps(wt125, wt126);
__m512 tmp5259 = _mm512_unpacklo_ps(wt127, wt128);
__m512 tmp5260 = _mm512_unpackhi_ps(wt127, wt128);
__m512 tmp5261 = _mm512_unpacklo_ps(wt129, wt130);
__m512 tmp5262 = _mm512_unpackhi_ps(wt129, wt130);
__m512 tmp5263 = _mm512_unpacklo_ps(wt131, wt132);
__m512 tmp5264 = _mm512_unpackhi_ps(wt131, wt132);
__m512 tmp5265 = _mm512_unpacklo_ps(wt133, wt134);
__m512 tmp5266 = _mm512_unpackhi_ps(wt133, wt134);
__m512 tmp5267 = _mm512_unpacklo_ps(wt135, wt136);
__m512 tmp5268 = _mm512_unpackhi_ps(wt135, wt136);
__m512 tmp5269 = _mm512_shuffle_ps(tmp5253, tmp5255, 68);
__m512 tmp5270 = _mm512_shuffle_ps(tmp5253, tmp5255, 238);
__m512 tmp5271 = _mm512_shuffle_ps(tmp5254, tmp5256, 68);
__m512 tmp5272 = _mm512_shuffle_ps(tmp5254, tmp5256, 238);
__m512 tmp5273 = _mm512_shuffle_ps(tmp5257, tmp5259, 68);
__m512 tmp5274 = _mm512_shuffle_ps(tmp5257, tmp5259, 238);
__m512 tmp5275 = _mm512_shuffle_ps(tmp5258, tmp5260, 68);
__m512 tmp5276 = _mm512_shuffle_ps(tmp5258, tmp5260, 238);
__m512 tmp5277 = _mm512_shuffle_ps(tmp5261, tmp5263, 68);
__m512 tmp5278 = _mm512_shuffle_ps(tmp5261, tmp5263, 238);
__m512 tmp5279 = _mm512_shuffle_ps(tmp5262, tmp5264, 68);
__m512 tmp5280 = _mm512_shuffle_ps(tmp5262, tmp5264, 238);
__m512 tmp5281 = _mm512_shuffle_ps(tmp5265, tmp5267, 68);
__m512 tmp5282 = _mm512_shuffle_ps(tmp5265, tmp5267, 238);
__m512 tmp5283 = _mm512_shuffle_ps(tmp5266, tmp5268, 68);
__m512 tmp5284 = _mm512_shuffle_ps(tmp5266, tmp5268, 238);
__m512 tmp5285 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 136);
__m512 tmp5286 = _mm512_shuffle_f32x4(tmp5269, tmp5273, 221);
__m512 tmp5287 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 136);
__m512 tmp5288 = _mm512_shuffle_f32x4(tmp5270, tmp5274, 221);
__m512 tmp5289 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 136);
__m512 tmp5290 = _mm512_shuffle_f32x4(tmp5271, tmp5275, 221);
__m512 tmp5291 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 136);
__m512 tmp5292 = _mm512_shuffle_f32x4(tmp5272, tmp5276, 221);
__m512 tmp5293 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 136);
__m512 tmp5294 = _mm512_shuffle_f32x4(tmp5277, tmp5281, 221);
__m512 tmp5295 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 136);
__m512 tmp5296 = _mm512_shuffle_f32x4(tmp5278, tmp5282, 221);
__m512 tmp5297 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 136);
__m512 tmp5298 = _mm512_shuffle_f32x4(tmp5279, tmp5283, 221);
__m512 tmp5299 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 136);
__m512 tmp5300 = _mm512_shuffle_f32x4(tmp5280, tmp5284, 221);
wt121 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 136);
wt129 = _mm512_shuffle_f32x4(tmp5285, tmp5293, 221);
wt122 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 136);
wt130 = _mm512_shuffle_f32x4(tmp5287, tmp5295, 221);
wt123 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 136);
wt131 = _mm512_shuffle_f32x4(tmp5289, tmp5297, 221);
wt124 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 136);
wt132 = _mm512_shuffle_f32x4(tmp5291, tmp5299, 221);
wt125 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 136);
wt133 = _mm512_shuffle_f32x4(tmp5286, tmp5294, 221);
wt126 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 136);
wt134 = _mm512_shuffle_f32x4(tmp5288, tmp5296, 221);
wt127 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 136);
wt135 = _mm512_shuffle_f32x4(tmp5290, tmp5298, 221);
wt128 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 136);
wt136 = _mm512_shuffle_f32x4(tmp5292, tmp5300, 221);
wt121 = _mm512_mul_ps(wt121, postMul15);
wt122 = _mm512_mul_ps(wt122, postMul15);
wt123 = _mm512_mul_ps(wt123, postMul15);
wt124 = _mm512_mul_ps(wt124, postMul15);
wt125 = _mm512_mul_ps(wt125, postMul15);
wt126 = _mm512_mul_ps(wt126, postMul15);
wt127 = _mm512_mul_ps(wt127, postMul15);
wt128 = _mm512_mul_ps(wt128, postMul15);
wt129 = _mm512_mul_ps(wt129, postMul15);
wt130 = _mm512_mul_ps(wt130, postMul15);
wt131 = _mm512_mul_ps(wt131, postMul15);
wt132 = _mm512_mul_ps(wt132, postMul15);
wt133 = _mm512_mul_ps(wt133, postMul15);
wt134 = _mm512_mul_ps(wt134, postMul15);
wt135 = _mm512_mul_ps(wt135, postMul15);
wt136 = _mm512_mul_ps(wt136, postMul15);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)0, 63>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)0, 63>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)0, 63>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)0, 63>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)0, 63>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)0, 63>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)0, 63>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)0, 63>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)0, 63>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)0, 63>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)0, 63>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)0, 63>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)0, 63>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)0, 63>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)0, 63>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)0, 63>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)3072, 4032>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)6144, 258048>>cut5, wt136);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(1+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt121);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(2+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt122);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(3+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt123);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(4+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt124);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(5+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt125);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(6+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt126);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(7+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt127);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(8+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt128);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(9+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt129);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(10+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt130);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(11+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt131);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(12+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt132);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(13+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt133);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(14+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt134);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(15+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt135);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l23+4*cut5+24*(16+16*c13)+(ptrdiff_t)9216, 65535-(262143>>cut5), wt136);
}
}
}
} else {
ptrdiff_t k70 = 240;
ptrdiff_t l22 = (size_t)(0+k70)/6;
ptrdiff_t cut4 = (size_t)(0+k70)%6;
__m512 sum70 = _mm512_maskz_loadu_ps(65535, biasPtr5+1024*i20+4*k70);
__m512i pmMul9 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd9 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo7 = _mm512_loadu_ps(bnPtr6+(ptrdiff_t)8*(k70+256*i20));
__m512 masHi7 = _mm512_maskz_loadu_ps(65535, bnPtr6+(ptrdiff_t)8*(k70+256*i20)+(ptrdiff_t)64);
__m512 postMul13 = _mm512_permutex2var_ps(masLo7, pmMul9, masHi7);
__m512 postAdd7 = _mm512_permutex2var_ps(masLo7, pmAdd9, masHi7);
sum70 = _mm512_fmadd_ps(sum70, postMul13, postAdd7);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*0+(ptrdiff_t)0, 63>>cut4, sum70);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*0+(ptrdiff_t)3072, 4032>>cut4, sum70);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*0+(ptrdiff_t)6144, 65535-(4095>>cut4), sum70);
ptrdiff_t c11 = 0;
for (; c11 != 8; ++c11) {
__m512 wt89 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)0);
__m512 wt90 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)512);
__m512 wt91 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)1024);
__m512 wt92 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)1536);
__m512 wt93 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)2048);
__m512 wt94 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)2560);
__m512 wt95 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)3072);
__m512 wt96 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)3584);
__m512 wt97 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)4096);
__m512 wt98 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)4608);
__m512 wt99 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)5120);
__m512 wt100 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)5632);
__m512 wt101 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)6144);
__m512 wt102 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)6656);
__m512 wt103 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)7168);
__m512 wt104 = _mm512_maskz_loadu_ps(65535, wtPtr5+131072*i20+512*k70+64*c11+(ptrdiff_t)7680);
__m512 tmp5301 = _mm512_unpacklo_ps(wt89, wt90);
__m512 tmp5302 = _mm512_unpackhi_ps(wt89, wt90);
__m512 tmp5303 = _mm512_unpacklo_ps(wt91, wt92);
__m512 tmp5304 = _mm512_unpackhi_ps(wt91, wt92);
__m512 tmp5305 = _mm512_unpacklo_ps(wt93, wt94);
__m512 tmp5306 = _mm512_unpackhi_ps(wt93, wt94);
__m512 tmp5307 = _mm512_unpacklo_ps(wt95, wt96);
__m512 tmp5308 = _mm512_unpackhi_ps(wt95, wt96);
__m512 tmp5309 = _mm512_unpacklo_ps(wt97, wt98);
__m512 tmp5310 = _mm512_unpackhi_ps(wt97, wt98);
__m512 tmp5311 = _mm512_unpacklo_ps(wt99, wt100);
__m512 tmp5312 = _mm512_unpackhi_ps(wt99, wt100);
__m512 tmp5313 = _mm512_unpacklo_ps(wt101, wt102);
__m512 tmp5314 = _mm512_unpackhi_ps(wt101, wt102);
__m512 tmp5315 = _mm512_unpacklo_ps(wt103, wt104);
__m512 tmp5316 = _mm512_unpackhi_ps(wt103, wt104);
__m512 tmp5317 = _mm512_shuffle_ps(tmp5301, tmp5303, 68);
__m512 tmp5318 = _mm512_shuffle_ps(tmp5301, tmp5303, 238);
__m512 tmp5319 = _mm512_shuffle_ps(tmp5302, tmp5304, 68);
__m512 tmp5320 = _mm512_shuffle_ps(tmp5302, tmp5304, 238);
__m512 tmp5321 = _mm512_shuffle_ps(tmp5305, tmp5307, 68);
__m512 tmp5322 = _mm512_shuffle_ps(tmp5305, tmp5307, 238);
__m512 tmp5323 = _mm512_shuffle_ps(tmp5306, tmp5308, 68);
__m512 tmp5324 = _mm512_shuffle_ps(tmp5306, tmp5308, 238);
__m512 tmp5325 = _mm512_shuffle_ps(tmp5309, tmp5311, 68);
__m512 tmp5326 = _mm512_shuffle_ps(tmp5309, tmp5311, 238);
__m512 tmp5327 = _mm512_shuffle_ps(tmp5310, tmp5312, 68);
__m512 tmp5328 = _mm512_shuffle_ps(tmp5310, tmp5312, 238);
__m512 tmp5329 = _mm512_shuffle_ps(tmp5313, tmp5315, 68);
__m512 tmp5330 = _mm512_shuffle_ps(tmp5313, tmp5315, 238);
__m512 tmp5331 = _mm512_shuffle_ps(tmp5314, tmp5316, 68);
__m512 tmp5332 = _mm512_shuffle_ps(tmp5314, tmp5316, 238);
__m512 tmp5333 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 136);
__m512 tmp5334 = _mm512_shuffle_f32x4(tmp5317, tmp5321, 221);
__m512 tmp5335 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 136);
__m512 tmp5336 = _mm512_shuffle_f32x4(tmp5318, tmp5322, 221);
__m512 tmp5337 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 136);
__m512 tmp5338 = _mm512_shuffle_f32x4(tmp5319, tmp5323, 221);
__m512 tmp5339 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 136);
__m512 tmp5340 = _mm512_shuffle_f32x4(tmp5320, tmp5324, 221);
__m512 tmp5341 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 136);
__m512 tmp5342 = _mm512_shuffle_f32x4(tmp5325, tmp5329, 221);
__m512 tmp5343 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 136);
__m512 tmp5344 = _mm512_shuffle_f32x4(tmp5326, tmp5330, 221);
__m512 tmp5345 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 136);
__m512 tmp5346 = _mm512_shuffle_f32x4(tmp5327, tmp5331, 221);
__m512 tmp5347 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 136);
__m512 tmp5348 = _mm512_shuffle_f32x4(tmp5328, tmp5332, 221);
wt89 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 136);
wt97 = _mm512_shuffle_f32x4(tmp5333, tmp5341, 221);
wt90 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 136);
wt98 = _mm512_shuffle_f32x4(tmp5335, tmp5343, 221);
wt91 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 136);
wt99 = _mm512_shuffle_f32x4(tmp5337, tmp5345, 221);
wt92 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 136);
wt100 = _mm512_shuffle_f32x4(tmp5339, tmp5347, 221);
wt93 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 136);
wt101 = _mm512_shuffle_f32x4(tmp5334, tmp5342, 221);
wt94 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 136);
wt102 = _mm512_shuffle_f32x4(tmp5336, tmp5344, 221);
wt95 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 136);
wt103 = _mm512_shuffle_f32x4(tmp5338, tmp5346, 221);
wt96 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 136);
wt104 = _mm512_shuffle_f32x4(tmp5340, tmp5348, 221);
wt89 = _mm512_mul_ps(wt89, postMul13);
wt90 = _mm512_mul_ps(wt90, postMul13);
wt91 = _mm512_mul_ps(wt91, postMul13);
wt92 = _mm512_mul_ps(wt92, postMul13);
wt93 = _mm512_mul_ps(wt93, postMul13);
wt94 = _mm512_mul_ps(wt94, postMul13);
wt95 = _mm512_mul_ps(wt95, postMul13);
wt96 = _mm512_mul_ps(wt96, postMul13);
wt97 = _mm512_mul_ps(wt97, postMul13);
wt98 = _mm512_mul_ps(wt98, postMul13);
wt99 = _mm512_mul_ps(wt99, postMul13);
wt100 = _mm512_mul_ps(wt100, postMul13);
wt101 = _mm512_mul_ps(wt101, postMul13);
wt102 = _mm512_mul_ps(wt102, postMul13);
wt103 = _mm512_mul_ps(wt103, postMul13);
wt104 = _mm512_mul_ps(wt104, postMul13);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(1+16*c11)+(ptrdiff_t)0, 63>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(2+16*c11)+(ptrdiff_t)0, 63>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(3+16*c11)+(ptrdiff_t)0, 63>>cut4, wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(4+16*c11)+(ptrdiff_t)0, 63>>cut4, wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(5+16*c11)+(ptrdiff_t)0, 63>>cut4, wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(6+16*c11)+(ptrdiff_t)0, 63>>cut4, wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(7+16*c11)+(ptrdiff_t)0, 63>>cut4, wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(8+16*c11)+(ptrdiff_t)0, 63>>cut4, wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(9+16*c11)+(ptrdiff_t)0, 63>>cut4, wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(10+16*c11)+(ptrdiff_t)0, 63>>cut4, wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(11+16*c11)+(ptrdiff_t)0, 63>>cut4, wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(12+16*c11)+(ptrdiff_t)0, 63>>cut4, wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(13+16*c11)+(ptrdiff_t)0, 63>>cut4, wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(14+16*c11)+(ptrdiff_t)0, 63>>cut4, wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(15+16*c11)+(ptrdiff_t)0, 63>>cut4, wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(16+16*c11)+(ptrdiff_t)0, 63>>cut4, wt104);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(1+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(2+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(3+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(4+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(5+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(6+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(7+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(8+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(9+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(10+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(11+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(12+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(13+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(14+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(15+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+24*(16+16*c11)+(ptrdiff_t)3072, 4032>>cut4, wt104);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(1+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt89);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(2+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt90);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(3+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt91);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(4+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt92);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(5+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt93);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(6+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt94);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(7+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt95);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(8+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt96);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(9+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt97);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(10+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt98);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(11+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt99);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(12+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt100);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(13+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt101);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(14+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt102);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(15+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt103);
_mm512_mask_storeu_ps(arranged3+132096*i20+3096*l22+4*cut4+16*(16+16*c11)+(ptrdiff_t)6144, 65535-(4095>>cut4), wt104);
}
}
}
}
}

static void ResNeXt50OneArrangeWts2(ResNeXt50ThreaderTeam1* team26, char** tensors25) {
ResNeXt50ThreaderTask1 task29;
task29.callee1 = ResNeXt50OneArrangeWts2Callee1;
task29.any1 = tensors25;
task29.nd1 = 3;
task29.hull1[0] = 4;
task29.hull1[1] = 1;
task29.hull1[2] = 1;
ResNeXt50ThreaderDo1(team26, &task29);
}

static void ResNeXt50OneArrangeDats2Callee1(ResNeXt50ThreaderTask1* task30, int64_t* pt20) {
char** tensors28 = task30->any1;
ptrdiff_t c14 = pt20[1];
char*restrict datPtr7 = tensors28[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)1613824*0;
char*restrict arranged4 = tensors28[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii9 = 1;
for (ptrdiff_t i21 = 0; i21 < ii9; ++i21) {
ptrdiff_t j15 = 1*c14;
ptrdiff_t jj24 = j15+0;
for (; j15 != 49; ++j15) {
ptrdiff_t k72 = 0;
ptrdiff_t kk26 = k72+128;
for (; k72 < kk26; ++k72) {
__m512 dat1267 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)0);
__m512 dat1268 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)64);
__m512 dat1269 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)128);
__m512 dat1270 = _mm512_maskz_loadu_ps(65535, datPtr7+1613824*i21+256*j15+12608*k72+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)0, 65535, dat1267);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)64, 65535, dat1268);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)128, 65535, dat1269);
_mm512_mask_storeu_ps(arranged4+1605632*i21+32768*j15+256*k72+(ptrdiff_t)192, 65535, dat1270);
}
if (j15 >= jj24) goto next2;
}
next2:;
}
}

static void ResNeXt50OneArrangeDats2(ResNeXt50ThreaderTeam1* team27, char** tensors27) {
ResNeXt50ThreaderTask1 task31;
task31.callee1 = ResNeXt50OneArrangeDats2Callee1;
task31.any1 = tensors27;
task31.nd1 = 4;
task31.hull1[0] = 1;
task31.hull1[1] = 49;
task31.hull1[2] = 1;
task31.hull1[3] = 1;
ResNeXt50ThreaderDo1(team27, &task31);
}

static void ResNeXt50OneApply2Callee1(ResNeXt50ThreaderTask1* task32, int64_t* pt21) {
void** pair6 = task32->any1;
char** tensors30 = pair6[0];
ptrdiff_t e10 = 0;
ptrdiff_t g11 = 0;
ptrdiff_t d6 = pt21[1];
ptrdiff_t w34 = pt21[0];
char*restrict arrangedWts2 = tensors30[0]+856064*e10+(ptrdiff_t)132096*1*g11;
char*restrict arrangedDats2 = tensors30[1]+10474240*e10+(ptrdiff_t)1605632*1*g11;
char*restrict datPtr8 = tensors30[2]+(ptrdiff_t)3227648*1*g11;
char*restrict datPtr9 = tensors30[3]+(ptrdiff_t)3227648*1*g11;
ptrdiff_t ii10 = 1;
for (ptrdiff_t i22 = 0; i22 < ii10; ++i22) {
ptrdiff_t j16 = 1*d6;
ptrdiff_t jj25 = j16+0;
for (; j16 != 49; ++j16) {
ptrdiff_t k73 = 4*w34;
ptrdiff_t kk27 = k73+(w34 < 9 ? 3 : 6);
for (; k73 != 42; ++k73) {
ptrdiff_t s13 = -1;
__m512 sum73 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)24));
__m512 sum77 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)28));
__m512 sum81 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)32));
__m512 sum85 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)36));
__m512 sum89 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)40));
__m512 sum93 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)44));
__m512 sum74 = sum73;
__m512 sum75 = sum73;
__m512 sum76 = sum73;
__m512 sum78 = sum77;
__m512 sum79 = sum77;
__m512 sum80 = sum77;
__m512 sum82 = sum81;
__m512 sum83 = sum81;
__m512 sum84 = sum81;
__m512 sum86 = sum85;
__m512 sum87 = sum85;
__m512 sum88 = sum85;
__m512 sum90 = sum89;
__m512 sum91 = sum89;
__m512 sum92 = sum89;
__m512 sum94 = sum93;
__m512 sum95 = sum93;
__m512 sum96 = sum93;
for (s13 = 0; s13 < 128; ++s13) {
__m512 dat1271 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)0);
__m512 dat1272 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)64);
__m512 dat1273 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)128);
__m512 dat1274 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s13+(ptrdiff_t)192);
__m512 wt137 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)24));
sum73 = _mm512_fmadd_ps(wt137, dat1271, sum73);
sum74 = _mm512_fmadd_ps(wt137, dat1272, sum74);
sum75 = _mm512_fmadd_ps(wt137, dat1273, sum75);
sum76 = _mm512_fmadd_ps(wt137, dat1274, sum76);
__m512 wt138 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)28));
sum77 = _mm512_fmadd_ps(wt138, dat1271, sum77);
sum78 = _mm512_fmadd_ps(wt138, dat1272, sum78);
sum79 = _mm512_fmadd_ps(wt138, dat1273, sum79);
sum80 = _mm512_fmadd_ps(wt138, dat1274, sum80);
__m512 wt139 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)32));
sum81 = _mm512_fmadd_ps(wt139, dat1271, sum81);
sum82 = _mm512_fmadd_ps(wt139, dat1272, sum82);
sum83 = _mm512_fmadd_ps(wt139, dat1273, sum83);
sum84 = _mm512_fmadd_ps(wt139, dat1274, sum84);
__m512 wt140 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)36));
sum85 = _mm512_fmadd_ps(wt140, dat1271, sum85);
sum86 = _mm512_fmadd_ps(wt140, dat1272, sum86);
sum87 = _mm512_fmadd_ps(wt140, dat1273, sum87);
sum88 = _mm512_fmadd_ps(wt140, dat1274, sum88);
__m512 wt141 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)40));
sum89 = _mm512_fmadd_ps(wt141, dat1271, sum89);
sum90 = _mm512_fmadd_ps(wt141, dat1272, sum90);
sum91 = _mm512_fmadd_ps(wt141, dat1273, sum91);
sum92 = _mm512_fmadd_ps(wt141, dat1274, sum92);
__m512 wt142 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+24*s13+(ptrdiff_t)44));
sum93 = _mm512_fmadd_ps(wt142, dat1271, sum93);
sum94 = _mm512_fmadd_ps(wt142, dat1272, sum94);
sum95 = _mm512_fmadd_ps(wt142, dat1273, sum95);
sum96 = _mm512_fmadd_ps(wt142, dat1274, sum96);
}
sum73 = _mm512_add_ps(sum73, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0));
sum74 = _mm512_add_ps(sum74, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64));
sum75 = _mm512_add_ps(sum75, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128));
sum76 = _mm512_add_ps(sum76, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192));
sum73 = _mm512_max_ps(_mm512_setzero_ps(), sum73);
sum74 = _mm512_max_ps(_mm512_setzero_ps(), sum74);
sum75 = _mm512_max_ps(_mm512_setzero_ps(), sum75);
sum76 = _mm512_max_ps(_mm512_setzero_ps(), sum76);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum73);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum74);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum75);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum76);
sum77 = _mm512_add_ps(sum77, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608));
sum78 = _mm512_add_ps(sum78, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672));
sum79 = _mm512_add_ps(sum79, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736));
sum80 = _mm512_add_ps(sum80, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800));
sum77 = _mm512_max_ps(_mm512_setzero_ps(), sum77);
sum78 = _mm512_max_ps(_mm512_setzero_ps(), sum78);
sum79 = _mm512_max_ps(_mm512_setzero_ps(), sum79);
sum80 = _mm512_max_ps(_mm512_setzero_ps(), sum80);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum77);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum78);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum79);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum80);
sum81 = _mm512_add_ps(sum81, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216));
sum82 = _mm512_add_ps(sum82, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280));
sum83 = _mm512_add_ps(sum83, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344));
sum84 = _mm512_add_ps(sum84, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408));
sum81 = _mm512_max_ps(_mm512_setzero_ps(), sum81);
sum82 = _mm512_max_ps(_mm512_setzero_ps(), sum82);
sum83 = _mm512_max_ps(_mm512_setzero_ps(), sum83);
sum84 = _mm512_max_ps(_mm512_setzero_ps(), sum84);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216, 65535, sum81);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280, 65535, sum82);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344, 65535, sum83);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408, 65535, sum84);
sum85 = _mm512_add_ps(sum85, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824));
sum86 = _mm512_add_ps(sum86, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888));
sum87 = _mm512_add_ps(sum87, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952));
sum88 = _mm512_add_ps(sum88, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016));
sum85 = _mm512_max_ps(_mm512_setzero_ps(), sum85);
sum86 = _mm512_max_ps(_mm512_setzero_ps(), sum86);
sum87 = _mm512_max_ps(_mm512_setzero_ps(), sum87);
sum88 = _mm512_max_ps(_mm512_setzero_ps(), sum88);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824, 65535, sum85);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888, 65535, sum86);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952, 65535, sum87);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016, 65535, sum88);
sum89 = _mm512_add_ps(sum89, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50432));
sum90 = _mm512_add_ps(sum90, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50496));
sum91 = _mm512_add_ps(sum91, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50560));
sum92 = _mm512_add_ps(sum92, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50624));
sum89 = _mm512_max_ps(_mm512_setzero_ps(), sum89);
sum90 = _mm512_max_ps(_mm512_setzero_ps(), sum90);
sum91 = _mm512_max_ps(_mm512_setzero_ps(), sum91);
sum92 = _mm512_max_ps(_mm512_setzero_ps(), sum92);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50432, 65535, sum89);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50496, 65535, sum90);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50560, 65535, sum91);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)50624, 65535, sum92);
sum93 = _mm512_add_ps(sum93, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63040));
sum94 = _mm512_add_ps(sum94, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63104));
sum95 = _mm512_add_ps(sum95, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63168));
sum96 = _mm512_add_ps(sum96, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63232));
sum93 = _mm512_max_ps(_mm512_setzero_ps(), sum93);
sum94 = _mm512_max_ps(_mm512_setzero_ps(), sum94);
sum95 = _mm512_max_ps(_mm512_setzero_ps(), sum95);
sum96 = _mm512_max_ps(_mm512_setzero_ps(), sum96);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63040, 65535, sum93);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63104, 65535, sum94);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63168, 65535, sum95);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)63232, 65535, sum96);
if (k73 >= kk27) return;
}
ptrdiff_t s14 = -1;
__m512 sum97 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)16));
__m512 sum101 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)20));
__m512 sum105 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)24));
__m512 sum109 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)28));
__m512 sum98 = sum97;
__m512 sum99 = sum97;
__m512 sum100 = sum97;
__m512 sum102 = sum101;
__m512 sum103 = sum101;
__m512 sum104 = sum101;
__m512 sum106 = sum105;
__m512 sum107 = sum105;
__m512 sum108 = sum105;
__m512 sum110 = sum109;
__m512 sum111 = sum109;
__m512 sum112 = sum109;
for (s14 = 0; s14 < 128; ++s14) {
__m512 dat1275 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)0);
__m512 dat1276 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)64);
__m512 dat1277 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)128);
__m512 dat1278 = _mm512_loadu_ps(arrangedDats2+1605632*i22+32768*j16+256*s14+(ptrdiff_t)192);
__m512 wt143 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)16));
sum97 = _mm512_fmadd_ps(wt143, dat1275, sum97);
sum98 = _mm512_fmadd_ps(wt143, dat1276, sum98);
sum99 = _mm512_fmadd_ps(wt143, dat1277, sum99);
sum100 = _mm512_fmadd_ps(wt143, dat1278, sum100);
__m512 wt144 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)20));
sum101 = _mm512_fmadd_ps(wt144, dat1275, sum101);
sum102 = _mm512_fmadd_ps(wt144, dat1276, sum102);
sum103 = _mm512_fmadd_ps(wt144, dat1277, sum103);
sum104 = _mm512_fmadd_ps(wt144, dat1278, sum104);
__m512 wt145 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)24));
sum105 = _mm512_fmadd_ps(wt145, dat1275, sum105);
sum106 = _mm512_fmadd_ps(wt145, dat1276, sum106);
sum107 = _mm512_fmadd_ps(wt145, dat1277, sum107);
sum108 = _mm512_fmadd_ps(wt145, dat1278, sum108);
__m512 wt146 = _mm512_set1_ps(*(float*)(arrangedWts2+132096*i22+3096*k73+16*s14+(ptrdiff_t)28));
sum109 = _mm512_fmadd_ps(wt146, dat1275, sum109);
sum110 = _mm512_fmadd_ps(wt146, dat1276, sum110);
sum111 = _mm512_fmadd_ps(wt146, dat1277, sum111);
sum112 = _mm512_fmadd_ps(wt146, dat1278, sum112);
}
sum97 = _mm512_add_ps(sum97, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0));
sum98 = _mm512_add_ps(sum98, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64));
sum99 = _mm512_add_ps(sum99, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128));
sum100 = _mm512_add_ps(sum100, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192));
sum97 = _mm512_max_ps(_mm512_setzero_ps(), sum97);
sum98 = _mm512_max_ps(_mm512_setzero_ps(), sum98);
sum99 = _mm512_max_ps(_mm512_setzero_ps(), sum99);
sum100 = _mm512_max_ps(_mm512_setzero_ps(), sum100);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)0, 65535, sum97);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)64, 65535, sum98);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)128, 65535, sum99);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)192, 65535, sum100);
sum101 = _mm512_add_ps(sum101, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608));
sum102 = _mm512_add_ps(sum102, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672));
sum103 = _mm512_add_ps(sum103, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736));
sum104 = _mm512_add_ps(sum104, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800));
sum101 = _mm512_max_ps(_mm512_setzero_ps(), sum101);
sum102 = _mm512_max_ps(_mm512_setzero_ps(), sum102);
sum103 = _mm512_max_ps(_mm512_setzero_ps(), sum103);
sum104 = _mm512_max_ps(_mm512_setzero_ps(), sum104);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12608, 65535, sum101);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12672, 65535, sum102);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12736, 65535, sum103);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)12800, 65535, sum104);
sum105 = _mm512_add_ps(sum105, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216));
sum106 = _mm512_add_ps(sum106, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280));
sum107 = _mm512_add_ps(sum107, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344));
sum108 = _mm512_add_ps(sum108, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408));
sum105 = _mm512_max_ps(_mm512_setzero_ps(), sum105);
sum106 = _mm512_max_ps(_mm512_setzero_ps(), sum106);
sum107 = _mm512_max_ps(_mm512_setzero_ps(), sum107);
sum108 = _mm512_max_ps(_mm512_setzero_ps(), sum108);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25216, 65535, sum105);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25280, 65535, sum106);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25344, 65535, sum107);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)25408, 65535, sum108);
sum109 = _mm512_add_ps(sum109, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824));
sum110 = _mm512_add_ps(sum110, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888));
sum111 = _mm512_add_ps(sum111, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952));
sum112 = _mm512_add_ps(sum112, _mm512_maskz_loadu_ps(65535, datPtr8+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016));
sum109 = _mm512_max_ps(_mm512_setzero_ps(), sum109);
sum110 = _mm512_max_ps(_mm512_setzero_ps(), sum110);
sum111 = _mm512_max_ps(_mm512_setzero_ps(), sum111);
sum112 = _mm512_max_ps(_mm512_setzero_ps(), sum112);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37824, 65535, sum109);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37888, 65535, sum110);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)37952, 65535, sum111);
_mm512_mask_storeu_ps(datPtr9+3227648*i22+256*j16+75648*k73+(ptrdiff_t)38016, 65535, sum112);
if (j16 >= jj25) return;
}
}
}

static void ResNeXt50OneApply2(ResNeXt50ThreaderTeam1* team28, char** tensors29) {
void* pair5[] = {tensors29, 0};
ResNeXt50ThreaderTask1 task33;
task33.callee1 = ResNeXt50OneApply2Callee1;
task33.any1 = pair5;
task33.nd1 = 3;
task33.hull1[0] = 10;
task33.hull1[1] = 49;
task33.hull1[2] = 1;
ResNeXt50ThreaderDo1(team28, &task33);
}

static void ResNeXt50OneArrangeWts3Callee1(ResNeXt50ThreaderTask1* task34, int64_t* pt22) {
char** tensors32 = task34->any1;
ptrdiff_t b49 = pt22[0];
char*restrict wtPtr6 = tensors32[0]+(ptrdiff_t)3340*0+(ptrdiff_t)131072*0;
char*restrict biasPtr6 = tensors32[1]+(ptrdiff_t)512*0;
char*restrict bnPtr7 = tensors32[2]+(ptrdiff_t)8*128*0;
char*restrict arranged5 = tensors32[3]+(ptrdiff_t)428032*0+(ptrdiff_t)131584*0;
ptrdiff_t ii11 = 1;
for (ptrdiff_t i23 = 0; i23 < ii11; ++i23) {
ptrdiff_t j17 = 2*b49;
ptrdiff_t jj26 = j17+2;
for (; j17 < jj26; ++j17) {
if (j17 < 7) {
ptrdiff_t k75 = 0+16*(j17-0);
ptrdiff_t l25 = (size_t)(0+k75)/6;
ptrdiff_t cut7 = (size_t)(0+k75)%6;
switch (cut7) {
case 0:;
case 2: {
__m512 sum114 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k75);
__m512i pmMul10 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd10 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo8 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi8 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul17 = _mm512_permutex2var_ps(masLo8, pmMul10, masHi8);
__m512 postAdd11 = _mm512_permutex2var_ps(masLo8, pmAdd10, masHi8);
sum114 = _mm512_fmadd_ps(sum114, postMul17, postAdd11);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum114);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)12288, 65535-(4095>>cut7), sum114);
ptrdiff_t c16 = 0;
for (; c16 != 16; ++c16) {
__m512 wt163 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)0);
__m512 wt164 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)1024);
__m512 wt165 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)2048);
__m512 wt166 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)3072);
__m512 wt167 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)4096);
__m512 wt168 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)5120);
__m512 wt169 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)6144);
__m512 wt170 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)7168);
__m512 wt171 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)8192);
__m512 wt172 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)9216);
__m512 wt173 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)10240);
__m512 wt174 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)11264);
__m512 wt175 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)12288);
__m512 wt176 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)13312);
__m512 wt177 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)14336);
__m512 wt178 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c16+(ptrdiff_t)15360);
__m512 tmp5349 = _mm512_unpacklo_ps(wt163, wt164);
__m512 tmp5350 = _mm512_unpackhi_ps(wt163, wt164);
__m512 tmp5351 = _mm512_unpacklo_ps(wt165, wt166);
__m512 tmp5352 = _mm512_unpackhi_ps(wt165, wt166);
__m512 tmp5353 = _mm512_unpacklo_ps(wt167, wt168);
__m512 tmp5354 = _mm512_unpackhi_ps(wt167, wt168);
__m512 tmp5355 = _mm512_unpacklo_ps(wt169, wt170);
__m512 tmp5356 = _mm512_unpackhi_ps(wt169, wt170);
__m512 tmp5357 = _mm512_unpacklo_ps(wt171, wt172);
__m512 tmp5358 = _mm512_unpackhi_ps(wt171, wt172);
__m512 tmp5359 = _mm512_unpacklo_ps(wt173, wt174);
__m512 tmp5360 = _mm512_unpackhi_ps(wt173, wt174);
__m512 tmp5361 = _mm512_unpacklo_ps(wt175, wt176);
__m512 tmp5362 = _mm512_unpackhi_ps(wt175, wt176);
__m512 tmp5363 = _mm512_unpacklo_ps(wt177, wt178);
__m512 tmp5364 = _mm512_unpackhi_ps(wt177, wt178);
__m512 tmp5365 = _mm512_shuffle_ps(tmp5349, tmp5351, 68);
__m512 tmp5366 = _mm512_shuffle_ps(tmp5349, tmp5351, 238);
__m512 tmp5367 = _mm512_shuffle_ps(tmp5350, tmp5352, 68);
__m512 tmp5368 = _mm512_shuffle_ps(tmp5350, tmp5352, 238);
__m512 tmp5369 = _mm512_shuffle_ps(tmp5353, tmp5355, 68);
__m512 tmp5370 = _mm512_shuffle_ps(tmp5353, tmp5355, 238);
__m512 tmp5371 = _mm512_shuffle_ps(tmp5354, tmp5356, 68);
__m512 tmp5372 = _mm512_shuffle_ps(tmp5354, tmp5356, 238);
__m512 tmp5373 = _mm512_shuffle_ps(tmp5357, tmp5359, 68);
__m512 tmp5374 = _mm512_shuffle_ps(tmp5357, tmp5359, 238);
__m512 tmp5375 = _mm512_shuffle_ps(tmp5358, tmp5360, 68);
__m512 tmp5376 = _mm512_shuffle_ps(tmp5358, tmp5360, 238);
__m512 tmp5377 = _mm512_shuffle_ps(tmp5361, tmp5363, 68);
__m512 tmp5378 = _mm512_shuffle_ps(tmp5361, tmp5363, 238);
__m512 tmp5379 = _mm512_shuffle_ps(tmp5362, tmp5364, 68);
__m512 tmp5380 = _mm512_shuffle_ps(tmp5362, tmp5364, 238);
__m512 tmp5381 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 136);
__m512 tmp5382 = _mm512_shuffle_f32x4(tmp5365, tmp5369, 221);
__m512 tmp5383 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 136);
__m512 tmp5384 = _mm512_shuffle_f32x4(tmp5366, tmp5370, 221);
__m512 tmp5385 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 136);
__m512 tmp5386 = _mm512_shuffle_f32x4(tmp5367, tmp5371, 221);
__m512 tmp5387 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 136);
__m512 tmp5388 = _mm512_shuffle_f32x4(tmp5368, tmp5372, 221);
__m512 tmp5389 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 136);
__m512 tmp5390 = _mm512_shuffle_f32x4(tmp5373, tmp5377, 221);
__m512 tmp5391 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 136);
__m512 tmp5392 = _mm512_shuffle_f32x4(tmp5374, tmp5378, 221);
__m512 tmp5393 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 136);
__m512 tmp5394 = _mm512_shuffle_f32x4(tmp5375, tmp5379, 221);
__m512 tmp5395 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 136);
__m512 tmp5396 = _mm512_shuffle_f32x4(tmp5376, tmp5380, 221);
wt163 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 136);
wt171 = _mm512_shuffle_f32x4(tmp5381, tmp5389, 221);
wt164 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 136);
wt172 = _mm512_shuffle_f32x4(tmp5383, tmp5391, 221);
wt165 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 136);
wt173 = _mm512_shuffle_f32x4(tmp5385, tmp5393, 221);
wt166 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 136);
wt174 = _mm512_shuffle_f32x4(tmp5387, tmp5395, 221);
wt167 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 136);
wt175 = _mm512_shuffle_f32x4(tmp5382, tmp5390, 221);
wt168 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 136);
wt176 = _mm512_shuffle_f32x4(tmp5384, tmp5392, 221);
wt169 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 136);
wt177 = _mm512_shuffle_f32x4(tmp5386, tmp5394, 221);
wt170 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 136);
wt178 = _mm512_shuffle_f32x4(tmp5388, tmp5396, 221);
wt163 = _mm512_mul_ps(wt163, postMul17);
wt164 = _mm512_mul_ps(wt164, postMul17);
wt165 = _mm512_mul_ps(wt165, postMul17);
wt166 = _mm512_mul_ps(wt166, postMul17);
wt167 = _mm512_mul_ps(wt167, postMul17);
wt168 = _mm512_mul_ps(wt168, postMul17);
wt169 = _mm512_mul_ps(wt169, postMul17);
wt170 = _mm512_mul_ps(wt170, postMul17);
wt171 = _mm512_mul_ps(wt171, postMul17);
wt172 = _mm512_mul_ps(wt172, postMul17);
wt173 = _mm512_mul_ps(wt173, postMul17);
wt174 = _mm512_mul_ps(wt174, postMul17);
wt175 = _mm512_mul_ps(wt175, postMul17);
wt176 = _mm512_mul_ps(wt176, postMul17);
wt177 = _mm512_mul_ps(wt177, postMul17);
wt178 = _mm512_mul_ps(wt178, postMul17);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)0, 63>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)0, 63>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)0, 63>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)0, 63>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)0, 63>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)0, 63>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)0, 63>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)0, 63>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)0, 63>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)0, 63>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)0, 63>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)0, 63>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)0, 63>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)0, 63>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)0, 63>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)0, 63>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)6144, 4032>>cut7, wt178);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt163);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt164);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt165);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt166);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt167);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt168);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt169);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt170);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt171);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt172);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt173);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt174);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt175);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt176);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt177);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c16)+(ptrdiff_t)12288, 65535-(4095>>cut7), wt178);
}
break;
}
default: {
cut7 = 4;
__m512 sum115 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k75);
__m512i pmMul11 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd11 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo9 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k75+128*i23));
__m512 masHi9 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k75+128*i23)+(ptrdiff_t)64);
__m512 postMul18 = _mm512_permutex2var_ps(masLo9, pmMul11, masHi9);
__m512 postAdd12 = _mm512_permutex2var_ps(masLo9, pmAdd11, masHi9);
sum115 = _mm512_fmadd_ps(sum115, postMul18, postAdd12);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)0, 63>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)6144, 4032>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)12288, 258048>>cut7, sum115);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*0+(ptrdiff_t)18432, 65535-(262143>>cut7), sum115);
ptrdiff_t c17 = 0;
for (; c17 != 16; ++c17) {
__m512 wt179 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)0);
__m512 wt180 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)1024);
__m512 wt181 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)2048);
__m512 wt182 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)3072);
__m512 wt183 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)4096);
__m512 wt184 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)5120);
__m512 wt185 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)6144);
__m512 wt186 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)7168);
__m512 wt187 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)8192);
__m512 wt188 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)9216);
__m512 wt189 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)10240);
__m512 wt190 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)11264);
__m512 wt191 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)12288);
__m512 wt192 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)13312);
__m512 wt193 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)14336);
__m512 wt194 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k75+64*c17+(ptrdiff_t)15360);
__m512 tmp5397 = _mm512_unpacklo_ps(wt179, wt180);
__m512 tmp5398 = _mm512_unpackhi_ps(wt179, wt180);
__m512 tmp5399 = _mm512_unpacklo_ps(wt181, wt182);
__m512 tmp5400 = _mm512_unpackhi_ps(wt181, wt182);
__m512 tmp5401 = _mm512_unpacklo_ps(wt183, wt184);
__m512 tmp5402 = _mm512_unpackhi_ps(wt183, wt184);
__m512 tmp5403 = _mm512_unpacklo_ps(wt185, wt186);
__m512 tmp5404 = _mm512_unpackhi_ps(wt185, wt186);
__m512 tmp5405 = _mm512_unpacklo_ps(wt187, wt188);
__m512 tmp5406 = _mm512_unpackhi_ps(wt187, wt188);
__m512 tmp5407 = _mm512_unpacklo_ps(wt189, wt190);
__m512 tmp5408 = _mm512_unpackhi_ps(wt189, wt190);
__m512 tmp5409 = _mm512_unpacklo_ps(wt191, wt192);
__m512 tmp5410 = _mm512_unpackhi_ps(wt191, wt192);
__m512 tmp5411 = _mm512_unpacklo_ps(wt193, wt194);
__m512 tmp5412 = _mm512_unpackhi_ps(wt193, wt194);
__m512 tmp5413 = _mm512_shuffle_ps(tmp5397, tmp5399, 68);
__m512 tmp5414 = _mm512_shuffle_ps(tmp5397, tmp5399, 238);
__m512 tmp5415 = _mm512_shuffle_ps(tmp5398, tmp5400, 68);
__m512 tmp5416 = _mm512_shuffle_ps(tmp5398, tmp5400, 238);
__m512 tmp5417 = _mm512_shuffle_ps(tmp5401, tmp5403, 68);
__m512 tmp5418 = _mm512_shuffle_ps(tmp5401, tmp5403, 238);
__m512 tmp5419 = _mm512_shuffle_ps(tmp5402, tmp5404, 68);
__m512 tmp5420 = _mm512_shuffle_ps(tmp5402, tmp5404, 238);
__m512 tmp5421 = _mm512_shuffle_ps(tmp5405, tmp5407, 68);
__m512 tmp5422 = _mm512_shuffle_ps(tmp5405, tmp5407, 238);
__m512 tmp5423 = _mm512_shuffle_ps(tmp5406, tmp5408, 68);
__m512 tmp5424 = _mm512_shuffle_ps(tmp5406, tmp5408, 238);
__m512 tmp5425 = _mm512_shuffle_ps(tmp5409, tmp5411, 68);
__m512 tmp5426 = _mm512_shuffle_ps(tmp5409, tmp5411, 238);
__m512 tmp5427 = _mm512_shuffle_ps(tmp5410, tmp5412, 68);
__m512 tmp5428 = _mm512_shuffle_ps(tmp5410, tmp5412, 238);
__m512 tmp5429 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 136);
__m512 tmp5430 = _mm512_shuffle_f32x4(tmp5413, tmp5417, 221);
__m512 tmp5431 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 136);
__m512 tmp5432 = _mm512_shuffle_f32x4(tmp5414, tmp5418, 221);
__m512 tmp5433 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 136);
__m512 tmp5434 = _mm512_shuffle_f32x4(tmp5415, tmp5419, 221);
__m512 tmp5435 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 136);
__m512 tmp5436 = _mm512_shuffle_f32x4(tmp5416, tmp5420, 221);
__m512 tmp5437 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 136);
__m512 tmp5438 = _mm512_shuffle_f32x4(tmp5421, tmp5425, 221);
__m512 tmp5439 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 136);
__m512 tmp5440 = _mm512_shuffle_f32x4(tmp5422, tmp5426, 221);
__m512 tmp5441 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 136);
__m512 tmp5442 = _mm512_shuffle_f32x4(tmp5423, tmp5427, 221);
__m512 tmp5443 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 136);
__m512 tmp5444 = _mm512_shuffle_f32x4(tmp5424, tmp5428, 221);
wt179 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 136);
wt187 = _mm512_shuffle_f32x4(tmp5429, tmp5437, 221);
wt180 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 136);
wt188 = _mm512_shuffle_f32x4(tmp5431, tmp5439, 221);
wt181 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 136);
wt189 = _mm512_shuffle_f32x4(tmp5433, tmp5441, 221);
wt182 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 136);
wt190 = _mm512_shuffle_f32x4(tmp5435, tmp5443, 221);
wt183 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 136);
wt191 = _mm512_shuffle_f32x4(tmp5430, tmp5438, 221);
wt184 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 136);
wt192 = _mm512_shuffle_f32x4(tmp5432, tmp5440, 221);
wt185 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 136);
wt193 = _mm512_shuffle_f32x4(tmp5434, tmp5442, 221);
wt186 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 136);
wt194 = _mm512_shuffle_f32x4(tmp5436, tmp5444, 221);
wt179 = _mm512_mul_ps(wt179, postMul18);
wt180 = _mm512_mul_ps(wt180, postMul18);
wt181 = _mm512_mul_ps(wt181, postMul18);
wt182 = _mm512_mul_ps(wt182, postMul18);
wt183 = _mm512_mul_ps(wt183, postMul18);
wt184 = _mm512_mul_ps(wt184, postMul18);
wt185 = _mm512_mul_ps(wt185, postMul18);
wt186 = _mm512_mul_ps(wt186, postMul18);
wt187 = _mm512_mul_ps(wt187, postMul18);
wt188 = _mm512_mul_ps(wt188, postMul18);
wt189 = _mm512_mul_ps(wt189, postMul18);
wt190 = _mm512_mul_ps(wt190, postMul18);
wt191 = _mm512_mul_ps(wt191, postMul18);
wt192 = _mm512_mul_ps(wt192, postMul18);
wt193 = _mm512_mul_ps(wt193, postMul18);
wt194 = _mm512_mul_ps(wt194, postMul18);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)0, 63>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)0, 63>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)0, 63>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)0, 63>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)0, 63>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)0, 63>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)0, 63>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)0, 63>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)0, 63>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)0, 63>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)0, 63>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)0, 63>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)0, 63>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)0, 63>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)0, 63>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)0, 63>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)6144, 4032>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)12288, 258048>>cut7, wt194);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(1+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt179);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(2+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt180);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(3+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt181);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(4+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt182);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(5+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt183);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(6+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt184);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(7+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt185);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(8+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt186);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(9+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt187);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(10+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt188);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(11+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt189);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(12+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt190);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(13+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt191);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(14+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt192);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(15+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt193);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l25+4*cut7+24*(16+16*c17)+(ptrdiff_t)18432, 65535-(262143>>cut7), wt194);
}
}
}
} else {
ptrdiff_t k74 = 112;
ptrdiff_t l24 = (size_t)(0+k74)/6;
ptrdiff_t cut6 = (size_t)(0+k74)%6;
__m512 sum113 = _mm512_maskz_loadu_ps(65535, biasPtr6+512*i23+4*k74);
__m512i pmMul12 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd12 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo10 = _mm512_loadu_ps(bnPtr7+(ptrdiff_t)8*(k74+128*i23));
__m512 masHi10 = _mm512_maskz_loadu_ps(65535, bnPtr7+(ptrdiff_t)8*(k74+128*i23)+(ptrdiff_t)64);
__m512 postMul16 = _mm512_permutex2var_ps(masLo10, pmMul12, masHi10);
__m512 postAdd10 = _mm512_permutex2var_ps(masLo10, pmAdd12, masHi10);
sum113 = _mm512_fmadd_ps(sum113, postMul16, postAdd10);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)0, 63>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)6144, 4032>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*0+(ptrdiff_t)12288, 258048>>cut6, sum113);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*0+(ptrdiff_t)18432, 65535-(262143>>cut6), sum113);
ptrdiff_t c15 = 0;
for (; c15 != 16; ++c15) {
__m512 wt147 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)0);
__m512 wt148 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)1024);
__m512 wt149 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)2048);
__m512 wt150 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)3072);
__m512 wt151 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)4096);
__m512 wt152 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)5120);
__m512 wt153 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)6144);
__m512 wt154 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)7168);
__m512 wt155 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)8192);
__m512 wt156 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)9216);
__m512 wt157 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)10240);
__m512 wt158 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)11264);
__m512 wt159 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)12288);
__m512 wt160 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)13312);
__m512 wt161 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)14336);
__m512 wt162 = _mm512_maskz_loadu_ps(65535, wtPtr6+131072*i23+1024*k74+64*c15+(ptrdiff_t)15360);
__m512 tmp5445 = _mm512_unpacklo_ps(wt147, wt148);
__m512 tmp5446 = _mm512_unpackhi_ps(wt147, wt148);
__m512 tmp5447 = _mm512_unpacklo_ps(wt149, wt150);
__m512 tmp5448 = _mm512_unpackhi_ps(wt149, wt150);
__m512 tmp5449 = _mm512_unpacklo_ps(wt151, wt152);
__m512 tmp5450 = _mm512_unpackhi_ps(wt151, wt152);
__m512 tmp5451 = _mm512_unpacklo_ps(wt153, wt154);
__m512 tmp5452 = _mm512_unpackhi_ps(wt153, wt154);
__m512 tmp5453 = _mm512_unpacklo_ps(wt155, wt156);
__m512 tmp5454 = _mm512_unpackhi_ps(wt155, wt156);
__m512 tmp5455 = _mm512_unpacklo_ps(wt157, wt158);
__m512 tmp5456 = _mm512_unpackhi_ps(wt157, wt158);
__m512 tmp5457 = _mm512_unpacklo_ps(wt159, wt160);
__m512 tmp5458 = _mm512_unpackhi_ps(wt159, wt160);
__m512 tmp5459 = _mm512_unpacklo_ps(wt161, wt162);
__m512 tmp5460 = _mm512_unpackhi_ps(wt161, wt162);
__m512 tmp5461 = _mm512_shuffle_ps(tmp5445, tmp5447, 68);
__m512 tmp5462 = _mm512_shuffle_ps(tmp5445, tmp5447, 238);
__m512 tmp5463 = _mm512_shuffle_ps(tmp5446, tmp5448, 68);
__m512 tmp5464 = _mm512_shuffle_ps(tmp5446, tmp5448, 238);
__m512 tmp5465 = _mm512_shuffle_ps(tmp5449, tmp5451, 68);
__m512 tmp5466 = _mm512_shuffle_ps(tmp5449, tmp5451, 238);
__m512 tmp5467 = _mm512_shuffle_ps(tmp5450, tmp5452, 68);
__m512 tmp5468 = _mm512_shuffle_ps(tmp5450, tmp5452, 238);
__m512 tmp5469 = _mm512_shuffle_ps(tmp5453, tmp5455, 68);
__m512 tmp5470 = _mm512_shuffle_ps(tmp5453, tmp5455, 238);
__m512 tmp5471 = _mm512_shuffle_ps(tmp5454, tmp5456, 68);
__m512 tmp5472 = _mm512_shuffle_ps(tmp5454, tmp5456, 238);
__m512 tmp5473 = _mm512_shuffle_ps(tmp5457, tmp5459, 68);
__m512 tmp5474 = _mm512_shuffle_ps(tmp5457, tmp5459, 238);
__m512 tmp5475 = _mm512_shuffle_ps(tmp5458, tmp5460, 68);
__m512 tmp5476 = _mm512_shuffle_ps(tmp5458, tmp5460, 238);
__m512 tmp5477 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 136);
__m512 tmp5478 = _mm512_shuffle_f32x4(tmp5461, tmp5465, 221);
__m512 tmp5479 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 136);
__m512 tmp5480 = _mm512_shuffle_f32x4(tmp5462, tmp5466, 221);
__m512 tmp5481 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 136);
__m512 tmp5482 = _mm512_shuffle_f32x4(tmp5463, tmp5467, 221);
__m512 tmp5483 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 136);
__m512 tmp5484 = _mm512_shuffle_f32x4(tmp5464, tmp5468, 221);
__m512 tmp5485 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 136);
__m512 tmp5486 = _mm512_shuffle_f32x4(tmp5469, tmp5473, 221);
__m512 tmp5487 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 136);
__m512 tmp5488 = _mm512_shuffle_f32x4(tmp5470, tmp5474, 221);
__m512 tmp5489 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 136);
__m512 tmp5490 = _mm512_shuffle_f32x4(tmp5471, tmp5475, 221);
__m512 tmp5491 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 136);
__m512 tmp5492 = _mm512_shuffle_f32x4(tmp5472, tmp5476, 221);
wt147 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 136);
wt155 = _mm512_shuffle_f32x4(tmp5477, tmp5485, 221);
wt148 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 136);
wt156 = _mm512_shuffle_f32x4(tmp5479, tmp5487, 221);
wt149 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 136);
wt157 = _mm512_shuffle_f32x4(tmp5481, tmp5489, 221);
wt150 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 136);
wt158 = _mm512_shuffle_f32x4(tmp5483, tmp5491, 221);
wt151 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 136);
wt159 = _mm512_shuffle_f32x4(tmp5478, tmp5486, 221);
wt152 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 136);
wt160 = _mm512_shuffle_f32x4(tmp5480, tmp5488, 221);
wt153 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 136);
wt161 = _mm512_shuffle_f32x4(tmp5482, tmp5490, 221);
wt154 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 136);
wt162 = _mm512_shuffle_f32x4(tmp5484, tmp5492, 221);
wt147 = _mm512_mul_ps(wt147, postMul16);
wt148 = _mm512_mul_ps(wt148, postMul16);
wt149 = _mm512_mul_ps(wt149, postMul16);
wt150 = _mm512_mul_ps(wt150, postMul16);
wt151 = _mm512_mul_ps(wt151, postMul16);
wt152 = _mm512_mul_ps(wt152, postMul16);
wt153 = _mm512_mul_ps(wt153, postMul16);
wt154 = _mm512_mul_ps(wt154, postMul16);
wt155 = _mm512_mul_ps(wt155, postMul16);
wt156 = _mm512_mul_ps(wt156, postMul16);
wt157 = _mm512_mul_ps(wt157, postMul16);
wt158 = _mm512_mul_ps(wt158, postMul16);
wt159 = _mm512_mul_ps(wt159, postMul16);
wt160 = _mm512_mul_ps(wt160, postMul16);
wt161 = _mm512_mul_ps(wt161, postMul16);
wt162 = _mm512_mul_ps(wt162, postMul16);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)0, 63>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)0, 63>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)0, 63>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)0, 63>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)0, 63>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)0, 63>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)0, 63>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)0, 63>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)0, 63>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)0, 63>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)0, 63>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)0, 63>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)0, 63>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)0, 63>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)0, 63>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)0, 63>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)6144, 4032>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(1+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(2+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(3+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(4+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(5+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(6+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(7+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(8+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(9+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(10+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(11+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(12+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(13+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(14+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(15+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+24*(16+16*c15)+(ptrdiff_t)12288, 258048>>cut6, wt162);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(1+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt147);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(2+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt148);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(3+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt149);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(4+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt150);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(5+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt151);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(6+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt152);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(7+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt153);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(8+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt154);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(9+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt155);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(10+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt156);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(11+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt157);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(12+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt158);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(13+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt159);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(14+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt160);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(15+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt161);
_mm512_mask_storeu_ps(arranged5+131584*i23+6168*l24+4*cut6+8*(16+16*c15)+(ptrdiff_t)18432, 65535-(262143>>cut6), wt162);
}
}
}
}
}

static void ResNeXt50OneArrangeWts3(ResNeXt50ThreaderTeam1* team29, char** tensors31) {
ResNeXt50ThreaderTask1 task35;
task35.callee1 = ResNeXt50OneArrangeWts3Callee1;
task35.any1 = tensors31;
task35.nd1 = 3;
task35.hull1[0] = 4;
task35.hull1[1] = 1;
task35.hull1[2] = 1;
ResNeXt50ThreaderDo1(team29, &task35);
}

static void ResNeXt50OneArrangeDats3Callee1(ResNeXt50ThreaderTask1* task36, int64_t* pt23) {
char** tensors34 = task36->any1;
ptrdiff_t s15 = pt23[0];
ptrdiff_t c18 = pt23[1];
char*restrict datPtr10 = tensors34[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged6 = tensors34[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii12 = 1;
for (ptrdiff_t i24 = 0; i24 < ii12; ++i24) {
ptrdiff_t j18 = 1*c18;
ptrdiff_t jj27 = j18+0;
for (; j18 != 49; ++j18) {
ptrdiff_t k76 = 128*s15;
ptrdiff_t kk28 = k76+128;
for (; k76 < kk28; ++k76) {
__m512 dat1279 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)0);
__m512 dat1280 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)64);
__m512 dat1281 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)128);
__m512 dat1282 = _mm512_maskz_loadu_ps(65535, datPtr10+3227648*i24+256*j18+12608*k76+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)0, 65535, dat1279);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)64, 65535, dat1280);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)128, 65535, dat1281);
_mm512_mask_storeu_ps(arranged6+3211264*i24+65536*j18+256*k76+(ptrdiff_t)192, 65535, dat1282);
}
if (j18 >= jj27) goto next3;
}
next3:;
}
}

static void ResNeXt50OneArrangeDats3(ResNeXt50ThreaderTeam1* team30, char** tensors33) {
ResNeXt50ThreaderTask1 task37;
task37.callee1 = ResNeXt50OneArrangeDats3Callee1;
task37.any1 = tensors33;
task37.nd1 = 4;
task37.hull1[0] = 2;
task37.hull1[1] = 49;
task37.hull1[2] = 1;
task37.hull1[3] = 1;
ResNeXt50ThreaderDo1(team30, &task37);
}

static void ResNeXt50OneApply3Callee1(ResNeXt50ThreaderTask1* task38, int64_t* pt24) {
void** pair8 = task38->any1;
char** tensors36 = pair8[0];
ptrdiff_t e11 = 0;
ptrdiff_t g12 = 0;
ptrdiff_t d7 = pt24[1];
ptrdiff_t w35 = pt24[0];
char*restrict arrangedWts3 = tensors36[0]+428032*e11+(ptrdiff_t)131584*1*g12;
char*restrict arrangedDats3 = tensors36[1]+10474240*e11+(ptrdiff_t)3211264*1*g12;
char*restrict datPtr11 = tensors36[2]+(ptrdiff_t)1613824*1*g12;
ptrdiff_t ii13 = 1;
for (ptrdiff_t i25 = 0; i25 < ii13; ++i25) {
ptrdiff_t j19 = 1*d7;
ptrdiff_t jj28 = j19+0;
for (; j19 != 49; ++j19) {
ptrdiff_t k77 = 2*w35;
ptrdiff_t kk29 = k77+1;
for (; k77 != 21; ++k77) {
ptrdiff_t s16 = -1;
__m512 sum116 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)24));
__m512 sum120 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)28));
__m512 sum124 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)32));
__m512 sum128 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)36));
__m512 sum132 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)40));
__m512 sum136 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)44));
__m512 sum117 = sum116;
__m512 sum118 = sum116;
__m512 sum119 = sum116;
__m512 sum121 = sum120;
__m512 sum122 = sum120;
__m512 sum123 = sum120;
__m512 sum125 = sum124;
__m512 sum126 = sum124;
__m512 sum127 = sum124;
__m512 sum129 = sum128;
__m512 sum130 = sum128;
__m512 sum131 = sum128;
__m512 sum133 = sum132;
__m512 sum134 = sum132;
__m512 sum135 = sum132;
__m512 sum137 = sum136;
__m512 sum138 = sum136;
__m512 sum139 = sum136;
for (s16 = 0; s16 < 256; ++s16) {
__m512 dat1283 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)0);
__m512 dat1284 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)64);
__m512 dat1285 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)128);
__m512 dat1286 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s16+(ptrdiff_t)192);
__m512 wt195 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)24));
sum116 = _mm512_fmadd_ps(wt195, dat1283, sum116);
sum117 = _mm512_fmadd_ps(wt195, dat1284, sum117);
sum118 = _mm512_fmadd_ps(wt195, dat1285, sum118);
sum119 = _mm512_fmadd_ps(wt195, dat1286, sum119);
__m512 wt196 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)28));
sum120 = _mm512_fmadd_ps(wt196, dat1283, sum120);
sum121 = _mm512_fmadd_ps(wt196, dat1284, sum121);
sum122 = _mm512_fmadd_ps(wt196, dat1285, sum122);
sum123 = _mm512_fmadd_ps(wt196, dat1286, sum123);
__m512 wt197 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)32));
sum124 = _mm512_fmadd_ps(wt197, dat1283, sum124);
sum125 = _mm512_fmadd_ps(wt197, dat1284, sum125);
sum126 = _mm512_fmadd_ps(wt197, dat1285, sum126);
sum127 = _mm512_fmadd_ps(wt197, dat1286, sum127);
__m512 wt198 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)36));
sum128 = _mm512_fmadd_ps(wt198, dat1283, sum128);
sum129 = _mm512_fmadd_ps(wt198, dat1284, sum129);
sum130 = _mm512_fmadd_ps(wt198, dat1285, sum130);
sum131 = _mm512_fmadd_ps(wt198, dat1286, sum131);
__m512 wt199 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)40));
sum132 = _mm512_fmadd_ps(wt199, dat1283, sum132);
sum133 = _mm512_fmadd_ps(wt199, dat1284, sum133);
sum134 = _mm512_fmadd_ps(wt199, dat1285, sum134);
sum135 = _mm512_fmadd_ps(wt199, dat1286, sum135);
__m512 wt200 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+24*s16+(ptrdiff_t)44));
sum136 = _mm512_fmadd_ps(wt200, dat1283, sum136);
sum137 = _mm512_fmadd_ps(wt200, dat1284, sum137);
sum138 = _mm512_fmadd_ps(wt200, dat1285, sum138);
sum139 = _mm512_fmadd_ps(wt200, dat1286, sum139);
}
sum116 = _mm512_max_ps(_mm512_setzero_ps(), sum116);
sum117 = _mm512_max_ps(_mm512_setzero_ps(), sum117);
sum118 = _mm512_max_ps(_mm512_setzero_ps(), sum118);
sum119 = _mm512_max_ps(_mm512_setzero_ps(), sum119);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum116);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum117);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum118);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum119);
sum120 = _mm512_max_ps(_mm512_setzero_ps(), sum120);
sum121 = _mm512_max_ps(_mm512_setzero_ps(), sum121);
sum122 = _mm512_max_ps(_mm512_setzero_ps(), sum122);
sum123 = _mm512_max_ps(_mm512_setzero_ps(), sum123);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum120);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum121);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum122);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum123);
sum124 = _mm512_max_ps(_mm512_setzero_ps(), sum124);
sum125 = _mm512_max_ps(_mm512_setzero_ps(), sum125);
sum126 = _mm512_max_ps(_mm512_setzero_ps(), sum126);
sum127 = _mm512_max_ps(_mm512_setzero_ps(), sum127);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25216, 65535, sum124);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25280, 65535, sum125);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25344, 65535, sum126);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)25408, 65535, sum127);
sum128 = _mm512_max_ps(_mm512_setzero_ps(), sum128);
sum129 = _mm512_max_ps(_mm512_setzero_ps(), sum129);
sum130 = _mm512_max_ps(_mm512_setzero_ps(), sum130);
sum131 = _mm512_max_ps(_mm512_setzero_ps(), sum131);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37824, 65535, sum128);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37888, 65535, sum129);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)37952, 65535, sum130);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)38016, 65535, sum131);
sum132 = _mm512_max_ps(_mm512_setzero_ps(), sum132);
sum133 = _mm512_max_ps(_mm512_setzero_ps(), sum133);
sum134 = _mm512_max_ps(_mm512_setzero_ps(), sum134);
sum135 = _mm512_max_ps(_mm512_setzero_ps(), sum135);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50432, 65535, sum132);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50496, 65535, sum133);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50560, 65535, sum134);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)50624, 65535, sum135);
sum136 = _mm512_max_ps(_mm512_setzero_ps(), sum136);
sum137 = _mm512_max_ps(_mm512_setzero_ps(), sum137);
sum138 = _mm512_max_ps(_mm512_setzero_ps(), sum138);
sum139 = _mm512_max_ps(_mm512_setzero_ps(), sum139);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63040, 65535, sum136);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63104, 65535, sum137);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63168, 65535, sum138);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)63232, 65535, sum139);
if (k77 >= kk29) return;
}
ptrdiff_t s17 = -1;
__m512 sum140 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)8));
__m512 sum144 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)12));
__m512 sum141 = sum140;
__m512 sum142 = sum140;
__m512 sum143 = sum140;
__m512 sum145 = sum144;
__m512 sum146 = sum144;
__m512 sum147 = sum144;
for (s17 = 0; s17 < 256; ++s17) {
__m512 dat1287 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)0);
__m512 dat1288 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)64);
__m512 dat1289 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)128);
__m512 dat1290 = _mm512_loadu_ps(arrangedDats3+3211264*i25+65536*j19+256*s17+(ptrdiff_t)192);
__m512 wt201 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)8));
sum140 = _mm512_fmadd_ps(wt201, dat1287, sum140);
sum141 = _mm512_fmadd_ps(wt201, dat1288, sum141);
sum142 = _mm512_fmadd_ps(wt201, dat1289, sum142);
sum143 = _mm512_fmadd_ps(wt201, dat1290, sum143);
__m512 wt202 = _mm512_set1_ps(*(float*)(arrangedWts3+131584*i25+6168*k77+8*s17+(ptrdiff_t)12));
sum144 = _mm512_fmadd_ps(wt202, dat1287, sum144);
sum145 = _mm512_fmadd_ps(wt202, dat1288, sum145);
sum146 = _mm512_fmadd_ps(wt202, dat1289, sum146);
sum147 = _mm512_fmadd_ps(wt202, dat1290, sum147);
}
sum140 = _mm512_max_ps(_mm512_setzero_ps(), sum140);
sum141 = _mm512_max_ps(_mm512_setzero_ps(), sum141);
sum142 = _mm512_max_ps(_mm512_setzero_ps(), sum142);
sum143 = _mm512_max_ps(_mm512_setzero_ps(), sum143);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)0, 65535, sum140);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)64, 65535, sum141);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)128, 65535, sum142);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)192, 65535, sum143);
sum144 = _mm512_max_ps(_mm512_setzero_ps(), sum144);
sum145 = _mm512_max_ps(_mm512_setzero_ps(), sum145);
sum146 = _mm512_max_ps(_mm512_setzero_ps(), sum146);
sum147 = _mm512_max_ps(_mm512_setzero_ps(), sum147);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12608, 65535, sum144);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12672, 65535, sum145);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12736, 65535, sum146);
_mm512_mask_storeu_ps(datPtr11+1613824*i25+256*j19+75648*k77+(ptrdiff_t)12800, 65535, sum147);
if (j19 >= jj28) return;
}
}
}

static void ResNeXt50OneApply3(ResNeXt50ThreaderTeam1* team31, char** tensors35) {
void* pair7[] = {tensors35, 0};
ResNeXt50ThreaderTask1 task39;
task39.callee1 = ResNeXt50OneApply3Callee1;
task39.any1 = pair7;
task39.nd1 = 3;
task39.hull1[0] = 11;
task39.hull1[1] = 49;
task39.hull1[2] = 1;
ResNeXt50ThreaderDo1(team31, &task39);
}

static void ResNeXt50OneArrangeWts4Callee1(ResNeXt50ThreaderTask1* task48, int64_t* pt29) {
char** tensors46 = task48->any1;
ptrdiff_t b53 = pt29[0];
char*restrict wtPtr8 = tensors46[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr8 = tensors46[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr9 = tensors46[2]+(ptrdiff_t)8*512*0;
char*restrict arranged7 = tensors46[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)526336*0;
ptrdiff_t ii18 = 1;
for (ptrdiff_t i31 = 0; i31 < ii18; ++i31) {
ptrdiff_t j24 = 2*b53;
ptrdiff_t jj30 = j24+2;
for (; j24 < jj30; ++j24) {
if (j24 < 31) {
ptrdiff_t k99 = 0+16*(j24-0);
ptrdiff_t l38 = (size_t)(0+k99)/6;
ptrdiff_t cut10 = (size_t)(0+k99)%6;
switch (cut10) {
case 0:;
case 2: {
__m512 sum189 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k99);
__m512i pmMul14 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd14 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo11 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k99+512*i31));
__m512 masHi11 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k99+512*i31)+(ptrdiff_t)64);
__m512 postMul25 = _mm512_permutex2var_ps(masLo11, pmMul14, masHi11);
__m512 postAdd15 = _mm512_permutex2var_ps(masLo11, pmAdd14, masHi11);
sum189 = _mm512_fmadd_ps(sum189, postMul25, postAdd15);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum189);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum189);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 65535-(4095>>cut10), sum189);
ptrdiff_t c21 = 0;
for (; c21 != 16; ++c21) {
__m512 wt223 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)0);
__m512 wt224 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)1024);
__m512 wt225 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)2048);
__m512 wt226 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)3072);
__m512 wt227 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)4096);
__m512 wt228 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)5120);
__m512 wt229 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)6144);
__m512 wt230 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)7168);
__m512 wt231 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)8192);
__m512 wt232 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)9216);
__m512 wt233 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)10240);
__m512 wt234 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)11264);
__m512 wt235 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)12288);
__m512 wt236 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)13312);
__m512 wt237 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)14336);
__m512 wt238 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c21+(ptrdiff_t)15360);
__m512 tmp10505 = _mm512_unpacklo_ps(wt223, wt224);
__m512 tmp10506 = _mm512_unpackhi_ps(wt223, wt224);
__m512 tmp10507 = _mm512_unpacklo_ps(wt225, wt226);
__m512 tmp10508 = _mm512_unpackhi_ps(wt225, wt226);
__m512 tmp10509 = _mm512_unpacklo_ps(wt227, wt228);
__m512 tmp10510 = _mm512_unpackhi_ps(wt227, wt228);
__m512 tmp10511 = _mm512_unpacklo_ps(wt229, wt230);
__m512 tmp10512 = _mm512_unpackhi_ps(wt229, wt230);
__m512 tmp10513 = _mm512_unpacklo_ps(wt231, wt232);
__m512 tmp10514 = _mm512_unpackhi_ps(wt231, wt232);
__m512 tmp10515 = _mm512_unpacklo_ps(wt233, wt234);
__m512 tmp10516 = _mm512_unpackhi_ps(wt233, wt234);
__m512 tmp10517 = _mm512_unpacklo_ps(wt235, wt236);
__m512 tmp10518 = _mm512_unpackhi_ps(wt235, wt236);
__m512 tmp10519 = _mm512_unpacklo_ps(wt237, wt238);
__m512 tmp10520 = _mm512_unpackhi_ps(wt237, wt238);
__m512 tmp10521 = _mm512_shuffle_ps(tmp10505, tmp10507, 68);
__m512 tmp10522 = _mm512_shuffle_ps(tmp10505, tmp10507, 238);
__m512 tmp10523 = _mm512_shuffle_ps(tmp10506, tmp10508, 68);
__m512 tmp10524 = _mm512_shuffle_ps(tmp10506, tmp10508, 238);
__m512 tmp10525 = _mm512_shuffle_ps(tmp10509, tmp10511, 68);
__m512 tmp10526 = _mm512_shuffle_ps(tmp10509, tmp10511, 238);
__m512 tmp10527 = _mm512_shuffle_ps(tmp10510, tmp10512, 68);
__m512 tmp10528 = _mm512_shuffle_ps(tmp10510, tmp10512, 238);
__m512 tmp10529 = _mm512_shuffle_ps(tmp10513, tmp10515, 68);
__m512 tmp10530 = _mm512_shuffle_ps(tmp10513, tmp10515, 238);
__m512 tmp10531 = _mm512_shuffle_ps(tmp10514, tmp10516, 68);
__m512 tmp10532 = _mm512_shuffle_ps(tmp10514, tmp10516, 238);
__m512 tmp10533 = _mm512_shuffle_ps(tmp10517, tmp10519, 68);
__m512 tmp10534 = _mm512_shuffle_ps(tmp10517, tmp10519, 238);
__m512 tmp10535 = _mm512_shuffle_ps(tmp10518, tmp10520, 68);
__m512 tmp10536 = _mm512_shuffle_ps(tmp10518, tmp10520, 238);
__m512 tmp10537 = _mm512_shuffle_f32x4(tmp10521, tmp10525, 136);
__m512 tmp10538 = _mm512_shuffle_f32x4(tmp10521, tmp10525, 221);
__m512 tmp10539 = _mm512_shuffle_f32x4(tmp10522, tmp10526, 136);
__m512 tmp10540 = _mm512_shuffle_f32x4(tmp10522, tmp10526, 221);
__m512 tmp10541 = _mm512_shuffle_f32x4(tmp10523, tmp10527, 136);
__m512 tmp10542 = _mm512_shuffle_f32x4(tmp10523, tmp10527, 221);
__m512 tmp10543 = _mm512_shuffle_f32x4(tmp10524, tmp10528, 136);
__m512 tmp10544 = _mm512_shuffle_f32x4(tmp10524, tmp10528, 221);
__m512 tmp10545 = _mm512_shuffle_f32x4(tmp10529, tmp10533, 136);
__m512 tmp10546 = _mm512_shuffle_f32x4(tmp10529, tmp10533, 221);
__m512 tmp10547 = _mm512_shuffle_f32x4(tmp10530, tmp10534, 136);
__m512 tmp10548 = _mm512_shuffle_f32x4(tmp10530, tmp10534, 221);
__m512 tmp10549 = _mm512_shuffle_f32x4(tmp10531, tmp10535, 136);
__m512 tmp10550 = _mm512_shuffle_f32x4(tmp10531, tmp10535, 221);
__m512 tmp10551 = _mm512_shuffle_f32x4(tmp10532, tmp10536, 136);
__m512 tmp10552 = _mm512_shuffle_f32x4(tmp10532, tmp10536, 221);
wt223 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 136);
wt231 = _mm512_shuffle_f32x4(tmp10537, tmp10545, 221);
wt224 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 136);
wt232 = _mm512_shuffle_f32x4(tmp10539, tmp10547, 221);
wt225 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 136);
wt233 = _mm512_shuffle_f32x4(tmp10541, tmp10549, 221);
wt226 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 136);
wt234 = _mm512_shuffle_f32x4(tmp10543, tmp10551, 221);
wt227 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 136);
wt235 = _mm512_shuffle_f32x4(tmp10538, tmp10546, 221);
wt228 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 136);
wt236 = _mm512_shuffle_f32x4(tmp10540, tmp10548, 221);
wt229 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 136);
wt237 = _mm512_shuffle_f32x4(tmp10542, tmp10550, 221);
wt230 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 136);
wt238 = _mm512_shuffle_f32x4(tmp10544, tmp10552, 221);
wt223 = _mm512_mul_ps(wt223, postMul25);
wt224 = _mm512_mul_ps(wt224, postMul25);
wt225 = _mm512_mul_ps(wt225, postMul25);
wt226 = _mm512_mul_ps(wt226, postMul25);
wt227 = _mm512_mul_ps(wt227, postMul25);
wt228 = _mm512_mul_ps(wt228, postMul25);
wt229 = _mm512_mul_ps(wt229, postMul25);
wt230 = _mm512_mul_ps(wt230, postMul25);
wt231 = _mm512_mul_ps(wt231, postMul25);
wt232 = _mm512_mul_ps(wt232, postMul25);
wt233 = _mm512_mul_ps(wt233, postMul25);
wt234 = _mm512_mul_ps(wt234, postMul25);
wt235 = _mm512_mul_ps(wt235, postMul25);
wt236 = _mm512_mul_ps(wt236, postMul25);
wt237 = _mm512_mul_ps(wt237, postMul25);
wt238 = _mm512_mul_ps(wt238, postMul25);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)0, 63>>cut10, wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)0, 63>>cut10, wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)0, 63>>cut10, wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)0, 63>>cut10, wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)0, 63>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)0, 63>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)0, 63>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)0, 63>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)0, 63>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)0, 63>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)0, 63>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)0, 63>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)0, 63>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)0, 63>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)0, 63>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)0, 63>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)6144, 4032>>cut10, wt238);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt223);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt224);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt225);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt226);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt227);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt228);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt229);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt230);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt231);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt232);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt233);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt234);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt235);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt236);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt237);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c21)+(ptrdiff_t)12288, 65535-(4095>>cut10), wt238);
}
break;
}
default: {
cut10 = 4;
__m512 sum190 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k99);
__m512i pmMul15 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd15 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo12 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k99+512*i31));
__m512 masHi12 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k99+512*i31)+(ptrdiff_t)64);
__m512 postMul26 = _mm512_permutex2var_ps(masLo12, pmMul15, masHi12);
__m512 postAdd16 = _mm512_permutex2var_ps(masLo12, pmAdd15, masHi12);
sum190 = _mm512_fmadd_ps(sum190, postMul26, postAdd16);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)0, 63>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)6144, 4032>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)12288, 258048>>cut10, sum190);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*0+(ptrdiff_t)18432, 65535-(262143>>cut10), sum190);
ptrdiff_t c22 = 0;
for (; c22 != 16; ++c22) {
__m512 wt239 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)0);
__m512 wt240 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)1024);
__m512 wt241 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)2048);
__m512 wt242 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)3072);
__m512 wt243 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)4096);
__m512 wt244 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)5120);
__m512 wt245 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)6144);
__m512 wt246 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)7168);
__m512 wt247 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)8192);
__m512 wt248 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)9216);
__m512 wt249 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)10240);
__m512 wt250 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)11264);
__m512 wt251 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)12288);
__m512 wt252 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)13312);
__m512 wt253 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)14336);
__m512 wt254 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k99+64*c22+(ptrdiff_t)15360);
__m512 tmp10553 = _mm512_unpacklo_ps(wt239, wt240);
__m512 tmp10554 = _mm512_unpackhi_ps(wt239, wt240);
__m512 tmp10555 = _mm512_unpacklo_ps(wt241, wt242);
__m512 tmp10556 = _mm512_unpackhi_ps(wt241, wt242);
__m512 tmp10557 = _mm512_unpacklo_ps(wt243, wt244);
__m512 tmp10558 = _mm512_unpackhi_ps(wt243, wt244);
__m512 tmp10559 = _mm512_unpacklo_ps(wt245, wt246);
__m512 tmp10560 = _mm512_unpackhi_ps(wt245, wt246);
__m512 tmp10561 = _mm512_unpacklo_ps(wt247, wt248);
__m512 tmp10562 = _mm512_unpackhi_ps(wt247, wt248);
__m512 tmp10563 = _mm512_unpacklo_ps(wt249, wt250);
__m512 tmp10564 = _mm512_unpackhi_ps(wt249, wt250);
__m512 tmp10565 = _mm512_unpacklo_ps(wt251, wt252);
__m512 tmp10566 = _mm512_unpackhi_ps(wt251, wt252);
__m512 tmp10567 = _mm512_unpacklo_ps(wt253, wt254);
__m512 tmp10568 = _mm512_unpackhi_ps(wt253, wt254);
__m512 tmp10569 = _mm512_shuffle_ps(tmp10553, tmp10555, 68);
__m512 tmp10570 = _mm512_shuffle_ps(tmp10553, tmp10555, 238);
__m512 tmp10571 = _mm512_shuffle_ps(tmp10554, tmp10556, 68);
__m512 tmp10572 = _mm512_shuffle_ps(tmp10554, tmp10556, 238);
__m512 tmp10573 = _mm512_shuffle_ps(tmp10557, tmp10559, 68);
__m512 tmp10574 = _mm512_shuffle_ps(tmp10557, tmp10559, 238);
__m512 tmp10575 = _mm512_shuffle_ps(tmp10558, tmp10560, 68);
__m512 tmp10576 = _mm512_shuffle_ps(tmp10558, tmp10560, 238);
__m512 tmp10577 = _mm512_shuffle_ps(tmp10561, tmp10563, 68);
__m512 tmp10578 = _mm512_shuffle_ps(tmp10561, tmp10563, 238);
__m512 tmp10579 = _mm512_shuffle_ps(tmp10562, tmp10564, 68);
__m512 tmp10580 = _mm512_shuffle_ps(tmp10562, tmp10564, 238);
__m512 tmp10581 = _mm512_shuffle_ps(tmp10565, tmp10567, 68);
__m512 tmp10582 = _mm512_shuffle_ps(tmp10565, tmp10567, 238);
__m512 tmp10583 = _mm512_shuffle_ps(tmp10566, tmp10568, 68);
__m512 tmp10584 = _mm512_shuffle_ps(tmp10566, tmp10568, 238);
__m512 tmp10585 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 136);
__m512 tmp10586 = _mm512_shuffle_f32x4(tmp10569, tmp10573, 221);
__m512 tmp10587 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 136);
__m512 tmp10588 = _mm512_shuffle_f32x4(tmp10570, tmp10574, 221);
__m512 tmp10589 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 136);
__m512 tmp10590 = _mm512_shuffle_f32x4(tmp10571, tmp10575, 221);
__m512 tmp10591 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 136);
__m512 tmp10592 = _mm512_shuffle_f32x4(tmp10572, tmp10576, 221);
__m512 tmp10593 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 136);
__m512 tmp10594 = _mm512_shuffle_f32x4(tmp10577, tmp10581, 221);
__m512 tmp10595 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 136);
__m512 tmp10596 = _mm512_shuffle_f32x4(tmp10578, tmp10582, 221);
__m512 tmp10597 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 136);
__m512 tmp10598 = _mm512_shuffle_f32x4(tmp10579, tmp10583, 221);
__m512 tmp10599 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 136);
__m512 tmp10600 = _mm512_shuffle_f32x4(tmp10580, tmp10584, 221);
wt239 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 136);
wt247 = _mm512_shuffle_f32x4(tmp10585, tmp10593, 221);
wt240 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 136);
wt248 = _mm512_shuffle_f32x4(tmp10587, tmp10595, 221);
wt241 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 136);
wt249 = _mm512_shuffle_f32x4(tmp10589, tmp10597, 221);
wt242 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 136);
wt250 = _mm512_shuffle_f32x4(tmp10591, tmp10599, 221);
wt243 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 136);
wt251 = _mm512_shuffle_f32x4(tmp10586, tmp10594, 221);
wt244 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 136);
wt252 = _mm512_shuffle_f32x4(tmp10588, tmp10596, 221);
wt245 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 136);
wt253 = _mm512_shuffle_f32x4(tmp10590, tmp10598, 221);
wt246 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 136);
wt254 = _mm512_shuffle_f32x4(tmp10592, tmp10600, 221);
wt239 = _mm512_mul_ps(wt239, postMul26);
wt240 = _mm512_mul_ps(wt240, postMul26);
wt241 = _mm512_mul_ps(wt241, postMul26);
wt242 = _mm512_mul_ps(wt242, postMul26);
wt243 = _mm512_mul_ps(wt243, postMul26);
wt244 = _mm512_mul_ps(wt244, postMul26);
wt245 = _mm512_mul_ps(wt245, postMul26);
wt246 = _mm512_mul_ps(wt246, postMul26);
wt247 = _mm512_mul_ps(wt247, postMul26);
wt248 = _mm512_mul_ps(wt248, postMul26);
wt249 = _mm512_mul_ps(wt249, postMul26);
wt250 = _mm512_mul_ps(wt250, postMul26);
wt251 = _mm512_mul_ps(wt251, postMul26);
wt252 = _mm512_mul_ps(wt252, postMul26);
wt253 = _mm512_mul_ps(wt253, postMul26);
wt254 = _mm512_mul_ps(wt254, postMul26);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)0, 63>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)0, 63>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)0, 63>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)0, 63>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)0, 63>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)0, 63>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)0, 63>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)0, 63>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)0, 63>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)0, 63>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)0, 63>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)0, 63>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)0, 63>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)0, 63>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)0, 63>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)0, 63>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)6144, 4032>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)12288, 258048>>cut10, wt254);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(1+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt239);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(2+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt240);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(3+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt241);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(4+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt242);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(5+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt243);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(6+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt244);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(7+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt245);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(8+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt246);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(9+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt247);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(10+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt248);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(11+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt249);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(12+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt250);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(13+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt251);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(14+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt252);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(15+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt253);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l38+4*cut10+24*(16+16*c22)+(ptrdiff_t)18432, 65535-(262143>>cut10), wt254);
}
}
}
} else {
ptrdiff_t k98 = 496;
ptrdiff_t l37 = (size_t)(0+k98)/6;
ptrdiff_t cut9 = (size_t)(0+k98)%6;
__m512 sum188 = _mm512_maskz_loadu_ps(65535, biasPtr8+2048*i31+4*k98);
__m512i pmMul16 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd16 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo13 = _mm512_loadu_ps(bnPtr9+(ptrdiff_t)8*(k98+512*i31));
__m512 masHi13 = _mm512_maskz_loadu_ps(65535, bnPtr9+(ptrdiff_t)8*(k98+512*i31)+(ptrdiff_t)64);
__m512 postMul24 = _mm512_permutex2var_ps(masLo13, pmMul16, masHi13);
__m512 postAdd14 = _mm512_permutex2var_ps(masLo13, pmAdd16, masHi13);
sum188 = _mm512_fmadd_ps(sum188, postMul24, postAdd14);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)0, 63>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)6144, 4032>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*0+(ptrdiff_t)12288, 258048>>cut9, sum188);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*0+(ptrdiff_t)18432, 65535-(262143>>cut9), sum188);
ptrdiff_t c20 = 0;
for (; c20 != 16; ++c20) {
__m512 wt207 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)0);
__m512 wt208 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)1024);
__m512 wt209 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)2048);
__m512 wt210 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)3072);
__m512 wt211 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)4096);
__m512 wt212 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)5120);
__m512 wt213 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)6144);
__m512 wt214 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)7168);
__m512 wt215 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)8192);
__m512 wt216 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)9216);
__m512 wt217 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)10240);
__m512 wt218 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)11264);
__m512 wt219 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)12288);
__m512 wt220 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)13312);
__m512 wt221 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)14336);
__m512 wt222 = _mm512_maskz_loadu_ps(65535, wtPtr8+524288*i31+1024*k98+64*c20+(ptrdiff_t)15360);
__m512 tmp10601 = _mm512_unpacklo_ps(wt207, wt208);
__m512 tmp10602 = _mm512_unpackhi_ps(wt207, wt208);
__m512 tmp10603 = _mm512_unpacklo_ps(wt209, wt210);
__m512 tmp10604 = _mm512_unpackhi_ps(wt209, wt210);
__m512 tmp10605 = _mm512_unpacklo_ps(wt211, wt212);
__m512 tmp10606 = _mm512_unpackhi_ps(wt211, wt212);
__m512 tmp10607 = _mm512_unpacklo_ps(wt213, wt214);
__m512 tmp10608 = _mm512_unpackhi_ps(wt213, wt214);
__m512 tmp10609 = _mm512_unpacklo_ps(wt215, wt216);
__m512 tmp10610 = _mm512_unpackhi_ps(wt215, wt216);
__m512 tmp10611 = _mm512_unpacklo_ps(wt217, wt218);
__m512 tmp10612 = _mm512_unpackhi_ps(wt217, wt218);
__m512 tmp10613 = _mm512_unpacklo_ps(wt219, wt220);
__m512 tmp10614 = _mm512_unpackhi_ps(wt219, wt220);
__m512 tmp10615 = _mm512_unpacklo_ps(wt221, wt222);
__m512 tmp10616 = _mm512_unpackhi_ps(wt221, wt222);
__m512 tmp10617 = _mm512_shuffle_ps(tmp10601, tmp10603, 68);
__m512 tmp10618 = _mm512_shuffle_ps(tmp10601, tmp10603, 238);
__m512 tmp10619 = _mm512_shuffle_ps(tmp10602, tmp10604, 68);
__m512 tmp10620 = _mm512_shuffle_ps(tmp10602, tmp10604, 238);
__m512 tmp10621 = _mm512_shuffle_ps(tmp10605, tmp10607, 68);
__m512 tmp10622 = _mm512_shuffle_ps(tmp10605, tmp10607, 238);
__m512 tmp10623 = _mm512_shuffle_ps(tmp10606, tmp10608, 68);
__m512 tmp10624 = _mm512_shuffle_ps(tmp10606, tmp10608, 238);
__m512 tmp10625 = _mm512_shuffle_ps(tmp10609, tmp10611, 68);
__m512 tmp10626 = _mm512_shuffle_ps(tmp10609, tmp10611, 238);
__m512 tmp10627 = _mm512_shuffle_ps(tmp10610, tmp10612, 68);
__m512 tmp10628 = _mm512_shuffle_ps(tmp10610, tmp10612, 238);
__m512 tmp10629 = _mm512_shuffle_ps(tmp10613, tmp10615, 68);
__m512 tmp10630 = _mm512_shuffle_ps(tmp10613, tmp10615, 238);
__m512 tmp10631 = _mm512_shuffle_ps(tmp10614, tmp10616, 68);
__m512 tmp10632 = _mm512_shuffle_ps(tmp10614, tmp10616, 238);
__m512 tmp10633 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 136);
__m512 tmp10634 = _mm512_shuffle_f32x4(tmp10617, tmp10621, 221);
__m512 tmp10635 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 136);
__m512 tmp10636 = _mm512_shuffle_f32x4(tmp10618, tmp10622, 221);
__m512 tmp10637 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 136);
__m512 tmp10638 = _mm512_shuffle_f32x4(tmp10619, tmp10623, 221);
__m512 tmp10639 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 136);
__m512 tmp10640 = _mm512_shuffle_f32x4(tmp10620, tmp10624, 221);
__m512 tmp10641 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 136);
__m512 tmp10642 = _mm512_shuffle_f32x4(tmp10625, tmp10629, 221);
__m512 tmp10643 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 136);
__m512 tmp10644 = _mm512_shuffle_f32x4(tmp10626, tmp10630, 221);
__m512 tmp10645 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 136);
__m512 tmp10646 = _mm512_shuffle_f32x4(tmp10627, tmp10631, 221);
__m512 tmp10647 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 136);
__m512 tmp10648 = _mm512_shuffle_f32x4(tmp10628, tmp10632, 221);
wt207 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 136);
wt215 = _mm512_shuffle_f32x4(tmp10633, tmp10641, 221);
wt208 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 136);
wt216 = _mm512_shuffle_f32x4(tmp10635, tmp10643, 221);
wt209 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 136);
wt217 = _mm512_shuffle_f32x4(tmp10637, tmp10645, 221);
wt210 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 136);
wt218 = _mm512_shuffle_f32x4(tmp10639, tmp10647, 221);
wt211 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 136);
wt219 = _mm512_shuffle_f32x4(tmp10634, tmp10642, 221);
wt212 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 136);
wt220 = _mm512_shuffle_f32x4(tmp10636, tmp10644, 221);
wt213 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 136);
wt221 = _mm512_shuffle_f32x4(tmp10638, tmp10646, 221);
wt214 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 136);
wt222 = _mm512_shuffle_f32x4(tmp10640, tmp10648, 221);
wt207 = _mm512_mul_ps(wt207, postMul24);
wt208 = _mm512_mul_ps(wt208, postMul24);
wt209 = _mm512_mul_ps(wt209, postMul24);
wt210 = _mm512_mul_ps(wt210, postMul24);
wt211 = _mm512_mul_ps(wt211, postMul24);
wt212 = _mm512_mul_ps(wt212, postMul24);
wt213 = _mm512_mul_ps(wt213, postMul24);
wt214 = _mm512_mul_ps(wt214, postMul24);
wt215 = _mm512_mul_ps(wt215, postMul24);
wt216 = _mm512_mul_ps(wt216, postMul24);
wt217 = _mm512_mul_ps(wt217, postMul24);
wt218 = _mm512_mul_ps(wt218, postMul24);
wt219 = _mm512_mul_ps(wt219, postMul24);
wt220 = _mm512_mul_ps(wt220, postMul24);
wt221 = _mm512_mul_ps(wt221, postMul24);
wt222 = _mm512_mul_ps(wt222, postMul24);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)0, 63>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)0, 63>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)0, 63>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)0, 63>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)0, 63>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)0, 63>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)0, 63>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)0, 63>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)0, 63>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)0, 63>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)0, 63>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)0, 63>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)0, 63>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)0, 63>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)0, 63>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)0, 63>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)6144, 4032>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(1+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(2+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(3+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(4+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(5+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(6+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(7+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(8+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(9+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(10+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(11+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(12+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(13+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(14+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(15+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+24*(16+16*c20)+(ptrdiff_t)12288, 258048>>cut9, wt222);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(1+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt207);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(2+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt208);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(3+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt209);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(4+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt210);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(5+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt211);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(6+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt212);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(7+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt213);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(8+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt214);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(9+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt215);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(10+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt216);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(11+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt217);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(12+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt218);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(13+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt219);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(14+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt220);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(15+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt221);
_mm512_mask_storeu_ps(arranged7+526336*i31+6168*l37+4*cut9+8*(16+16*c20)+(ptrdiff_t)18432, 65535-(262143>>cut9), wt222);
}
}
}
}
}

static void ResNeXt50OneArrangeWts4(ResNeXt50ThreaderTeam1* team36, char** tensors45) {
ResNeXt50ThreaderTask1 task49;
task49.callee1 = ResNeXt50OneArrangeWts4Callee1;
task49.any1 = tensors45;
task49.nd1 = 3;
task49.hull1[0] = 16;
task49.hull1[1] = 1;
task49.hull1[2] = 1;
ResNeXt50ThreaderDo1(team36, &task49);
}

static void ResNeXt50OneArrangeDats4Callee1(ResNeXt50ThreaderTask1* task50, int64_t* pt30) {
char** tensors48 = task50->any1;
ptrdiff_t s20 = pt30[0];
ptrdiff_t c23 = pt30[1];
char*restrict datPtr14 = tensors48[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged8 = tensors48[1]+(ptrdiff_t)2992640*0+(ptrdiff_t)917504*0;
ptrdiff_t ii19 = 1;
for (ptrdiff_t i32 = 0; i32 < ii19; ++i32) {
ptrdiff_t j25 = 1*c23;
ptrdiff_t jj31 = j25+0;
ptrdiff_t h38 = 0+((size_t)j25-0)/1*4;
switch (((size_t)j25-0)%1) {
default: {
wrap3:;
ptrdiff_t k100 = 128*s20;
ptrdiff_t kk30 = k100+128;
for (; k100 < kk30; ++k100) {
__m512 dat1641 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)0);
__m512 dat1642 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)64);
__m512i pm153 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1643 = _mm512_permutex2var_ps(dat1641, pm153, dat1642);
__m512 dat1644 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)128);
__m512 dat1645 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)192);
__m512i pm154 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1646 = _mm512_permutex2var_ps(dat1644, pm154, dat1645);
__m512 dat1647 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)448);
__m512 dat1648 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)512);
__m512i pm155 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1649 = _mm512_permutex2var_ps(dat1647, pm155, dat1648);
__m512 dat1650 = _mm512_maskz_loadu_ps(32767, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)576);
__m512 dat1651 = _mm512_maskz_loadu_ps(127, datPtr14+3227648*i32+224*h38+12608*k100+(ptrdiff_t)640);
__m512i pm156 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat1652 = _mm512_permutex2var_ps(dat1650, pm156, dat1651);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)0, dat1643);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)64, dat1646);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)128, dat1649);
_mm512_storeu_ps(arranged8+917504*i32+65536*j25+256*k100+(ptrdiff_t)192, dat1652);
}
if (j25 >= jj31) goto next4;
if (j25 >= 13) break;
++j25;
h38 += 4;
goto wrap3;
}
}
j25 = 14;
next4:;
}
}

static void ResNeXt50OneArrangeDats4(ResNeXt50ThreaderTeam1* team37, char** tensors47) {
ResNeXt50ThreaderTask1 task51;
task51.callee1 = ResNeXt50OneArrangeDats4Callee1;
task51.any1 = tensors47;
task51.nd1 = 4;
task51.hull1[0] = 2;
task51.hull1[1] = 14;
task51.hull1[2] = 1;
task51.hull1[3] = 1;
ResNeXt50ThreaderDo1(team37, &task51);
}

static void ResNeXt50OneApply4Callee1(ResNeXt50ThreaderTask1* task52, int64_t* pt31) {
void** pair12 = task52->any1;
char** tensors50 = pair12[0];
ptrdiff_t e15 = 0;
ptrdiff_t g17 = 0;
ptrdiff_t d10 = pt31[1];
ptrdiff_t w47 = pt31[0];
char*restrict arrangedWts4 = tensors50[0]+1712128*e15+(ptrdiff_t)526336*1*g17;
char*restrict arrangedDats4 = tensors50[1]+2992640*e15+(ptrdiff_t)917504*1*g17;
char*restrict datPtr15 = tensors50[2]+(ptrdiff_t)1605632*1*g17;
ptrdiff_t ii20 = 1;
for (ptrdiff_t i33 = 0; i33 < ii20; ++i33) {
ptrdiff_t j26 = 1*d10;
ptrdiff_t jj32 = j26+0;
ptrdiff_t h39 = 0+((size_t)j26-0)/1*2;
switch (((size_t)j26-0)%1) {
default: {
wrap4:;
ptrdiff_t k101 = 2*w47;
ptrdiff_t kk31 = k101+1;
for (; k101 != 85; ++k101) {
ptrdiff_t s21 = -1;
__m512 sum191 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)24));
__m512 sum195 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)28));
__m512 sum199 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)32));
__m512 sum203 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)36));
__m512 sum207 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)40));
__m512 sum211 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)44));
__m512 sum192 = sum191;
__m512 sum193 = sum191;
__m512 sum194 = sum191;
__m512 sum196 = sum195;
__m512 sum197 = sum195;
__m512 sum198 = sum195;
__m512 sum200 = sum199;
__m512 sum201 = sum199;
__m512 sum202 = sum199;
__m512 sum204 = sum203;
__m512 sum205 = sum203;
__m512 sum206 = sum203;
__m512 sum208 = sum207;
__m512 sum209 = sum207;
__m512 sum210 = sum207;
__m512 sum212 = sum211;
__m512 sum213 = sum211;
__m512 sum214 = sum211;
for (s21 = 0; s21 < 256; ++s21) {
__m512 dat1653 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)0);
__m512 dat1654 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)64);
__m512 dat1655 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)128);
__m512 dat1656 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s21+(ptrdiff_t)192);
__m512 wt255 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)24));
sum191 = _mm512_fmadd_ps(wt255, dat1653, sum191);
sum192 = _mm512_fmadd_ps(wt255, dat1654, sum192);
sum193 = _mm512_fmadd_ps(wt255, dat1655, sum193);
sum194 = _mm512_fmadd_ps(wt255, dat1656, sum194);
__m512 wt256 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)28));
sum195 = _mm512_fmadd_ps(wt256, dat1653, sum195);
sum196 = _mm512_fmadd_ps(wt256, dat1654, sum196);
sum197 = _mm512_fmadd_ps(wt256, dat1655, sum197);
sum198 = _mm512_fmadd_ps(wt256, dat1656, sum198);
__m512 wt257 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)32));
sum199 = _mm512_fmadd_ps(wt257, dat1653, sum199);
sum200 = _mm512_fmadd_ps(wt257, dat1654, sum200);
sum201 = _mm512_fmadd_ps(wt257, dat1655, sum201);
sum202 = _mm512_fmadd_ps(wt257, dat1656, sum202);
__m512 wt258 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)36));
sum203 = _mm512_fmadd_ps(wt258, dat1653, sum203);
sum204 = _mm512_fmadd_ps(wt258, dat1654, sum204);
sum205 = _mm512_fmadd_ps(wt258, dat1655, sum205);
sum206 = _mm512_fmadd_ps(wt258, dat1656, sum206);
__m512 wt259 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)40));
sum207 = _mm512_fmadd_ps(wt259, dat1653, sum207);
sum208 = _mm512_fmadd_ps(wt259, dat1654, sum208);
sum209 = _mm512_fmadd_ps(wt259, dat1655, sum209);
sum210 = _mm512_fmadd_ps(wt259, dat1656, sum210);
__m512 wt260 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+24*s21+(ptrdiff_t)44));
sum211 = _mm512_fmadd_ps(wt260, dat1653, sum211);
sum212 = _mm512_fmadd_ps(wt260, dat1654, sum212);
sum213 = _mm512_fmadd_ps(wt260, dat1655, sum213);
sum214 = _mm512_fmadd_ps(wt260, dat1656, sum214);
}
__m512 dat1657 = sum192;
__m512 dat1658 = sum194;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)0, 65535, sum191);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)64, 4095, dat1657);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)112, 65535, sum193);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)176, 4095, dat1658);
__m512 dat1659 = sum196;
__m512 dat1660 = sum198;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3136, 65535, sum195);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3200, 4095, dat1659);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3248, 65535, sum197);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3312, 4095, dat1660);
__m512 dat1661 = sum200;
__m512 dat1662 = sum202;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6272, 65535, sum199);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6336, 4095, dat1661);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6384, 65535, sum201);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)6448, 4095, dat1662);
__m512 dat1663 = sum204;
__m512 dat1664 = sum206;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9408, 65535, sum203);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9472, 4095, dat1663);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9520, 65535, sum205);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)9584, 4095, dat1664);
__m512 dat1665 = sum208;
__m512 dat1666 = sum210;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12544, 65535, sum207);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12608, 4095, dat1665);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12656, 65535, sum209);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)12720, 4095, dat1666);
__m512 dat1667 = sum212;
__m512 dat1668 = sum214;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15680, 65535, sum211);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15744, 4095, dat1667);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15792, 65535, sum213);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)15856, 4095, dat1668);
if (k101 >= kk31) return;
}
ptrdiff_t s22 = -1;
__m512 sum215 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)8));
__m512 sum219 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)12));
__m512 sum216 = sum215;
__m512 sum217 = sum215;
__m512 sum218 = sum215;
__m512 sum220 = sum219;
__m512 sum221 = sum219;
__m512 sum222 = sum219;
for (s22 = 0; s22 < 256; ++s22) {
__m512 dat1669 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)0);
__m512 dat1670 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)64);
__m512 dat1671 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)128);
__m512 dat1672 = _mm512_loadu_ps(arrangedDats4+917504*i33+65536*j26+256*s22+(ptrdiff_t)192);
__m512 wt261 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)8));
sum215 = _mm512_fmadd_ps(wt261, dat1669, sum215);
sum216 = _mm512_fmadd_ps(wt261, dat1670, sum216);
sum217 = _mm512_fmadd_ps(wt261, dat1671, sum217);
sum218 = _mm512_fmadd_ps(wt261, dat1672, sum218);
__m512 wt262 = _mm512_set1_ps(*(float*)(arrangedWts4+526336*i33+6168*k101+8*s22+(ptrdiff_t)12));
sum219 = _mm512_fmadd_ps(wt262, dat1669, sum219);
sum220 = _mm512_fmadd_ps(wt262, dat1670, sum220);
sum221 = _mm512_fmadd_ps(wt262, dat1671, sum221);
sum222 = _mm512_fmadd_ps(wt262, dat1672, sum222);
}
__m512 dat1673 = sum216;
__m512 dat1674 = sum218;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)0, 65535, sum215);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)64, 4095, dat1673);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)112, 65535, sum217);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)176, 4095, dat1674);
__m512 dat1675 = sum220;
__m512 dat1676 = sum222;
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3136, 65535, sum219);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3200, 4095, dat1675);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3248, 65535, sum221);
_mm512_mask_storeu_ps(datPtr15+1605632*i33+112*h39+18816*k101+(ptrdiff_t)3312, 4095, dat1676);
if (j26 >= jj32) return;
if (j26 >= 13) break;
++j26;
h39 += 2;
goto wrap4;
}
}
j26 = 14;
}
}

static void ResNeXt50OneApply4(ResNeXt50ThreaderTeam1* team38, char** tensors49) {
void* pair11[] = {tensors49, 0};
ResNeXt50ThreaderTask1 task53;
task53.callee1 = ResNeXt50OneApply4Callee1;
task53.any1 = pair11;
task53.nd1 = 3;
task53.hull1[0] = 43;
task53.hull1[1] = 14;
task53.hull1[2] = 1;
ResNeXt50ThreaderDo1(team38, &task53);
}

static void ResNeXt50OneArrangeWts5Callee1(ResNeXt50ThreaderTask1* task54, int64_t* pt32) {
char** tensors52 = task54->any1;
ptrdiff_t b54 = pt32[0];
char*restrict wtPtr9 = tensors52[0]+(ptrdiff_t)3340*0+(ptrdiff_t)262144*0;
char*restrict biasPtr9 = tensors52[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr10 = tensors52[2]+(ptrdiff_t)8*256*0;
char*restrict arranged9 = tensors52[3]+(ptrdiff_t)856064*0+(ptrdiff_t)263168*0;
ptrdiff_t ii21 = 1;
for (ptrdiff_t i34 = 0; i34 < ii21; ++i34) {
ptrdiff_t j27 = 2*b54;
ptrdiff_t jj33 = j27+2;
for (; j27 < jj33; ++j27) {
if (j27 < 15) {
ptrdiff_t k103 = 0+16*(j27-0);
ptrdiff_t l40 = (size_t)(0+k103)/6;
ptrdiff_t cut12 = (size_t)(0+k103)%6;
switch (cut12) {
case 0:;
case 2: {
__m512 sum224 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k103);
__m512i pmMul17 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd17 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo14 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k103+256*i34));
__m512 masHi14 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k103+256*i34)+(ptrdiff_t)64);
__m512 postMul28 = _mm512_permutex2var_ps(masLo14, pmMul17, masHi14);
__m512 postAdd18 = _mm512_permutex2var_ps(masLo14, pmAdd17, masHi14);
sum224 = _mm512_fmadd_ps(sum224, postMul28, postAdd18);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum224);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum224);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 65535-(4095>>cut12), sum224);
ptrdiff_t c25 = 0;
for (; c25 != 16; ++c25) {
__m512 wt279 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)0);
__m512 wt280 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)1024);
__m512 wt281 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)2048);
__m512 wt282 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)3072);
__m512 wt283 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)4096);
__m512 wt284 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)5120);
__m512 wt285 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)6144);
__m512 wt286 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)7168);
__m512 wt287 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)8192);
__m512 wt288 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)9216);
__m512 wt289 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)10240);
__m512 wt290 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)11264);
__m512 wt291 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)12288);
__m512 wt292 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)13312);
__m512 wt293 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)14336);
__m512 wt294 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c25+(ptrdiff_t)15360);
__m512 tmp10649 = _mm512_unpacklo_ps(wt279, wt280);
__m512 tmp10650 = _mm512_unpackhi_ps(wt279, wt280);
__m512 tmp10651 = _mm512_unpacklo_ps(wt281, wt282);
__m512 tmp10652 = _mm512_unpackhi_ps(wt281, wt282);
__m512 tmp10653 = _mm512_unpacklo_ps(wt283, wt284);
__m512 tmp10654 = _mm512_unpackhi_ps(wt283, wt284);
__m512 tmp10655 = _mm512_unpacklo_ps(wt285, wt286);
__m512 tmp10656 = _mm512_unpackhi_ps(wt285, wt286);
__m512 tmp10657 = _mm512_unpacklo_ps(wt287, wt288);
__m512 tmp10658 = _mm512_unpackhi_ps(wt287, wt288);
__m512 tmp10659 = _mm512_unpacklo_ps(wt289, wt290);
__m512 tmp10660 = _mm512_unpackhi_ps(wt289, wt290);
__m512 tmp10661 = _mm512_unpacklo_ps(wt291, wt292);
__m512 tmp10662 = _mm512_unpackhi_ps(wt291, wt292);
__m512 tmp10663 = _mm512_unpacklo_ps(wt293, wt294);
__m512 tmp10664 = _mm512_unpackhi_ps(wt293, wt294);
__m512 tmp10665 = _mm512_shuffle_ps(tmp10649, tmp10651, 68);
__m512 tmp10666 = _mm512_shuffle_ps(tmp10649, tmp10651, 238);
__m512 tmp10667 = _mm512_shuffle_ps(tmp10650, tmp10652, 68);
__m512 tmp10668 = _mm512_shuffle_ps(tmp10650, tmp10652, 238);
__m512 tmp10669 = _mm512_shuffle_ps(tmp10653, tmp10655, 68);
__m512 tmp10670 = _mm512_shuffle_ps(tmp10653, tmp10655, 238);
__m512 tmp10671 = _mm512_shuffle_ps(tmp10654, tmp10656, 68);
__m512 tmp10672 = _mm512_shuffle_ps(tmp10654, tmp10656, 238);
__m512 tmp10673 = _mm512_shuffle_ps(tmp10657, tmp10659, 68);
__m512 tmp10674 = _mm512_shuffle_ps(tmp10657, tmp10659, 238);
__m512 tmp10675 = _mm512_shuffle_ps(tmp10658, tmp10660, 68);
__m512 tmp10676 = _mm512_shuffle_ps(tmp10658, tmp10660, 238);
__m512 tmp10677 = _mm512_shuffle_ps(tmp10661, tmp10663, 68);
__m512 tmp10678 = _mm512_shuffle_ps(tmp10661, tmp10663, 238);
__m512 tmp10679 = _mm512_shuffle_ps(tmp10662, tmp10664, 68);
__m512 tmp10680 = _mm512_shuffle_ps(tmp10662, tmp10664, 238);
__m512 tmp10681 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 136);
__m512 tmp10682 = _mm512_shuffle_f32x4(tmp10665, tmp10669, 221);
__m512 tmp10683 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 136);
__m512 tmp10684 = _mm512_shuffle_f32x4(tmp10666, tmp10670, 221);
__m512 tmp10685 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 136);
__m512 tmp10686 = _mm512_shuffle_f32x4(tmp10667, tmp10671, 221);
__m512 tmp10687 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 136);
__m512 tmp10688 = _mm512_shuffle_f32x4(tmp10668, tmp10672, 221);
__m512 tmp10689 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 136);
__m512 tmp10690 = _mm512_shuffle_f32x4(tmp10673, tmp10677, 221);
__m512 tmp10691 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 136);
__m512 tmp10692 = _mm512_shuffle_f32x4(tmp10674, tmp10678, 221);
__m512 tmp10693 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 136);
__m512 tmp10694 = _mm512_shuffle_f32x4(tmp10675, tmp10679, 221);
__m512 tmp10695 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 136);
__m512 tmp10696 = _mm512_shuffle_f32x4(tmp10676, tmp10680, 221);
wt279 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 136);
wt287 = _mm512_shuffle_f32x4(tmp10681, tmp10689, 221);
wt280 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 136);
wt288 = _mm512_shuffle_f32x4(tmp10683, tmp10691, 221);
wt281 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 136);
wt289 = _mm512_shuffle_f32x4(tmp10685, tmp10693, 221);
wt282 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 136);
wt290 = _mm512_shuffle_f32x4(tmp10687, tmp10695, 221);
wt283 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 136);
wt291 = _mm512_shuffle_f32x4(tmp10682, tmp10690, 221);
wt284 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 136);
wt292 = _mm512_shuffle_f32x4(tmp10684, tmp10692, 221);
wt285 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 136);
wt293 = _mm512_shuffle_f32x4(tmp10686, tmp10694, 221);
wt286 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 136);
wt294 = _mm512_shuffle_f32x4(tmp10688, tmp10696, 221);
wt279 = _mm512_mul_ps(wt279, postMul28);
wt280 = _mm512_mul_ps(wt280, postMul28);
wt281 = _mm512_mul_ps(wt281, postMul28);
wt282 = _mm512_mul_ps(wt282, postMul28);
wt283 = _mm512_mul_ps(wt283, postMul28);
wt284 = _mm512_mul_ps(wt284, postMul28);
wt285 = _mm512_mul_ps(wt285, postMul28);
wt286 = _mm512_mul_ps(wt286, postMul28);
wt287 = _mm512_mul_ps(wt287, postMul28);
wt288 = _mm512_mul_ps(wt288, postMul28);
wt289 = _mm512_mul_ps(wt289, postMul28);
wt290 = _mm512_mul_ps(wt290, postMul28);
wt291 = _mm512_mul_ps(wt291, postMul28);
wt292 = _mm512_mul_ps(wt292, postMul28);
wt293 = _mm512_mul_ps(wt293, postMul28);
wt294 = _mm512_mul_ps(wt294, postMul28);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)0, 63>>cut12, wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)0, 63>>cut12, wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)0, 63>>cut12, wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)0, 63>>cut12, wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)0, 63>>cut12, wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)0, 63>>cut12, wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)0, 63>>cut12, wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)0, 63>>cut12, wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)0, 63>>cut12, wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)0, 63>>cut12, wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)0, 63>>cut12, wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)0, 63>>cut12, wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)0, 63>>cut12, wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)0, 63>>cut12, wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)0, 63>>cut12, wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)0, 63>>cut12, wt294);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)6144, 4032>>cut12, wt294);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt279);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt280);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt281);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt282);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt283);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt284);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt285);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt286);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt287);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt288);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt289);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt290);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt291);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt292);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt293);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c25)+(ptrdiff_t)12288, 65535-(4095>>cut12), wt294);
}
break;
}
default: {
cut12 = 4;
__m512 sum225 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k103);
__m512i pmMul18 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd18 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo15 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k103+256*i34));
__m512 masHi15 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k103+256*i34)+(ptrdiff_t)64);
__m512 postMul29 = _mm512_permutex2var_ps(masLo15, pmMul18, masHi15);
__m512 postAdd19 = _mm512_permutex2var_ps(masLo15, pmAdd18, masHi15);
sum225 = _mm512_fmadd_ps(sum225, postMul29, postAdd19);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)0, 63>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)6144, 4032>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)12288, 258048>>cut12, sum225);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*0+(ptrdiff_t)18432, 65535-(262143>>cut12), sum225);
ptrdiff_t c26 = 0;
for (; c26 != 16; ++c26) {
__m512 wt295 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)0);
__m512 wt296 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)1024);
__m512 wt297 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)2048);
__m512 wt298 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)3072);
__m512 wt299 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)4096);
__m512 wt300 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)5120);
__m512 wt301 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)6144);
__m512 wt302 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)7168);
__m512 wt303 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)8192);
__m512 wt304 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)9216);
__m512 wt305 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)10240);
__m512 wt306 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)11264);
__m512 wt307 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)12288);
__m512 wt308 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)13312);
__m512 wt309 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)14336);
__m512 wt310 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k103+64*c26+(ptrdiff_t)15360);
__m512 tmp10697 = _mm512_unpacklo_ps(wt295, wt296);
__m512 tmp10698 = _mm512_unpackhi_ps(wt295, wt296);
__m512 tmp10699 = _mm512_unpacklo_ps(wt297, wt298);
__m512 tmp10700 = _mm512_unpackhi_ps(wt297, wt298);
__m512 tmp10701 = _mm512_unpacklo_ps(wt299, wt300);
__m512 tmp10702 = _mm512_unpackhi_ps(wt299, wt300);
__m512 tmp10703 = _mm512_unpacklo_ps(wt301, wt302);
__m512 tmp10704 = _mm512_unpackhi_ps(wt301, wt302);
__m512 tmp10705 = _mm512_unpacklo_ps(wt303, wt304);
__m512 tmp10706 = _mm512_unpackhi_ps(wt303, wt304);
__m512 tmp10707 = _mm512_unpacklo_ps(wt305, wt306);
__m512 tmp10708 = _mm512_unpackhi_ps(wt305, wt306);
__m512 tmp10709 = _mm512_unpacklo_ps(wt307, wt308);
__m512 tmp10710 = _mm512_unpackhi_ps(wt307, wt308);
__m512 tmp10711 = _mm512_unpacklo_ps(wt309, wt310);
__m512 tmp10712 = _mm512_unpackhi_ps(wt309, wt310);
__m512 tmp10713 = _mm512_shuffle_ps(tmp10697, tmp10699, 68);
__m512 tmp10714 = _mm512_shuffle_ps(tmp10697, tmp10699, 238);
__m512 tmp10715 = _mm512_shuffle_ps(tmp10698, tmp10700, 68);
__m512 tmp10716 = _mm512_shuffle_ps(tmp10698, tmp10700, 238);
__m512 tmp10717 = _mm512_shuffle_ps(tmp10701, tmp10703, 68);
__m512 tmp10718 = _mm512_shuffle_ps(tmp10701, tmp10703, 238);
__m512 tmp10719 = _mm512_shuffle_ps(tmp10702, tmp10704, 68);
__m512 tmp10720 = _mm512_shuffle_ps(tmp10702, tmp10704, 238);
__m512 tmp10721 = _mm512_shuffle_ps(tmp10705, tmp10707, 68);
__m512 tmp10722 = _mm512_shuffle_ps(tmp10705, tmp10707, 238);
__m512 tmp10723 = _mm512_shuffle_ps(tmp10706, tmp10708, 68);
__m512 tmp10724 = _mm512_shuffle_ps(tmp10706, tmp10708, 238);
__m512 tmp10725 = _mm512_shuffle_ps(tmp10709, tmp10711, 68);
__m512 tmp10726 = _mm512_shuffle_ps(tmp10709, tmp10711, 238);
__m512 tmp10727 = _mm512_shuffle_ps(tmp10710, tmp10712, 68);
__m512 tmp10728 = _mm512_shuffle_ps(tmp10710, tmp10712, 238);
__m512 tmp10729 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 136);
__m512 tmp10730 = _mm512_shuffle_f32x4(tmp10713, tmp10717, 221);
__m512 tmp10731 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 136);
__m512 tmp10732 = _mm512_shuffle_f32x4(tmp10714, tmp10718, 221);
__m512 tmp10733 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 136);
__m512 tmp10734 = _mm512_shuffle_f32x4(tmp10715, tmp10719, 221);
__m512 tmp10735 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 136);
__m512 tmp10736 = _mm512_shuffle_f32x4(tmp10716, tmp10720, 221);
__m512 tmp10737 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 136);
__m512 tmp10738 = _mm512_shuffle_f32x4(tmp10721, tmp10725, 221);
__m512 tmp10739 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 136);
__m512 tmp10740 = _mm512_shuffle_f32x4(tmp10722, tmp10726, 221);
__m512 tmp10741 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 136);
__m512 tmp10742 = _mm512_shuffle_f32x4(tmp10723, tmp10727, 221);
__m512 tmp10743 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 136);
__m512 tmp10744 = _mm512_shuffle_f32x4(tmp10724, tmp10728, 221);
wt295 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 136);
wt303 = _mm512_shuffle_f32x4(tmp10729, tmp10737, 221);
wt296 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 136);
wt304 = _mm512_shuffle_f32x4(tmp10731, tmp10739, 221);
wt297 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 136);
wt305 = _mm512_shuffle_f32x4(tmp10733, tmp10741, 221);
wt298 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 136);
wt306 = _mm512_shuffle_f32x4(tmp10735, tmp10743, 221);
wt299 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 136);
wt307 = _mm512_shuffle_f32x4(tmp10730, tmp10738, 221);
wt300 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 136);
wt308 = _mm512_shuffle_f32x4(tmp10732, tmp10740, 221);
wt301 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 136);
wt309 = _mm512_shuffle_f32x4(tmp10734, tmp10742, 221);
wt302 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 136);
wt310 = _mm512_shuffle_f32x4(tmp10736, tmp10744, 221);
wt295 = _mm512_mul_ps(wt295, postMul29);
wt296 = _mm512_mul_ps(wt296, postMul29);
wt297 = _mm512_mul_ps(wt297, postMul29);
wt298 = _mm512_mul_ps(wt298, postMul29);
wt299 = _mm512_mul_ps(wt299, postMul29);
wt300 = _mm512_mul_ps(wt300, postMul29);
wt301 = _mm512_mul_ps(wt301, postMul29);
wt302 = _mm512_mul_ps(wt302, postMul29);
wt303 = _mm512_mul_ps(wt303, postMul29);
wt304 = _mm512_mul_ps(wt304, postMul29);
wt305 = _mm512_mul_ps(wt305, postMul29);
wt306 = _mm512_mul_ps(wt306, postMul29);
wt307 = _mm512_mul_ps(wt307, postMul29);
wt308 = _mm512_mul_ps(wt308, postMul29);
wt309 = _mm512_mul_ps(wt309, postMul29);
wt310 = _mm512_mul_ps(wt310, postMul29);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)0, 63>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)0, 63>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)0, 63>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)0, 63>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)0, 63>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)0, 63>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)0, 63>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)0, 63>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)0, 63>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)0, 63>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)0, 63>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)0, 63>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)0, 63>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)0, 63>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)0, 63>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)0, 63>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)6144, 4032>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)12288, 258048>>cut12, wt310);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(1+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt295);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(2+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt296);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(3+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt297);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(4+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt298);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(5+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt299);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(6+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt300);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(7+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt301);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(8+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt302);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(9+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt303);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(10+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt304);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(11+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt305);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(12+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt306);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(13+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt307);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(14+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt308);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(15+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt309);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l40+4*cut12+24*(16+16*c26)+(ptrdiff_t)18432, 65535-(262143>>cut12), wt310);
}
}
}
} else {
ptrdiff_t k102 = 240;
ptrdiff_t l39 = (size_t)(0+k102)/6;
ptrdiff_t cut11 = (size_t)(0+k102)%6;
__m512 sum223 = _mm512_maskz_loadu_ps(65535, biasPtr9+1024*i34+4*k102);
__m512i pmMul19 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd19 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo16 = _mm512_loadu_ps(bnPtr10+(ptrdiff_t)8*(k102+256*i34));
__m512 masHi16 = _mm512_maskz_loadu_ps(65535, bnPtr10+(ptrdiff_t)8*(k102+256*i34)+(ptrdiff_t)64);
__m512 postMul27 = _mm512_permutex2var_ps(masLo16, pmMul19, masHi16);
__m512 postAdd17 = _mm512_permutex2var_ps(masLo16, pmAdd19, masHi16);
sum223 = _mm512_fmadd_ps(sum223, postMul27, postAdd17);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*0+(ptrdiff_t)0, 63>>cut11, sum223);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*0+(ptrdiff_t)6144, 4032>>cut11, sum223);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*0+(ptrdiff_t)12288, 65535-(4095>>cut11), sum223);
ptrdiff_t c24 = 0;
for (; c24 != 16; ++c24) {
__m512 wt263 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)0);
__m512 wt264 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)1024);
__m512 wt265 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)2048);
__m512 wt266 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)3072);
__m512 wt267 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)4096);
__m512 wt268 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)5120);
__m512 wt269 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)6144);
__m512 wt270 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)7168);
__m512 wt271 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)8192);
__m512 wt272 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)9216);
__m512 wt273 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)10240);
__m512 wt274 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)11264);
__m512 wt275 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)12288);
__m512 wt276 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)13312);
__m512 wt277 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)14336);
__m512 wt278 = _mm512_maskz_loadu_ps(65535, wtPtr9+262144*i34+1024*k102+64*c24+(ptrdiff_t)15360);
__m512 tmp10745 = _mm512_unpacklo_ps(wt263, wt264);
__m512 tmp10746 = _mm512_unpackhi_ps(wt263, wt264);
__m512 tmp10747 = _mm512_unpacklo_ps(wt265, wt266);
__m512 tmp10748 = _mm512_unpackhi_ps(wt265, wt266);
__m512 tmp10749 = _mm512_unpacklo_ps(wt267, wt268);
__m512 tmp10750 = _mm512_unpackhi_ps(wt267, wt268);
__m512 tmp10751 = _mm512_unpacklo_ps(wt269, wt270);
__m512 tmp10752 = _mm512_unpackhi_ps(wt269, wt270);
__m512 tmp10753 = _mm512_unpacklo_ps(wt271, wt272);
__m512 tmp10754 = _mm512_unpackhi_ps(wt271, wt272);
__m512 tmp10755 = _mm512_unpacklo_ps(wt273, wt274);
__m512 tmp10756 = _mm512_unpackhi_ps(wt273, wt274);
__m512 tmp10757 = _mm512_unpacklo_ps(wt275, wt276);
__m512 tmp10758 = _mm512_unpackhi_ps(wt275, wt276);
__m512 tmp10759 = _mm512_unpacklo_ps(wt277, wt278);
__m512 tmp10760 = _mm512_unpackhi_ps(wt277, wt278);
__m512 tmp10761 = _mm512_shuffle_ps(tmp10745, tmp10747, 68);
__m512 tmp10762 = _mm512_shuffle_ps(tmp10745, tmp10747, 238);
__m512 tmp10763 = _mm512_shuffle_ps(tmp10746, tmp10748, 68);
__m512 tmp10764 = _mm512_shuffle_ps(tmp10746, tmp10748, 238);
__m512 tmp10765 = _mm512_shuffle_ps(tmp10749, tmp10751, 68);
__m512 tmp10766 = _mm512_shuffle_ps(tmp10749, tmp10751, 238);
__m512 tmp10767 = _mm512_shuffle_ps(tmp10750, tmp10752, 68);
__m512 tmp10768 = _mm512_shuffle_ps(tmp10750, tmp10752, 238);
__m512 tmp10769 = _mm512_shuffle_ps(tmp10753, tmp10755, 68);
__m512 tmp10770 = _mm512_shuffle_ps(tmp10753, tmp10755, 238);
__m512 tmp10771 = _mm512_shuffle_ps(tmp10754, tmp10756, 68);
__m512 tmp10772 = _mm512_shuffle_ps(tmp10754, tmp10756, 238);
__m512 tmp10773 = _mm512_shuffle_ps(tmp10757, tmp10759, 68);
__m512 tmp10774 = _mm512_shuffle_ps(tmp10757, tmp10759, 238);
__m512 tmp10775 = _mm512_shuffle_ps(tmp10758, tmp10760, 68);
__m512 tmp10776 = _mm512_shuffle_ps(tmp10758, tmp10760, 238);
__m512 tmp10777 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 136);
__m512 tmp10778 = _mm512_shuffle_f32x4(tmp10761, tmp10765, 221);
__m512 tmp10779 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 136);
__m512 tmp10780 = _mm512_shuffle_f32x4(tmp10762, tmp10766, 221);
__m512 tmp10781 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 136);
__m512 tmp10782 = _mm512_shuffle_f32x4(tmp10763, tmp10767, 221);
__m512 tmp10783 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 136);
__m512 tmp10784 = _mm512_shuffle_f32x4(tmp10764, tmp10768, 221);
__m512 tmp10785 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 136);
__m512 tmp10786 = _mm512_shuffle_f32x4(tmp10769, tmp10773, 221);
__m512 tmp10787 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 136);
__m512 tmp10788 = _mm512_shuffle_f32x4(tmp10770, tmp10774, 221);
__m512 tmp10789 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 136);
__m512 tmp10790 = _mm512_shuffle_f32x4(tmp10771, tmp10775, 221);
__m512 tmp10791 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 136);
__m512 tmp10792 = _mm512_shuffle_f32x4(tmp10772, tmp10776, 221);
wt263 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 136);
wt271 = _mm512_shuffle_f32x4(tmp10777, tmp10785, 221);
wt264 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 136);
wt272 = _mm512_shuffle_f32x4(tmp10779, tmp10787, 221);
wt265 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 136);
wt273 = _mm512_shuffle_f32x4(tmp10781, tmp10789, 221);
wt266 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 136);
wt274 = _mm512_shuffle_f32x4(tmp10783, tmp10791, 221);
wt267 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 136);
wt275 = _mm512_shuffle_f32x4(tmp10778, tmp10786, 221);
wt268 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 136);
wt276 = _mm512_shuffle_f32x4(tmp10780, tmp10788, 221);
wt269 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 136);
wt277 = _mm512_shuffle_f32x4(tmp10782, tmp10790, 221);
wt270 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 136);
wt278 = _mm512_shuffle_f32x4(tmp10784, tmp10792, 221);
wt263 = _mm512_mul_ps(wt263, postMul27);
wt264 = _mm512_mul_ps(wt264, postMul27);
wt265 = _mm512_mul_ps(wt265, postMul27);
wt266 = _mm512_mul_ps(wt266, postMul27);
wt267 = _mm512_mul_ps(wt267, postMul27);
wt268 = _mm512_mul_ps(wt268, postMul27);
wt269 = _mm512_mul_ps(wt269, postMul27);
wt270 = _mm512_mul_ps(wt270, postMul27);
wt271 = _mm512_mul_ps(wt271, postMul27);
wt272 = _mm512_mul_ps(wt272, postMul27);
wt273 = _mm512_mul_ps(wt273, postMul27);
wt274 = _mm512_mul_ps(wt274, postMul27);
wt275 = _mm512_mul_ps(wt275, postMul27);
wt276 = _mm512_mul_ps(wt276, postMul27);
wt277 = _mm512_mul_ps(wt277, postMul27);
wt278 = _mm512_mul_ps(wt278, postMul27);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(1+16*c24)+(ptrdiff_t)0, 63>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(2+16*c24)+(ptrdiff_t)0, 63>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(3+16*c24)+(ptrdiff_t)0, 63>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(4+16*c24)+(ptrdiff_t)0, 63>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(5+16*c24)+(ptrdiff_t)0, 63>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(6+16*c24)+(ptrdiff_t)0, 63>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(7+16*c24)+(ptrdiff_t)0, 63>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(8+16*c24)+(ptrdiff_t)0, 63>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(9+16*c24)+(ptrdiff_t)0, 63>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(10+16*c24)+(ptrdiff_t)0, 63>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(11+16*c24)+(ptrdiff_t)0, 63>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(12+16*c24)+(ptrdiff_t)0, 63>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(13+16*c24)+(ptrdiff_t)0, 63>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(14+16*c24)+(ptrdiff_t)0, 63>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(15+16*c24)+(ptrdiff_t)0, 63>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(16+16*c24)+(ptrdiff_t)0, 63>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(1+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(2+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(3+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(4+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(5+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(6+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(7+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(8+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(9+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(10+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(11+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(12+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(13+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(14+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(15+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+24*(16+16*c24)+(ptrdiff_t)6144, 4032>>cut11, wt278);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(1+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt263);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(2+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt264);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(3+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt265);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(4+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt266);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(5+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt267);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(6+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt268);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(7+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt269);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(8+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt270);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(9+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt271);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(10+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt272);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(11+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt273);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(12+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt274);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(13+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt275);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(14+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt276);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(15+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt277);
_mm512_mask_storeu_ps(arranged9+263168*i34+6168*l39+4*cut11+16*(16+16*c24)+(ptrdiff_t)12288, 65535-(4095>>cut11), wt278);
}
}
}
}
}

static void ResNeXt50OneArrangeWts5(ResNeXt50ThreaderTeam1* team39, char** tensors51) {
ResNeXt50ThreaderTask1 task55;
task55.callee1 = ResNeXt50OneArrangeWts5Callee1;
task55.any1 = tensors51;
task55.nd1 = 3;
task55.hull1[0] = 8;
task55.hull1[1] = 1;
task55.hull1[2] = 1;
ResNeXt50ThreaderDo1(team39, &task55);
}

static void ResNeXt50OneArrangeDats5Callee1(ResNeXt50ThreaderTask1* task56, int64_t* pt33) {
char** tensors54 = task56->any1;
ptrdiff_t s23 = pt33[0];
ptrdiff_t c27 = pt33[1];
char*restrict datPtr16 = tensors54[0]+(ptrdiff_t)0+(ptrdiff_t)10527680*0+(ptrdiff_t)3227648*0;
char*restrict arranged10 = tensors54[1]+(ptrdiff_t)10474240*0+(ptrdiff_t)3211264*0;
ptrdiff_t ii22 = 1;
for (ptrdiff_t i35 = 0; i35 < ii22; ++i35) {
ptrdiff_t j28 = 1*c27;
ptrdiff_t jj34 = j28+0;
for (; j28 != 49; ++j28) {
ptrdiff_t k104 = 128*s23;
ptrdiff_t kk32 = k104+128;
for (; k104 < kk32; ++k104) {
__m512 dat1677 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)0);
__m512 dat1678 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)64);
__m512 dat1679 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)128);
__m512 dat1680 = _mm512_maskz_loadu_ps(65535, datPtr16+3227648*i35+256*j28+12608*k104+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)0, 65535, dat1677);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)64, 65535, dat1678);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)128, 65535, dat1679);
_mm512_mask_storeu_ps(arranged10+3211264*i35+65536*j28+256*k104+(ptrdiff_t)192, 65535, dat1680);
}
if (j28 >= jj34) goto next5;
}
next5:;
}
}

static void ResNeXt50OneArrangeDats5(ResNeXt50ThreaderTeam1* team40, char** tensors53) {
ResNeXt50ThreaderTask1 task57;
task57.callee1 = ResNeXt50OneArrangeDats5Callee1;
task57.any1 = tensors53;
task57.nd1 = 4;
task57.hull1[0] = 2;
task57.hull1[1] = 49;
task57.hull1[2] = 1;
task57.hull1[3] = 1;
ResNeXt50ThreaderDo1(team40, &task57);
}

static void ResNeXt50OneApply5Callee1(ResNeXt50ThreaderTask1* task58, int64_t* pt34) {
void** pair14 = task58->any1;
char** tensors56 = pair14[0];
ptrdiff_t e16 = 0;
ptrdiff_t g18 = 0;
ptrdiff_t d11 = pt34[1];
ptrdiff_t w48 = pt34[0];
char*restrict arrangedWts5 = tensors56[0]+856064*e16+(ptrdiff_t)263168*1*g18;
char*restrict arrangedDats5 = tensors56[1]+10474240*e16+(ptrdiff_t)3211264*1*g18;
char*restrict datPtr17 = tensors56[2]+(ptrdiff_t)3227648*1*g18;
ptrdiff_t ii23 = 1;
for (ptrdiff_t i36 = 0; i36 < ii23; ++i36) {
ptrdiff_t j29 = 1*d11;
ptrdiff_t jj35 = j29+0;
for (; j29 != 49; ++j29) {
ptrdiff_t k105 = 2*w48;
ptrdiff_t kk33 = k105+(w48 < 20 ? 1 : 2);
for (; k105 != 42; ++k105) {
ptrdiff_t s24 = -1;
__m512 sum226 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)24));
__m512 sum230 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)28));
__m512 sum234 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)32));
__m512 sum238 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)36));
__m512 sum242 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)40));
__m512 sum246 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)44));
__m512 sum227 = sum226;
__m512 sum228 = sum226;
__m512 sum229 = sum226;
__m512 sum231 = sum230;
__m512 sum232 = sum230;
__m512 sum233 = sum230;
__m512 sum235 = sum234;
__m512 sum236 = sum234;
__m512 sum237 = sum234;
__m512 sum239 = sum238;
__m512 sum240 = sum238;
__m512 sum241 = sum238;
__m512 sum243 = sum242;
__m512 sum244 = sum242;
__m512 sum245 = sum242;
__m512 sum247 = sum246;
__m512 sum248 = sum246;
__m512 sum249 = sum246;
for (s24 = 0; s24 < 256; ++s24) {
__m512 dat1681 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)0);
__m512 dat1682 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)64);
__m512 dat1683 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)128);
__m512 dat1684 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s24+(ptrdiff_t)192);
__m512 wt311 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)24));
sum226 = _mm512_fmadd_ps(wt311, dat1681, sum226);
sum227 = _mm512_fmadd_ps(wt311, dat1682, sum227);
sum228 = _mm512_fmadd_ps(wt311, dat1683, sum228);
sum229 = _mm512_fmadd_ps(wt311, dat1684, sum229);
__m512 wt312 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)28));
sum230 = _mm512_fmadd_ps(wt312, dat1681, sum230);
sum231 = _mm512_fmadd_ps(wt312, dat1682, sum231);
sum232 = _mm512_fmadd_ps(wt312, dat1683, sum232);
sum233 = _mm512_fmadd_ps(wt312, dat1684, sum233);
__m512 wt313 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)32));
sum234 = _mm512_fmadd_ps(wt313, dat1681, sum234);
sum235 = _mm512_fmadd_ps(wt313, dat1682, sum235);
sum236 = _mm512_fmadd_ps(wt313, dat1683, sum236);
sum237 = _mm512_fmadd_ps(wt313, dat1684, sum237);
__m512 wt314 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)36));
sum238 = _mm512_fmadd_ps(wt314, dat1681, sum238);
sum239 = _mm512_fmadd_ps(wt314, dat1682, sum239);
sum240 = _mm512_fmadd_ps(wt314, dat1683, sum240);
sum241 = _mm512_fmadd_ps(wt314, dat1684, sum241);
__m512 wt315 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)40));
sum242 = _mm512_fmadd_ps(wt315, dat1681, sum242);
sum243 = _mm512_fmadd_ps(wt315, dat1682, sum243);
sum244 = _mm512_fmadd_ps(wt315, dat1683, sum244);
sum245 = _mm512_fmadd_ps(wt315, dat1684, sum245);
__m512 wt316 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+24*s24+(ptrdiff_t)44));
sum246 = _mm512_fmadd_ps(wt316, dat1681, sum246);
sum247 = _mm512_fmadd_ps(wt316, dat1682, sum247);
sum248 = _mm512_fmadd_ps(wt316, dat1683, sum248);
sum249 = _mm512_fmadd_ps(wt316, dat1684, sum249);
}
sum226 = _mm512_max_ps(_mm512_setzero_ps(), sum226);
sum227 = _mm512_max_ps(_mm512_setzero_ps(), sum227);
sum228 = _mm512_max_ps(_mm512_setzero_ps(), sum228);
sum229 = _mm512_max_ps(_mm512_setzero_ps(), sum229);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)0, 65535, sum226);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)64, 65535, sum227);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)128, 65535, sum228);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)192, 65535, sum229);
sum230 = _mm512_max_ps(_mm512_setzero_ps(), sum230);
sum231 = _mm512_max_ps(_mm512_setzero_ps(), sum231);
sum232 = _mm512_max_ps(_mm512_setzero_ps(), sum232);
sum233 = _mm512_max_ps(_mm512_setzero_ps(), sum233);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12608, 65535, sum230);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12672, 65535, sum231);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12736, 65535, sum232);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12800, 65535, sum233);
sum234 = _mm512_max_ps(_mm512_setzero_ps(), sum234);
sum235 = _mm512_max_ps(_mm512_setzero_ps(), sum235);
sum236 = _mm512_max_ps(_mm512_setzero_ps(), sum236);
sum237 = _mm512_max_ps(_mm512_setzero_ps(), sum237);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25216, 65535, sum234);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25280, 65535, sum235);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25344, 65535, sum236);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25408, 65535, sum237);
sum238 = _mm512_max_ps(_mm512_setzero_ps(), sum238);
sum239 = _mm512_max_ps(_mm512_setzero_ps(), sum239);
sum240 = _mm512_max_ps(_mm512_setzero_ps(), sum240);
sum241 = _mm512_max_ps(_mm512_setzero_ps(), sum241);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37824, 65535, sum238);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37888, 65535, sum239);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37952, 65535, sum240);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)38016, 65535, sum241);
sum242 = _mm512_max_ps(_mm512_setzero_ps(), sum242);
sum243 = _mm512_max_ps(_mm512_setzero_ps(), sum243);
sum244 = _mm512_max_ps(_mm512_setzero_ps(), sum244);
sum245 = _mm512_max_ps(_mm512_setzero_ps(), sum245);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50432, 65535, sum242);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50496, 65535, sum243);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50560, 65535, sum244);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)50624, 65535, sum245);
sum246 = _mm512_max_ps(_mm512_setzero_ps(), sum246);
sum247 = _mm512_max_ps(_mm512_setzero_ps(), sum247);
sum248 = _mm512_max_ps(_mm512_setzero_ps(), sum248);
sum249 = _mm512_max_ps(_mm512_setzero_ps(), sum249);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63040, 65535, sum246);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63104, 65535, sum247);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63168, 65535, sum248);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)63232, 65535, sum249);
if (k105 >= kk33) return;
}
ptrdiff_t s25 = -1;
__m512 sum250 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)16));
__m512 sum254 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)20));
__m512 sum258 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)24));
__m512 sum262 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)28));
__m512 sum251 = sum250;
__m512 sum252 = sum250;
__m512 sum253 = sum250;
__m512 sum255 = sum254;
__m512 sum256 = sum254;
__m512 sum257 = sum254;
__m512 sum259 = sum258;
__m512 sum260 = sum258;
__m512 sum261 = sum258;
__m512 sum263 = sum262;
__m512 sum264 = sum262;
__m512 sum265 = sum262;
for (s25 = 0; s25 < 256; ++s25) {
__m512 dat1685 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)0);
__m512 dat1686 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)64);
__m512 dat1687 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)128);
__m512 dat1688 = _mm512_loadu_ps(arrangedDats5+3211264*i36+65536*j29+256*s25+(ptrdiff_t)192);
__m512 wt317 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)16));
sum250 = _mm512_fmadd_ps(wt317, dat1685, sum250);
sum251 = _mm512_fmadd_ps(wt317, dat1686, sum251);
sum252 = _mm512_fmadd_ps(wt317, dat1687, sum252);
sum253 = _mm512_fmadd_ps(wt317, dat1688, sum253);
__m512 wt318 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)20));
sum254 = _mm512_fmadd_ps(wt318, dat1685, sum254);
sum255 = _mm512_fmadd_ps(wt318, dat1686, sum255);
sum256 = _mm512_fmadd_ps(wt318, dat1687, sum256);
sum257 = _mm512_fmadd_ps(wt318, dat1688, sum257);
__m512 wt319 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)24));
sum258 = _mm512_fmadd_ps(wt319, dat1685, sum258);
sum259 = _mm512_fmadd_ps(wt319, dat1686, sum259);
sum260 = _mm512_fmadd_ps(wt319, dat1687, sum260);
sum261 = _mm512_fmadd_ps(wt319, dat1688, sum261);
__m512 wt320 = _mm512_set1_ps(*(float*)(arrangedWts5+263168*i36+6168*k105+16*s25+(ptrdiff_t)28));
sum262 = _mm512_fmadd_ps(wt320, dat1685, sum262);
sum263 = _mm512_fmadd_ps(wt320, dat1686, sum263);
sum264 = _mm512_fmadd_ps(wt320, dat1687, sum264);
sum265 = _mm512_fmadd_ps(wt320, dat1688, sum265);
}
sum250 = _mm512_max_ps(_mm512_setzero_ps(), sum250);
sum251 = _mm512_max_ps(_mm512_setzero_ps(), sum251);
sum252 = _mm512_max_ps(_mm512_setzero_ps(), sum252);
sum253 = _mm512_max_ps(_mm512_setzero_ps(), sum253);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)0, 65535, sum250);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)64, 65535, sum251);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)128, 65535, sum252);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)192, 65535, sum253);
sum254 = _mm512_max_ps(_mm512_setzero_ps(), sum254);
sum255 = _mm512_max_ps(_mm512_setzero_ps(), sum255);
sum256 = _mm512_max_ps(_mm512_setzero_ps(), sum256);
sum257 = _mm512_max_ps(_mm512_setzero_ps(), sum257);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12608, 65535, sum254);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12672, 65535, sum255);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12736, 65535, sum256);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)12800, 65535, sum257);
sum258 = _mm512_max_ps(_mm512_setzero_ps(), sum258);
sum259 = _mm512_max_ps(_mm512_setzero_ps(), sum259);
sum260 = _mm512_max_ps(_mm512_setzero_ps(), sum260);
sum261 = _mm512_max_ps(_mm512_setzero_ps(), sum261);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25216, 65535, sum258);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25280, 65535, sum259);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25344, 65535, sum260);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)25408, 65535, sum261);
sum262 = _mm512_max_ps(_mm512_setzero_ps(), sum262);
sum263 = _mm512_max_ps(_mm512_setzero_ps(), sum263);
sum264 = _mm512_max_ps(_mm512_setzero_ps(), sum264);
sum265 = _mm512_max_ps(_mm512_setzero_ps(), sum265);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37824, 65535, sum262);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37888, 65535, sum263);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)37952, 65535, sum264);
_mm512_mask_storeu_ps(datPtr17+3227648*i36+256*j29+75648*k105+(ptrdiff_t)38016, 65535, sum265);
if (j29 >= jj35) return;
}
}
}

static void ResNeXt50OneApply5(ResNeXt50ThreaderTeam1* team41, char** tensors55) {
void* pair13[] = {tensors55, 0};
ResNeXt50ThreaderTask1 task59;
task59.callee1 = ResNeXt50OneApply5Callee1;
task59.any1 = pair13;
task59.nd1 = 3;
task59.hull1[0] = 21;
task59.hull1[1] = 49;
task59.hull1[2] = 1;
ResNeXt50ThreaderDo1(team41, &task59);
}

static void ResNeXt50OneArrangeWts6Callee1(ResNeXt50ThreaderTask1* task68, int64_t* pt39) {
char** tensors66 = task68->any1;
ptrdiff_t b69 = pt39[0];
char*restrict wtPtr11 = tensors66[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr11 = tensors66[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr12 = tensors66[2]+(ptrdiff_t)8*512*0;
char*restrict arranged11 = tensors66[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)526336*0;
ptrdiff_t ii28 = 1;
for (ptrdiff_t i42 = 0; i42 < ii28; ++i42) {
ptrdiff_t j35 = 2*b69;
ptrdiff_t jj38 = j35+2;
for (; j35 < jj38; ++j35) {
if (j35 < 31) {
ptrdiff_t k118 = 0+16*(j35-0);
ptrdiff_t l50 = (size_t)(0+k118)/6;
ptrdiff_t cut14 = (size_t)(0+k118)%6;
switch (cut14) {
case 0:;
case 2: {
__m512 sum267 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k118);
__m512i pmMul21 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd21 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo17 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k118+512*i42));
__m512 masHi17 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k118+512*i42)+(ptrdiff_t)64);
__m512 postMul34 = _mm512_permutex2var_ps(masLo17, pmMul21, masHi17);
__m512 postAdd22 = _mm512_permutex2var_ps(masLo17, pmAdd21, masHi17);
sum267 = _mm512_fmadd_ps(sum267, postMul34, postAdd22);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum267);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)6144, 4032>>cut14, sum267);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)12288, 65535-(4095>>cut14), sum267);
ptrdiff_t c32 = 0;
for (; c32 != 16; ++c32) {
__m512 wt343 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)0);
__m512 wt344 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)1024);
__m512 wt345 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)2048);
__m512 wt346 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)3072);
__m512 wt347 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)4096);
__m512 wt348 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)5120);
__m512 wt349 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)6144);
__m512 wt350 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)7168);
__m512 wt351 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)8192);
__m512 wt352 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)9216);
__m512 wt353 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)10240);
__m512 wt354 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)11264);
__m512 wt355 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)12288);
__m512 wt356 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)13312);
__m512 wt357 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)14336);
__m512 wt358 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c32+(ptrdiff_t)15360);
__m512 tmp10793 = _mm512_unpacklo_ps(wt343, wt344);
__m512 tmp10794 = _mm512_unpackhi_ps(wt343, wt344);
__m512 tmp10795 = _mm512_unpacklo_ps(wt345, wt346);
__m512 tmp10796 = _mm512_unpackhi_ps(wt345, wt346);
__m512 tmp10797 = _mm512_unpacklo_ps(wt347, wt348);
__m512 tmp10798 = _mm512_unpackhi_ps(wt347, wt348);
__m512 tmp10799 = _mm512_unpacklo_ps(wt349, wt350);
__m512 tmp10800 = _mm512_unpackhi_ps(wt349, wt350);
__m512 tmp10801 = _mm512_unpacklo_ps(wt351, wt352);
__m512 tmp10802 = _mm512_unpackhi_ps(wt351, wt352);
__m512 tmp10803 = _mm512_unpacklo_ps(wt353, wt354);
__m512 tmp10804 = _mm512_unpackhi_ps(wt353, wt354);
__m512 tmp10805 = _mm512_unpacklo_ps(wt355, wt356);
__m512 tmp10806 = _mm512_unpackhi_ps(wt355, wt356);
__m512 tmp10807 = _mm512_unpacklo_ps(wt357, wt358);
__m512 tmp10808 = _mm512_unpackhi_ps(wt357, wt358);
__m512 tmp10809 = _mm512_shuffle_ps(tmp10793, tmp10795, 68);
__m512 tmp10810 = _mm512_shuffle_ps(tmp10793, tmp10795, 238);
__m512 tmp10811 = _mm512_shuffle_ps(tmp10794, tmp10796, 68);
__m512 tmp10812 = _mm512_shuffle_ps(tmp10794, tmp10796, 238);
__m512 tmp10813 = _mm512_shuffle_ps(tmp10797, tmp10799, 68);
__m512 tmp10814 = _mm512_shuffle_ps(tmp10797, tmp10799, 238);
__m512 tmp10815 = _mm512_shuffle_ps(tmp10798, tmp10800, 68);
__m512 tmp10816 = _mm512_shuffle_ps(tmp10798, tmp10800, 238);
__m512 tmp10817 = _mm512_shuffle_ps(tmp10801, tmp10803, 68);
__m512 tmp10818 = _mm512_shuffle_ps(tmp10801, tmp10803, 238);
__m512 tmp10819 = _mm512_shuffle_ps(tmp10802, tmp10804, 68);
__m512 tmp10820 = _mm512_shuffle_ps(tmp10802, tmp10804, 238);
__m512 tmp10821 = _mm512_shuffle_ps(tmp10805, tmp10807, 68);
__m512 tmp10822 = _mm512_shuffle_ps(tmp10805, tmp10807, 238);
__m512 tmp10823 = _mm512_shuffle_ps(tmp10806, tmp10808, 68);
__m512 tmp10824 = _mm512_shuffle_ps(tmp10806, tmp10808, 238);
__m512 tmp10825 = _mm512_shuffle_f32x4(tmp10809, tmp10813, 136);
__m512 tmp10826 = _mm512_shuffle_f32x4(tmp10809, tmp10813, 221);
__m512 tmp10827 = _mm512_shuffle_f32x4(tmp10810, tmp10814, 136);
__m512 tmp10828 = _mm512_shuffle_f32x4(tmp10810, tmp10814, 221);
__m512 tmp10829 = _mm512_shuffle_f32x4(tmp10811, tmp10815, 136);
__m512 tmp10830 = _mm512_shuffle_f32x4(tmp10811, tmp10815, 221);
__m512 tmp10831 = _mm512_shuffle_f32x4(tmp10812, tmp10816, 136);
__m512 tmp10832 = _mm512_shuffle_f32x4(tmp10812, tmp10816, 221);
__m512 tmp10833 = _mm512_shuffle_f32x4(tmp10817, tmp10821, 136);
__m512 tmp10834 = _mm512_shuffle_f32x4(tmp10817, tmp10821, 221);
__m512 tmp10835 = _mm512_shuffle_f32x4(tmp10818, tmp10822, 136);
__m512 tmp10836 = _mm512_shuffle_f32x4(tmp10818, tmp10822, 221);
__m512 tmp10837 = _mm512_shuffle_f32x4(tmp10819, tmp10823, 136);
__m512 tmp10838 = _mm512_shuffle_f32x4(tmp10819, tmp10823, 221);
__m512 tmp10839 = _mm512_shuffle_f32x4(tmp10820, tmp10824, 136);
__m512 tmp10840 = _mm512_shuffle_f32x4(tmp10820, tmp10824, 221);
wt343 = _mm512_shuffle_f32x4(tmp10825, tmp10833, 136);
wt351 = _mm512_shuffle_f32x4(tmp10825, tmp10833, 221);
wt344 = _mm512_shuffle_f32x4(tmp10827, tmp10835, 136);
wt352 = _mm512_shuffle_f32x4(tmp10827, tmp10835, 221);
wt345 = _mm512_shuffle_f32x4(tmp10829, tmp10837, 136);
wt353 = _mm512_shuffle_f32x4(tmp10829, tmp10837, 221);
wt346 = _mm512_shuffle_f32x4(tmp10831, tmp10839, 136);
wt354 = _mm512_shuffle_f32x4(tmp10831, tmp10839, 221);
wt347 = _mm512_shuffle_f32x4(tmp10826, tmp10834, 136);
wt355 = _mm512_shuffle_f32x4(tmp10826, tmp10834, 221);
wt348 = _mm512_shuffle_f32x4(tmp10828, tmp10836, 136);
wt356 = _mm512_shuffle_f32x4(tmp10828, tmp10836, 221);
wt349 = _mm512_shuffle_f32x4(tmp10830, tmp10838, 136);
wt357 = _mm512_shuffle_f32x4(tmp10830, tmp10838, 221);
wt350 = _mm512_shuffle_f32x4(tmp10832, tmp10840, 136);
wt358 = _mm512_shuffle_f32x4(tmp10832, tmp10840, 221);
wt343 = _mm512_mul_ps(wt343, postMul34);
wt344 = _mm512_mul_ps(wt344, postMul34);
wt345 = _mm512_mul_ps(wt345, postMul34);
wt346 = _mm512_mul_ps(wt346, postMul34);
wt347 = _mm512_mul_ps(wt347, postMul34);
wt348 = _mm512_mul_ps(wt348, postMul34);
wt349 = _mm512_mul_ps(wt349, postMul34);
wt350 = _mm512_mul_ps(wt350, postMul34);
wt351 = _mm512_mul_ps(wt351, postMul34);
wt352 = _mm512_mul_ps(wt352, postMul34);
wt353 = _mm512_mul_ps(wt353, postMul34);
wt354 = _mm512_mul_ps(wt354, postMul34);
wt355 = _mm512_mul_ps(wt355, postMul34);
wt356 = _mm512_mul_ps(wt356, postMul34);
wt357 = _mm512_mul_ps(wt357, postMul34);
wt358 = _mm512_mul_ps(wt358, postMul34);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)0, 63>>cut14, wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)0, 63>>cut14, wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)0, 63>>cut14, wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)0, 63>>cut14, wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)0, 63>>cut14, wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)0, 63>>cut14, wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)0, 63>>cut14, wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)0, 63>>cut14, wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)0, 63>>cut14, wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)0, 63>>cut14, wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)0, 63>>cut14, wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)0, 63>>cut14, wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)0, 63>>cut14, wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)0, 63>>cut14, wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)0, 63>>cut14, wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)0, 63>>cut14, wt358);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)6144, 4032>>cut14, wt358);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt343);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt344);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt345);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt346);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt347);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt348);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt349);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt350);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt351);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt352);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt353);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt354);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt355);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt356);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt357);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c32)+(ptrdiff_t)12288, 65535-(4095>>cut14), wt358);
}
break;
}
default: {
cut14 = 4;
__m512 sum268 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k118);
__m512i pmMul22 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd22 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo18 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k118+512*i42));
__m512 masHi18 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k118+512*i42)+(ptrdiff_t)64);
__m512 postMul35 = _mm512_permutex2var_ps(masLo18, pmMul22, masHi18);
__m512 postAdd23 = _mm512_permutex2var_ps(masLo18, pmAdd22, masHi18);
sum268 = _mm512_fmadd_ps(sum268, postMul35, postAdd23);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)0, 63>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)6144, 4032>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)12288, 258048>>cut14, sum268);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*0+(ptrdiff_t)18432, 65535-(262143>>cut14), sum268);
ptrdiff_t c33 = 0;
for (; c33 != 16; ++c33) {
__m512 wt359 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)0);
__m512 wt360 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)1024);
__m512 wt361 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)2048);
__m512 wt362 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)3072);
__m512 wt363 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)4096);
__m512 wt364 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)5120);
__m512 wt365 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)6144);
__m512 wt366 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)7168);
__m512 wt367 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)8192);
__m512 wt368 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)9216);
__m512 wt369 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)10240);
__m512 wt370 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)11264);
__m512 wt371 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)12288);
__m512 wt372 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)13312);
__m512 wt373 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)14336);
__m512 wt374 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k118+64*c33+(ptrdiff_t)15360);
__m512 tmp10841 = _mm512_unpacklo_ps(wt359, wt360);
__m512 tmp10842 = _mm512_unpackhi_ps(wt359, wt360);
__m512 tmp10843 = _mm512_unpacklo_ps(wt361, wt362);
__m512 tmp10844 = _mm512_unpackhi_ps(wt361, wt362);
__m512 tmp10845 = _mm512_unpacklo_ps(wt363, wt364);
__m512 tmp10846 = _mm512_unpackhi_ps(wt363, wt364);
__m512 tmp10847 = _mm512_unpacklo_ps(wt365, wt366);
__m512 tmp10848 = _mm512_unpackhi_ps(wt365, wt366);
__m512 tmp10849 = _mm512_unpacklo_ps(wt367, wt368);
__m512 tmp10850 = _mm512_unpackhi_ps(wt367, wt368);
__m512 tmp10851 = _mm512_unpacklo_ps(wt369, wt370);
__m512 tmp10852 = _mm512_unpackhi_ps(wt369, wt370);
__m512 tmp10853 = _mm512_unpacklo_ps(wt371, wt372);
__m512 tmp10854 = _mm512_unpackhi_ps(wt371, wt372);
__m512 tmp10855 = _mm512_unpacklo_ps(wt373, wt374);
__m512 tmp10856 = _mm512_unpackhi_ps(wt373, wt374);
__m512 tmp10857 = _mm512_shuffle_ps(tmp10841, tmp10843, 68);
__m512 tmp10858 = _mm512_shuffle_ps(tmp10841, tmp10843, 238);
__m512 tmp10859 = _mm512_shuffle_ps(tmp10842, tmp10844, 68);
__m512 tmp10860 = _mm512_shuffle_ps(tmp10842, tmp10844, 238);
__m512 tmp10861 = _mm512_shuffle_ps(tmp10845, tmp10847, 68);
__m512 tmp10862 = _mm512_shuffle_ps(tmp10845, tmp10847, 238);
__m512 tmp10863 = _mm512_shuffle_ps(tmp10846, tmp10848, 68);
__m512 tmp10864 = _mm512_shuffle_ps(tmp10846, tmp10848, 238);
__m512 tmp10865 = _mm512_shuffle_ps(tmp10849, tmp10851, 68);
__m512 tmp10866 = _mm512_shuffle_ps(tmp10849, tmp10851, 238);
__m512 tmp10867 = _mm512_shuffle_ps(tmp10850, tmp10852, 68);
__m512 tmp10868 = _mm512_shuffle_ps(tmp10850, tmp10852, 238);
__m512 tmp10869 = _mm512_shuffle_ps(tmp10853, tmp10855, 68);
__m512 tmp10870 = _mm512_shuffle_ps(tmp10853, tmp10855, 238);
__m512 tmp10871 = _mm512_shuffle_ps(tmp10854, tmp10856, 68);
__m512 tmp10872 = _mm512_shuffle_ps(tmp10854, tmp10856, 238);
__m512 tmp10873 = _mm512_shuffle_f32x4(tmp10857, tmp10861, 136);
__m512 tmp10874 = _mm512_shuffle_f32x4(tmp10857, tmp10861, 221);
__m512 tmp10875 = _mm512_shuffle_f32x4(tmp10858, tmp10862, 136);
__m512 tmp10876 = _mm512_shuffle_f32x4(tmp10858, tmp10862, 221);
__m512 tmp10877 = _mm512_shuffle_f32x4(tmp10859, tmp10863, 136);
__m512 tmp10878 = _mm512_shuffle_f32x4(tmp10859, tmp10863, 221);
__m512 tmp10879 = _mm512_shuffle_f32x4(tmp10860, tmp10864, 136);
__m512 tmp10880 = _mm512_shuffle_f32x4(tmp10860, tmp10864, 221);
__m512 tmp10881 = _mm512_shuffle_f32x4(tmp10865, tmp10869, 136);
__m512 tmp10882 = _mm512_shuffle_f32x4(tmp10865, tmp10869, 221);
__m512 tmp10883 = _mm512_shuffle_f32x4(tmp10866, tmp10870, 136);
__m512 tmp10884 = _mm512_shuffle_f32x4(tmp10866, tmp10870, 221);
__m512 tmp10885 = _mm512_shuffle_f32x4(tmp10867, tmp10871, 136);
__m512 tmp10886 = _mm512_shuffle_f32x4(tmp10867, tmp10871, 221);
__m512 tmp10887 = _mm512_shuffle_f32x4(tmp10868, tmp10872, 136);
__m512 tmp10888 = _mm512_shuffle_f32x4(tmp10868, tmp10872, 221);
wt359 = _mm512_shuffle_f32x4(tmp10873, tmp10881, 136);
wt367 = _mm512_shuffle_f32x4(tmp10873, tmp10881, 221);
wt360 = _mm512_shuffle_f32x4(tmp10875, tmp10883, 136);
wt368 = _mm512_shuffle_f32x4(tmp10875, tmp10883, 221);
wt361 = _mm512_shuffle_f32x4(tmp10877, tmp10885, 136);
wt369 = _mm512_shuffle_f32x4(tmp10877, tmp10885, 221);
wt362 = _mm512_shuffle_f32x4(tmp10879, tmp10887, 136);
wt370 = _mm512_shuffle_f32x4(tmp10879, tmp10887, 221);
wt363 = _mm512_shuffle_f32x4(tmp10874, tmp10882, 136);
wt371 = _mm512_shuffle_f32x4(tmp10874, tmp10882, 221);
wt364 = _mm512_shuffle_f32x4(tmp10876, tmp10884, 136);
wt372 = _mm512_shuffle_f32x4(tmp10876, tmp10884, 221);
wt365 = _mm512_shuffle_f32x4(tmp10878, tmp10886, 136);
wt373 = _mm512_shuffle_f32x4(tmp10878, tmp10886, 221);
wt366 = _mm512_shuffle_f32x4(tmp10880, tmp10888, 136);
wt374 = _mm512_shuffle_f32x4(tmp10880, tmp10888, 221);
wt359 = _mm512_mul_ps(wt359, postMul35);
wt360 = _mm512_mul_ps(wt360, postMul35);
wt361 = _mm512_mul_ps(wt361, postMul35);
wt362 = _mm512_mul_ps(wt362, postMul35);
wt363 = _mm512_mul_ps(wt363, postMul35);
wt364 = _mm512_mul_ps(wt364, postMul35);
wt365 = _mm512_mul_ps(wt365, postMul35);
wt366 = _mm512_mul_ps(wt366, postMul35);
wt367 = _mm512_mul_ps(wt367, postMul35);
wt368 = _mm512_mul_ps(wt368, postMul35);
wt369 = _mm512_mul_ps(wt369, postMul35);
wt370 = _mm512_mul_ps(wt370, postMul35);
wt371 = _mm512_mul_ps(wt371, postMul35);
wt372 = _mm512_mul_ps(wt372, postMul35);
wt373 = _mm512_mul_ps(wt373, postMul35);
wt374 = _mm512_mul_ps(wt374, postMul35);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)0, 63>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)0, 63>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)0, 63>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)0, 63>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)0, 63>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)0, 63>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)0, 63>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)0, 63>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)0, 63>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)0, 63>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)0, 63>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)0, 63>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)0, 63>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)0, 63>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)0, 63>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)0, 63>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)6144, 4032>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)12288, 258048>>cut14, wt374);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(1+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt359);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(2+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt360);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(3+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt361);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(4+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt362);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(5+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt363);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(6+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt364);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(7+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt365);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(8+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt366);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(9+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt367);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(10+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt368);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(11+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt369);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(12+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt370);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(13+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt371);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(14+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt372);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(15+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt373);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l50+4*cut14+24*(16+16*c33)+(ptrdiff_t)18432, 65535-(262143>>cut14), wt374);
}
}
}
} else {
ptrdiff_t k117 = 496;
ptrdiff_t l49 = (size_t)(0+k117)/6;
ptrdiff_t cut13 = (size_t)(0+k117)%6;
__m512 sum266 = _mm512_maskz_loadu_ps(65535, biasPtr11+2048*i42+4*k117);
__m512i pmMul23 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd23 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo19 = _mm512_loadu_ps(bnPtr12+(ptrdiff_t)8*(k117+512*i42));
__m512 masHi19 = _mm512_maskz_loadu_ps(65535, bnPtr12+(ptrdiff_t)8*(k117+512*i42)+(ptrdiff_t)64);
__m512 postMul33 = _mm512_permutex2var_ps(masLo19, pmMul23, masHi19);
__m512 postAdd21 = _mm512_permutex2var_ps(masLo19, pmAdd23, masHi19);
sum266 = _mm512_fmadd_ps(sum266, postMul33, postAdd21);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)0, 63>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)6144, 4032>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*0+(ptrdiff_t)12288, 258048>>cut13, sum266);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*0+(ptrdiff_t)18432, 65535-(262143>>cut13), sum266);
ptrdiff_t c31 = 0;
for (; c31 != 16; ++c31) {
__m512 wt327 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)0);
__m512 wt328 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)1024);
__m512 wt329 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)2048);
__m512 wt330 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)3072);
__m512 wt331 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)4096);
__m512 wt332 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)5120);
__m512 wt333 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)6144);
__m512 wt334 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)7168);
__m512 wt335 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)8192);
__m512 wt336 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)9216);
__m512 wt337 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)10240);
__m512 wt338 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)11264);
__m512 wt339 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)12288);
__m512 wt340 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)13312);
__m512 wt341 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)14336);
__m512 wt342 = _mm512_maskz_loadu_ps(65535, wtPtr11+524288*i42+1024*k117+64*c31+(ptrdiff_t)15360);
__m512 tmp10889 = _mm512_unpacklo_ps(wt327, wt328);
__m512 tmp10890 = _mm512_unpackhi_ps(wt327, wt328);
__m512 tmp10891 = _mm512_unpacklo_ps(wt329, wt330);
__m512 tmp10892 = _mm512_unpackhi_ps(wt329, wt330);
__m512 tmp10893 = _mm512_unpacklo_ps(wt331, wt332);
__m512 tmp10894 = _mm512_unpackhi_ps(wt331, wt332);
__m512 tmp10895 = _mm512_unpacklo_ps(wt333, wt334);
__m512 tmp10896 = _mm512_unpackhi_ps(wt333, wt334);
__m512 tmp10897 = _mm512_unpacklo_ps(wt335, wt336);
__m512 tmp10898 = _mm512_unpackhi_ps(wt335, wt336);
__m512 tmp10899 = _mm512_unpacklo_ps(wt337, wt338);
__m512 tmp10900 = _mm512_unpackhi_ps(wt337, wt338);
__m512 tmp10901 = _mm512_unpacklo_ps(wt339, wt340);
__m512 tmp10902 = _mm512_unpackhi_ps(wt339, wt340);
__m512 tmp10903 = _mm512_unpacklo_ps(wt341, wt342);
__m512 tmp10904 = _mm512_unpackhi_ps(wt341, wt342);
__m512 tmp10905 = _mm512_shuffle_ps(tmp10889, tmp10891, 68);
__m512 tmp10906 = _mm512_shuffle_ps(tmp10889, tmp10891, 238);
__m512 tmp10907 = _mm512_shuffle_ps(tmp10890, tmp10892, 68);
__m512 tmp10908 = _mm512_shuffle_ps(tmp10890, tmp10892, 238);
__m512 tmp10909 = _mm512_shuffle_ps(tmp10893, tmp10895, 68);
__m512 tmp10910 = _mm512_shuffle_ps(tmp10893, tmp10895, 238);
__m512 tmp10911 = _mm512_shuffle_ps(tmp10894, tmp10896, 68);
__m512 tmp10912 = _mm512_shuffle_ps(tmp10894, tmp10896, 238);
__m512 tmp10913 = _mm512_shuffle_ps(tmp10897, tmp10899, 68);
__m512 tmp10914 = _mm512_shuffle_ps(tmp10897, tmp10899, 238);
__m512 tmp10915 = _mm512_shuffle_ps(tmp10898, tmp10900, 68);
__m512 tmp10916 = _mm512_shuffle_ps(tmp10898, tmp10900, 238);
__m512 tmp10917 = _mm512_shuffle_ps(tmp10901, tmp10903, 68);
__m512 tmp10918 = _mm512_shuffle_ps(tmp10901, tmp10903, 238);
__m512 tmp10919 = _mm512_shuffle_ps(tmp10902, tmp10904, 68);
__m512 tmp10920 = _mm512_shuffle_ps(tmp10902, tmp10904, 238);
__m512 tmp10921 = _mm512_shuffle_f32x4(tmp10905, tmp10909, 136);
__m512 tmp10922 = _mm512_shuffle_f32x4(tmp10905, tmp10909, 221);
__m512 tmp10923 = _mm512_shuffle_f32x4(tmp10906, tmp10910, 136);
__m512 tmp10924 = _mm512_shuffle_f32x4(tmp10906, tmp10910, 221);
__m512 tmp10925 = _mm512_shuffle_f32x4(tmp10907, tmp10911, 136);
__m512 tmp10926 = _mm512_shuffle_f32x4(tmp10907, tmp10911, 221);
__m512 tmp10927 = _mm512_shuffle_f32x4(tmp10908, tmp10912, 136);
__m512 tmp10928 = _mm512_shuffle_f32x4(tmp10908, tmp10912, 221);
__m512 tmp10929 = _mm512_shuffle_f32x4(tmp10913, tmp10917, 136);
__m512 tmp10930 = _mm512_shuffle_f32x4(tmp10913, tmp10917, 221);
__m512 tmp10931 = _mm512_shuffle_f32x4(tmp10914, tmp10918, 136);
__m512 tmp10932 = _mm512_shuffle_f32x4(tmp10914, tmp10918, 221);
__m512 tmp10933 = _mm512_shuffle_f32x4(tmp10915, tmp10919, 136);
__m512 tmp10934 = _mm512_shuffle_f32x4(tmp10915, tmp10919, 221);
__m512 tmp10935 = _mm512_shuffle_f32x4(tmp10916, tmp10920, 136);
__m512 tmp10936 = _mm512_shuffle_f32x4(tmp10916, tmp10920, 221);
wt327 = _mm512_shuffle_f32x4(tmp10921, tmp10929, 136);
wt335 = _mm512_shuffle_f32x4(tmp10921, tmp10929, 221);
wt328 = _mm512_shuffle_f32x4(tmp10923, tmp10931, 136);
wt336 = _mm512_shuffle_f32x4(tmp10923, tmp10931, 221);
wt329 = _mm512_shuffle_f32x4(tmp10925, tmp10933, 136);
wt337 = _mm512_shuffle_f32x4(tmp10925, tmp10933, 221);
wt330 = _mm512_shuffle_f32x4(tmp10927, tmp10935, 136);
wt338 = _mm512_shuffle_f32x4(tmp10927, tmp10935, 221);
wt331 = _mm512_shuffle_f32x4(tmp10922, tmp10930, 136);
wt339 = _mm512_shuffle_f32x4(tmp10922, tmp10930, 221);
wt332 = _mm512_shuffle_f32x4(tmp10924, tmp10932, 136);
wt340 = _mm512_shuffle_f32x4(tmp10924, tmp10932, 221);
wt333 = _mm512_shuffle_f32x4(tmp10926, tmp10934, 136);
wt341 = _mm512_shuffle_f32x4(tmp10926, tmp10934, 221);
wt334 = _mm512_shuffle_f32x4(tmp10928, tmp10936, 136);
wt342 = _mm512_shuffle_f32x4(tmp10928, tmp10936, 221);
wt327 = _mm512_mul_ps(wt327, postMul33);
wt328 = _mm512_mul_ps(wt328, postMul33);
wt329 = _mm512_mul_ps(wt329, postMul33);
wt330 = _mm512_mul_ps(wt330, postMul33);
wt331 = _mm512_mul_ps(wt331, postMul33);
wt332 = _mm512_mul_ps(wt332, postMul33);
wt333 = _mm512_mul_ps(wt333, postMul33);
wt334 = _mm512_mul_ps(wt334, postMul33);
wt335 = _mm512_mul_ps(wt335, postMul33);
wt336 = _mm512_mul_ps(wt336, postMul33);
wt337 = _mm512_mul_ps(wt337, postMul33);
wt338 = _mm512_mul_ps(wt338, postMul33);
wt339 = _mm512_mul_ps(wt339, postMul33);
wt340 = _mm512_mul_ps(wt340, postMul33);
wt341 = _mm512_mul_ps(wt341, postMul33);
wt342 = _mm512_mul_ps(wt342, postMul33);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)0, 63>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)0, 63>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)0, 63>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)0, 63>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)0, 63>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)0, 63>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)0, 63>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)0, 63>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)0, 63>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)0, 63>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)0, 63>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)0, 63>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)0, 63>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)0, 63>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)0, 63>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)0, 63>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)6144, 4032>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(1+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(2+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(3+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(4+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(5+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(6+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(7+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(8+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(9+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(10+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(11+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(12+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(13+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(14+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(15+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+24*(16+16*c31)+(ptrdiff_t)12288, 258048>>cut13, wt342);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(1+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt327);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(2+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt328);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(3+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt329);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(4+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt330);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(5+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt331);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(6+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt332);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(7+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt333);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(8+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt334);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(9+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt335);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(10+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt336);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(11+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt337);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(12+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt338);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(13+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt339);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(14+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt340);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(15+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt341);
_mm512_mask_storeu_ps(arranged11+526336*i42+6168*l49+4*cut13+8*(16+16*c31)+(ptrdiff_t)18432, 65535-(262143>>cut13), wt342);
}
}
}
}
}

static void ResNeXt50OneArrangeWts6(ResNeXt50ThreaderTeam1* team46, char** tensors65) {
ResNeXt50ThreaderTask1 task69;
task69.callee1 = ResNeXt50OneArrangeWts6Callee1;
task69.any1 = tensors65;
task69.nd1 = 3;
task69.hull1[0] = 16;
task69.hull1[1] = 1;
task69.hull1[2] = 1;
ResNeXt50ThreaderDo1(team46, &task69);
}

static void ResNeXt50OneArrangeDats6Callee1(ResNeXt50ThreaderTask1* task70, int64_t* pt40) {
char** tensors68 = task70->any1;
ptrdiff_t s35 = pt40[0];
ptrdiff_t c34 = pt40[1];
char*restrict datPtr20 = tensors68[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)802816*0;
char*restrict arranged12 = tensors68[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)802816*0;
ptrdiff_t ii29 = 1;
for (ptrdiff_t i43 = 0; i43 < ii29; ++i43) {
ptrdiff_t j36 = 1*c34;
ptrdiff_t jj39 = j36+0;
for (; j36 != 12; ++j36) {
ptrdiff_t k119 = 128*s35;
ptrdiff_t kk37 = k119+128;
for (; k119 < kk37; ++k119) {
__m512 dat2003 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)0);
__m512 dat2004 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)64);
__m512 dat2005 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)128);
__m512 dat2006 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k119+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)0, 65535, dat2003);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)64, 65535, dat2004);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)128, 65535, dat2005);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+256*k119+(ptrdiff_t)192, 65535, dat2006);
}
if (j36 >= jj39) goto next6;
}
ptrdiff_t k120 = 128*s35;
ptrdiff_t kk38 = k120+128;
for (; k120 < kk38; ++k120) {
__m512 dat2007 = _mm512_maskz_loadu_ps(65535, datPtr20+802816*i43+256*j36+3136*k120+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged12+802816*i43+65536*j36+64*k120+(ptrdiff_t)0, 65535, dat2007);
}
next6:;
}
}

static void ResNeXt50OneArrangeDats6(ResNeXt50ThreaderTeam1* team47, char** tensors67) {
ResNeXt50ThreaderTask1 task71;
task71.callee1 = ResNeXt50OneArrangeDats6Callee1;
task71.any1 = tensors67;
task71.nd1 = 4;
task71.hull1[0] = 2;
task71.hull1[1] = 13;
task71.hull1[2] = 1;
task71.hull1[3] = 1;
ResNeXt50ThreaderDo1(team47, &task71);
}

static void ResNeXt50OneApply6Callee1(ResNeXt50ThreaderTask1* task72, int64_t* pt41) {
void** pair16 = task72->any1;
char** tensors70 = pair16[0];
ptrdiff_t e21 = 0;
ptrdiff_t g23 = 0;
ptrdiff_t d14 = pt41[1];
ptrdiff_t w54 = pt41[0];
char*restrict arrangedWts6 = tensors70[0]+1712128*e21+(ptrdiff_t)526336*1*g23;
char*restrict arrangedDats6 = tensors70[1]+2618560*e21+(ptrdiff_t)802816*1*g23;
char*restrict datPtr21 = tensors70[2]+(ptrdiff_t)1605632*1*g23;
char*restrict datPtr22 = tensors70[3]+(ptrdiff_t)1605632*1*g23;
ptrdiff_t ii30 = 1;
for (ptrdiff_t i44 = 0; i44 < ii30; ++i44) {
ptrdiff_t j37 = 1*d14;
ptrdiff_t jj40 = j37+0;
for (; j37 != 12; ++j37) {
ptrdiff_t k121 = 2*w54;
ptrdiff_t kk39 = k121+1;
for (; k121 != 85; ++k121) {
ptrdiff_t s36 = -1;
__m512 sum269 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)24));
__m512 sum273 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)28));
__m512 sum277 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)32));
__m512 sum281 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)36));
__m512 sum285 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)40));
__m512 sum289 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)44));
__m512 sum270 = sum269;
__m512 sum271 = sum269;
__m512 sum272 = sum269;
__m512 sum274 = sum273;
__m512 sum275 = sum273;
__m512 sum276 = sum273;
__m512 sum278 = sum277;
__m512 sum279 = sum277;
__m512 sum280 = sum277;
__m512 sum282 = sum281;
__m512 sum283 = sum281;
__m512 sum284 = sum281;
__m512 sum286 = sum285;
__m512 sum287 = sum285;
__m512 sum288 = sum285;
__m512 sum290 = sum289;
__m512 sum291 = sum289;
__m512 sum292 = sum289;
for (s36 = 0; s36 < 256; ++s36) {
__m512 dat2008 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)0);
__m512 dat2009 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)64);
__m512 dat2010 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)128);
__m512 dat2011 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s36+(ptrdiff_t)192);
__m512 wt375 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)24));
sum269 = _mm512_fmadd_ps(wt375, dat2008, sum269);
sum270 = _mm512_fmadd_ps(wt375, dat2009, sum270);
sum271 = _mm512_fmadd_ps(wt375, dat2010, sum271);
sum272 = _mm512_fmadd_ps(wt375, dat2011, sum272);
__m512 wt376 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)28));
sum273 = _mm512_fmadd_ps(wt376, dat2008, sum273);
sum274 = _mm512_fmadd_ps(wt376, dat2009, sum274);
sum275 = _mm512_fmadd_ps(wt376, dat2010, sum275);
sum276 = _mm512_fmadd_ps(wt376, dat2011, sum276);
__m512 wt377 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)32));
sum277 = _mm512_fmadd_ps(wt377, dat2008, sum277);
sum278 = _mm512_fmadd_ps(wt377, dat2009, sum278);
sum279 = _mm512_fmadd_ps(wt377, dat2010, sum279);
sum280 = _mm512_fmadd_ps(wt377, dat2011, sum280);
__m512 wt378 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)36));
sum281 = _mm512_fmadd_ps(wt378, dat2008, sum281);
sum282 = _mm512_fmadd_ps(wt378, dat2009, sum282);
sum283 = _mm512_fmadd_ps(wt378, dat2010, sum283);
sum284 = _mm512_fmadd_ps(wt378, dat2011, sum284);
__m512 wt379 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)40));
sum285 = _mm512_fmadd_ps(wt379, dat2008, sum285);
sum286 = _mm512_fmadd_ps(wt379, dat2009, sum286);
sum287 = _mm512_fmadd_ps(wt379, dat2010, sum287);
sum288 = _mm512_fmadd_ps(wt379, dat2011, sum288);
__m512 wt380 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+24*s36+(ptrdiff_t)44));
sum289 = _mm512_fmadd_ps(wt380, dat2008, sum289);
sum290 = _mm512_fmadd_ps(wt380, dat2009, sum290);
sum291 = _mm512_fmadd_ps(wt380, dat2010, sum291);
sum292 = _mm512_fmadd_ps(wt380, dat2011, sum292);
}
sum269 = _mm512_add_ps(sum269, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0));
sum270 = _mm512_add_ps(sum270, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64));
sum271 = _mm512_add_ps(sum271, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128));
sum272 = _mm512_add_ps(sum272, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192));
sum269 = _mm512_max_ps(_mm512_setzero_ps(), sum269);
sum270 = _mm512_max_ps(_mm512_setzero_ps(), sum270);
sum271 = _mm512_max_ps(_mm512_setzero_ps(), sum271);
sum272 = _mm512_max_ps(_mm512_setzero_ps(), sum272);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0, 65535, sum269);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64, 65535, sum270);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128, 65535, sum271);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192, 65535, sum272);
sum273 = _mm512_add_ps(sum273, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136));
sum274 = _mm512_add_ps(sum274, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200));
sum275 = _mm512_add_ps(sum275, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264));
sum276 = _mm512_add_ps(sum276, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328));
sum273 = _mm512_max_ps(_mm512_setzero_ps(), sum273);
sum274 = _mm512_max_ps(_mm512_setzero_ps(), sum274);
sum275 = _mm512_max_ps(_mm512_setzero_ps(), sum275);
sum276 = _mm512_max_ps(_mm512_setzero_ps(), sum276);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136, 65535, sum273);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200, 65535, sum274);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264, 65535, sum275);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328, 65535, sum276);
sum277 = _mm512_add_ps(sum277, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6272));
sum278 = _mm512_add_ps(sum278, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6336));
sum279 = _mm512_add_ps(sum279, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6400));
sum280 = _mm512_add_ps(sum280, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6464));
sum277 = _mm512_max_ps(_mm512_setzero_ps(), sum277);
sum278 = _mm512_max_ps(_mm512_setzero_ps(), sum278);
sum279 = _mm512_max_ps(_mm512_setzero_ps(), sum279);
sum280 = _mm512_max_ps(_mm512_setzero_ps(), sum280);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6272, 65535, sum277);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6336, 65535, sum278);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6400, 65535, sum279);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)6464, 65535, sum280);
sum281 = _mm512_add_ps(sum281, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9408));
sum282 = _mm512_add_ps(sum282, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9472));
sum283 = _mm512_add_ps(sum283, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9536));
sum284 = _mm512_add_ps(sum284, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9600));
sum281 = _mm512_max_ps(_mm512_setzero_ps(), sum281);
sum282 = _mm512_max_ps(_mm512_setzero_ps(), sum282);
sum283 = _mm512_max_ps(_mm512_setzero_ps(), sum283);
sum284 = _mm512_max_ps(_mm512_setzero_ps(), sum284);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9408, 65535, sum281);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9472, 65535, sum282);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9536, 65535, sum283);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)9600, 65535, sum284);
sum285 = _mm512_add_ps(sum285, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12544));
sum286 = _mm512_add_ps(sum286, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12608));
sum287 = _mm512_add_ps(sum287, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12672));
sum288 = _mm512_add_ps(sum288, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12736));
sum285 = _mm512_max_ps(_mm512_setzero_ps(), sum285);
sum286 = _mm512_max_ps(_mm512_setzero_ps(), sum286);
sum287 = _mm512_max_ps(_mm512_setzero_ps(), sum287);
sum288 = _mm512_max_ps(_mm512_setzero_ps(), sum288);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12544, 65535, sum285);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12608, 65535, sum286);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12672, 65535, sum287);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)12736, 65535, sum288);
sum289 = _mm512_add_ps(sum289, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15680));
sum290 = _mm512_add_ps(sum290, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15744));
sum291 = _mm512_add_ps(sum291, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15808));
sum292 = _mm512_add_ps(sum292, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15872));
sum289 = _mm512_max_ps(_mm512_setzero_ps(), sum289);
sum290 = _mm512_max_ps(_mm512_setzero_ps(), sum290);
sum291 = _mm512_max_ps(_mm512_setzero_ps(), sum291);
sum292 = _mm512_max_ps(_mm512_setzero_ps(), sum292);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15680, 65535, sum289);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15744, 65535, sum290);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15808, 65535, sum291);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)15872, 65535, sum292);
if (k121 >= kk39) return;
}
ptrdiff_t s37 = -1;
__m512 sum293 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)8));
__m512 sum297 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)12));
__m512 sum294 = sum293;
__m512 sum295 = sum293;
__m512 sum296 = sum293;
__m512 sum298 = sum297;
__m512 sum299 = sum297;
__m512 sum300 = sum297;
for (s37 = 0; s37 < 256; ++s37) {
__m512 dat2012 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)0);
__m512 dat2013 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)64);
__m512 dat2014 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)128);
__m512 dat2015 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+256*s37+(ptrdiff_t)192);
__m512 wt381 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)8));
sum293 = _mm512_fmadd_ps(wt381, dat2012, sum293);
sum294 = _mm512_fmadd_ps(wt381, dat2013, sum294);
sum295 = _mm512_fmadd_ps(wt381, dat2014, sum295);
sum296 = _mm512_fmadd_ps(wt381, dat2015, sum296);
__m512 wt382 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k121+8*s37+(ptrdiff_t)12));
sum297 = _mm512_fmadd_ps(wt382, dat2012, sum297);
sum298 = _mm512_fmadd_ps(wt382, dat2013, sum298);
sum299 = _mm512_fmadd_ps(wt382, dat2014, sum299);
sum300 = _mm512_fmadd_ps(wt382, dat2015, sum300);
}
sum293 = _mm512_add_ps(sum293, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0));
sum294 = _mm512_add_ps(sum294, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64));
sum295 = _mm512_add_ps(sum295, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128));
sum296 = _mm512_add_ps(sum296, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192));
sum293 = _mm512_max_ps(_mm512_setzero_ps(), sum293);
sum294 = _mm512_max_ps(_mm512_setzero_ps(), sum294);
sum295 = _mm512_max_ps(_mm512_setzero_ps(), sum295);
sum296 = _mm512_max_ps(_mm512_setzero_ps(), sum296);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)0, 65535, sum293);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)64, 65535, sum294);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)128, 65535, sum295);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)192, 65535, sum296);
sum297 = _mm512_add_ps(sum297, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136));
sum298 = _mm512_add_ps(sum298, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200));
sum299 = _mm512_add_ps(sum299, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264));
sum300 = _mm512_add_ps(sum300, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328));
sum297 = _mm512_max_ps(_mm512_setzero_ps(), sum297);
sum298 = _mm512_max_ps(_mm512_setzero_ps(), sum298);
sum299 = _mm512_max_ps(_mm512_setzero_ps(), sum299);
sum300 = _mm512_max_ps(_mm512_setzero_ps(), sum300);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3136, 65535, sum297);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3200, 65535, sum298);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3264, 65535, sum299);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k121+(ptrdiff_t)3328, 65535, sum300);
if (j37 >= jj40) return;
}
ptrdiff_t k122 = 2*w54;
ptrdiff_t kk40 = k122+1;
for (; k122 != 85; ++k122) {
ptrdiff_t s38 = -1;
__m512 sum301 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)24));
__m512 sum302 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)28));
__m512 sum303 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)32));
__m512 sum304 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)36));
__m512 sum305 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)40));
__m512 sum306 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)44));
for (s38 = 0; s38 < 256; ++s38) {
__m512 dat2016 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+64*s38+(ptrdiff_t)0);
__m512 wt383 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)24));
sum301 = _mm512_fmadd_ps(wt383, dat2016, sum301);
__m512 wt384 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)28));
sum302 = _mm512_fmadd_ps(wt384, dat2016, sum302);
__m512 wt385 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)32));
sum303 = _mm512_fmadd_ps(wt385, dat2016, sum303);
__m512 wt386 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)36));
sum304 = _mm512_fmadd_ps(wt386, dat2016, sum304);
__m512 wt387 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)40));
sum305 = _mm512_fmadd_ps(wt387, dat2016, sum305);
__m512 wt388 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+24*s38+(ptrdiff_t)44));
sum306 = _mm512_fmadd_ps(wt388, dat2016, sum306);
}
sum301 = _mm512_add_ps(sum301, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0));
sum301 = _mm512_max_ps(_mm512_setzero_ps(), sum301);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0, 65535, sum301);
sum302 = _mm512_add_ps(sum302, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136));
sum302 = _mm512_max_ps(_mm512_setzero_ps(), sum302);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136, 65535, sum302);
sum303 = _mm512_add_ps(sum303, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)6272));
sum303 = _mm512_max_ps(_mm512_setzero_ps(), sum303);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)6272, 65535, sum303);
sum304 = _mm512_add_ps(sum304, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)9408));
sum304 = _mm512_max_ps(_mm512_setzero_ps(), sum304);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)9408, 65535, sum304);
sum305 = _mm512_add_ps(sum305, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)12544));
sum305 = _mm512_max_ps(_mm512_setzero_ps(), sum305);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)12544, 65535, sum305);
sum306 = _mm512_add_ps(sum306, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)15680));
sum306 = _mm512_max_ps(_mm512_setzero_ps(), sum306);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)15680, 65535, sum306);
if (k122 >= kk40) return;
}
ptrdiff_t s39 = -1;
__m512 sum307 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)8));
__m512 sum308 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)12));
for (s39 = 0; s39 < 256; ++s39) {
__m512 dat2017 = _mm512_loadu_ps(arrangedDats6+802816*i44+65536*j37+64*s39+(ptrdiff_t)0);
__m512 wt389 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)8));
sum307 = _mm512_fmadd_ps(wt389, dat2017, sum307);
__m512 wt390 = _mm512_set1_ps(*(float*)(arrangedWts6+526336*i44+6168*k122+8*s39+(ptrdiff_t)12));
sum308 = _mm512_fmadd_ps(wt390, dat2017, sum308);
}
sum307 = _mm512_add_ps(sum307, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0));
sum307 = _mm512_max_ps(_mm512_setzero_ps(), sum307);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)0, 65535, sum307);
sum308 = _mm512_add_ps(sum308, _mm512_maskz_loadu_ps(65535, datPtr21+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136));
sum308 = _mm512_max_ps(_mm512_setzero_ps(), sum308);
_mm512_mask_storeu_ps(datPtr22+1605632*i44+256*j37+18816*k122+(ptrdiff_t)3136, 65535, sum308);
}
}

static void ResNeXt50OneApply6(ResNeXt50ThreaderTeam1* team48, char** tensors69) {
void* pair15[] = {tensors69, 0};
ResNeXt50ThreaderTask1 task73;
task73.callee1 = ResNeXt50OneApply6Callee1;
task73.any1 = pair15;
task73.nd1 = 3;
task73.hull1[0] = 43;
task73.hull1[1] = 13;
task73.hull1[2] = 1;
ResNeXt50ThreaderDo1(team48, &task73);
}

static void ResNeXt50OneArrangeWts7Callee1(ResNeXt50ThreaderTask1* task74, int64_t* pt42) {
char** tensors72 = task74->any1;
ptrdiff_t b70 = pt42[0];
char*restrict wtPtr12 = tensors72[0]+(ptrdiff_t)3340*0+(ptrdiff_t)524288*0;
char*restrict biasPtr12 = tensors72[1]+(ptrdiff_t)1024*0;
char*restrict bnPtr13 = tensors72[2]+(ptrdiff_t)8*256*0;
char*restrict arranged13 = tensors72[3]+(ptrdiff_t)856064*0+(ptrdiff_t)525312*0;
ptrdiff_t ii31 = 1;
for (ptrdiff_t i45 = 0; i45 < ii31; ++i45) {
ptrdiff_t j38 = 1*b70;
ptrdiff_t jj41 = j38+1;
for (; j38 < jj41; ++j38) {
if (j38 < 15) {
ptrdiff_t k124 = 0+16*(j38-0);
ptrdiff_t l52 = (size_t)(0+k124)/6;
ptrdiff_t cut16 = (size_t)(0+k124)%6;
switch (cut16) {
case 0:;
case 2: {
__m512 sum310 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k124);
__m512i pmMul24 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd24 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo20 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k124+256*i45));
__m512 masHi20 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k124+256*i45)+(ptrdiff_t)64);
__m512 postMul37 = _mm512_permutex2var_ps(masLo20, pmMul24, masHi20);
__m512 postAdd25 = _mm512_permutex2var_ps(masLo20, pmAdd24, masHi20);
sum310 = _mm512_fmadd_ps(sum310, postMul37, postAdd25);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum310);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum310);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)24576, 65535-(4095>>cut16), sum310);
ptrdiff_t c36 = 0;
for (; c36 != 32; ++c36) {
__m512 wt407 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)0);
__m512 wt408 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)2048);
__m512 wt409 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)4096);
__m512 wt410 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)6144);
__m512 wt411 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)8192);
__m512 wt412 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)10240);
__m512 wt413 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)12288);
__m512 wt414 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)14336);
__m512 wt415 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)16384);
__m512 wt416 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)18432);
__m512 wt417 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)20480);
__m512 wt418 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)22528);
__m512 wt419 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)24576);
__m512 wt420 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)26624);
__m512 wt421 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)28672);
__m512 wt422 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c36+(ptrdiff_t)30720);
__m512 tmp10937 = _mm512_unpacklo_ps(wt407, wt408);
__m512 tmp10938 = _mm512_unpackhi_ps(wt407, wt408);
__m512 tmp10939 = _mm512_unpacklo_ps(wt409, wt410);
__m512 tmp10940 = _mm512_unpackhi_ps(wt409, wt410);
__m512 tmp10941 = _mm512_unpacklo_ps(wt411, wt412);
__m512 tmp10942 = _mm512_unpackhi_ps(wt411, wt412);
__m512 tmp10943 = _mm512_unpacklo_ps(wt413, wt414);
__m512 tmp10944 = _mm512_unpackhi_ps(wt413, wt414);
__m512 tmp10945 = _mm512_unpacklo_ps(wt415, wt416);
__m512 tmp10946 = _mm512_unpackhi_ps(wt415, wt416);
__m512 tmp10947 = _mm512_unpacklo_ps(wt417, wt418);
__m512 tmp10948 = _mm512_unpackhi_ps(wt417, wt418);
__m512 tmp10949 = _mm512_unpacklo_ps(wt419, wt420);
__m512 tmp10950 = _mm512_unpackhi_ps(wt419, wt420);
__m512 tmp10951 = _mm512_unpacklo_ps(wt421, wt422);
__m512 tmp10952 = _mm512_unpackhi_ps(wt421, wt422);
__m512 tmp10953 = _mm512_shuffle_ps(tmp10937, tmp10939, 68);
__m512 tmp10954 = _mm512_shuffle_ps(tmp10937, tmp10939, 238);
__m512 tmp10955 = _mm512_shuffle_ps(tmp10938, tmp10940, 68);
__m512 tmp10956 = _mm512_shuffle_ps(tmp10938, tmp10940, 238);
__m512 tmp10957 = _mm512_shuffle_ps(tmp10941, tmp10943, 68);
__m512 tmp10958 = _mm512_shuffle_ps(tmp10941, tmp10943, 238);
__m512 tmp10959 = _mm512_shuffle_ps(tmp10942, tmp10944, 68);
__m512 tmp10960 = _mm512_shuffle_ps(tmp10942, tmp10944, 238);
__m512 tmp10961 = _mm512_shuffle_ps(tmp10945, tmp10947, 68);
__m512 tmp10962 = _mm512_shuffle_ps(tmp10945, tmp10947, 238);
__m512 tmp10963 = _mm512_shuffle_ps(tmp10946, tmp10948, 68);
__m512 tmp10964 = _mm512_shuffle_ps(tmp10946, tmp10948, 238);
__m512 tmp10965 = _mm512_shuffle_ps(tmp10949, tmp10951, 68);
__m512 tmp10966 = _mm512_shuffle_ps(tmp10949, tmp10951, 238);
__m512 tmp10967 = _mm512_shuffle_ps(tmp10950, tmp10952, 68);
__m512 tmp10968 = _mm512_shuffle_ps(tmp10950, tmp10952, 238);
__m512 tmp10969 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 136);
__m512 tmp10970 = _mm512_shuffle_f32x4(tmp10953, tmp10957, 221);
__m512 tmp10971 = _mm512_shuffle_f32x4(tmp10954, tmp10958, 136);
__m512 tmp10972 = _mm512_shuffle_f32x4(tmp10954, tmp10958, 221);
__m512 tmp10973 = _mm512_shuffle_f32x4(tmp10955, tmp10959, 136);
__m512 tmp10974 = _mm512_shuffle_f32x4(tmp10955, tmp10959, 221);
__m512 tmp10975 = _mm512_shuffle_f32x4(tmp10956, tmp10960, 136);
__m512 tmp10976 = _mm512_shuffle_f32x4(tmp10956, tmp10960, 221);
__m512 tmp10977 = _mm512_shuffle_f32x4(tmp10961, tmp10965, 136);
__m512 tmp10978 = _mm512_shuffle_f32x4(tmp10961, tmp10965, 221);
__m512 tmp10979 = _mm512_shuffle_f32x4(tmp10962, tmp10966, 136);
__m512 tmp10980 = _mm512_shuffle_f32x4(tmp10962, tmp10966, 221);
__m512 tmp10981 = _mm512_shuffle_f32x4(tmp10963, tmp10967, 136);
__m512 tmp10982 = _mm512_shuffle_f32x4(tmp10963, tmp10967, 221);
__m512 tmp10983 = _mm512_shuffle_f32x4(tmp10964, tmp10968, 136);
__m512 tmp10984 = _mm512_shuffle_f32x4(tmp10964, tmp10968, 221);
wt407 = _mm512_shuffle_f32x4(tmp10969, tmp10977, 136);
wt415 = _mm512_shuffle_f32x4(tmp10969, tmp10977, 221);
wt408 = _mm512_shuffle_f32x4(tmp10971, tmp10979, 136);
wt416 = _mm512_shuffle_f32x4(tmp10971, tmp10979, 221);
wt409 = _mm512_shuffle_f32x4(tmp10973, tmp10981, 136);
wt417 = _mm512_shuffle_f32x4(tmp10973, tmp10981, 221);
wt410 = _mm512_shuffle_f32x4(tmp10975, tmp10983, 136);
wt418 = _mm512_shuffle_f32x4(tmp10975, tmp10983, 221);
wt411 = _mm512_shuffle_f32x4(tmp10970, tmp10978, 136);
wt419 = _mm512_shuffle_f32x4(tmp10970, tmp10978, 221);
wt412 = _mm512_shuffle_f32x4(tmp10972, tmp10980, 136);
wt420 = _mm512_shuffle_f32x4(tmp10972, tmp10980, 221);
wt413 = _mm512_shuffle_f32x4(tmp10974, tmp10982, 136);
wt421 = _mm512_shuffle_f32x4(tmp10974, tmp10982, 221);
wt414 = _mm512_shuffle_f32x4(tmp10976, tmp10984, 136);
wt422 = _mm512_shuffle_f32x4(tmp10976, tmp10984, 221);
wt407 = _mm512_mul_ps(wt407, postMul37);
wt408 = _mm512_mul_ps(wt408, postMul37);
wt409 = _mm512_mul_ps(wt409, postMul37);
wt410 = _mm512_mul_ps(wt410, postMul37);
wt411 = _mm512_mul_ps(wt411, postMul37);
wt412 = _mm512_mul_ps(wt412, postMul37);
wt413 = _mm512_mul_ps(wt413, postMul37);
wt414 = _mm512_mul_ps(wt414, postMul37);
wt415 = _mm512_mul_ps(wt415, postMul37);
wt416 = _mm512_mul_ps(wt416, postMul37);
wt417 = _mm512_mul_ps(wt417, postMul37);
wt418 = _mm512_mul_ps(wt418, postMul37);
wt419 = _mm512_mul_ps(wt419, postMul37);
wt420 = _mm512_mul_ps(wt420, postMul37);
wt421 = _mm512_mul_ps(wt421, postMul37);
wt422 = _mm512_mul_ps(wt422, postMul37);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)0, 63>>cut16, wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)0, 63>>cut16, wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)0, 63>>cut16, wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)0, 63>>cut16, wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)0, 63>>cut16, wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)0, 63>>cut16, wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)0, 63>>cut16, wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)0, 63>>cut16, wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)0, 63>>cut16, wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)0, 63>>cut16, wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)0, 63>>cut16, wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)0, 63>>cut16, wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)0, 63>>cut16, wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)0, 63>>cut16, wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)0, 63>>cut16, wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)0, 63>>cut16, wt422);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)12288, 4032>>cut16, wt422);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt407);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt408);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt409);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt410);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt411);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt412);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt413);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt414);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt415);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt416);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt417);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt418);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt419);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt420);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt421);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c36)+(ptrdiff_t)24576, 65535-(4095>>cut16), wt422);
}
break;
}
default: {
cut16 = 4;
__m512 sum311 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k124);
__m512i pmMul25 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd25 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo21 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k124+256*i45));
__m512 masHi21 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k124+256*i45)+(ptrdiff_t)64);
__m512 postMul38 = _mm512_permutex2var_ps(masLo21, pmMul25, masHi21);
__m512 postAdd26 = _mm512_permutex2var_ps(masLo21, pmAdd25, masHi21);
sum311 = _mm512_fmadd_ps(sum311, postMul38, postAdd26);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)0, 63>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)12288, 4032>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)24576, 258048>>cut16, sum311);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*0+(ptrdiff_t)36864, 65535-(262143>>cut16), sum311);
ptrdiff_t c37 = 0;
for (; c37 != 32; ++c37) {
__m512 wt423 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)0);
__m512 wt424 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)2048);
__m512 wt425 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)4096);
__m512 wt426 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)6144);
__m512 wt427 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)8192);
__m512 wt428 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)10240);
__m512 wt429 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)12288);
__m512 wt430 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)14336);
__m512 wt431 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)16384);
__m512 wt432 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)18432);
__m512 wt433 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)20480);
__m512 wt434 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)22528);
__m512 wt435 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)24576);
__m512 wt436 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)26624);
__m512 wt437 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)28672);
__m512 wt438 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k124+64*c37+(ptrdiff_t)30720);
__m512 tmp10985 = _mm512_unpacklo_ps(wt423, wt424);
__m512 tmp10986 = _mm512_unpackhi_ps(wt423, wt424);
__m512 tmp10987 = _mm512_unpacklo_ps(wt425, wt426);
__m512 tmp10988 = _mm512_unpackhi_ps(wt425, wt426);
__m512 tmp10989 = _mm512_unpacklo_ps(wt427, wt428);
__m512 tmp10990 = _mm512_unpackhi_ps(wt427, wt428);
__m512 tmp10991 = _mm512_unpacklo_ps(wt429, wt430);
__m512 tmp10992 = _mm512_unpackhi_ps(wt429, wt430);
__m512 tmp10993 = _mm512_unpacklo_ps(wt431, wt432);
__m512 tmp10994 = _mm512_unpackhi_ps(wt431, wt432);
__m512 tmp10995 = _mm512_unpacklo_ps(wt433, wt434);
__m512 tmp10996 = _mm512_unpackhi_ps(wt433, wt434);
__m512 tmp10997 = _mm512_unpacklo_ps(wt435, wt436);
__m512 tmp10998 = _mm512_unpackhi_ps(wt435, wt436);
__m512 tmp10999 = _mm512_unpacklo_ps(wt437, wt438);
__m512 tmp11000 = _mm512_unpackhi_ps(wt437, wt438);
__m512 tmp11001 = _mm512_shuffle_ps(tmp10985, tmp10987, 68);
__m512 tmp11002 = _mm512_shuffle_ps(tmp10985, tmp10987, 238);
__m512 tmp11003 = _mm512_shuffle_ps(tmp10986, tmp10988, 68);
__m512 tmp11004 = _mm512_shuffle_ps(tmp10986, tmp10988, 238);
__m512 tmp11005 = _mm512_shuffle_ps(tmp10989, tmp10991, 68);
__m512 tmp11006 = _mm512_shuffle_ps(tmp10989, tmp10991, 238);
__m512 tmp11007 = _mm512_shuffle_ps(tmp10990, tmp10992, 68);
__m512 tmp11008 = _mm512_shuffle_ps(tmp10990, tmp10992, 238);
__m512 tmp11009 = _mm512_shuffle_ps(tmp10993, tmp10995, 68);
__m512 tmp11010 = _mm512_shuffle_ps(tmp10993, tmp10995, 238);
__m512 tmp11011 = _mm512_shuffle_ps(tmp10994, tmp10996, 68);
__m512 tmp11012 = _mm512_shuffle_ps(tmp10994, tmp10996, 238);
__m512 tmp11013 = _mm512_shuffle_ps(tmp10997, tmp10999, 68);
__m512 tmp11014 = _mm512_shuffle_ps(tmp10997, tmp10999, 238);
__m512 tmp11015 = _mm512_shuffle_ps(tmp10998, tmp11000, 68);
__m512 tmp11016 = _mm512_shuffle_ps(tmp10998, tmp11000, 238);
__m512 tmp11017 = _mm512_shuffle_f32x4(tmp11001, tmp11005, 136);
__m512 tmp11018 = _mm512_shuffle_f32x4(tmp11001, tmp11005, 221);
__m512 tmp11019 = _mm512_shuffle_f32x4(tmp11002, tmp11006, 136);
__m512 tmp11020 = _mm512_shuffle_f32x4(tmp11002, tmp11006, 221);
__m512 tmp11021 = _mm512_shuffle_f32x4(tmp11003, tmp11007, 136);
__m512 tmp11022 = _mm512_shuffle_f32x4(tmp11003, tmp11007, 221);
__m512 tmp11023 = _mm512_shuffle_f32x4(tmp11004, tmp11008, 136);
__m512 tmp11024 = _mm512_shuffle_f32x4(tmp11004, tmp11008, 221);
__m512 tmp11025 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 136);
__m512 tmp11026 = _mm512_shuffle_f32x4(tmp11009, tmp11013, 221);
__m512 tmp11027 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 136);
__m512 tmp11028 = _mm512_shuffle_f32x4(tmp11010, tmp11014, 221);
__m512 tmp11029 = _mm512_shuffle_f32x4(tmp11011, tmp11015, 136);
__m512 tmp11030 = _mm512_shuffle_f32x4(tmp11011, tmp11015, 221);
__m512 tmp11031 = _mm512_shuffle_f32x4(tmp11012, tmp11016, 136);
__m512 tmp11032 = _mm512_shuffle_f32x4(tmp11012, tmp11016, 221);
wt423 = _mm512_shuffle_f32x4(tmp11017, tmp11025, 136);
wt431 = _mm512_shuffle_f32x4(tmp11017, tmp11025, 221);
wt424 = _mm512_shuffle_f32x4(tmp11019, tmp11027, 136);
wt432 = _mm512_shuffle_f32x4(tmp11019, tmp11027, 221);
wt425 = _mm512_shuffle_f32x4(tmp11021, tmp11029, 136);
wt433 = _mm512_shuffle_f32x4(tmp11021, tmp11029, 221);
wt426 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 136);
wt434 = _mm512_shuffle_f32x4(tmp11023, tmp11031, 221);
wt427 = _mm512_shuffle_f32x4(tmp11018, tmp11026, 136);
wt435 = _mm512_shuffle_f32x4(tmp11018, tmp11026, 221);
wt428 = _mm512_shuffle_f32x4(tmp11020, tmp11028, 136);
wt436 = _mm512_shuffle_f32x4(tmp11020, tmp11028, 221);
wt429 = _mm512_shuffle_f32x4(tmp11022, tmp11030, 136);
wt437 = _mm512_shuffle_f32x4(tmp11022, tmp11030, 221);
wt430 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 136);
wt438 = _mm512_shuffle_f32x4(tmp11024, tmp11032, 221);
wt423 = _mm512_mul_ps(wt423, postMul38);
wt424 = _mm512_mul_ps(wt424, postMul38);
wt425 = _mm512_mul_ps(wt425, postMul38);
wt426 = _mm512_mul_ps(wt426, postMul38);
wt427 = _mm512_mul_ps(wt427, postMul38);
wt428 = _mm512_mul_ps(wt428, postMul38);
wt429 = _mm512_mul_ps(wt429, postMul38);
wt430 = _mm512_mul_ps(wt430, postMul38);
wt431 = _mm512_mul_ps(wt431, postMul38);
wt432 = _mm512_mul_ps(wt432, postMul38);
wt433 = _mm512_mul_ps(wt433, postMul38);
wt434 = _mm512_mul_ps(wt434, postMul38);
wt435 = _mm512_mul_ps(wt435, postMul38);
wt436 = _mm512_mul_ps(wt436, postMul38);
wt437 = _mm512_mul_ps(wt437, postMul38);
wt438 = _mm512_mul_ps(wt438, postMul38);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)0, 63>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)0, 63>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)0, 63>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)0, 63>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)0, 63>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)0, 63>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)0, 63>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)0, 63>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)0, 63>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)0, 63>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)0, 63>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)0, 63>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)0, 63>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)0, 63>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)0, 63>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)0, 63>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)12288, 4032>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)24576, 258048>>cut16, wt438);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(1+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt423);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(2+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt424);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(3+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt425);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(4+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt426);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(5+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt427);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(6+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt428);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(7+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt429);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(8+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt430);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(9+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt431);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(10+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt432);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(11+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt433);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(12+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt434);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(13+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt435);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(14+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt436);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(15+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt437);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l52+4*cut16+24*(16+16*c37)+(ptrdiff_t)36864, 65535-(262143>>cut16), wt438);
}
}
}
} else {
ptrdiff_t k123 = 240;
ptrdiff_t l51 = (size_t)(0+k123)/6;
ptrdiff_t cut15 = (size_t)(0+k123)%6;
__m512 sum309 = _mm512_maskz_loadu_ps(65535, biasPtr12+1024*i45+4*k123);
__m512i pmMul26 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd26 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo22 = _mm512_loadu_ps(bnPtr13+(ptrdiff_t)8*(k123+256*i45));
__m512 masHi22 = _mm512_maskz_loadu_ps(65535, bnPtr13+(ptrdiff_t)8*(k123+256*i45)+(ptrdiff_t)64);
__m512 postMul36 = _mm512_permutex2var_ps(masLo22, pmMul26, masHi22);
__m512 postAdd24 = _mm512_permutex2var_ps(masLo22, pmAdd26, masHi22);
sum309 = _mm512_fmadd_ps(sum309, postMul36, postAdd24);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*0+(ptrdiff_t)0, 63>>cut15, sum309);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*0+(ptrdiff_t)12288, 4032>>cut15, sum309);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*0+(ptrdiff_t)24576, 65535-(4095>>cut15), sum309);
ptrdiff_t c35 = 0;
for (; c35 != 32; ++c35) {
__m512 wt391 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)0);
__m512 wt392 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)2048);
__m512 wt393 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)4096);
__m512 wt394 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)6144);
__m512 wt395 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)8192);
__m512 wt396 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)10240);
__m512 wt397 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)12288);
__m512 wt398 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)14336);
__m512 wt399 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)16384);
__m512 wt400 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)18432);
__m512 wt401 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)20480);
__m512 wt402 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)22528);
__m512 wt403 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)24576);
__m512 wt404 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)26624);
__m512 wt405 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)28672);
__m512 wt406 = _mm512_maskz_loadu_ps(65535, wtPtr12+524288*i45+2048*k123+64*c35+(ptrdiff_t)30720);
__m512 tmp11033 = _mm512_unpacklo_ps(wt391, wt392);
__m512 tmp11034 = _mm512_unpackhi_ps(wt391, wt392);
__m512 tmp11035 = _mm512_unpacklo_ps(wt393, wt394);
__m512 tmp11036 = _mm512_unpackhi_ps(wt393, wt394);
__m512 tmp11037 = _mm512_unpacklo_ps(wt395, wt396);
__m512 tmp11038 = _mm512_unpackhi_ps(wt395, wt396);
__m512 tmp11039 = _mm512_unpacklo_ps(wt397, wt398);
__m512 tmp11040 = _mm512_unpackhi_ps(wt397, wt398);
__m512 tmp11041 = _mm512_unpacklo_ps(wt399, wt400);
__m512 tmp11042 = _mm512_unpackhi_ps(wt399, wt400);
__m512 tmp11043 = _mm512_unpacklo_ps(wt401, wt402);
__m512 tmp11044 = _mm512_unpackhi_ps(wt401, wt402);
__m512 tmp11045 = _mm512_unpacklo_ps(wt403, wt404);
__m512 tmp11046 = _mm512_unpackhi_ps(wt403, wt404);
__m512 tmp11047 = _mm512_unpacklo_ps(wt405, wt406);
__m512 tmp11048 = _mm512_unpackhi_ps(wt405, wt406);
__m512 tmp11049 = _mm512_shuffle_ps(tmp11033, tmp11035, 68);
__m512 tmp11050 = _mm512_shuffle_ps(tmp11033, tmp11035, 238);
__m512 tmp11051 = _mm512_shuffle_ps(tmp11034, tmp11036, 68);
__m512 tmp11052 = _mm512_shuffle_ps(tmp11034, tmp11036, 238);
__m512 tmp11053 = _mm512_shuffle_ps(tmp11037, tmp11039, 68);
__m512 tmp11054 = _mm512_shuffle_ps(tmp11037, tmp11039, 238);
__m512 tmp11055 = _mm512_shuffle_ps(tmp11038, tmp11040, 68);
__m512 tmp11056 = _mm512_shuffle_ps(tmp11038, tmp11040, 238);
__m512 tmp11057 = _mm512_shuffle_ps(tmp11041, tmp11043, 68);
__m512 tmp11058 = _mm512_shuffle_ps(tmp11041, tmp11043, 238);
__m512 tmp11059 = _mm512_shuffle_ps(tmp11042, tmp11044, 68);
__m512 tmp11060 = _mm512_shuffle_ps(tmp11042, tmp11044, 238);
__m512 tmp11061 = _mm512_shuffle_ps(tmp11045, tmp11047, 68);
__m512 tmp11062 = _mm512_shuffle_ps(tmp11045, tmp11047, 238);
__m512 tmp11063 = _mm512_shuffle_ps(tmp11046, tmp11048, 68);
__m512 tmp11064 = _mm512_shuffle_ps(tmp11046, tmp11048, 238);
__m512 tmp11065 = _mm512_shuffle_f32x4(tmp11049, tmp11053, 136);
__m512 tmp11066 = _mm512_shuffle_f32x4(tmp11049, tmp11053, 221);
__m512 tmp11067 = _mm512_shuffle_f32x4(tmp11050, tmp11054, 136);
__m512 tmp11068 = _mm512_shuffle_f32x4(tmp11050, tmp11054, 221);
__m512 tmp11069 = _mm512_shuffle_f32x4(tmp11051, tmp11055, 136);
__m512 tmp11070 = _mm512_shuffle_f32x4(tmp11051, tmp11055, 221);
__m512 tmp11071 = _mm512_shuffle_f32x4(tmp11052, tmp11056, 136);
__m512 tmp11072 = _mm512_shuffle_f32x4(tmp11052, tmp11056, 221);
__m512 tmp11073 = _mm512_shuffle_f32x4(tmp11057, tmp11061, 136);
__m512 tmp11074 = _mm512_shuffle_f32x4(tmp11057, tmp11061, 221);
__m512 tmp11075 = _mm512_shuffle_f32x4(tmp11058, tmp11062, 136);
__m512 tmp11076 = _mm512_shuffle_f32x4(tmp11058, tmp11062, 221);
__m512 tmp11077 = _mm512_shuffle_f32x4(tmp11059, tmp11063, 136);
__m512 tmp11078 = _mm512_shuffle_f32x4(tmp11059, tmp11063, 221);
__m512 tmp11079 = _mm512_shuffle_f32x4(tmp11060, tmp11064, 136);
__m512 tmp11080 = _mm512_shuffle_f32x4(tmp11060, tmp11064, 221);
wt391 = _mm512_shuffle_f32x4(tmp11065, tmp11073, 136);
wt399 = _mm512_shuffle_f32x4(tmp11065, tmp11073, 221);
wt392 = _mm512_shuffle_f32x4(tmp11067, tmp11075, 136);
wt400 = _mm512_shuffle_f32x4(tmp11067, tmp11075, 221);
wt393 = _mm512_shuffle_f32x4(tmp11069, tmp11077, 136);
wt401 = _mm512_shuffle_f32x4(tmp11069, tmp11077, 221);
wt394 = _mm512_shuffle_f32x4(tmp11071, tmp11079, 136);
wt402 = _mm512_shuffle_f32x4(tmp11071, tmp11079, 221);
wt395 = _mm512_shuffle_f32x4(tmp11066, tmp11074, 136);
wt403 = _mm512_shuffle_f32x4(tmp11066, tmp11074, 221);
wt396 = _mm512_shuffle_f32x4(tmp11068, tmp11076, 136);
wt404 = _mm512_shuffle_f32x4(tmp11068, tmp11076, 221);
wt397 = _mm512_shuffle_f32x4(tmp11070, tmp11078, 136);
wt405 = _mm512_shuffle_f32x4(tmp11070, tmp11078, 221);
wt398 = _mm512_shuffle_f32x4(tmp11072, tmp11080, 136);
wt406 = _mm512_shuffle_f32x4(tmp11072, tmp11080, 221);
wt391 = _mm512_mul_ps(wt391, postMul36);
wt392 = _mm512_mul_ps(wt392, postMul36);
wt393 = _mm512_mul_ps(wt393, postMul36);
wt394 = _mm512_mul_ps(wt394, postMul36);
wt395 = _mm512_mul_ps(wt395, postMul36);
wt396 = _mm512_mul_ps(wt396, postMul36);
wt397 = _mm512_mul_ps(wt397, postMul36);
wt398 = _mm512_mul_ps(wt398, postMul36);
wt399 = _mm512_mul_ps(wt399, postMul36);
wt400 = _mm512_mul_ps(wt400, postMul36);
wt401 = _mm512_mul_ps(wt401, postMul36);
wt402 = _mm512_mul_ps(wt402, postMul36);
wt403 = _mm512_mul_ps(wt403, postMul36);
wt404 = _mm512_mul_ps(wt404, postMul36);
wt405 = _mm512_mul_ps(wt405, postMul36);
wt406 = _mm512_mul_ps(wt406, postMul36);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(1+16*c35)+(ptrdiff_t)0, 63>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(2+16*c35)+(ptrdiff_t)0, 63>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(3+16*c35)+(ptrdiff_t)0, 63>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(4+16*c35)+(ptrdiff_t)0, 63>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(5+16*c35)+(ptrdiff_t)0, 63>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(6+16*c35)+(ptrdiff_t)0, 63>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(7+16*c35)+(ptrdiff_t)0, 63>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(8+16*c35)+(ptrdiff_t)0, 63>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(9+16*c35)+(ptrdiff_t)0, 63>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(10+16*c35)+(ptrdiff_t)0, 63>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(11+16*c35)+(ptrdiff_t)0, 63>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(12+16*c35)+(ptrdiff_t)0, 63>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(13+16*c35)+(ptrdiff_t)0, 63>>cut15, wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(14+16*c35)+(ptrdiff_t)0, 63>>cut15, wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(15+16*c35)+(ptrdiff_t)0, 63>>cut15, wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(16+16*c35)+(ptrdiff_t)0, 63>>cut15, wt406);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(1+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(2+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(3+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(4+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(5+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(6+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(7+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(8+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(9+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(10+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(11+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(12+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(13+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(14+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(15+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+24*(16+16*c35)+(ptrdiff_t)12288, 4032>>cut15, wt406);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(1+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt391);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(2+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt392);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(3+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt393);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(4+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt394);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(5+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt395);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(6+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt396);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(7+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt397);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(8+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt398);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(9+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt399);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(10+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt400);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(11+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt401);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(12+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt402);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(13+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt403);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(14+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt404);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(15+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt405);
_mm512_mask_storeu_ps(arranged13+525312*i45+12312*l51+4*cut15+16*(16+16*c35)+(ptrdiff_t)24576, 65535-(4095>>cut15), wt406);
}
}
}
}
}

static void ResNeXt50OneArrangeWts7(ResNeXt50ThreaderTeam1* team49, char** tensors71) {
ResNeXt50ThreaderTask1 task75;
task75.callee1 = ResNeXt50OneArrangeWts7Callee1;
task75.any1 = tensors71;
task75.nd1 = 3;
task75.hull1[0] = 16;
task75.hull1[1] = 1;
task75.hull1[2] = 1;
ResNeXt50ThreaderDo1(team49, &task75);
}

static void ResNeXt50OneArrangeDats7Callee1(ResNeXt50ThreaderTask1* task76, int64_t* pt43) {
char** tensors74 = task76->any1;
ptrdiff_t s40 = pt43[0];
ptrdiff_t c38 = pt43[1];
char*restrict datPtr23 = tensors74[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged14 = tensors74[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii32 = 1;
for (ptrdiff_t i46 = 0; i46 < ii32; ++i46) {
ptrdiff_t j39 = 1*c38;
ptrdiff_t jj42 = j39+0;
for (; j39 != 12; ++j39) {
ptrdiff_t k125 = 128*s40;
ptrdiff_t kk41 = k125+128;
for (; k125 < kk41; ++k125) {
__m512 dat2018 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)0);
__m512 dat2019 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)64);
__m512 dat2020 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)128);
__m512 dat2021 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k125+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)0, 65535, dat2018);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)64, 65535, dat2019);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)128, 65535, dat2020);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+256*k125+(ptrdiff_t)192, 65535, dat2021);
}
if (j39 >= jj42) goto next7;
}
ptrdiff_t k126 = 128*s40;
ptrdiff_t kk42 = k126+128;
for (; k126 < kk42; ++k126) {
__m512 dat2022 = _mm512_maskz_loadu_ps(65535, datPtr23+1605632*i46+256*j39+3136*k126+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged14+1605632*i46+131072*j39+64*k126+(ptrdiff_t)0, 65535, dat2022);
}
next7:;
}
}

static void ResNeXt50OneArrangeDats7(ResNeXt50ThreaderTeam1* team50, char** tensors73) {
ResNeXt50ThreaderTask1 task77;
task77.callee1 = ResNeXt50OneArrangeDats7Callee1;
task77.any1 = tensors73;
task77.nd1 = 4;
task77.hull1[0] = 4;
task77.hull1[1] = 13;
task77.hull1[2] = 1;
task77.hull1[3] = 1;
ResNeXt50ThreaderDo1(team50, &task77);
}

static void ResNeXt50OneApply7Callee1(ResNeXt50ThreaderTask1* task78, int64_t* pt44) {
void** pair18 = task78->any1;
char** tensors76 = pair18[0];
ptrdiff_t e22 = 0;
ptrdiff_t g24 = 0;
ptrdiff_t d15 = pt44[1];
ptrdiff_t w55 = pt44[0];
char*restrict arrangedWts7 = tensors76[0]+856064*e22+(ptrdiff_t)525312*1*g24;
char*restrict arrangedDats7 = tensors76[1]+2618560*e22+(ptrdiff_t)1605632*1*g24;
char*restrict datPtr24 = tensors76[2]+(ptrdiff_t)802816*1*g24;
ptrdiff_t ii33 = 1;
for (ptrdiff_t i47 = 0; i47 < ii33; ++i47) {
ptrdiff_t j40 = 1*d15;
ptrdiff_t jj43 = j40+0;
for (; j40 != 12; ++j40) {
ptrdiff_t k127 = 1*w55;
ptrdiff_t kk43 = k127+0;
for (; k127 != 42; ++k127) {
ptrdiff_t s41 = -1;
__m512 sum312 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)24));
__m512 sum316 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)28));
__m512 sum320 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)32));
__m512 sum324 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)36));
__m512 sum328 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)40));
__m512 sum332 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)44));
__m512 sum313 = sum312;
__m512 sum314 = sum312;
__m512 sum315 = sum312;
__m512 sum317 = sum316;
__m512 sum318 = sum316;
__m512 sum319 = sum316;
__m512 sum321 = sum320;
__m512 sum322 = sum320;
__m512 sum323 = sum320;
__m512 sum325 = sum324;
__m512 sum326 = sum324;
__m512 sum327 = sum324;
__m512 sum329 = sum328;
__m512 sum330 = sum328;
__m512 sum331 = sum328;
__m512 sum333 = sum332;
__m512 sum334 = sum332;
__m512 sum335 = sum332;
for (s41 = 0; s41 < 512; ++s41) {
__m512 dat2023 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)0);
__m512 dat2024 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)64);
__m512 dat2025 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)128);
__m512 dat2026 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s41+(ptrdiff_t)192);
__m512 wt439 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)24));
sum312 = _mm512_fmadd_ps(wt439, dat2023, sum312);
sum313 = _mm512_fmadd_ps(wt439, dat2024, sum313);
sum314 = _mm512_fmadd_ps(wt439, dat2025, sum314);
sum315 = _mm512_fmadd_ps(wt439, dat2026, sum315);
__m512 wt440 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)28));
sum316 = _mm512_fmadd_ps(wt440, dat2023, sum316);
sum317 = _mm512_fmadd_ps(wt440, dat2024, sum317);
sum318 = _mm512_fmadd_ps(wt440, dat2025, sum318);
sum319 = _mm512_fmadd_ps(wt440, dat2026, sum319);
__m512 wt441 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)32));
sum320 = _mm512_fmadd_ps(wt441, dat2023, sum320);
sum321 = _mm512_fmadd_ps(wt441, dat2024, sum321);
sum322 = _mm512_fmadd_ps(wt441, dat2025, sum322);
sum323 = _mm512_fmadd_ps(wt441, dat2026, sum323);
__m512 wt442 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)36));
sum324 = _mm512_fmadd_ps(wt442, dat2023, sum324);
sum325 = _mm512_fmadd_ps(wt442, dat2024, sum325);
sum326 = _mm512_fmadd_ps(wt442, dat2025, sum326);
sum327 = _mm512_fmadd_ps(wt442, dat2026, sum327);
__m512 wt443 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)40));
sum328 = _mm512_fmadd_ps(wt443, dat2023, sum328);
sum329 = _mm512_fmadd_ps(wt443, dat2024, sum329);
sum330 = _mm512_fmadd_ps(wt443, dat2025, sum330);
sum331 = _mm512_fmadd_ps(wt443, dat2026, sum331);
__m512 wt444 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+24*s41+(ptrdiff_t)44));
sum332 = _mm512_fmadd_ps(wt444, dat2023, sum332);
sum333 = _mm512_fmadd_ps(wt444, dat2024, sum333);
sum334 = _mm512_fmadd_ps(wt444, dat2025, sum334);
sum335 = _mm512_fmadd_ps(wt444, dat2026, sum335);
}
sum312 = _mm512_max_ps(_mm512_setzero_ps(), sum312);
sum313 = _mm512_max_ps(_mm512_setzero_ps(), sum313);
sum314 = _mm512_max_ps(_mm512_setzero_ps(), sum314);
sum315 = _mm512_max_ps(_mm512_setzero_ps(), sum315);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)0, 65535, sum312);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)64, 65535, sum313);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)128, 65535, sum314);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)192, 65535, sum315);
sum316 = _mm512_max_ps(_mm512_setzero_ps(), sum316);
sum317 = _mm512_max_ps(_mm512_setzero_ps(), sum317);
sum318 = _mm512_max_ps(_mm512_setzero_ps(), sum318);
sum319 = _mm512_max_ps(_mm512_setzero_ps(), sum319);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3136, 65535, sum316);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3200, 65535, sum317);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3264, 65535, sum318);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3328, 65535, sum319);
sum320 = _mm512_max_ps(_mm512_setzero_ps(), sum320);
sum321 = _mm512_max_ps(_mm512_setzero_ps(), sum321);
sum322 = _mm512_max_ps(_mm512_setzero_ps(), sum322);
sum323 = _mm512_max_ps(_mm512_setzero_ps(), sum323);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6272, 65535, sum320);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6336, 65535, sum321);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6400, 65535, sum322);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6464, 65535, sum323);
sum324 = _mm512_max_ps(_mm512_setzero_ps(), sum324);
sum325 = _mm512_max_ps(_mm512_setzero_ps(), sum325);
sum326 = _mm512_max_ps(_mm512_setzero_ps(), sum326);
sum327 = _mm512_max_ps(_mm512_setzero_ps(), sum327);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9408, 65535, sum324);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9472, 65535, sum325);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9536, 65535, sum326);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9600, 65535, sum327);
sum328 = _mm512_max_ps(_mm512_setzero_ps(), sum328);
sum329 = _mm512_max_ps(_mm512_setzero_ps(), sum329);
sum330 = _mm512_max_ps(_mm512_setzero_ps(), sum330);
sum331 = _mm512_max_ps(_mm512_setzero_ps(), sum331);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12544, 65535, sum328);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12608, 65535, sum329);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12672, 65535, sum330);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)12736, 65535, sum331);
sum332 = _mm512_max_ps(_mm512_setzero_ps(), sum332);
sum333 = _mm512_max_ps(_mm512_setzero_ps(), sum333);
sum334 = _mm512_max_ps(_mm512_setzero_ps(), sum334);
sum335 = _mm512_max_ps(_mm512_setzero_ps(), sum335);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15680, 65535, sum332);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15744, 65535, sum333);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15808, 65535, sum334);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)15872, 65535, sum335);
if (k127 >= kk43) return;
}
ptrdiff_t s42 = -1;
__m512 sum336 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)16));
__m512 sum340 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)20));
__m512 sum344 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)24));
__m512 sum348 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)28));
__m512 sum337 = sum336;
__m512 sum338 = sum336;
__m512 sum339 = sum336;
__m512 sum341 = sum340;
__m512 sum342 = sum340;
__m512 sum343 = sum340;
__m512 sum345 = sum344;
__m512 sum346 = sum344;
__m512 sum347 = sum344;
__m512 sum349 = sum348;
__m512 sum350 = sum348;
__m512 sum351 = sum348;
for (s42 = 0; s42 < 512; ++s42) {
__m512 dat2027 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)0);
__m512 dat2028 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)64);
__m512 dat2029 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)128);
__m512 dat2030 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+256*s42+(ptrdiff_t)192);
__m512 wt445 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)16));
sum336 = _mm512_fmadd_ps(wt445, dat2027, sum336);
sum337 = _mm512_fmadd_ps(wt445, dat2028, sum337);
sum338 = _mm512_fmadd_ps(wt445, dat2029, sum338);
sum339 = _mm512_fmadd_ps(wt445, dat2030, sum339);
__m512 wt446 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)20));
sum340 = _mm512_fmadd_ps(wt446, dat2027, sum340);
sum341 = _mm512_fmadd_ps(wt446, dat2028, sum341);
sum342 = _mm512_fmadd_ps(wt446, dat2029, sum342);
sum343 = _mm512_fmadd_ps(wt446, dat2030, sum343);
__m512 wt447 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)24));
sum344 = _mm512_fmadd_ps(wt447, dat2027, sum344);
sum345 = _mm512_fmadd_ps(wt447, dat2028, sum345);
sum346 = _mm512_fmadd_ps(wt447, dat2029, sum346);
sum347 = _mm512_fmadd_ps(wt447, dat2030, sum347);
__m512 wt448 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k127+16*s42+(ptrdiff_t)28));
sum348 = _mm512_fmadd_ps(wt448, dat2027, sum348);
sum349 = _mm512_fmadd_ps(wt448, dat2028, sum349);
sum350 = _mm512_fmadd_ps(wt448, dat2029, sum350);
sum351 = _mm512_fmadd_ps(wt448, dat2030, sum351);
}
sum336 = _mm512_max_ps(_mm512_setzero_ps(), sum336);
sum337 = _mm512_max_ps(_mm512_setzero_ps(), sum337);
sum338 = _mm512_max_ps(_mm512_setzero_ps(), sum338);
sum339 = _mm512_max_ps(_mm512_setzero_ps(), sum339);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)0, 65535, sum336);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)64, 65535, sum337);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)128, 65535, sum338);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)192, 65535, sum339);
sum340 = _mm512_max_ps(_mm512_setzero_ps(), sum340);
sum341 = _mm512_max_ps(_mm512_setzero_ps(), sum341);
sum342 = _mm512_max_ps(_mm512_setzero_ps(), sum342);
sum343 = _mm512_max_ps(_mm512_setzero_ps(), sum343);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3136, 65535, sum340);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3200, 65535, sum341);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3264, 65535, sum342);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)3328, 65535, sum343);
sum344 = _mm512_max_ps(_mm512_setzero_ps(), sum344);
sum345 = _mm512_max_ps(_mm512_setzero_ps(), sum345);
sum346 = _mm512_max_ps(_mm512_setzero_ps(), sum346);
sum347 = _mm512_max_ps(_mm512_setzero_ps(), sum347);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6272, 65535, sum344);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6336, 65535, sum345);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6400, 65535, sum346);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)6464, 65535, sum347);
sum348 = _mm512_max_ps(_mm512_setzero_ps(), sum348);
sum349 = _mm512_max_ps(_mm512_setzero_ps(), sum349);
sum350 = _mm512_max_ps(_mm512_setzero_ps(), sum350);
sum351 = _mm512_max_ps(_mm512_setzero_ps(), sum351);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9408, 65535, sum348);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9472, 65535, sum349);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9536, 65535, sum350);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k127+(ptrdiff_t)9600, 65535, sum351);
if (j40 >= jj43) return;
}
ptrdiff_t k128 = 1*w55;
ptrdiff_t kk44 = k128+0;
for (; k128 != 42; ++k128) {
ptrdiff_t s43 = -1;
__m512 sum352 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)24));
__m512 sum353 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)28));
__m512 sum354 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)32));
__m512 sum355 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)36));
__m512 sum356 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)40));
__m512 sum357 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)44));
for (s43 = 0; s43 < 512; ++s43) {
__m512 dat2031 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+64*s43+(ptrdiff_t)0);
__m512 wt449 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)24));
sum352 = _mm512_fmadd_ps(wt449, dat2031, sum352);
__m512 wt450 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)28));
sum353 = _mm512_fmadd_ps(wt450, dat2031, sum353);
__m512 wt451 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)32));
sum354 = _mm512_fmadd_ps(wt451, dat2031, sum354);
__m512 wt452 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)36));
sum355 = _mm512_fmadd_ps(wt452, dat2031, sum355);
__m512 wt453 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)40));
sum356 = _mm512_fmadd_ps(wt453, dat2031, sum356);
__m512 wt454 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+24*s43+(ptrdiff_t)44));
sum357 = _mm512_fmadd_ps(wt454, dat2031, sum357);
}
sum352 = _mm512_max_ps(_mm512_setzero_ps(), sum352);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)0, 65535, sum352);
sum353 = _mm512_max_ps(_mm512_setzero_ps(), sum353);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)3136, 65535, sum353);
sum354 = _mm512_max_ps(_mm512_setzero_ps(), sum354);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)6272, 65535, sum354);
sum355 = _mm512_max_ps(_mm512_setzero_ps(), sum355);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)9408, 65535, sum355);
sum356 = _mm512_max_ps(_mm512_setzero_ps(), sum356);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)12544, 65535, sum356);
sum357 = _mm512_max_ps(_mm512_setzero_ps(), sum357);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)15680, 65535, sum357);
if (k128 >= kk44) return;
}
ptrdiff_t s44 = -1;
__m512 sum358 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)16));
__m512 sum359 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)20));
__m512 sum360 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)24));
__m512 sum361 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)28));
for (s44 = 0; s44 < 512; ++s44) {
__m512 dat2032 = _mm512_loadu_ps(arrangedDats7+1605632*i47+131072*j40+64*s44+(ptrdiff_t)0);
__m512 wt455 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)16));
sum358 = _mm512_fmadd_ps(wt455, dat2032, sum358);
__m512 wt456 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)20));
sum359 = _mm512_fmadd_ps(wt456, dat2032, sum359);
__m512 wt457 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)24));
sum360 = _mm512_fmadd_ps(wt457, dat2032, sum360);
__m512 wt458 = _mm512_set1_ps(*(float*)(arrangedWts7+525312*i47+12312*k128+16*s44+(ptrdiff_t)28));
sum361 = _mm512_fmadd_ps(wt458, dat2032, sum361);
}
sum358 = _mm512_max_ps(_mm512_setzero_ps(), sum358);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)0, 65535, sum358);
sum359 = _mm512_max_ps(_mm512_setzero_ps(), sum359);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)3136, 65535, sum359);
sum360 = _mm512_max_ps(_mm512_setzero_ps(), sum360);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)6272, 65535, sum360);
sum361 = _mm512_max_ps(_mm512_setzero_ps(), sum361);
_mm512_mask_storeu_ps(datPtr24+802816*i47+256*j40+18816*k128+(ptrdiff_t)9408, 65535, sum361);
}
}

static void ResNeXt50OneApply7(ResNeXt50ThreaderTeam1* team51, char** tensors75) {
void* pair17[] = {tensors75, 0};
ResNeXt50ThreaderTask1 task79;
task79.callee1 = ResNeXt50OneApply7Callee1;
task79.any1 = pair17;
task79.nd1 = 3;
task79.hull1[0] = 43;
task79.hull1[1] = 13;
task79.hull1[2] = 1;
ResNeXt50ThreaderDo1(team51, &task79);
}

static void ResNeXt50OneArrangeWts8Callee1(ResNeXt50ThreaderTask1* task88, int64_t* pt49) {
char** tensors86 = task88->any1;
ptrdiff_t b74 = pt49[0];
char*restrict wtPtr14 = tensors86[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr14 = tensors86[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr15 = tensors86[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged15 = tensors86[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)2101248*0;
ptrdiff_t ii38 = 1;
for (ptrdiff_t i53 = 0; i53 < ii38; ++i53) {
ptrdiff_t j45 = 1*b74;
ptrdiff_t jj45 = j45+1;
for (; j45 < jj45; ++j45) {
if (j45 < 63) {
ptrdiff_t k142 = 0+16*(j45-0);
ptrdiff_t l61 = (size_t)(0+k142)/6;
ptrdiff_t cut19 = (size_t)(0+k142)%6;
switch (cut19) {
case 0:;
case 2: {
__m512 sum391 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k142);
__m512i pmMul28 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd28 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo23 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1024*i53));
__m512 masHi23 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1024*i53)+(ptrdiff_t)64);
__m512 postMul45 = _mm512_permutex2var_ps(masLo23, pmMul28, masHi23);
__m512 postAdd29 = _mm512_permutex2var_ps(masLo23, pmAdd28, masHi23);
sum391 = _mm512_fmadd_ps(sum391, postMul45, postAdd29);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum391);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum391);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)24576, 65535-(4095>>cut19), sum391);
ptrdiff_t c41 = 0;
for (; c41 != 32; ++c41) {
__m512 wt479 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)0);
__m512 wt480 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)2048);
__m512 wt481 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)4096);
__m512 wt482 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)6144);
__m512 wt483 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)8192);
__m512 wt484 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)10240);
__m512 wt485 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)12288);
__m512 wt486 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)14336);
__m512 wt487 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)16384);
__m512 wt488 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)18432);
__m512 wt489 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)20480);
__m512 wt490 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)22528);
__m512 wt491 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)24576);
__m512 wt492 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)26624);
__m512 wt493 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)28672);
__m512 wt494 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c41+(ptrdiff_t)30720);
__m512 tmp13699 = _mm512_unpacklo_ps(wt479, wt480);
__m512 tmp13700 = _mm512_unpackhi_ps(wt479, wt480);
__m512 tmp13701 = _mm512_unpacklo_ps(wt481, wt482);
__m512 tmp13702 = _mm512_unpackhi_ps(wt481, wt482);
__m512 tmp13703 = _mm512_unpacklo_ps(wt483, wt484);
__m512 tmp13704 = _mm512_unpackhi_ps(wt483, wt484);
__m512 tmp13705 = _mm512_unpacklo_ps(wt485, wt486);
__m512 tmp13706 = _mm512_unpackhi_ps(wt485, wt486);
__m512 tmp13707 = _mm512_unpacklo_ps(wt487, wt488);
__m512 tmp13708 = _mm512_unpackhi_ps(wt487, wt488);
__m512 tmp13709 = _mm512_unpacklo_ps(wt489, wt490);
__m512 tmp13710 = _mm512_unpackhi_ps(wt489, wt490);
__m512 tmp13711 = _mm512_unpacklo_ps(wt491, wt492);
__m512 tmp13712 = _mm512_unpackhi_ps(wt491, wt492);
__m512 tmp13713 = _mm512_unpacklo_ps(wt493, wt494);
__m512 tmp13714 = _mm512_unpackhi_ps(wt493, wt494);
__m512 tmp13715 = _mm512_shuffle_ps(tmp13699, tmp13701, 68);
__m512 tmp13716 = _mm512_shuffle_ps(tmp13699, tmp13701, 238);
__m512 tmp13717 = _mm512_shuffle_ps(tmp13700, tmp13702, 68);
__m512 tmp13718 = _mm512_shuffle_ps(tmp13700, tmp13702, 238);
__m512 tmp13719 = _mm512_shuffle_ps(tmp13703, tmp13705, 68);
__m512 tmp13720 = _mm512_shuffle_ps(tmp13703, tmp13705, 238);
__m512 tmp13721 = _mm512_shuffle_ps(tmp13704, tmp13706, 68);
__m512 tmp13722 = _mm512_shuffle_ps(tmp13704, tmp13706, 238);
__m512 tmp13723 = _mm512_shuffle_ps(tmp13707, tmp13709, 68);
__m512 tmp13724 = _mm512_shuffle_ps(tmp13707, tmp13709, 238);
__m512 tmp13725 = _mm512_shuffle_ps(tmp13708, tmp13710, 68);
__m512 tmp13726 = _mm512_shuffle_ps(tmp13708, tmp13710, 238);
__m512 tmp13727 = _mm512_shuffle_ps(tmp13711, tmp13713, 68);
__m512 tmp13728 = _mm512_shuffle_ps(tmp13711, tmp13713, 238);
__m512 tmp13729 = _mm512_shuffle_ps(tmp13712, tmp13714, 68);
__m512 tmp13730 = _mm512_shuffle_ps(tmp13712, tmp13714, 238);
__m512 tmp13731 = _mm512_shuffle_f32x4(tmp13715, tmp13719, 136);
__m512 tmp13732 = _mm512_shuffle_f32x4(tmp13715, tmp13719, 221);
__m512 tmp13733 = _mm512_shuffle_f32x4(tmp13716, tmp13720, 136);
__m512 tmp13734 = _mm512_shuffle_f32x4(tmp13716, tmp13720, 221);
__m512 tmp13735 = _mm512_shuffle_f32x4(tmp13717, tmp13721, 136);
__m512 tmp13736 = _mm512_shuffle_f32x4(tmp13717, tmp13721, 221);
__m512 tmp13737 = _mm512_shuffle_f32x4(tmp13718, tmp13722, 136);
__m512 tmp13738 = _mm512_shuffle_f32x4(tmp13718, tmp13722, 221);
__m512 tmp13739 = _mm512_shuffle_f32x4(tmp13723, tmp13727, 136);
__m512 tmp13740 = _mm512_shuffle_f32x4(tmp13723, tmp13727, 221);
__m512 tmp13741 = _mm512_shuffle_f32x4(tmp13724, tmp13728, 136);
__m512 tmp13742 = _mm512_shuffle_f32x4(tmp13724, tmp13728, 221);
__m512 tmp13743 = _mm512_shuffle_f32x4(tmp13725, tmp13729, 136);
__m512 tmp13744 = _mm512_shuffle_f32x4(tmp13725, tmp13729, 221);
__m512 tmp13745 = _mm512_shuffle_f32x4(tmp13726, tmp13730, 136);
__m512 tmp13746 = _mm512_shuffle_f32x4(tmp13726, tmp13730, 221);
wt479 = _mm512_shuffle_f32x4(tmp13731, tmp13739, 136);
wt487 = _mm512_shuffle_f32x4(tmp13731, tmp13739, 221);
wt480 = _mm512_shuffle_f32x4(tmp13733, tmp13741, 136);
wt488 = _mm512_shuffle_f32x4(tmp13733, tmp13741, 221);
wt481 = _mm512_shuffle_f32x4(tmp13735, tmp13743, 136);
wt489 = _mm512_shuffle_f32x4(tmp13735, tmp13743, 221);
wt482 = _mm512_shuffle_f32x4(tmp13737, tmp13745, 136);
wt490 = _mm512_shuffle_f32x4(tmp13737, tmp13745, 221);
wt483 = _mm512_shuffle_f32x4(tmp13732, tmp13740, 136);
wt491 = _mm512_shuffle_f32x4(tmp13732, tmp13740, 221);
wt484 = _mm512_shuffle_f32x4(tmp13734, tmp13742, 136);
wt492 = _mm512_shuffle_f32x4(tmp13734, tmp13742, 221);
wt485 = _mm512_shuffle_f32x4(tmp13736, tmp13744, 136);
wt493 = _mm512_shuffle_f32x4(tmp13736, tmp13744, 221);
wt486 = _mm512_shuffle_f32x4(tmp13738, tmp13746, 136);
wt494 = _mm512_shuffle_f32x4(tmp13738, tmp13746, 221);
wt479 = _mm512_mul_ps(wt479, postMul45);
wt480 = _mm512_mul_ps(wt480, postMul45);
wt481 = _mm512_mul_ps(wt481, postMul45);
wt482 = _mm512_mul_ps(wt482, postMul45);
wt483 = _mm512_mul_ps(wt483, postMul45);
wt484 = _mm512_mul_ps(wt484, postMul45);
wt485 = _mm512_mul_ps(wt485, postMul45);
wt486 = _mm512_mul_ps(wt486, postMul45);
wt487 = _mm512_mul_ps(wt487, postMul45);
wt488 = _mm512_mul_ps(wt488, postMul45);
wt489 = _mm512_mul_ps(wt489, postMul45);
wt490 = _mm512_mul_ps(wt490, postMul45);
wt491 = _mm512_mul_ps(wt491, postMul45);
wt492 = _mm512_mul_ps(wt492, postMul45);
wt493 = _mm512_mul_ps(wt493, postMul45);
wt494 = _mm512_mul_ps(wt494, postMul45);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)0, 63>>cut19, wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)0, 63>>cut19, wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)0, 63>>cut19, wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)0, 63>>cut19, wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)0, 63>>cut19, wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)0, 63>>cut19, wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)0, 63>>cut19, wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)0, 63>>cut19, wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)0, 63>>cut19, wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)0, 63>>cut19, wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)0, 63>>cut19, wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)0, 63>>cut19, wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)0, 63>>cut19, wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)0, 63>>cut19, wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)0, 63>>cut19, wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)0, 63>>cut19, wt494);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)12288, 4032>>cut19, wt494);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt479);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt480);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt481);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt482);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt483);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt484);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt485);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt486);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt487);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt488);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt489);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt490);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt491);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt492);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt493);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c41)+(ptrdiff_t)24576, 65535-(4095>>cut19), wt494);
}
break;
}
default: {
cut19 = 4;
__m512 sum392 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k142);
__m512i pmMul29 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd29 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo24 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k142+1024*i53));
__m512 masHi24 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k142+1024*i53)+(ptrdiff_t)64);
__m512 postMul46 = _mm512_permutex2var_ps(masLo24, pmMul29, masHi24);
__m512 postAdd30 = _mm512_permutex2var_ps(masLo24, pmAdd29, masHi24);
sum392 = _mm512_fmadd_ps(sum392, postMul46, postAdd30);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)0, 63>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)12288, 4032>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)24576, 258048>>cut19, sum392);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*0+(ptrdiff_t)36864, 65535-(262143>>cut19), sum392);
ptrdiff_t c42 = 0;
for (; c42 != 32; ++c42) {
__m512 wt495 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)0);
__m512 wt496 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)2048);
__m512 wt497 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)4096);
__m512 wt498 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)6144);
__m512 wt499 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)8192);
__m512 wt500 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)10240);
__m512 wt501 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)12288);
__m512 wt502 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)14336);
__m512 wt503 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)16384);
__m512 wt504 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)18432);
__m512 wt505 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)20480);
__m512 wt506 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)22528);
__m512 wt507 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)24576);
__m512 wt508 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)26624);
__m512 wt509 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)28672);
__m512 wt510 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k142+64*c42+(ptrdiff_t)30720);
__m512 tmp13747 = _mm512_unpacklo_ps(wt495, wt496);
__m512 tmp13748 = _mm512_unpackhi_ps(wt495, wt496);
__m512 tmp13749 = _mm512_unpacklo_ps(wt497, wt498);
__m512 tmp13750 = _mm512_unpackhi_ps(wt497, wt498);
__m512 tmp13751 = _mm512_unpacklo_ps(wt499, wt500);
__m512 tmp13752 = _mm512_unpackhi_ps(wt499, wt500);
__m512 tmp13753 = _mm512_unpacklo_ps(wt501, wt502);
__m512 tmp13754 = _mm512_unpackhi_ps(wt501, wt502);
__m512 tmp13755 = _mm512_unpacklo_ps(wt503, wt504);
__m512 tmp13756 = _mm512_unpackhi_ps(wt503, wt504);
__m512 tmp13757 = _mm512_unpacklo_ps(wt505, wt506);
__m512 tmp13758 = _mm512_unpackhi_ps(wt505, wt506);
__m512 tmp13759 = _mm512_unpacklo_ps(wt507, wt508);
__m512 tmp13760 = _mm512_unpackhi_ps(wt507, wt508);
__m512 tmp13761 = _mm512_unpacklo_ps(wt509, wt510);
__m512 tmp13762 = _mm512_unpackhi_ps(wt509, wt510);
__m512 tmp13763 = _mm512_shuffle_ps(tmp13747, tmp13749, 68);
__m512 tmp13764 = _mm512_shuffle_ps(tmp13747, tmp13749, 238);
__m512 tmp13765 = _mm512_shuffle_ps(tmp13748, tmp13750, 68);
__m512 tmp13766 = _mm512_shuffle_ps(tmp13748, tmp13750, 238);
__m512 tmp13767 = _mm512_shuffle_ps(tmp13751, tmp13753, 68);
__m512 tmp13768 = _mm512_shuffle_ps(tmp13751, tmp13753, 238);
__m512 tmp13769 = _mm512_shuffle_ps(tmp13752, tmp13754, 68);
__m512 tmp13770 = _mm512_shuffle_ps(tmp13752, tmp13754, 238);
__m512 tmp13771 = _mm512_shuffle_ps(tmp13755, tmp13757, 68);
__m512 tmp13772 = _mm512_shuffle_ps(tmp13755, tmp13757, 238);
__m512 tmp13773 = _mm512_shuffle_ps(tmp13756, tmp13758, 68);
__m512 tmp13774 = _mm512_shuffle_ps(tmp13756, tmp13758, 238);
__m512 tmp13775 = _mm512_shuffle_ps(tmp13759, tmp13761, 68);
__m512 tmp13776 = _mm512_shuffle_ps(tmp13759, tmp13761, 238);
__m512 tmp13777 = _mm512_shuffle_ps(tmp13760, tmp13762, 68);
__m512 tmp13778 = _mm512_shuffle_ps(tmp13760, tmp13762, 238);
__m512 tmp13779 = _mm512_shuffle_f32x4(tmp13763, tmp13767, 136);
__m512 tmp13780 = _mm512_shuffle_f32x4(tmp13763, tmp13767, 221);
__m512 tmp13781 = _mm512_shuffle_f32x4(tmp13764, tmp13768, 136);
__m512 tmp13782 = _mm512_shuffle_f32x4(tmp13764, tmp13768, 221);
__m512 tmp13783 = _mm512_shuffle_f32x4(tmp13765, tmp13769, 136);
__m512 tmp13784 = _mm512_shuffle_f32x4(tmp13765, tmp13769, 221);
__m512 tmp13785 = _mm512_shuffle_f32x4(tmp13766, tmp13770, 136);
__m512 tmp13786 = _mm512_shuffle_f32x4(tmp13766, tmp13770, 221);
__m512 tmp13787 = _mm512_shuffle_f32x4(tmp13771, tmp13775, 136);
__m512 tmp13788 = _mm512_shuffle_f32x4(tmp13771, tmp13775, 221);
__m512 tmp13789 = _mm512_shuffle_f32x4(tmp13772, tmp13776, 136);
__m512 tmp13790 = _mm512_shuffle_f32x4(tmp13772, tmp13776, 221);
__m512 tmp13791 = _mm512_shuffle_f32x4(tmp13773, tmp13777, 136);
__m512 tmp13792 = _mm512_shuffle_f32x4(tmp13773, tmp13777, 221);
__m512 tmp13793 = _mm512_shuffle_f32x4(tmp13774, tmp13778, 136);
__m512 tmp13794 = _mm512_shuffle_f32x4(tmp13774, tmp13778, 221);
wt495 = _mm512_shuffle_f32x4(tmp13779, tmp13787, 136);
wt503 = _mm512_shuffle_f32x4(tmp13779, tmp13787, 221);
wt496 = _mm512_shuffle_f32x4(tmp13781, tmp13789, 136);
wt504 = _mm512_shuffle_f32x4(tmp13781, tmp13789, 221);
wt497 = _mm512_shuffle_f32x4(tmp13783, tmp13791, 136);
wt505 = _mm512_shuffle_f32x4(tmp13783, tmp13791, 221);
wt498 = _mm512_shuffle_f32x4(tmp13785, tmp13793, 136);
wt506 = _mm512_shuffle_f32x4(tmp13785, tmp13793, 221);
wt499 = _mm512_shuffle_f32x4(tmp13780, tmp13788, 136);
wt507 = _mm512_shuffle_f32x4(tmp13780, tmp13788, 221);
wt500 = _mm512_shuffle_f32x4(tmp13782, tmp13790, 136);
wt508 = _mm512_shuffle_f32x4(tmp13782, tmp13790, 221);
wt501 = _mm512_shuffle_f32x4(tmp13784, tmp13792, 136);
wt509 = _mm512_shuffle_f32x4(tmp13784, tmp13792, 221);
wt502 = _mm512_shuffle_f32x4(tmp13786, tmp13794, 136);
wt510 = _mm512_shuffle_f32x4(tmp13786, tmp13794, 221);
wt495 = _mm512_mul_ps(wt495, postMul46);
wt496 = _mm512_mul_ps(wt496, postMul46);
wt497 = _mm512_mul_ps(wt497, postMul46);
wt498 = _mm512_mul_ps(wt498, postMul46);
wt499 = _mm512_mul_ps(wt499, postMul46);
wt500 = _mm512_mul_ps(wt500, postMul46);
wt501 = _mm512_mul_ps(wt501, postMul46);
wt502 = _mm512_mul_ps(wt502, postMul46);
wt503 = _mm512_mul_ps(wt503, postMul46);
wt504 = _mm512_mul_ps(wt504, postMul46);
wt505 = _mm512_mul_ps(wt505, postMul46);
wt506 = _mm512_mul_ps(wt506, postMul46);
wt507 = _mm512_mul_ps(wt507, postMul46);
wt508 = _mm512_mul_ps(wt508, postMul46);
wt509 = _mm512_mul_ps(wt509, postMul46);
wt510 = _mm512_mul_ps(wt510, postMul46);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)0, 63>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)0, 63>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)0, 63>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)0, 63>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)0, 63>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)0, 63>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)0, 63>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)0, 63>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)0, 63>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)0, 63>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)0, 63>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)0, 63>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)0, 63>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)0, 63>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)0, 63>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)0, 63>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)12288, 4032>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)24576, 258048>>cut19, wt510);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(1+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt495);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(2+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt496);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(3+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt497);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(4+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt498);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(5+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt499);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(6+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt500);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(7+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt501);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(8+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt502);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(9+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt503);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(10+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt504);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(11+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt505);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(12+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt506);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(13+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt507);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(14+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt508);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(15+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt509);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l61+4*cut19+24*(16+16*c42)+(ptrdiff_t)36864, 65535-(262143>>cut19), wt510);
}
}
}
} else {
ptrdiff_t k141 = 1008;
ptrdiff_t l60 = (size_t)(0+k141)/6;
ptrdiff_t cut18 = (size_t)(0+k141)%6;
__m512 sum390 = _mm512_maskz_loadu_ps(65535, biasPtr14+4096*i53+4*k141);
__m512i pmMul30 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd30 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo25 = _mm512_loadu_ps(bnPtr15+(ptrdiff_t)8*(k141+1024*i53));
__m512 masHi25 = _mm512_maskz_loadu_ps(65535, bnPtr15+(ptrdiff_t)8*(k141+1024*i53)+(ptrdiff_t)64);
__m512 postMul44 = _mm512_permutex2var_ps(masLo25, pmMul30, masHi25);
__m512 postAdd28 = _mm512_permutex2var_ps(masLo25, pmAdd30, masHi25);
sum390 = _mm512_fmadd_ps(sum390, postMul44, postAdd28);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*0+(ptrdiff_t)0, 63>>cut18, sum390);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*0+(ptrdiff_t)12288, 4032>>cut18, sum390);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*0+(ptrdiff_t)24576, 65535-(4095>>cut18), sum390);
ptrdiff_t c40 = 0;
for (; c40 != 32; ++c40) {
__m512 wt463 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)0);
__m512 wt464 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)2048);
__m512 wt465 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)4096);
__m512 wt466 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)6144);
__m512 wt467 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)8192);
__m512 wt468 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)10240);
__m512 wt469 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)12288);
__m512 wt470 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)14336);
__m512 wt471 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)16384);
__m512 wt472 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)18432);
__m512 wt473 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)20480);
__m512 wt474 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)22528);
__m512 wt475 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)24576);
__m512 wt476 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)26624);
__m512 wt477 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)28672);
__m512 wt478 = _mm512_maskz_loadu_ps(65535, wtPtr14+2097152*i53+2048*k141+64*c40+(ptrdiff_t)30720);
__m512 tmp13795 = _mm512_unpacklo_ps(wt463, wt464);
__m512 tmp13796 = _mm512_unpackhi_ps(wt463, wt464);
__m512 tmp13797 = _mm512_unpacklo_ps(wt465, wt466);
__m512 tmp13798 = _mm512_unpackhi_ps(wt465, wt466);
__m512 tmp13799 = _mm512_unpacklo_ps(wt467, wt468);
__m512 tmp13800 = _mm512_unpackhi_ps(wt467, wt468);
__m512 tmp13801 = _mm512_unpacklo_ps(wt469, wt470);
__m512 tmp13802 = _mm512_unpackhi_ps(wt469, wt470);
__m512 tmp13803 = _mm512_unpacklo_ps(wt471, wt472);
__m512 tmp13804 = _mm512_unpackhi_ps(wt471, wt472);
__m512 tmp13805 = _mm512_unpacklo_ps(wt473, wt474);
__m512 tmp13806 = _mm512_unpackhi_ps(wt473, wt474);
__m512 tmp13807 = _mm512_unpacklo_ps(wt475, wt476);
__m512 tmp13808 = _mm512_unpackhi_ps(wt475, wt476);
__m512 tmp13809 = _mm512_unpacklo_ps(wt477, wt478);
__m512 tmp13810 = _mm512_unpackhi_ps(wt477, wt478);
__m512 tmp13811 = _mm512_shuffle_ps(tmp13795, tmp13797, 68);
__m512 tmp13812 = _mm512_shuffle_ps(tmp13795, tmp13797, 238);
__m512 tmp13813 = _mm512_shuffle_ps(tmp13796, tmp13798, 68);
__m512 tmp13814 = _mm512_shuffle_ps(tmp13796, tmp13798, 238);
__m512 tmp13815 = _mm512_shuffle_ps(tmp13799, tmp13801, 68);
__m512 tmp13816 = _mm512_shuffle_ps(tmp13799, tmp13801, 238);
__m512 tmp13817 = _mm512_shuffle_ps(tmp13800, tmp13802, 68);
__m512 tmp13818 = _mm512_shuffle_ps(tmp13800, tmp13802, 238);
__m512 tmp13819 = _mm512_shuffle_ps(tmp13803, tmp13805, 68);
__m512 tmp13820 = _mm512_shuffle_ps(tmp13803, tmp13805, 238);
__m512 tmp13821 = _mm512_shuffle_ps(tmp13804, tmp13806, 68);
__m512 tmp13822 = _mm512_shuffle_ps(tmp13804, tmp13806, 238);
__m512 tmp13823 = _mm512_shuffle_ps(tmp13807, tmp13809, 68);
__m512 tmp13824 = _mm512_shuffle_ps(tmp13807, tmp13809, 238);
__m512 tmp13825 = _mm512_shuffle_ps(tmp13808, tmp13810, 68);
__m512 tmp13826 = _mm512_shuffle_ps(tmp13808, tmp13810, 238);
__m512 tmp13827 = _mm512_shuffle_f32x4(tmp13811, tmp13815, 136);
__m512 tmp13828 = _mm512_shuffle_f32x4(tmp13811, tmp13815, 221);
__m512 tmp13829 = _mm512_shuffle_f32x4(tmp13812, tmp13816, 136);
__m512 tmp13830 = _mm512_shuffle_f32x4(tmp13812, tmp13816, 221);
__m512 tmp13831 = _mm512_shuffle_f32x4(tmp13813, tmp13817, 136);
__m512 tmp13832 = _mm512_shuffle_f32x4(tmp13813, tmp13817, 221);
__m512 tmp13833 = _mm512_shuffle_f32x4(tmp13814, tmp13818, 136);
__m512 tmp13834 = _mm512_shuffle_f32x4(tmp13814, tmp13818, 221);
__m512 tmp13835 = _mm512_shuffle_f32x4(tmp13819, tmp13823, 136);
__m512 tmp13836 = _mm512_shuffle_f32x4(tmp13819, tmp13823, 221);
__m512 tmp13837 = _mm512_shuffle_f32x4(tmp13820, tmp13824, 136);
__m512 tmp13838 = _mm512_shuffle_f32x4(tmp13820, tmp13824, 221);
__m512 tmp13839 = _mm512_shuffle_f32x4(tmp13821, tmp13825, 136);
__m512 tmp13840 = _mm512_shuffle_f32x4(tmp13821, tmp13825, 221);
__m512 tmp13841 = _mm512_shuffle_f32x4(tmp13822, tmp13826, 136);
__m512 tmp13842 = _mm512_shuffle_f32x4(tmp13822, tmp13826, 221);
wt463 = _mm512_shuffle_f32x4(tmp13827, tmp13835, 136);
wt471 = _mm512_shuffle_f32x4(tmp13827, tmp13835, 221);
wt464 = _mm512_shuffle_f32x4(tmp13829, tmp13837, 136);
wt472 = _mm512_shuffle_f32x4(tmp13829, tmp13837, 221);
wt465 = _mm512_shuffle_f32x4(tmp13831, tmp13839, 136);
wt473 = _mm512_shuffle_f32x4(tmp13831, tmp13839, 221);
wt466 = _mm512_shuffle_f32x4(tmp13833, tmp13841, 136);
wt474 = _mm512_shuffle_f32x4(tmp13833, tmp13841, 221);
wt467 = _mm512_shuffle_f32x4(tmp13828, tmp13836, 136);
wt475 = _mm512_shuffle_f32x4(tmp13828, tmp13836, 221);
wt468 = _mm512_shuffle_f32x4(tmp13830, tmp13838, 136);
wt476 = _mm512_shuffle_f32x4(tmp13830, tmp13838, 221);
wt469 = _mm512_shuffle_f32x4(tmp13832, tmp13840, 136);
wt477 = _mm512_shuffle_f32x4(tmp13832, tmp13840, 221);
wt470 = _mm512_shuffle_f32x4(tmp13834, tmp13842, 136);
wt478 = _mm512_shuffle_f32x4(tmp13834, tmp13842, 221);
wt463 = _mm512_mul_ps(wt463, postMul44);
wt464 = _mm512_mul_ps(wt464, postMul44);
wt465 = _mm512_mul_ps(wt465, postMul44);
wt466 = _mm512_mul_ps(wt466, postMul44);
wt467 = _mm512_mul_ps(wt467, postMul44);
wt468 = _mm512_mul_ps(wt468, postMul44);
wt469 = _mm512_mul_ps(wt469, postMul44);
wt470 = _mm512_mul_ps(wt470, postMul44);
wt471 = _mm512_mul_ps(wt471, postMul44);
wt472 = _mm512_mul_ps(wt472, postMul44);
wt473 = _mm512_mul_ps(wt473, postMul44);
wt474 = _mm512_mul_ps(wt474, postMul44);
wt475 = _mm512_mul_ps(wt475, postMul44);
wt476 = _mm512_mul_ps(wt476, postMul44);
wt477 = _mm512_mul_ps(wt477, postMul44);
wt478 = _mm512_mul_ps(wt478, postMul44);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(1+16*c40)+(ptrdiff_t)0, 63>>cut18, wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(2+16*c40)+(ptrdiff_t)0, 63>>cut18, wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(3+16*c40)+(ptrdiff_t)0, 63>>cut18, wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(4+16*c40)+(ptrdiff_t)0, 63>>cut18, wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(5+16*c40)+(ptrdiff_t)0, 63>>cut18, wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(6+16*c40)+(ptrdiff_t)0, 63>>cut18, wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(7+16*c40)+(ptrdiff_t)0, 63>>cut18, wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(8+16*c40)+(ptrdiff_t)0, 63>>cut18, wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(9+16*c40)+(ptrdiff_t)0, 63>>cut18, wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(10+16*c40)+(ptrdiff_t)0, 63>>cut18, wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(11+16*c40)+(ptrdiff_t)0, 63>>cut18, wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(12+16*c40)+(ptrdiff_t)0, 63>>cut18, wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(13+16*c40)+(ptrdiff_t)0, 63>>cut18, wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(14+16*c40)+(ptrdiff_t)0, 63>>cut18, wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(15+16*c40)+(ptrdiff_t)0, 63>>cut18, wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(16+16*c40)+(ptrdiff_t)0, 63>>cut18, wt478);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(1+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(2+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(3+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(4+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(5+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(6+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(7+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(8+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(9+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(10+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(11+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(12+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(13+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(14+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(15+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+24*(16+16*c40)+(ptrdiff_t)12288, 4032>>cut18, wt478);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(1+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt463);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(2+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt464);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(3+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt465);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(4+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt466);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(5+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt467);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(6+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt468);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(7+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt469);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(8+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt470);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(9+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt471);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(10+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt472);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(11+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt473);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(12+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt474);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(13+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt475);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(14+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt476);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(15+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt477);
_mm512_mask_storeu_ps(arranged15+2101248*i53+12312*l60+4*cut18+16*(16+16*c40)+(ptrdiff_t)24576, 65535-(4095>>cut18), wt478);
}
}
}
}
}

static void ResNeXt50OneArrangeWts8(ResNeXt50ThreaderTeam1* team56, char** tensors85) {
ResNeXt50ThreaderTask1 task89;
task89.callee1 = ResNeXt50OneArrangeWts8Callee1;
task89.any1 = tensors85;
task89.nd1 = 3;
task89.hull1[0] = 64;
task89.hull1[1] = 1;
task89.hull1[2] = 1;
ResNeXt50ThreaderDo1(team56, &task89);
}

static void ResNeXt50OneArrangeDats8Callee1(ResNeXt50ThreaderTask1* task90, int64_t* pt50) {
char** tensors88 = task90->any1;
ptrdiff_t s47 = pt50[0];
ptrdiff_t c43 = pt50[1];
char*restrict datPtr27 = tensors88[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged16 = tensors88[1]+(ptrdiff_t)748160*0+(ptrdiff_t)458752*0;
ptrdiff_t ii39 = 1;
for (ptrdiff_t i54 = 0; i54 < ii39; ++i54) {
ptrdiff_t j46 = 1*c43;
ptrdiff_t jj46 = j46+0;
if (j46 < 3) {
ptrdiff_t h48 = 0+((size_t)j46-0)/1*8;
switch (((size_t)j46-0)%1) {
default: {
wrap5:;
ptrdiff_t k143 = 128*s47;
ptrdiff_t kk45 = k143+128;
for (; k143 < kk45; ++k143) {
__m512 dat2259 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)0);
__m512 dat2260 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)64);
__m512i pm201 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2261 = _mm512_permutex2var_ps(dat2259, pm201, dat2260);
__m512 dat2262 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)224);
__m512 dat2263 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)288);
__m512i pm202 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2264 = _mm512_permutex2var_ps(dat2262, pm202, dat2263);
__m512 dat2265 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)448);
__m512 dat2266 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)512);
__m512i pm203 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2267 = _mm512_permutex2var_ps(dat2265, pm203, dat2266);
__m512 dat2268 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)672);
__m512 dat2269 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*h48+3136*k143+(ptrdiff_t)736);
__m512i pm204 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2270 = _mm512_permutex2var_ps(dat2268, pm204, dat2269);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)0, dat2261);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)64, dat2264);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)128, dat2267);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+256*k143+(ptrdiff_t)192, dat2270);
}
if (j46 >= jj46) goto next8;
if (j46 >= 2) break;
++j46;
h48 += 8;
goto wrap5;
}
}
j46 = 3;
}
switch ((size_t)j46-3) {
default: {
j46 = 3;
ptrdiff_t k144 = 128*s47;
ptrdiff_t kk46 = k144+128;
for (; k144 < kk46; ++k144) {
__m512 dat2271 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)0);
__m512 dat2272 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)64);
__m512i pm205 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2273 = _mm512_permutex2var_ps(dat2271, pm205, dat2272);
__m512 dat2274 = _mm512_maskz_loadu_ps(32767, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)224);
__m512 dat2275 = _mm512_maskz_loadu_ps(2047, datPtr27+1605632*i54+112*(ptrdiff_t)24+3136*k144+(ptrdiff_t)288);
__m512i pm206 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2276 = _mm512_permutex2var_ps(dat2274, pm206, dat2275);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+128*k144+(ptrdiff_t)0, dat2273);
_mm512_storeu_ps(arranged16+458752*i54+131072*j46+128*k144+(ptrdiff_t)64, dat2276);
}
if (j46 >= jj46) goto next8;
}
}
j46 = 4;
next8:;
}
}

static void ResNeXt50OneArrangeDats8(ResNeXt50ThreaderTeam1* team57, char** tensors87) {
ResNeXt50ThreaderTask1 task91;
task91.callee1 = ResNeXt50OneArrangeDats8Callee1;
task91.any1 = tensors87;
task91.nd1 = 4;
task91.hull1[0] = 4;
task91.hull1[1] = 4;
task91.hull1[2] = 1;
task91.hull1[3] = 1;
ResNeXt50ThreaderDo1(team57, &task91);
}

static void ResNeXt50OneApply8Callee1(ResNeXt50ThreaderTask1* task92, int64_t* pt51) {
void** pair22 = task92->any1;
char** tensors90 = pair22[0];
ptrdiff_t e26 = 0;
ptrdiff_t g29 = 0;
ptrdiff_t d18 = pt51[1];
ptrdiff_t w63 = pt51[0];
char*restrict arrangedWts8 = tensors90[0]+3424256*e26+(ptrdiff_t)2101248*1*g29;
char*restrict arrangedDats8 = tensors90[1]+748160*e26+(ptrdiff_t)458752*1*g29;
char*restrict datPtr28 = tensors90[2]+(ptrdiff_t)851968*1*g29;
ptrdiff_t ii40 = 1;
for (ptrdiff_t i55 = 0; i55 < ii40; ++i55) {
ptrdiff_t j47 = 1*d18;
ptrdiff_t jj47 = j47+0;
if (j47 < 3) {
ptrdiff_t h49 = 0+((size_t)j47-0)/1*4;
switch (((size_t)j47-0)%1) {
default: {
wrap6:;
ptrdiff_t k145 = 1*w63;
ptrdiff_t kk47 = k145+0;
for (; k145 != 170; ++k145) {
ptrdiff_t s48 = -1;
__m512 sum393 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)24));
__m512 sum397 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)28));
__m512 sum401 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)32));
__m512 sum405 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)36));
__m512 sum409 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)40));
__m512 sum413 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)44));
__m512 sum394 = sum393;
__m512 sum395 = sum393;
__m512 sum396 = sum393;
__m512 sum398 = sum397;
__m512 sum399 = sum397;
__m512 sum400 = sum397;
__m512 sum402 = sum401;
__m512 sum403 = sum401;
__m512 sum404 = sum401;
__m512 sum406 = sum405;
__m512 sum407 = sum405;
__m512 sum408 = sum405;
__m512 sum410 = sum409;
__m512 sum411 = sum409;
__m512 sum412 = sum409;
__m512 sum414 = sum413;
__m512 sum415 = sum413;
__m512 sum416 = sum413;
for (s48 = 0; s48 < 512; ++s48) {
__m512 dat2277 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)0);
__m512 dat2278 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)64);
__m512 dat2279 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)128);
__m512 dat2280 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s48+(ptrdiff_t)192);
__m512 wt511 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)24));
sum393 = _mm512_fmadd_ps(wt511, dat2277, sum393);
sum394 = _mm512_fmadd_ps(wt511, dat2278, sum394);
sum395 = _mm512_fmadd_ps(wt511, dat2279, sum395);
sum396 = _mm512_fmadd_ps(wt511, dat2280, sum396);
__m512 wt512 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)28));
sum397 = _mm512_fmadd_ps(wt512, dat2277, sum397);
sum398 = _mm512_fmadd_ps(wt512, dat2278, sum398);
sum399 = _mm512_fmadd_ps(wt512, dat2279, sum399);
sum400 = _mm512_fmadd_ps(wt512, dat2280, sum400);
__m512 wt513 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)32));
sum401 = _mm512_fmadd_ps(wt513, dat2277, sum401);
sum402 = _mm512_fmadd_ps(wt513, dat2278, sum402);
sum403 = _mm512_fmadd_ps(wt513, dat2279, sum403);
sum404 = _mm512_fmadd_ps(wt513, dat2280, sum404);
__m512 wt514 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)36));
sum405 = _mm512_fmadd_ps(wt514, dat2277, sum405);
sum406 = _mm512_fmadd_ps(wt514, dat2278, sum406);
sum407 = _mm512_fmadd_ps(wt514, dat2279, sum407);
sum408 = _mm512_fmadd_ps(wt514, dat2280, sum408);
__m512 wt515 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)40));
sum409 = _mm512_fmadd_ps(wt515, dat2277, sum409);
sum410 = _mm512_fmadd_ps(wt515, dat2278, sum410);
sum411 = _mm512_fmadd_ps(wt515, dat2279, sum411);
sum412 = _mm512_fmadd_ps(wt515, dat2280, sum412);
__m512 wt516 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+24*s48+(ptrdiff_t)44));
sum413 = _mm512_fmadd_ps(wt516, dat2277, sum413);
sum414 = _mm512_fmadd_ps(wt516, dat2278, sum414);
sum415 = _mm512_fmadd_ps(wt516, dat2279, sum415);
sum416 = _mm512_fmadd_ps(wt516, dat2280, sum416);
}
__m512 dat2281 = sum393;
__m512 dat2282 = sum394;
__m512 dat2283 = sum395;
__m512 dat2284 = sum396;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)0, 16383, dat2281);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)56, 16383, dat2282);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)112, 16383, dat2283);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)168, 16383, dat2284);
__m512 dat2285 = sum397;
__m512 dat2286 = sum398;
__m512 dat2287 = sum399;
__m512 dat2288 = sum400;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)832, 16383, dat2285);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)888, 16383, dat2286);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)944, 16383, dat2287);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1000, 16383, dat2288);
__m512 dat2289 = sum401;
__m512 dat2290 = sum402;
__m512 dat2291 = sum403;
__m512 dat2292 = sum404;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1664, 16383, dat2289);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1720, 16383, dat2290);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1776, 16383, dat2291);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1832, 16383, dat2292);
__m512 dat2293 = sum405;
__m512 dat2294 = sum406;
__m512 dat2295 = sum407;
__m512 dat2296 = sum408;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2496, 16383, dat2293);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2552, 16383, dat2294);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2608, 16383, dat2295);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2664, 16383, dat2296);
__m512 dat2297 = sum409;
__m512 dat2298 = sum410;
__m512 dat2299 = sum411;
__m512 dat2300 = sum412;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3328, 16383, dat2297);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3384, 16383, dat2298);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3440, 16383, dat2299);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)3496, 16383, dat2300);
__m512 dat2301 = sum413;
__m512 dat2302 = sum414;
__m512 dat2303 = sum415;
__m512 dat2304 = sum416;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4160, 16383, dat2301);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4216, 16383, dat2302);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4272, 16383, dat2303);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)4328, 16383, dat2304);
if (k145 >= kk47) return;
}
ptrdiff_t s49 = -1;
__m512 sum417 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)16));
__m512 sum421 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)20));
__m512 sum425 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)24));
__m512 sum429 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)28));
__m512 sum418 = sum417;
__m512 sum419 = sum417;
__m512 sum420 = sum417;
__m512 sum422 = sum421;
__m512 sum423 = sum421;
__m512 sum424 = sum421;
__m512 sum426 = sum425;
__m512 sum427 = sum425;
__m512 sum428 = sum425;
__m512 sum430 = sum429;
__m512 sum431 = sum429;
__m512 sum432 = sum429;
for (s49 = 0; s49 < 512; ++s49) {
__m512 dat2305 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)0);
__m512 dat2306 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)64);
__m512 dat2307 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)128);
__m512 dat2308 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+256*s49+(ptrdiff_t)192);
__m512 wt517 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)16));
sum417 = _mm512_fmadd_ps(wt517, dat2305, sum417);
sum418 = _mm512_fmadd_ps(wt517, dat2306, sum418);
sum419 = _mm512_fmadd_ps(wt517, dat2307, sum419);
sum420 = _mm512_fmadd_ps(wt517, dat2308, sum420);
__m512 wt518 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)20));
sum421 = _mm512_fmadd_ps(wt518, dat2305, sum421);
sum422 = _mm512_fmadd_ps(wt518, dat2306, sum422);
sum423 = _mm512_fmadd_ps(wt518, dat2307, sum423);
sum424 = _mm512_fmadd_ps(wt518, dat2308, sum424);
__m512 wt519 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)24));
sum425 = _mm512_fmadd_ps(wt519, dat2305, sum425);
sum426 = _mm512_fmadd_ps(wt519, dat2306, sum426);
sum427 = _mm512_fmadd_ps(wt519, dat2307, sum427);
sum428 = _mm512_fmadd_ps(wt519, dat2308, sum428);
__m512 wt520 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k145+16*s49+(ptrdiff_t)28));
sum429 = _mm512_fmadd_ps(wt520, dat2305, sum429);
sum430 = _mm512_fmadd_ps(wt520, dat2306, sum430);
sum431 = _mm512_fmadd_ps(wt520, dat2307, sum431);
sum432 = _mm512_fmadd_ps(wt520, dat2308, sum432);
}
__m512 dat2309 = sum417;
__m512 dat2310 = sum418;
__m512 dat2311 = sum419;
__m512 dat2312 = sum420;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)0, 16383, dat2309);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)56, 16383, dat2310);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)112, 16383, dat2311);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)168, 16383, dat2312);
__m512 dat2313 = sum421;
__m512 dat2314 = sum422;
__m512 dat2315 = sum423;
__m512 dat2316 = sum424;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)832, 16383, dat2313);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)888, 16383, dat2314);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)944, 16383, dat2315);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1000, 16383, dat2316);
__m512 dat2317 = sum425;
__m512 dat2318 = sum426;
__m512 dat2319 = sum427;
__m512 dat2320 = sum428;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1664, 16383, dat2317);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1720, 16383, dat2318);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1776, 16383, dat2319);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)1832, 16383, dat2320);
__m512 dat2321 = sum429;
__m512 dat2322 = sum430;
__m512 dat2323 = sum431;
__m512 dat2324 = sum432;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2496, 16383, dat2321);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2552, 16383, dat2322);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2608, 16383, dat2323);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h49+4992*k145+(ptrdiff_t)2664, 16383, dat2324);
if (j47 >= jj47) return;
if (j47 >= 2) break;
++j47;
h49 += 4;
goto wrap6;
}
}
j47 = 3;
}
ptrdiff_t h50 = 12;
switch (j47) {
default: {
j47 = 3;
ptrdiff_t k146 = 1*w63;
ptrdiff_t kk48 = k146+0;
for (; k146 != 170; ++k146) {
ptrdiff_t s50 = -1;
__m512 sum433 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)24));
__m512 sum435 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)28));
__m512 sum437 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)32));
__m512 sum439 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)36));
__m512 sum441 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)40));
__m512 sum443 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)44));
__m512 sum434 = sum433;
__m512 sum436 = sum435;
__m512 sum438 = sum437;
__m512 sum440 = sum439;
__m512 sum442 = sum441;
__m512 sum444 = sum443;
for (s50 = 0; s50 < 512; ++s50) {
__m512 dat2325 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s50+(ptrdiff_t)0);
__m512 dat2326 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s50+(ptrdiff_t)64);
__m512 wt521 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)24));
sum433 = _mm512_fmadd_ps(wt521, dat2325, sum433);
sum434 = _mm512_fmadd_ps(wt521, dat2326, sum434);
__m512 wt522 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)28));
sum435 = _mm512_fmadd_ps(wt522, dat2325, sum435);
sum436 = _mm512_fmadd_ps(wt522, dat2326, sum436);
__m512 wt523 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)32));
sum437 = _mm512_fmadd_ps(wt523, dat2325, sum437);
sum438 = _mm512_fmadd_ps(wt523, dat2326, sum438);
__m512 wt524 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)36));
sum439 = _mm512_fmadd_ps(wt524, dat2325, sum439);
sum440 = _mm512_fmadd_ps(wt524, dat2326, sum440);
__m512 wt525 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)40));
sum441 = _mm512_fmadd_ps(wt525, dat2325, sum441);
sum442 = _mm512_fmadd_ps(wt525, dat2326, sum442);
__m512 wt526 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+24*s50+(ptrdiff_t)44));
sum443 = _mm512_fmadd_ps(wt526, dat2325, sum443);
sum444 = _mm512_fmadd_ps(wt526, dat2326, sum444);
}
__m512 dat2327 = sum433;
__m512 dat2328 = sum434;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)0, 16383, dat2327);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)56, 16383, dat2328);
__m512 dat2329 = sum435;
__m512 dat2330 = sum436;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)832, 16383, dat2329);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)888, 16383, dat2330);
__m512 dat2331 = sum437;
__m512 dat2332 = sum438;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1664, 16383, dat2331);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1720, 16383, dat2332);
__m512 dat2333 = sum439;
__m512 dat2334 = sum440;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2496, 16383, dat2333);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2552, 16383, dat2334);
__m512 dat2335 = sum441;
__m512 dat2336 = sum442;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)3328, 16383, dat2335);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)3384, 16383, dat2336);
__m512 dat2337 = sum443;
__m512 dat2338 = sum444;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)4160, 16383, dat2337);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)4216, 16383, dat2338);
if (k146 >= kk48) return;
}
ptrdiff_t s51 = -1;
__m512 sum445 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)16));
__m512 sum447 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)20));
__m512 sum449 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)24));
__m512 sum451 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)28));
__m512 sum446 = sum445;
__m512 sum448 = sum447;
__m512 sum450 = sum449;
__m512 sum452 = sum451;
for (s51 = 0; s51 < 512; ++s51) {
__m512 dat2339 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s51+(ptrdiff_t)0);
__m512 dat2340 = _mm512_loadu_ps(arrangedDats8+458752*i55+131072*j47+128*s51+(ptrdiff_t)64);
__m512 wt527 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)16));
sum445 = _mm512_fmadd_ps(wt527, dat2339, sum445);
sum446 = _mm512_fmadd_ps(wt527, dat2340, sum446);
__m512 wt528 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)20));
sum447 = _mm512_fmadd_ps(wt528, dat2339, sum447);
sum448 = _mm512_fmadd_ps(wt528, dat2340, sum448);
__m512 wt529 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)24));
sum449 = _mm512_fmadd_ps(wt529, dat2339, sum449);
sum450 = _mm512_fmadd_ps(wt529, dat2340, sum450);
__m512 wt530 = _mm512_set1_ps(*(float*)(arrangedWts8+2101248*i55+12312*k146+16*s51+(ptrdiff_t)28));
sum451 = _mm512_fmadd_ps(wt530, dat2339, sum451);
sum452 = _mm512_fmadd_ps(wt530, dat2340, sum452);
}
__m512 dat2341 = sum445;
__m512 dat2342 = sum446;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)0, 16383, dat2341);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)56, 16383, dat2342);
__m512 dat2343 = sum447;
__m512 dat2344 = sum448;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)832, 16383, dat2343);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)888, 16383, dat2344);
__m512 dat2345 = sum449;
__m512 dat2346 = sum450;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1664, 16383, dat2345);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)1720, 16383, dat2346);
__m512 dat2347 = sum451;
__m512 dat2348 = sum452;
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2496, 16383, dat2347);
_mm512_mask_storeu_ps(datPtr28+851968*i55+56*h50+4992*k146+(ptrdiff_t)2552, 16383, dat2348);
if (j47 >= jj47) return;
}
}
j47 = 4;
}
}

static void ResNeXt50OneApply8(ResNeXt50ThreaderTeam1* team58, char** tensors89) {
void* pair21[] = {tensors89, 0};
ResNeXt50ThreaderTask1 task93;
task93.callee1 = ResNeXt50OneApply8Callee1;
task93.any1 = pair21;
task93.nd1 = 3;
task93.hull1[0] = 171;
task93.hull1[1] = 4;
task93.hull1[2] = 1;
ResNeXt50ThreaderDo1(team58, &task93);
}

static void ResNeXt50OneArrangeWts9Callee1(ResNeXt50ThreaderTask1* task94, int64_t* pt52) {
char** tensors92 = task94->any1;
ptrdiff_t b75 = pt52[0];
char*restrict wtPtr15 = tensors92[0]+(ptrdiff_t)3340*0+(ptrdiff_t)1048576*0;
char*restrict biasPtr15 = tensors92[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr16 = tensors92[2]+(ptrdiff_t)8*512*0;
char*restrict arranged17 = tensors92[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)1050624*0;
ptrdiff_t ii41 = 1;
for (ptrdiff_t i56 = 0; i56 < ii41; ++i56) {
ptrdiff_t j48 = 1*b75;
ptrdiff_t jj48 = j48+1;
for (; j48 < jj48; ++j48) {
if (j48 < 31) {
ptrdiff_t k148 = 0+16*(j48-0);
ptrdiff_t l63 = (size_t)(0+k148)/6;
ptrdiff_t cut21 = (size_t)(0+k148)%6;
switch (cut21) {
case 0:;
case 2: {
__m512 sum454 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k148);
__m512i pmMul31 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd31 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo26 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k148+512*i56));
__m512 masHi26 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k148+512*i56)+(ptrdiff_t)64);
__m512 postMul48 = _mm512_permutex2var_ps(masLo26, pmMul31, masHi26);
__m512 postAdd32 = _mm512_permutex2var_ps(masLo26, pmAdd31, masHi26);
sum454 = _mm512_fmadd_ps(sum454, postMul48, postAdd32);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum454);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum454);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)24576, 65535-(4095>>cut21), sum454);
ptrdiff_t c45 = 0;
for (; c45 != 32; ++c45) {
__m512 wt547 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)0);
__m512 wt548 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)2048);
__m512 wt549 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)4096);
__m512 wt550 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)6144);
__m512 wt551 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)8192);
__m512 wt552 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)10240);
__m512 wt553 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)12288);
__m512 wt554 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)14336);
__m512 wt555 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)16384);
__m512 wt556 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)18432);
__m512 wt557 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)20480);
__m512 wt558 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)22528);
__m512 wt559 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)24576);
__m512 wt560 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)26624);
__m512 wt561 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)28672);
__m512 wt562 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c45+(ptrdiff_t)30720);
__m512 tmp13843 = _mm512_unpacklo_ps(wt547, wt548);
__m512 tmp13844 = _mm512_unpackhi_ps(wt547, wt548);
__m512 tmp13845 = _mm512_unpacklo_ps(wt549, wt550);
__m512 tmp13846 = _mm512_unpackhi_ps(wt549, wt550);
__m512 tmp13847 = _mm512_unpacklo_ps(wt551, wt552);
__m512 tmp13848 = _mm512_unpackhi_ps(wt551, wt552);
__m512 tmp13849 = _mm512_unpacklo_ps(wt553, wt554);
__m512 tmp13850 = _mm512_unpackhi_ps(wt553, wt554);
__m512 tmp13851 = _mm512_unpacklo_ps(wt555, wt556);
__m512 tmp13852 = _mm512_unpackhi_ps(wt555, wt556);
__m512 tmp13853 = _mm512_unpacklo_ps(wt557, wt558);
__m512 tmp13854 = _mm512_unpackhi_ps(wt557, wt558);
__m512 tmp13855 = _mm512_unpacklo_ps(wt559, wt560);
__m512 tmp13856 = _mm512_unpackhi_ps(wt559, wt560);
__m512 tmp13857 = _mm512_unpacklo_ps(wt561, wt562);
__m512 tmp13858 = _mm512_unpackhi_ps(wt561, wt562);
__m512 tmp13859 = _mm512_shuffle_ps(tmp13843, tmp13845, 68);
__m512 tmp13860 = _mm512_shuffle_ps(tmp13843, tmp13845, 238);
__m512 tmp13861 = _mm512_shuffle_ps(tmp13844, tmp13846, 68);
__m512 tmp13862 = _mm512_shuffle_ps(tmp13844, tmp13846, 238);
__m512 tmp13863 = _mm512_shuffle_ps(tmp13847, tmp13849, 68);
__m512 tmp13864 = _mm512_shuffle_ps(tmp13847, tmp13849, 238);
__m512 tmp13865 = _mm512_shuffle_ps(tmp13848, tmp13850, 68);
__m512 tmp13866 = _mm512_shuffle_ps(tmp13848, tmp13850, 238);
__m512 tmp13867 = _mm512_shuffle_ps(tmp13851, tmp13853, 68);
__m512 tmp13868 = _mm512_shuffle_ps(tmp13851, tmp13853, 238);
__m512 tmp13869 = _mm512_shuffle_ps(tmp13852, tmp13854, 68);
__m512 tmp13870 = _mm512_shuffle_ps(tmp13852, tmp13854, 238);
__m512 tmp13871 = _mm512_shuffle_ps(tmp13855, tmp13857, 68);
__m512 tmp13872 = _mm512_shuffle_ps(tmp13855, tmp13857, 238);
__m512 tmp13873 = _mm512_shuffle_ps(tmp13856, tmp13858, 68);
__m512 tmp13874 = _mm512_shuffle_ps(tmp13856, tmp13858, 238);
__m512 tmp13875 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 136);
__m512 tmp13876 = _mm512_shuffle_f32x4(tmp13859, tmp13863, 221);
__m512 tmp13877 = _mm512_shuffle_f32x4(tmp13860, tmp13864, 136);
__m512 tmp13878 = _mm512_shuffle_f32x4(tmp13860, tmp13864, 221);
__m512 tmp13879 = _mm512_shuffle_f32x4(tmp13861, tmp13865, 136);
__m512 tmp13880 = _mm512_shuffle_f32x4(tmp13861, tmp13865, 221);
__m512 tmp13881 = _mm512_shuffle_f32x4(tmp13862, tmp13866, 136);
__m512 tmp13882 = _mm512_shuffle_f32x4(tmp13862, tmp13866, 221);
__m512 tmp13883 = _mm512_shuffle_f32x4(tmp13867, tmp13871, 136);
__m512 tmp13884 = _mm512_shuffle_f32x4(tmp13867, tmp13871, 221);
__m512 tmp13885 = _mm512_shuffle_f32x4(tmp13868, tmp13872, 136);
__m512 tmp13886 = _mm512_shuffle_f32x4(tmp13868, tmp13872, 221);
__m512 tmp13887 = _mm512_shuffle_f32x4(tmp13869, tmp13873, 136);
__m512 tmp13888 = _mm512_shuffle_f32x4(tmp13869, tmp13873, 221);
__m512 tmp13889 = _mm512_shuffle_f32x4(tmp13870, tmp13874, 136);
__m512 tmp13890 = _mm512_shuffle_f32x4(tmp13870, tmp13874, 221);
wt547 = _mm512_shuffle_f32x4(tmp13875, tmp13883, 136);
wt555 = _mm512_shuffle_f32x4(tmp13875, tmp13883, 221);
wt548 = _mm512_shuffle_f32x4(tmp13877, tmp13885, 136);
wt556 = _mm512_shuffle_f32x4(tmp13877, tmp13885, 221);
wt549 = _mm512_shuffle_f32x4(tmp13879, tmp13887, 136);
wt557 = _mm512_shuffle_f32x4(tmp13879, tmp13887, 221);
wt550 = _mm512_shuffle_f32x4(tmp13881, tmp13889, 136);
wt558 = _mm512_shuffle_f32x4(tmp13881, tmp13889, 221);
wt551 = _mm512_shuffle_f32x4(tmp13876, tmp13884, 136);
wt559 = _mm512_shuffle_f32x4(tmp13876, tmp13884, 221);
wt552 = _mm512_shuffle_f32x4(tmp13878, tmp13886, 136);
wt560 = _mm512_shuffle_f32x4(tmp13878, tmp13886, 221);
wt553 = _mm512_shuffle_f32x4(tmp13880, tmp13888, 136);
wt561 = _mm512_shuffle_f32x4(tmp13880, tmp13888, 221);
wt554 = _mm512_shuffle_f32x4(tmp13882, tmp13890, 136);
wt562 = _mm512_shuffle_f32x4(tmp13882, tmp13890, 221);
wt547 = _mm512_mul_ps(wt547, postMul48);
wt548 = _mm512_mul_ps(wt548, postMul48);
wt549 = _mm512_mul_ps(wt549, postMul48);
wt550 = _mm512_mul_ps(wt550, postMul48);
wt551 = _mm512_mul_ps(wt551, postMul48);
wt552 = _mm512_mul_ps(wt552, postMul48);
wt553 = _mm512_mul_ps(wt553, postMul48);
wt554 = _mm512_mul_ps(wt554, postMul48);
wt555 = _mm512_mul_ps(wt555, postMul48);
wt556 = _mm512_mul_ps(wt556, postMul48);
wt557 = _mm512_mul_ps(wt557, postMul48);
wt558 = _mm512_mul_ps(wt558, postMul48);
wt559 = _mm512_mul_ps(wt559, postMul48);
wt560 = _mm512_mul_ps(wt560, postMul48);
wt561 = _mm512_mul_ps(wt561, postMul48);
wt562 = _mm512_mul_ps(wt562, postMul48);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)0, 63>>cut21, wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)0, 63>>cut21, wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)0, 63>>cut21, wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)0, 63>>cut21, wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)0, 63>>cut21, wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)0, 63>>cut21, wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)0, 63>>cut21, wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)0, 63>>cut21, wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)0, 63>>cut21, wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)0, 63>>cut21, wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)0, 63>>cut21, wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)0, 63>>cut21, wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)0, 63>>cut21, wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)0, 63>>cut21, wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)0, 63>>cut21, wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)0, 63>>cut21, wt562);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)12288, 4032>>cut21, wt562);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt547);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt548);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt549);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt550);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt551);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt552);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt553);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt554);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt555);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt556);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt557);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt558);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt559);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt560);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt561);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c45)+(ptrdiff_t)24576, 65535-(4095>>cut21), wt562);
}
break;
}
default: {
cut21 = 4;
__m512 sum455 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k148);
__m512i pmMul32 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd32 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo27 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k148+512*i56));
__m512 masHi27 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k148+512*i56)+(ptrdiff_t)64);
__m512 postMul49 = _mm512_permutex2var_ps(masLo27, pmMul32, masHi27);
__m512 postAdd33 = _mm512_permutex2var_ps(masLo27, pmAdd32, masHi27);
sum455 = _mm512_fmadd_ps(sum455, postMul49, postAdd33);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)0, 63>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)12288, 4032>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)24576, 258048>>cut21, sum455);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*0+(ptrdiff_t)36864, 65535-(262143>>cut21), sum455);
ptrdiff_t c46 = 0;
for (; c46 != 32; ++c46) {
__m512 wt563 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)0);
__m512 wt564 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)2048);
__m512 wt565 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)4096);
__m512 wt566 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)6144);
__m512 wt567 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)8192);
__m512 wt568 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)10240);
__m512 wt569 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)12288);
__m512 wt570 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)14336);
__m512 wt571 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)16384);
__m512 wt572 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)18432);
__m512 wt573 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)20480);
__m512 wt574 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)22528);
__m512 wt575 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)24576);
__m512 wt576 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)26624);
__m512 wt577 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)28672);
__m512 wt578 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k148+64*c46+(ptrdiff_t)30720);
__m512 tmp13891 = _mm512_unpacklo_ps(wt563, wt564);
__m512 tmp13892 = _mm512_unpackhi_ps(wt563, wt564);
__m512 tmp13893 = _mm512_unpacklo_ps(wt565, wt566);
__m512 tmp13894 = _mm512_unpackhi_ps(wt565, wt566);
__m512 tmp13895 = _mm512_unpacklo_ps(wt567, wt568);
__m512 tmp13896 = _mm512_unpackhi_ps(wt567, wt568);
__m512 tmp13897 = _mm512_unpacklo_ps(wt569, wt570);
__m512 tmp13898 = _mm512_unpackhi_ps(wt569, wt570);
__m512 tmp13899 = _mm512_unpacklo_ps(wt571, wt572);
__m512 tmp13900 = _mm512_unpackhi_ps(wt571, wt572);
__m512 tmp13901 = _mm512_unpacklo_ps(wt573, wt574);
__m512 tmp13902 = _mm512_unpackhi_ps(wt573, wt574);
__m512 tmp13903 = _mm512_unpacklo_ps(wt575, wt576);
__m512 tmp13904 = _mm512_unpackhi_ps(wt575, wt576);
__m512 tmp13905 = _mm512_unpacklo_ps(wt577, wt578);
__m512 tmp13906 = _mm512_unpackhi_ps(wt577, wt578);
__m512 tmp13907 = _mm512_shuffle_ps(tmp13891, tmp13893, 68);
__m512 tmp13908 = _mm512_shuffle_ps(tmp13891, tmp13893, 238);
__m512 tmp13909 = _mm512_shuffle_ps(tmp13892, tmp13894, 68);
__m512 tmp13910 = _mm512_shuffle_ps(tmp13892, tmp13894, 238);
__m512 tmp13911 = _mm512_shuffle_ps(tmp13895, tmp13897, 68);
__m512 tmp13912 = _mm512_shuffle_ps(tmp13895, tmp13897, 238);
__m512 tmp13913 = _mm512_shuffle_ps(tmp13896, tmp13898, 68);
__m512 tmp13914 = _mm512_shuffle_ps(tmp13896, tmp13898, 238);
__m512 tmp13915 = _mm512_shuffle_ps(tmp13899, tmp13901, 68);
__m512 tmp13916 = _mm512_shuffle_ps(tmp13899, tmp13901, 238);
__m512 tmp13917 = _mm512_shuffle_ps(tmp13900, tmp13902, 68);
__m512 tmp13918 = _mm512_shuffle_ps(tmp13900, tmp13902, 238);
__m512 tmp13919 = _mm512_shuffle_ps(tmp13903, tmp13905, 68);
__m512 tmp13920 = _mm512_shuffle_ps(tmp13903, tmp13905, 238);
__m512 tmp13921 = _mm512_shuffle_ps(tmp13904, tmp13906, 68);
__m512 tmp13922 = _mm512_shuffle_ps(tmp13904, tmp13906, 238);
__m512 tmp13923 = _mm512_shuffle_f32x4(tmp13907, tmp13911, 136);
__m512 tmp13924 = _mm512_shuffle_f32x4(tmp13907, tmp13911, 221);
__m512 tmp13925 = _mm512_shuffle_f32x4(tmp13908, tmp13912, 136);
__m512 tmp13926 = _mm512_shuffle_f32x4(tmp13908, tmp13912, 221);
__m512 tmp13927 = _mm512_shuffle_f32x4(tmp13909, tmp13913, 136);
__m512 tmp13928 = _mm512_shuffle_f32x4(tmp13909, tmp13913, 221);
__m512 tmp13929 = _mm512_shuffle_f32x4(tmp13910, tmp13914, 136);
__m512 tmp13930 = _mm512_shuffle_f32x4(tmp13910, tmp13914, 221);
__m512 tmp13931 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 136);
__m512 tmp13932 = _mm512_shuffle_f32x4(tmp13915, tmp13919, 221);
__m512 tmp13933 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 136);
__m512 tmp13934 = _mm512_shuffle_f32x4(tmp13916, tmp13920, 221);
__m512 tmp13935 = _mm512_shuffle_f32x4(tmp13917, tmp13921, 136);
__m512 tmp13936 = _mm512_shuffle_f32x4(tmp13917, tmp13921, 221);
__m512 tmp13937 = _mm512_shuffle_f32x4(tmp13918, tmp13922, 136);
__m512 tmp13938 = _mm512_shuffle_f32x4(tmp13918, tmp13922, 221);
wt563 = _mm512_shuffle_f32x4(tmp13923, tmp13931, 136);
wt571 = _mm512_shuffle_f32x4(tmp13923, tmp13931, 221);
wt564 = _mm512_shuffle_f32x4(tmp13925, tmp13933, 136);
wt572 = _mm512_shuffle_f32x4(tmp13925, tmp13933, 221);
wt565 = _mm512_shuffle_f32x4(tmp13927, tmp13935, 136);
wt573 = _mm512_shuffle_f32x4(tmp13927, tmp13935, 221);
wt566 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 136);
wt574 = _mm512_shuffle_f32x4(tmp13929, tmp13937, 221);
wt567 = _mm512_shuffle_f32x4(tmp13924, tmp13932, 136);
wt575 = _mm512_shuffle_f32x4(tmp13924, tmp13932, 221);
wt568 = _mm512_shuffle_f32x4(tmp13926, tmp13934, 136);
wt576 = _mm512_shuffle_f32x4(tmp13926, tmp13934, 221);
wt569 = _mm512_shuffle_f32x4(tmp13928, tmp13936, 136);
wt577 = _mm512_shuffle_f32x4(tmp13928, tmp13936, 221);
wt570 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 136);
wt578 = _mm512_shuffle_f32x4(tmp13930, tmp13938, 221);
wt563 = _mm512_mul_ps(wt563, postMul49);
wt564 = _mm512_mul_ps(wt564, postMul49);
wt565 = _mm512_mul_ps(wt565, postMul49);
wt566 = _mm512_mul_ps(wt566, postMul49);
wt567 = _mm512_mul_ps(wt567, postMul49);
wt568 = _mm512_mul_ps(wt568, postMul49);
wt569 = _mm512_mul_ps(wt569, postMul49);
wt570 = _mm512_mul_ps(wt570, postMul49);
wt571 = _mm512_mul_ps(wt571, postMul49);
wt572 = _mm512_mul_ps(wt572, postMul49);
wt573 = _mm512_mul_ps(wt573, postMul49);
wt574 = _mm512_mul_ps(wt574, postMul49);
wt575 = _mm512_mul_ps(wt575, postMul49);
wt576 = _mm512_mul_ps(wt576, postMul49);
wt577 = _mm512_mul_ps(wt577, postMul49);
wt578 = _mm512_mul_ps(wt578, postMul49);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)0, 63>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)0, 63>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)0, 63>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)0, 63>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)0, 63>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)0, 63>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)0, 63>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)0, 63>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)0, 63>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)0, 63>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)0, 63>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)0, 63>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)0, 63>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)0, 63>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)0, 63>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)0, 63>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)12288, 4032>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)24576, 258048>>cut21, wt578);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(1+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt563);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(2+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt564);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(3+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt565);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(4+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt566);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(5+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt567);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(6+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt568);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(7+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt569);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(8+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt570);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(9+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt571);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(10+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt572);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(11+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt573);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(12+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt574);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(13+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt575);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(14+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt576);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(15+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt577);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l63+4*cut21+24*(16+16*c46)+(ptrdiff_t)36864, 65535-(262143>>cut21), wt578);
}
}
}
} else {
ptrdiff_t k147 = 496;
ptrdiff_t l62 = (size_t)(0+k147)/6;
ptrdiff_t cut20 = (size_t)(0+k147)%6;
__m512 sum453 = _mm512_maskz_loadu_ps(65535, biasPtr15+2048*i56+4*k147);
__m512i pmMul33 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd33 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo28 = _mm512_loadu_ps(bnPtr16+(ptrdiff_t)8*(k147+512*i56));
__m512 masHi28 = _mm512_maskz_loadu_ps(65535, bnPtr16+(ptrdiff_t)8*(k147+512*i56)+(ptrdiff_t)64);
__m512 postMul47 = _mm512_permutex2var_ps(masLo28, pmMul33, masHi28);
__m512 postAdd31 = _mm512_permutex2var_ps(masLo28, pmAdd33, masHi28);
sum453 = _mm512_fmadd_ps(sum453, postMul47, postAdd31);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)0, 63>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)12288, 4032>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*0+(ptrdiff_t)24576, 258048>>cut20, sum453);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*0+(ptrdiff_t)36864, 65535-(262143>>cut20), sum453);
ptrdiff_t c44 = 0;
for (; c44 != 32; ++c44) {
__m512 wt531 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)0);
__m512 wt532 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)2048);
__m512 wt533 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)4096);
__m512 wt534 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)6144);
__m512 wt535 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)8192);
__m512 wt536 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)10240);
__m512 wt537 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)12288);
__m512 wt538 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)14336);
__m512 wt539 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)16384);
__m512 wt540 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)18432);
__m512 wt541 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)20480);
__m512 wt542 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)22528);
__m512 wt543 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)24576);
__m512 wt544 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)26624);
__m512 wt545 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)28672);
__m512 wt546 = _mm512_maskz_loadu_ps(65535, wtPtr15+1048576*i56+2048*k147+64*c44+(ptrdiff_t)30720);
__m512 tmp13939 = _mm512_unpacklo_ps(wt531, wt532);
__m512 tmp13940 = _mm512_unpackhi_ps(wt531, wt532);
__m512 tmp13941 = _mm512_unpacklo_ps(wt533, wt534);
__m512 tmp13942 = _mm512_unpackhi_ps(wt533, wt534);
__m512 tmp13943 = _mm512_unpacklo_ps(wt535, wt536);
__m512 tmp13944 = _mm512_unpackhi_ps(wt535, wt536);
__m512 tmp13945 = _mm512_unpacklo_ps(wt537, wt538);
__m512 tmp13946 = _mm512_unpackhi_ps(wt537, wt538);
__m512 tmp13947 = _mm512_unpacklo_ps(wt539, wt540);
__m512 tmp13948 = _mm512_unpackhi_ps(wt539, wt540);
__m512 tmp13949 = _mm512_unpacklo_ps(wt541, wt542);
__m512 tmp13950 = _mm512_unpackhi_ps(wt541, wt542);
__m512 tmp13951 = _mm512_unpacklo_ps(wt543, wt544);
__m512 tmp13952 = _mm512_unpackhi_ps(wt543, wt544);
__m512 tmp13953 = _mm512_unpacklo_ps(wt545, wt546);
__m512 tmp13954 = _mm512_unpackhi_ps(wt545, wt546);
__m512 tmp13955 = _mm512_shuffle_ps(tmp13939, tmp13941, 68);
__m512 tmp13956 = _mm512_shuffle_ps(tmp13939, tmp13941, 238);
__m512 tmp13957 = _mm512_shuffle_ps(tmp13940, tmp13942, 68);
__m512 tmp13958 = _mm512_shuffle_ps(tmp13940, tmp13942, 238);
__m512 tmp13959 = _mm512_shuffle_ps(tmp13943, tmp13945, 68);
__m512 tmp13960 = _mm512_shuffle_ps(tmp13943, tmp13945, 238);
__m512 tmp13961 = _mm512_shuffle_ps(tmp13944, tmp13946, 68);
__m512 tmp13962 = _mm512_shuffle_ps(tmp13944, tmp13946, 238);
__m512 tmp13963 = _mm512_shuffle_ps(tmp13947, tmp13949, 68);
__m512 tmp13964 = _mm512_shuffle_ps(tmp13947, tmp13949, 238);
__m512 tmp13965 = _mm512_shuffle_ps(tmp13948, tmp13950, 68);
__m512 tmp13966 = _mm512_shuffle_ps(tmp13948, tmp13950, 238);
__m512 tmp13967 = _mm512_shuffle_ps(tmp13951, tmp13953, 68);
__m512 tmp13968 = _mm512_shuffle_ps(tmp13951, tmp13953, 238);
__m512 tmp13969 = _mm512_shuffle_ps(tmp13952, tmp13954, 68);
__m512 tmp13970 = _mm512_shuffle_ps(tmp13952, tmp13954, 238);
__m512 tmp13971 = _mm512_shuffle_f32x4(tmp13955, tmp13959, 136);
__m512 tmp13972 = _mm512_shuffle_f32x4(tmp13955, tmp13959, 221);
__m512 tmp13973 = _mm512_shuffle_f32x4(tmp13956, tmp13960, 136);
__m512 tmp13974 = _mm512_shuffle_f32x4(tmp13956, tmp13960, 221);
__m512 tmp13975 = _mm512_shuffle_f32x4(tmp13957, tmp13961, 136);
__m512 tmp13976 = _mm512_shuffle_f32x4(tmp13957, tmp13961, 221);
__m512 tmp13977 = _mm512_shuffle_f32x4(tmp13958, tmp13962, 136);
__m512 tmp13978 = _mm512_shuffle_f32x4(tmp13958, tmp13962, 221);
__m512 tmp13979 = _mm512_shuffle_f32x4(tmp13963, tmp13967, 136);
__m512 tmp13980 = _mm512_shuffle_f32x4(tmp13963, tmp13967, 221);
__m512 tmp13981 = _mm512_shuffle_f32x4(tmp13964, tmp13968, 136);
__m512 tmp13982 = _mm512_shuffle_f32x4(tmp13964, tmp13968, 221);
__m512 tmp13983 = _mm512_shuffle_f32x4(tmp13965, tmp13969, 136);
__m512 tmp13984 = _mm512_shuffle_f32x4(tmp13965, tmp13969, 221);
__m512 tmp13985 = _mm512_shuffle_f32x4(tmp13966, tmp13970, 136);
__m512 tmp13986 = _mm512_shuffle_f32x4(tmp13966, tmp13970, 221);
wt531 = _mm512_shuffle_f32x4(tmp13971, tmp13979, 136);
wt539 = _mm512_shuffle_f32x4(tmp13971, tmp13979, 221);
wt532 = _mm512_shuffle_f32x4(tmp13973, tmp13981, 136);
wt540 = _mm512_shuffle_f32x4(tmp13973, tmp13981, 221);
wt533 = _mm512_shuffle_f32x4(tmp13975, tmp13983, 136);
wt541 = _mm512_shuffle_f32x4(tmp13975, tmp13983, 221);
wt534 = _mm512_shuffle_f32x4(tmp13977, tmp13985, 136);
wt542 = _mm512_shuffle_f32x4(tmp13977, tmp13985, 221);
wt535 = _mm512_shuffle_f32x4(tmp13972, tmp13980, 136);
wt543 = _mm512_shuffle_f32x4(tmp13972, tmp13980, 221);
wt536 = _mm512_shuffle_f32x4(tmp13974, tmp13982, 136);
wt544 = _mm512_shuffle_f32x4(tmp13974, tmp13982, 221);
wt537 = _mm512_shuffle_f32x4(tmp13976, tmp13984, 136);
wt545 = _mm512_shuffle_f32x4(tmp13976, tmp13984, 221);
wt538 = _mm512_shuffle_f32x4(tmp13978, tmp13986, 136);
wt546 = _mm512_shuffle_f32x4(tmp13978, tmp13986, 221);
wt531 = _mm512_mul_ps(wt531, postMul47);
wt532 = _mm512_mul_ps(wt532, postMul47);
wt533 = _mm512_mul_ps(wt533, postMul47);
wt534 = _mm512_mul_ps(wt534, postMul47);
wt535 = _mm512_mul_ps(wt535, postMul47);
wt536 = _mm512_mul_ps(wt536, postMul47);
wt537 = _mm512_mul_ps(wt537, postMul47);
wt538 = _mm512_mul_ps(wt538, postMul47);
wt539 = _mm512_mul_ps(wt539, postMul47);
wt540 = _mm512_mul_ps(wt540, postMul47);
wt541 = _mm512_mul_ps(wt541, postMul47);
wt542 = _mm512_mul_ps(wt542, postMul47);
wt543 = _mm512_mul_ps(wt543, postMul47);
wt544 = _mm512_mul_ps(wt544, postMul47);
wt545 = _mm512_mul_ps(wt545, postMul47);
wt546 = _mm512_mul_ps(wt546, postMul47);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)0, 63>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)0, 63>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)0, 63>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)0, 63>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)0, 63>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)0, 63>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)0, 63>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)0, 63>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)0, 63>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)0, 63>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)0, 63>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)0, 63>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)0, 63>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)0, 63>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)0, 63>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)0, 63>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)12288, 4032>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(1+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(2+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(3+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(4+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(5+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(6+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(7+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(8+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(9+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(10+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(11+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(12+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(13+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(14+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(15+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+24*(16+16*c44)+(ptrdiff_t)24576, 258048>>cut20, wt546);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(1+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt531);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(2+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt532);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(3+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt533);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(4+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt534);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(5+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt535);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(6+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt536);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(7+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt537);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(8+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt538);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(9+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt539);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(10+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt540);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(11+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt541);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(12+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt542);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(13+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt543);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(14+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt544);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(15+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt545);
_mm512_mask_storeu_ps(arranged17+1050624*i56+12312*l62+4*cut20+8*(16+16*c44)+(ptrdiff_t)36864, 65535-(262143>>cut20), wt546);
}
}
}
}
}

static void ResNeXt50OneArrangeWts9(ResNeXt50ThreaderTeam1* team59, char** tensors91) {
ResNeXt50ThreaderTask1 task95;
task95.callee1 = ResNeXt50OneArrangeWts9Callee1;
task95.any1 = tensors91;
task95.nd1 = 3;
task95.hull1[0] = 32;
task95.hull1[1] = 1;
task95.hull1[2] = 1;
ResNeXt50ThreaderDo1(team59, &task95);
}

static void ResNeXt50OneArrangeDats9Callee1(ResNeXt50ThreaderTask1* task96, int64_t* pt53) {
char** tensors94 = task96->any1;
ptrdiff_t s52 = pt53[0];
ptrdiff_t c47 = pt53[1];
char*restrict datPtr29 = tensors94[0]+(ptrdiff_t)0+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
char*restrict arranged18 = tensors94[1]+(ptrdiff_t)2618560*0+(ptrdiff_t)1605632*0;
ptrdiff_t ii42 = 1;
for (ptrdiff_t i57 = 0; i57 < ii42; ++i57) {
ptrdiff_t j49 = 1*c47;
ptrdiff_t jj49 = j49+0;
for (; j49 != 12; ++j49) {
ptrdiff_t k149 = 128*s52;
ptrdiff_t kk49 = k149+128;
for (; k149 < kk49; ++k149) {
__m512 dat2349 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)0);
__m512 dat2350 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)64);
__m512 dat2351 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)128);
__m512 dat2352 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k149+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)0, 65535, dat2349);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)64, 65535, dat2350);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)128, 65535, dat2351);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+256*k149+(ptrdiff_t)192, 65535, dat2352);
}
if (j49 >= jj49) goto next9;
}
ptrdiff_t k150 = 128*s52;
ptrdiff_t kk50 = k150+128;
for (; k150 < kk50; ++k150) {
__m512 dat2353 = _mm512_maskz_loadu_ps(65535, datPtr29+1605632*i57+256*j49+3136*k150+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged18+1605632*i57+131072*j49+64*k150+(ptrdiff_t)0, 65535, dat2353);
}
next9:;
}
}

static void ResNeXt50OneArrangeDats9(ResNeXt50ThreaderTeam1* team60, char** tensors93) {
ResNeXt50ThreaderTask1 task97;
task97.callee1 = ResNeXt50OneArrangeDats9Callee1;
task97.any1 = tensors93;
task97.nd1 = 4;
task97.hull1[0] = 4;
task97.hull1[1] = 13;
task97.hull1[2] = 1;
task97.hull1[3] = 1;
ResNeXt50ThreaderDo1(team60, &task97);
}

static void ResNeXt50OneApply9Callee1(ResNeXt50ThreaderTask1* task98, int64_t* pt54) {
void** pair24 = task98->any1;
char** tensors96 = pair24[0];
ptrdiff_t e27 = 0;
ptrdiff_t g30 = 0;
ptrdiff_t d19 = pt54[1];
ptrdiff_t w64 = pt54[0];
char*restrict arrangedWts9 = tensors96[0]+1712128*e27+(ptrdiff_t)1050624*1*g30;
char*restrict arrangedDats9 = tensors96[1]+2618560*e27+(ptrdiff_t)1605632*1*g30;
char*restrict datPtr30 = tensors96[2]+(ptrdiff_t)1605632*1*g30;
ptrdiff_t ii43 = 1;
for (ptrdiff_t i58 = 0; i58 < ii43; ++i58) {
ptrdiff_t j50 = 1*d19;
ptrdiff_t jj50 = j50+0;
for (; j50 != 12; ++j50) {
ptrdiff_t k151 = 1*w64;
ptrdiff_t kk51 = k151+0;
for (; k151 != 85; ++k151) {
ptrdiff_t s53 = -1;
__m512 sum456 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)24));
__m512 sum460 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)28));
__m512 sum464 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)32));
__m512 sum468 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)36));
__m512 sum472 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)40));
__m512 sum476 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)44));
__m512 sum457 = sum456;
__m512 sum458 = sum456;
__m512 sum459 = sum456;
__m512 sum461 = sum460;
__m512 sum462 = sum460;
__m512 sum463 = sum460;
__m512 sum465 = sum464;
__m512 sum466 = sum464;
__m512 sum467 = sum464;
__m512 sum469 = sum468;
__m512 sum470 = sum468;
__m512 sum471 = sum468;
__m512 sum473 = sum472;
__m512 sum474 = sum472;
__m512 sum475 = sum472;
__m512 sum477 = sum476;
__m512 sum478 = sum476;
__m512 sum479 = sum476;
for (s53 = 0; s53 < 512; ++s53) {
__m512 dat2354 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)0);
__m512 dat2355 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)64);
__m512 dat2356 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)128);
__m512 dat2357 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s53+(ptrdiff_t)192);
__m512 wt579 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)24));
sum456 = _mm512_fmadd_ps(wt579, dat2354, sum456);
sum457 = _mm512_fmadd_ps(wt579, dat2355, sum457);
sum458 = _mm512_fmadd_ps(wt579, dat2356, sum458);
sum459 = _mm512_fmadd_ps(wt579, dat2357, sum459);
__m512 wt580 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)28));
sum460 = _mm512_fmadd_ps(wt580, dat2354, sum460);
sum461 = _mm512_fmadd_ps(wt580, dat2355, sum461);
sum462 = _mm512_fmadd_ps(wt580, dat2356, sum462);
sum463 = _mm512_fmadd_ps(wt580, dat2357, sum463);
__m512 wt581 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)32));
sum464 = _mm512_fmadd_ps(wt581, dat2354, sum464);
sum465 = _mm512_fmadd_ps(wt581, dat2355, sum465);
sum466 = _mm512_fmadd_ps(wt581, dat2356, sum466);
sum467 = _mm512_fmadd_ps(wt581, dat2357, sum467);
__m512 wt582 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)36));
sum468 = _mm512_fmadd_ps(wt582, dat2354, sum468);
sum469 = _mm512_fmadd_ps(wt582, dat2355, sum469);
sum470 = _mm512_fmadd_ps(wt582, dat2356, sum470);
sum471 = _mm512_fmadd_ps(wt582, dat2357, sum471);
__m512 wt583 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)40));
sum472 = _mm512_fmadd_ps(wt583, dat2354, sum472);
sum473 = _mm512_fmadd_ps(wt583, dat2355, sum473);
sum474 = _mm512_fmadd_ps(wt583, dat2356, sum474);
sum475 = _mm512_fmadd_ps(wt583, dat2357, sum475);
__m512 wt584 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+24*s53+(ptrdiff_t)44));
sum476 = _mm512_fmadd_ps(wt584, dat2354, sum476);
sum477 = _mm512_fmadd_ps(wt584, dat2355, sum477);
sum478 = _mm512_fmadd_ps(wt584, dat2356, sum478);
sum479 = _mm512_fmadd_ps(wt584, dat2357, sum479);
}
sum456 = _mm512_max_ps(_mm512_setzero_ps(), sum456);
sum457 = _mm512_max_ps(_mm512_setzero_ps(), sum457);
sum458 = _mm512_max_ps(_mm512_setzero_ps(), sum458);
sum459 = _mm512_max_ps(_mm512_setzero_ps(), sum459);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)0, 65535, sum456);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)64, 65535, sum457);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)128, 65535, sum458);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)192, 65535, sum459);
sum460 = _mm512_max_ps(_mm512_setzero_ps(), sum460);
sum461 = _mm512_max_ps(_mm512_setzero_ps(), sum461);
sum462 = _mm512_max_ps(_mm512_setzero_ps(), sum462);
sum463 = _mm512_max_ps(_mm512_setzero_ps(), sum463);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3136, 65535, sum460);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3200, 65535, sum461);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3264, 65535, sum462);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3328, 65535, sum463);
sum464 = _mm512_max_ps(_mm512_setzero_ps(), sum464);
sum465 = _mm512_max_ps(_mm512_setzero_ps(), sum465);
sum466 = _mm512_max_ps(_mm512_setzero_ps(), sum466);
sum467 = _mm512_max_ps(_mm512_setzero_ps(), sum467);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6272, 65535, sum464);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6336, 65535, sum465);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6400, 65535, sum466);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)6464, 65535, sum467);
sum468 = _mm512_max_ps(_mm512_setzero_ps(), sum468);
sum469 = _mm512_max_ps(_mm512_setzero_ps(), sum469);
sum470 = _mm512_max_ps(_mm512_setzero_ps(), sum470);
sum471 = _mm512_max_ps(_mm512_setzero_ps(), sum471);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9408, 65535, sum468);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9472, 65535, sum469);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9536, 65535, sum470);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)9600, 65535, sum471);
sum472 = _mm512_max_ps(_mm512_setzero_ps(), sum472);
sum473 = _mm512_max_ps(_mm512_setzero_ps(), sum473);
sum474 = _mm512_max_ps(_mm512_setzero_ps(), sum474);
sum475 = _mm512_max_ps(_mm512_setzero_ps(), sum475);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12544, 65535, sum472);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12608, 65535, sum473);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12672, 65535, sum474);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)12736, 65535, sum475);
sum476 = _mm512_max_ps(_mm512_setzero_ps(), sum476);
sum477 = _mm512_max_ps(_mm512_setzero_ps(), sum477);
sum478 = _mm512_max_ps(_mm512_setzero_ps(), sum478);
sum479 = _mm512_max_ps(_mm512_setzero_ps(), sum479);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15680, 65535, sum476);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15744, 65535, sum477);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15808, 65535, sum478);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)15872, 65535, sum479);
if (k151 >= kk51) return;
}
ptrdiff_t s54 = -1;
__m512 sum480 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)8));
__m512 sum484 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)12));
__m512 sum481 = sum480;
__m512 sum482 = sum480;
__m512 sum483 = sum480;
__m512 sum485 = sum484;
__m512 sum486 = sum484;
__m512 sum487 = sum484;
for (s54 = 0; s54 < 512; ++s54) {
__m512 dat2358 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)0);
__m512 dat2359 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)64);
__m512 dat2360 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)128);
__m512 dat2361 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+256*s54+(ptrdiff_t)192);
__m512 wt585 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)8));
sum480 = _mm512_fmadd_ps(wt585, dat2358, sum480);
sum481 = _mm512_fmadd_ps(wt585, dat2359, sum481);
sum482 = _mm512_fmadd_ps(wt585, dat2360, sum482);
sum483 = _mm512_fmadd_ps(wt585, dat2361, sum483);
__m512 wt586 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k151+8*s54+(ptrdiff_t)12));
sum484 = _mm512_fmadd_ps(wt586, dat2358, sum484);
sum485 = _mm512_fmadd_ps(wt586, dat2359, sum485);
sum486 = _mm512_fmadd_ps(wt586, dat2360, sum486);
sum487 = _mm512_fmadd_ps(wt586, dat2361, sum487);
}
sum480 = _mm512_max_ps(_mm512_setzero_ps(), sum480);
sum481 = _mm512_max_ps(_mm512_setzero_ps(), sum481);
sum482 = _mm512_max_ps(_mm512_setzero_ps(), sum482);
sum483 = _mm512_max_ps(_mm512_setzero_ps(), sum483);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)0, 65535, sum480);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)64, 65535, sum481);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)128, 65535, sum482);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)192, 65535, sum483);
sum484 = _mm512_max_ps(_mm512_setzero_ps(), sum484);
sum485 = _mm512_max_ps(_mm512_setzero_ps(), sum485);
sum486 = _mm512_max_ps(_mm512_setzero_ps(), sum486);
sum487 = _mm512_max_ps(_mm512_setzero_ps(), sum487);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3136, 65535, sum484);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3200, 65535, sum485);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3264, 65535, sum486);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k151+(ptrdiff_t)3328, 65535, sum487);
if (j50 >= jj50) return;
}
ptrdiff_t k152 = 1*w64;
ptrdiff_t kk52 = k152+0;
for (; k152 != 85; ++k152) {
ptrdiff_t s55 = -1;
__m512 sum488 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)24));
__m512 sum489 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)28));
__m512 sum490 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)32));
__m512 sum491 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)36));
__m512 sum492 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)40));
__m512 sum493 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)44));
for (s55 = 0; s55 < 512; ++s55) {
__m512 dat2362 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+64*s55+(ptrdiff_t)0);
__m512 wt587 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)24));
sum488 = _mm512_fmadd_ps(wt587, dat2362, sum488);
__m512 wt588 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)28));
sum489 = _mm512_fmadd_ps(wt588, dat2362, sum489);
__m512 wt589 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)32));
sum490 = _mm512_fmadd_ps(wt589, dat2362, sum490);
__m512 wt590 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)36));
sum491 = _mm512_fmadd_ps(wt590, dat2362, sum491);
__m512 wt591 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)40));
sum492 = _mm512_fmadd_ps(wt591, dat2362, sum492);
__m512 wt592 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+24*s55+(ptrdiff_t)44));
sum493 = _mm512_fmadd_ps(wt592, dat2362, sum493);
}
sum488 = _mm512_max_ps(_mm512_setzero_ps(), sum488);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)0, 65535, sum488);
sum489 = _mm512_max_ps(_mm512_setzero_ps(), sum489);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)3136, 65535, sum489);
sum490 = _mm512_max_ps(_mm512_setzero_ps(), sum490);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)6272, 65535, sum490);
sum491 = _mm512_max_ps(_mm512_setzero_ps(), sum491);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)9408, 65535, sum491);
sum492 = _mm512_max_ps(_mm512_setzero_ps(), sum492);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)12544, 65535, sum492);
sum493 = _mm512_max_ps(_mm512_setzero_ps(), sum493);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)15680, 65535, sum493);
if (k152 >= kk52) return;
}
ptrdiff_t s56 = -1;
__m512 sum494 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)8));
__m512 sum495 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)12));
for (s56 = 0; s56 < 512; ++s56) {
__m512 dat2363 = _mm512_loadu_ps(arrangedDats9+1605632*i58+131072*j50+64*s56+(ptrdiff_t)0);
__m512 wt593 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)8));
sum494 = _mm512_fmadd_ps(wt593, dat2363, sum494);
__m512 wt594 = _mm512_set1_ps(*(float*)(arrangedWts9+1050624*i58+12312*k152+8*s56+(ptrdiff_t)12));
sum495 = _mm512_fmadd_ps(wt594, dat2363, sum495);
}
sum494 = _mm512_max_ps(_mm512_setzero_ps(), sum494);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)0, 65535, sum494);
sum495 = _mm512_max_ps(_mm512_setzero_ps(), sum495);
_mm512_mask_storeu_ps(datPtr30+1605632*i58+256*j50+18816*k152+(ptrdiff_t)3136, 65535, sum495);
}
}

static void ResNeXt50OneApply9(ResNeXt50ThreaderTeam1* team61, char** tensors95) {
void* pair23[] = {tensors95, 0};
ResNeXt50ThreaderTask1 task99;
task99.callee1 = ResNeXt50OneApply9Callee1;
task99.any1 = pair23;
task99.nd1 = 3;
task99.hull1[0] = 86;
task99.hull1[1] = 13;
task99.hull1[2] = 1;
ResNeXt50ThreaderDo1(team61, &task99);
}

static void ResNeXt50OneArrangeWts10Callee1(ResNeXt50ThreaderTask1* task108, int64_t* pt59) {
char** tensors106 = task108->any1;
ptrdiff_t b81 = pt59[0];
char*restrict wtPtr17 = tensors106[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr17 = tensors106[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr18 = tensors106[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged19 = tensors106[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)2101248*0;
ptrdiff_t ii46 = 1;
for (ptrdiff_t i64 = 0; i64 < ii46; ++i64) {
ptrdiff_t j56 = 1*b81;
ptrdiff_t jj54 = j56+1;
for (; j56 < jj54; ++j56) {
if (j56 < 63) {
ptrdiff_t k161 = 0+16*(j56-0);
ptrdiff_t l69 = (size_t)(0+k161)/6;
ptrdiff_t cut23 = (size_t)(0+k161)%6;
switch (cut23) {
case 0:;
case 2: {
__m512 sum497 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k161);
__m512i pmMul35 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd35 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo29 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k161+1024*i64));
__m512 masHi29 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k161+1024*i64)+(ptrdiff_t)64);
__m512 postMul54 = _mm512_permutex2var_ps(masLo29, pmMul35, masHi29);
__m512 postAdd36 = _mm512_permutex2var_ps(masLo29, pmAdd35, masHi29);
sum497 = _mm512_fmadd_ps(sum497, postMul54, postAdd36);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum497);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)12288, 4032>>cut23, sum497);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)24576, 65535-(4095>>cut23), sum497);
ptrdiff_t c52 = 0;
for (; c52 != 32; ++c52) {
__m512 wt617 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)0);
__m512 wt618 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)2048);
__m512 wt619 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)4096);
__m512 wt620 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)6144);
__m512 wt621 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)8192);
__m512 wt622 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)10240);
__m512 wt623 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)12288);
__m512 wt624 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)14336);
__m512 wt625 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)16384);
__m512 wt626 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)18432);
__m512 wt627 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)20480);
__m512 wt628 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)22528);
__m512 wt629 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)24576);
__m512 wt630 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)26624);
__m512 wt631 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)28672);
__m512 wt632 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c52+(ptrdiff_t)30720);
__m512 tmp13987 = _mm512_unpacklo_ps(wt617, wt618);
__m512 tmp13988 = _mm512_unpackhi_ps(wt617, wt618);
__m512 tmp13989 = _mm512_unpacklo_ps(wt619, wt620);
__m512 tmp13990 = _mm512_unpackhi_ps(wt619, wt620);
__m512 tmp13991 = _mm512_unpacklo_ps(wt621, wt622);
__m512 tmp13992 = _mm512_unpackhi_ps(wt621, wt622);
__m512 tmp13993 = _mm512_unpacklo_ps(wt623, wt624);
__m512 tmp13994 = _mm512_unpackhi_ps(wt623, wt624);
__m512 tmp13995 = _mm512_unpacklo_ps(wt625, wt626);
__m512 tmp13996 = _mm512_unpackhi_ps(wt625, wt626);
__m512 tmp13997 = _mm512_unpacklo_ps(wt627, wt628);
__m512 tmp13998 = _mm512_unpackhi_ps(wt627, wt628);
__m512 tmp13999 = _mm512_unpacklo_ps(wt629, wt630);
__m512 tmp14000 = _mm512_unpackhi_ps(wt629, wt630);
__m512 tmp14001 = _mm512_unpacklo_ps(wt631, wt632);
__m512 tmp14002 = _mm512_unpackhi_ps(wt631, wt632);
__m512 tmp14003 = _mm512_shuffle_ps(tmp13987, tmp13989, 68);
__m512 tmp14004 = _mm512_shuffle_ps(tmp13987, tmp13989, 238);
__m512 tmp14005 = _mm512_shuffle_ps(tmp13988, tmp13990, 68);
__m512 tmp14006 = _mm512_shuffle_ps(tmp13988, tmp13990, 238);
__m512 tmp14007 = _mm512_shuffle_ps(tmp13991, tmp13993, 68);
__m512 tmp14008 = _mm512_shuffle_ps(tmp13991, tmp13993, 238);
__m512 tmp14009 = _mm512_shuffle_ps(tmp13992, tmp13994, 68);
__m512 tmp14010 = _mm512_shuffle_ps(tmp13992, tmp13994, 238);
__m512 tmp14011 = _mm512_shuffle_ps(tmp13995, tmp13997, 68);
__m512 tmp14012 = _mm512_shuffle_ps(tmp13995, tmp13997, 238);
__m512 tmp14013 = _mm512_shuffle_ps(tmp13996, tmp13998, 68);
__m512 tmp14014 = _mm512_shuffle_ps(tmp13996, tmp13998, 238);
__m512 tmp14015 = _mm512_shuffle_ps(tmp13999, tmp14001, 68);
__m512 tmp14016 = _mm512_shuffle_ps(tmp13999, tmp14001, 238);
__m512 tmp14017 = _mm512_shuffle_ps(tmp14000, tmp14002, 68);
__m512 tmp14018 = _mm512_shuffle_ps(tmp14000, tmp14002, 238);
__m512 tmp14019 = _mm512_shuffle_f32x4(tmp14003, tmp14007, 136);
__m512 tmp14020 = _mm512_shuffle_f32x4(tmp14003, tmp14007, 221);
__m512 tmp14021 = _mm512_shuffle_f32x4(tmp14004, tmp14008, 136);
__m512 tmp14022 = _mm512_shuffle_f32x4(tmp14004, tmp14008, 221);
__m512 tmp14023 = _mm512_shuffle_f32x4(tmp14005, tmp14009, 136);
__m512 tmp14024 = _mm512_shuffle_f32x4(tmp14005, tmp14009, 221);
__m512 tmp14025 = _mm512_shuffle_f32x4(tmp14006, tmp14010, 136);
__m512 tmp14026 = _mm512_shuffle_f32x4(tmp14006, tmp14010, 221);
__m512 tmp14027 = _mm512_shuffle_f32x4(tmp14011, tmp14015, 136);
__m512 tmp14028 = _mm512_shuffle_f32x4(tmp14011, tmp14015, 221);
__m512 tmp14029 = _mm512_shuffle_f32x4(tmp14012, tmp14016, 136);
__m512 tmp14030 = _mm512_shuffle_f32x4(tmp14012, tmp14016, 221);
__m512 tmp14031 = _mm512_shuffle_f32x4(tmp14013, tmp14017, 136);
__m512 tmp14032 = _mm512_shuffle_f32x4(tmp14013, tmp14017, 221);
__m512 tmp14033 = _mm512_shuffle_f32x4(tmp14014, tmp14018, 136);
__m512 tmp14034 = _mm512_shuffle_f32x4(tmp14014, tmp14018, 221);
wt617 = _mm512_shuffle_f32x4(tmp14019, tmp14027, 136);
wt625 = _mm512_shuffle_f32x4(tmp14019, tmp14027, 221);
wt618 = _mm512_shuffle_f32x4(tmp14021, tmp14029, 136);
wt626 = _mm512_shuffle_f32x4(tmp14021, tmp14029, 221);
wt619 = _mm512_shuffle_f32x4(tmp14023, tmp14031, 136);
wt627 = _mm512_shuffle_f32x4(tmp14023, tmp14031, 221);
wt620 = _mm512_shuffle_f32x4(tmp14025, tmp14033, 136);
wt628 = _mm512_shuffle_f32x4(tmp14025, tmp14033, 221);
wt621 = _mm512_shuffle_f32x4(tmp14020, tmp14028, 136);
wt629 = _mm512_shuffle_f32x4(tmp14020, tmp14028, 221);
wt622 = _mm512_shuffle_f32x4(tmp14022, tmp14030, 136);
wt630 = _mm512_shuffle_f32x4(tmp14022, tmp14030, 221);
wt623 = _mm512_shuffle_f32x4(tmp14024, tmp14032, 136);
wt631 = _mm512_shuffle_f32x4(tmp14024, tmp14032, 221);
wt624 = _mm512_shuffle_f32x4(tmp14026, tmp14034, 136);
wt632 = _mm512_shuffle_f32x4(tmp14026, tmp14034, 221);
wt617 = _mm512_mul_ps(wt617, postMul54);
wt618 = _mm512_mul_ps(wt618, postMul54);
wt619 = _mm512_mul_ps(wt619, postMul54);
wt620 = _mm512_mul_ps(wt620, postMul54);
wt621 = _mm512_mul_ps(wt621, postMul54);
wt622 = _mm512_mul_ps(wt622, postMul54);
wt623 = _mm512_mul_ps(wt623, postMul54);
wt624 = _mm512_mul_ps(wt624, postMul54);
wt625 = _mm512_mul_ps(wt625, postMul54);
wt626 = _mm512_mul_ps(wt626, postMul54);
wt627 = _mm512_mul_ps(wt627, postMul54);
wt628 = _mm512_mul_ps(wt628, postMul54);
wt629 = _mm512_mul_ps(wt629, postMul54);
wt630 = _mm512_mul_ps(wt630, postMul54);
wt631 = _mm512_mul_ps(wt631, postMul54);
wt632 = _mm512_mul_ps(wt632, postMul54);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)0, 63>>cut23, wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)0, 63>>cut23, wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)0, 63>>cut23, wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)0, 63>>cut23, wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)0, 63>>cut23, wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)0, 63>>cut23, wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)0, 63>>cut23, wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)0, 63>>cut23, wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)0, 63>>cut23, wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)0, 63>>cut23, wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)0, 63>>cut23, wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)0, 63>>cut23, wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)0, 63>>cut23, wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)0, 63>>cut23, wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)0, 63>>cut23, wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)0, 63>>cut23, wt632);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)12288, 4032>>cut23, wt632);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt617);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt618);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt619);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt620);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt621);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt622);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt623);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt624);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt625);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt626);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt627);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt628);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt629);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt630);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt631);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c52)+(ptrdiff_t)24576, 65535-(4095>>cut23), wt632);
}
break;
}
default: {
cut23 = 4;
__m512 sum498 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k161);
__m512i pmMul36 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd36 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo30 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k161+1024*i64));
__m512 masHi30 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k161+1024*i64)+(ptrdiff_t)64);
__m512 postMul55 = _mm512_permutex2var_ps(masLo30, pmMul36, masHi30);
__m512 postAdd37 = _mm512_permutex2var_ps(masLo30, pmAdd36, masHi30);
sum498 = _mm512_fmadd_ps(sum498, postMul55, postAdd37);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)0, 63>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)12288, 4032>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)24576, 258048>>cut23, sum498);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*0+(ptrdiff_t)36864, 65535-(262143>>cut23), sum498);
ptrdiff_t c53 = 0;
for (; c53 != 32; ++c53) {
__m512 wt633 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)0);
__m512 wt634 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)2048);
__m512 wt635 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)4096);
__m512 wt636 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)6144);
__m512 wt637 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)8192);
__m512 wt638 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)10240);
__m512 wt639 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)12288);
__m512 wt640 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)14336);
__m512 wt641 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)16384);
__m512 wt642 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)18432);
__m512 wt643 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)20480);
__m512 wt644 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)22528);
__m512 wt645 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)24576);
__m512 wt646 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)26624);
__m512 wt647 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)28672);
__m512 wt648 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k161+64*c53+(ptrdiff_t)30720);
__m512 tmp14035 = _mm512_unpacklo_ps(wt633, wt634);
__m512 tmp14036 = _mm512_unpackhi_ps(wt633, wt634);
__m512 tmp14037 = _mm512_unpacklo_ps(wt635, wt636);
__m512 tmp14038 = _mm512_unpackhi_ps(wt635, wt636);
__m512 tmp14039 = _mm512_unpacklo_ps(wt637, wt638);
__m512 tmp14040 = _mm512_unpackhi_ps(wt637, wt638);
__m512 tmp14041 = _mm512_unpacklo_ps(wt639, wt640);
__m512 tmp14042 = _mm512_unpackhi_ps(wt639, wt640);
__m512 tmp14043 = _mm512_unpacklo_ps(wt641, wt642);
__m512 tmp14044 = _mm512_unpackhi_ps(wt641, wt642);
__m512 tmp14045 = _mm512_unpacklo_ps(wt643, wt644);
__m512 tmp14046 = _mm512_unpackhi_ps(wt643, wt644);
__m512 tmp14047 = _mm512_unpacklo_ps(wt645, wt646);
__m512 tmp14048 = _mm512_unpackhi_ps(wt645, wt646);
__m512 tmp14049 = _mm512_unpacklo_ps(wt647, wt648);
__m512 tmp14050 = _mm512_unpackhi_ps(wt647, wt648);
__m512 tmp14051 = _mm512_shuffle_ps(tmp14035, tmp14037, 68);
__m512 tmp14052 = _mm512_shuffle_ps(tmp14035, tmp14037, 238);
__m512 tmp14053 = _mm512_shuffle_ps(tmp14036, tmp14038, 68);
__m512 tmp14054 = _mm512_shuffle_ps(tmp14036, tmp14038, 238);
__m512 tmp14055 = _mm512_shuffle_ps(tmp14039, tmp14041, 68);
__m512 tmp14056 = _mm512_shuffle_ps(tmp14039, tmp14041, 238);
__m512 tmp14057 = _mm512_shuffle_ps(tmp14040, tmp14042, 68);
__m512 tmp14058 = _mm512_shuffle_ps(tmp14040, tmp14042, 238);
__m512 tmp14059 = _mm512_shuffle_ps(tmp14043, tmp14045, 68);
__m512 tmp14060 = _mm512_shuffle_ps(tmp14043, tmp14045, 238);
__m512 tmp14061 = _mm512_shuffle_ps(tmp14044, tmp14046, 68);
__m512 tmp14062 = _mm512_shuffle_ps(tmp14044, tmp14046, 238);
__m512 tmp14063 = _mm512_shuffle_ps(tmp14047, tmp14049, 68);
__m512 tmp14064 = _mm512_shuffle_ps(tmp14047, tmp14049, 238);
__m512 tmp14065 = _mm512_shuffle_ps(tmp14048, tmp14050, 68);
__m512 tmp14066 = _mm512_shuffle_ps(tmp14048, tmp14050, 238);
__m512 tmp14067 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 136);
__m512 tmp14068 = _mm512_shuffle_f32x4(tmp14051, tmp14055, 221);
__m512 tmp14069 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 136);
__m512 tmp14070 = _mm512_shuffle_f32x4(tmp14052, tmp14056, 221);
__m512 tmp14071 = _mm512_shuffle_f32x4(tmp14053, tmp14057, 136);
__m512 tmp14072 = _mm512_shuffle_f32x4(tmp14053, tmp14057, 221);
__m512 tmp14073 = _mm512_shuffle_f32x4(tmp14054, tmp14058, 136);
__m512 tmp14074 = _mm512_shuffle_f32x4(tmp14054, tmp14058, 221);
__m512 tmp14075 = _mm512_shuffle_f32x4(tmp14059, tmp14063, 136);
__m512 tmp14076 = _mm512_shuffle_f32x4(tmp14059, tmp14063, 221);
__m512 tmp14077 = _mm512_shuffle_f32x4(tmp14060, tmp14064, 136);
__m512 tmp14078 = _mm512_shuffle_f32x4(tmp14060, tmp14064, 221);
__m512 tmp14079 = _mm512_shuffle_f32x4(tmp14061, tmp14065, 136);
__m512 tmp14080 = _mm512_shuffle_f32x4(tmp14061, tmp14065, 221);
__m512 tmp14081 = _mm512_shuffle_f32x4(tmp14062, tmp14066, 136);
__m512 tmp14082 = _mm512_shuffle_f32x4(tmp14062, tmp14066, 221);
wt633 = _mm512_shuffle_f32x4(tmp14067, tmp14075, 136);
wt641 = _mm512_shuffle_f32x4(tmp14067, tmp14075, 221);
wt634 = _mm512_shuffle_f32x4(tmp14069, tmp14077, 136);
wt642 = _mm512_shuffle_f32x4(tmp14069, tmp14077, 221);
wt635 = _mm512_shuffle_f32x4(tmp14071, tmp14079, 136);
wt643 = _mm512_shuffle_f32x4(tmp14071, tmp14079, 221);
wt636 = _mm512_shuffle_f32x4(tmp14073, tmp14081, 136);
wt644 = _mm512_shuffle_f32x4(tmp14073, tmp14081, 221);
wt637 = _mm512_shuffle_f32x4(tmp14068, tmp14076, 136);
wt645 = _mm512_shuffle_f32x4(tmp14068, tmp14076, 221);
wt638 = _mm512_shuffle_f32x4(tmp14070, tmp14078, 136);
wt646 = _mm512_shuffle_f32x4(tmp14070, tmp14078, 221);
wt639 = _mm512_shuffle_f32x4(tmp14072, tmp14080, 136);
wt647 = _mm512_shuffle_f32x4(tmp14072, tmp14080, 221);
wt640 = _mm512_shuffle_f32x4(tmp14074, tmp14082, 136);
wt648 = _mm512_shuffle_f32x4(tmp14074, tmp14082, 221);
wt633 = _mm512_mul_ps(wt633, postMul55);
wt634 = _mm512_mul_ps(wt634, postMul55);
wt635 = _mm512_mul_ps(wt635, postMul55);
wt636 = _mm512_mul_ps(wt636, postMul55);
wt637 = _mm512_mul_ps(wt637, postMul55);
wt638 = _mm512_mul_ps(wt638, postMul55);
wt639 = _mm512_mul_ps(wt639, postMul55);
wt640 = _mm512_mul_ps(wt640, postMul55);
wt641 = _mm512_mul_ps(wt641, postMul55);
wt642 = _mm512_mul_ps(wt642, postMul55);
wt643 = _mm512_mul_ps(wt643, postMul55);
wt644 = _mm512_mul_ps(wt644, postMul55);
wt645 = _mm512_mul_ps(wt645, postMul55);
wt646 = _mm512_mul_ps(wt646, postMul55);
wt647 = _mm512_mul_ps(wt647, postMul55);
wt648 = _mm512_mul_ps(wt648, postMul55);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)0, 63>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)0, 63>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)0, 63>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)0, 63>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)0, 63>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)0, 63>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)0, 63>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)0, 63>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)0, 63>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)0, 63>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)0, 63>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)0, 63>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)0, 63>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)0, 63>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)0, 63>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)0, 63>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)12288, 4032>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)24576, 258048>>cut23, wt648);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(1+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt633);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(2+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt634);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(3+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt635);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(4+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt636);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(5+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt637);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(6+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt638);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(7+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt639);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(8+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt640);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(9+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt641);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(10+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt642);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(11+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt643);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(12+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt644);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(13+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt645);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(14+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt646);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(15+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt647);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l69+4*cut23+24*(16+16*c53)+(ptrdiff_t)36864, 65535-(262143>>cut23), wt648);
}
}
}
} else {
ptrdiff_t k160 = 1008;
ptrdiff_t l68 = (size_t)(0+k160)/6;
ptrdiff_t cut22 = (size_t)(0+k160)%6;
__m512 sum496 = _mm512_maskz_loadu_ps(65535, biasPtr17+4096*i64+4*k160);
__m512i pmMul37 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd37 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo31 = _mm512_loadu_ps(bnPtr18+(ptrdiff_t)8*(k160+1024*i64));
__m512 masHi31 = _mm512_maskz_loadu_ps(65535, bnPtr18+(ptrdiff_t)8*(k160+1024*i64)+(ptrdiff_t)64);
__m512 postMul53 = _mm512_permutex2var_ps(masLo31, pmMul37, masHi31);
__m512 postAdd35 = _mm512_permutex2var_ps(masLo31, pmAdd37, masHi31);
sum496 = _mm512_fmadd_ps(sum496, postMul53, postAdd35);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*0+(ptrdiff_t)0, 63>>cut22, sum496);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*0+(ptrdiff_t)12288, 4032>>cut22, sum496);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*0+(ptrdiff_t)24576, 65535-(4095>>cut22), sum496);
ptrdiff_t c51 = 0;
for (; c51 != 32; ++c51) {
__m512 wt601 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)0);
__m512 wt602 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)2048);
__m512 wt603 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)4096);
__m512 wt604 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)6144);
__m512 wt605 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)8192);
__m512 wt606 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)10240);
__m512 wt607 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)12288);
__m512 wt608 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)14336);
__m512 wt609 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)16384);
__m512 wt610 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)18432);
__m512 wt611 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)20480);
__m512 wt612 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)22528);
__m512 wt613 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)24576);
__m512 wt614 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)26624);
__m512 wt615 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)28672);
__m512 wt616 = _mm512_maskz_loadu_ps(65535, wtPtr17+2097152*i64+2048*k160+64*c51+(ptrdiff_t)30720);
__m512 tmp14083 = _mm512_unpacklo_ps(wt601, wt602);
__m512 tmp14084 = _mm512_unpackhi_ps(wt601, wt602);
__m512 tmp14085 = _mm512_unpacklo_ps(wt603, wt604);
__m512 tmp14086 = _mm512_unpackhi_ps(wt603, wt604);
__m512 tmp14087 = _mm512_unpacklo_ps(wt605, wt606);
__m512 tmp14088 = _mm512_unpackhi_ps(wt605, wt606);
__m512 tmp14089 = _mm512_unpacklo_ps(wt607, wt608);
__m512 tmp14090 = _mm512_unpackhi_ps(wt607, wt608);
__m512 tmp14091 = _mm512_unpacklo_ps(wt609, wt610);
__m512 tmp14092 = _mm512_unpackhi_ps(wt609, wt610);
__m512 tmp14093 = _mm512_unpacklo_ps(wt611, wt612);
__m512 tmp14094 = _mm512_unpackhi_ps(wt611, wt612);
__m512 tmp14095 = _mm512_unpacklo_ps(wt613, wt614);
__m512 tmp14096 = _mm512_unpackhi_ps(wt613, wt614);
__m512 tmp14097 = _mm512_unpacklo_ps(wt615, wt616);
__m512 tmp14098 = _mm512_unpackhi_ps(wt615, wt616);
__m512 tmp14099 = _mm512_shuffle_ps(tmp14083, tmp14085, 68);
__m512 tmp14100 = _mm512_shuffle_ps(tmp14083, tmp14085, 238);
__m512 tmp14101 = _mm512_shuffle_ps(tmp14084, tmp14086, 68);
__m512 tmp14102 = _mm512_shuffle_ps(tmp14084, tmp14086, 238);
__m512 tmp14103 = _mm512_shuffle_ps(tmp14087, tmp14089, 68);
__m512 tmp14104 = _mm512_shuffle_ps(tmp14087, tmp14089, 238);
__m512 tmp14105 = _mm512_shuffle_ps(tmp14088, tmp14090, 68);
__m512 tmp14106 = _mm512_shuffle_ps(tmp14088, tmp14090, 238);
__m512 tmp14107 = _mm512_shuffle_ps(tmp14091, tmp14093, 68);
__m512 tmp14108 = _mm512_shuffle_ps(tmp14091, tmp14093, 238);
__m512 tmp14109 = _mm512_shuffle_ps(tmp14092, tmp14094, 68);
__m512 tmp14110 = _mm512_shuffle_ps(tmp14092, tmp14094, 238);
__m512 tmp14111 = _mm512_shuffle_ps(tmp14095, tmp14097, 68);
__m512 tmp14112 = _mm512_shuffle_ps(tmp14095, tmp14097, 238);
__m512 tmp14113 = _mm512_shuffle_ps(tmp14096, tmp14098, 68);
__m512 tmp14114 = _mm512_shuffle_ps(tmp14096, tmp14098, 238);
__m512 tmp14115 = _mm512_shuffle_f32x4(tmp14099, tmp14103, 136);
__m512 tmp14116 = _mm512_shuffle_f32x4(tmp14099, tmp14103, 221);
__m512 tmp14117 = _mm512_shuffle_f32x4(tmp14100, tmp14104, 136);
__m512 tmp14118 = _mm512_shuffle_f32x4(tmp14100, tmp14104, 221);
__m512 tmp14119 = _mm512_shuffle_f32x4(tmp14101, tmp14105, 136);
__m512 tmp14120 = _mm512_shuffle_f32x4(tmp14101, tmp14105, 221);
__m512 tmp14121 = _mm512_shuffle_f32x4(tmp14102, tmp14106, 136);
__m512 tmp14122 = _mm512_shuffle_f32x4(tmp14102, tmp14106, 221);
__m512 tmp14123 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 136);
__m512 tmp14124 = _mm512_shuffle_f32x4(tmp14107, tmp14111, 221);
__m512 tmp14125 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 136);
__m512 tmp14126 = _mm512_shuffle_f32x4(tmp14108, tmp14112, 221);
__m512 tmp14127 = _mm512_shuffle_f32x4(tmp14109, tmp14113, 136);
__m512 tmp14128 = _mm512_shuffle_f32x4(tmp14109, tmp14113, 221);
__m512 tmp14129 = _mm512_shuffle_f32x4(tmp14110, tmp14114, 136);
__m512 tmp14130 = _mm512_shuffle_f32x4(tmp14110, tmp14114, 221);
wt601 = _mm512_shuffle_f32x4(tmp14115, tmp14123, 136);
wt609 = _mm512_shuffle_f32x4(tmp14115, tmp14123, 221);
wt602 = _mm512_shuffle_f32x4(tmp14117, tmp14125, 136);
wt610 = _mm512_shuffle_f32x4(tmp14117, tmp14125, 221);
wt603 = _mm512_shuffle_f32x4(tmp14119, tmp14127, 136);
wt611 = _mm512_shuffle_f32x4(tmp14119, tmp14127, 221);
wt604 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 136);
wt612 = _mm512_shuffle_f32x4(tmp14121, tmp14129, 221);
wt605 = _mm512_shuffle_f32x4(tmp14116, tmp14124, 136);
wt613 = _mm512_shuffle_f32x4(tmp14116, tmp14124, 221);
wt606 = _mm512_shuffle_f32x4(tmp14118, tmp14126, 136);
wt614 = _mm512_shuffle_f32x4(tmp14118, tmp14126, 221);
wt607 = _mm512_shuffle_f32x4(tmp14120, tmp14128, 136);
wt615 = _mm512_shuffle_f32x4(tmp14120, tmp14128, 221);
wt608 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 136);
wt616 = _mm512_shuffle_f32x4(tmp14122, tmp14130, 221);
wt601 = _mm512_mul_ps(wt601, postMul53);
wt602 = _mm512_mul_ps(wt602, postMul53);
wt603 = _mm512_mul_ps(wt603, postMul53);
wt604 = _mm512_mul_ps(wt604, postMul53);
wt605 = _mm512_mul_ps(wt605, postMul53);
wt606 = _mm512_mul_ps(wt606, postMul53);
wt607 = _mm512_mul_ps(wt607, postMul53);
wt608 = _mm512_mul_ps(wt608, postMul53);
wt609 = _mm512_mul_ps(wt609, postMul53);
wt610 = _mm512_mul_ps(wt610, postMul53);
wt611 = _mm512_mul_ps(wt611, postMul53);
wt612 = _mm512_mul_ps(wt612, postMul53);
wt613 = _mm512_mul_ps(wt613, postMul53);
wt614 = _mm512_mul_ps(wt614, postMul53);
wt615 = _mm512_mul_ps(wt615, postMul53);
wt616 = _mm512_mul_ps(wt616, postMul53);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(1+16*c51)+(ptrdiff_t)0, 63>>cut22, wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(2+16*c51)+(ptrdiff_t)0, 63>>cut22, wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(3+16*c51)+(ptrdiff_t)0, 63>>cut22, wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(4+16*c51)+(ptrdiff_t)0, 63>>cut22, wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(5+16*c51)+(ptrdiff_t)0, 63>>cut22, wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(6+16*c51)+(ptrdiff_t)0, 63>>cut22, wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(7+16*c51)+(ptrdiff_t)0, 63>>cut22, wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(8+16*c51)+(ptrdiff_t)0, 63>>cut22, wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(9+16*c51)+(ptrdiff_t)0, 63>>cut22, wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(10+16*c51)+(ptrdiff_t)0, 63>>cut22, wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(11+16*c51)+(ptrdiff_t)0, 63>>cut22, wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(12+16*c51)+(ptrdiff_t)0, 63>>cut22, wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(13+16*c51)+(ptrdiff_t)0, 63>>cut22, wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(14+16*c51)+(ptrdiff_t)0, 63>>cut22, wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(15+16*c51)+(ptrdiff_t)0, 63>>cut22, wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(16+16*c51)+(ptrdiff_t)0, 63>>cut22, wt616);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(1+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(2+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(3+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(4+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(5+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(6+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(7+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(8+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(9+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(10+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(11+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(12+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(13+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(14+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(15+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+24*(16+16*c51)+(ptrdiff_t)12288, 4032>>cut22, wt616);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(1+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt601);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(2+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt602);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(3+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt603);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(4+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt604);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(5+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt605);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(6+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt606);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(7+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt607);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(8+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt608);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(9+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt609);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(10+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt610);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(11+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt611);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(12+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt612);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(13+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt613);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(14+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt614);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(15+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt615);
_mm512_mask_storeu_ps(arranged19+2101248*i64+12312*l68+4*cut22+16*(16+16*c51)+(ptrdiff_t)24576, 65535-(4095>>cut22), wt616);
}
}
}
}
}

static void ResNeXt50OneArrangeWts10(ResNeXt50ThreaderTeam1* team66, char** tensors105) {
ResNeXt50ThreaderTask1 task109;
task109.callee1 = ResNeXt50OneArrangeWts10Callee1;
task109.any1 = tensors105;
task109.nd1 = 3;
task109.hull1[0] = 64;
task109.hull1[1] = 1;
task109.hull1[2] = 1;
ResNeXt50ThreaderDo1(team66, &task109);
}

static void ResNeXt50OneArrangeDats10Callee1(ResNeXt50ThreaderTask1* task110, int64_t* pt60) {
char** tensors108 = task110->any1;
ptrdiff_t s62 = pt60[0];
ptrdiff_t c54 = pt60[1];
char*restrict datPtr33 = tensors108[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)425984*0;
char*restrict arranged20 = tensors108[1]+(ptrdiff_t)694720*0+(ptrdiff_t)425984*0;
ptrdiff_t ii47 = 1;
for (ptrdiff_t i65 = 0; i65 < ii47; ++i65) {
ptrdiff_t j57 = 1*c54;
ptrdiff_t jj55 = j57+0;
for (; j57 != 3; ++j57) {
ptrdiff_t k162 = 128*s62;
ptrdiff_t kk54 = k162+128;
for (; k162 < kk54; ++k162) {
__m512 dat2452 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)0);
__m512 dat2453 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)64);
__m512 dat2454 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)128);
__m512 dat2455 = _mm512_maskz_loadu_ps(65535, datPtr33+425984*i65+256*j57+832*k162+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)0, 65535, dat2452);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)64, 65535, dat2453);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)128, 65535, dat2454);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+256*k162+(ptrdiff_t)192, 65535, dat2455);
}
if (j57 >= jj55) goto next10;
}
ptrdiff_t k163 = 128*s62;
ptrdiff_t kk55 = k163+128;
for (; k163 < kk55; ++k163) {
__m512 dat2456 = _mm512_maskz_loadu_ps(15, datPtr33+425984*i65+256*j57+832*k163+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged20+425984*i65+131072*j57+64*k163+(ptrdiff_t)0, 15, dat2456);
}
next10:;
}
}

static void ResNeXt50OneArrangeDats10(ResNeXt50ThreaderTeam1* team67, char** tensors107) {
ResNeXt50ThreaderTask1 task111;
task111.callee1 = ResNeXt50OneArrangeDats10Callee1;
task111.any1 = tensors107;
task111.nd1 = 4;
task111.hull1[0] = 4;
task111.hull1[1] = 4;
task111.hull1[2] = 1;
task111.hull1[3] = 1;
ResNeXt50ThreaderDo1(team67, &task111);
}

static void ResNeXt50OneApply10Callee1(ResNeXt50ThreaderTask1* task112, int64_t* pt61) {
void** pair26 = task112->any1;
char** tensors110 = pair26[0];
ptrdiff_t e32 = 0;
ptrdiff_t g35 = 0;
ptrdiff_t d22 = pt61[1];
ptrdiff_t w68 = pt61[0];
char*restrict arrangedWts10 = tensors110[0]+3424256*e32+(ptrdiff_t)2101248*1*g35;
char*restrict arrangedDats10 = tensors110[1]+694720*e32+(ptrdiff_t)425984*1*g35;
char*restrict datPtr34 = tensors110[2]+(ptrdiff_t)851968*1*g35;
char*restrict datPtr35 = tensors110[3]+(ptrdiff_t)851968*1*g35;
ptrdiff_t ii48 = 1;
for (ptrdiff_t i66 = 0; i66 < ii48; ++i66) {
ptrdiff_t j58 = 1*d22;
ptrdiff_t jj56 = j58+0;
for (; j58 != 3; ++j58) {
ptrdiff_t k164 = 1*w68;
ptrdiff_t kk56 = k164+0;
for (; k164 != 170; ++k164) {
ptrdiff_t s63 = -1;
__m512 sum499 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)24));
__m512 sum503 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)28));
__m512 sum507 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)32));
__m512 sum511 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)36));
__m512 sum515 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)40));
__m512 sum519 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)44));
__m512 sum500 = sum499;
__m512 sum501 = sum499;
__m512 sum502 = sum499;
__m512 sum504 = sum503;
__m512 sum505 = sum503;
__m512 sum506 = sum503;
__m512 sum508 = sum507;
__m512 sum509 = sum507;
__m512 sum510 = sum507;
__m512 sum512 = sum511;
__m512 sum513 = sum511;
__m512 sum514 = sum511;
__m512 sum516 = sum515;
__m512 sum517 = sum515;
__m512 sum518 = sum515;
__m512 sum520 = sum519;
__m512 sum521 = sum519;
__m512 sum522 = sum519;
for (s63 = 0; s63 < 512; ++s63) {
__m512 dat2457 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)0);
__m512 dat2458 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)64);
__m512 dat2459 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)128);
__m512 dat2460 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s63+(ptrdiff_t)192);
__m512 wt649 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)24));
sum499 = _mm512_fmadd_ps(wt649, dat2457, sum499);
sum500 = _mm512_fmadd_ps(wt649, dat2458, sum500);
sum501 = _mm512_fmadd_ps(wt649, dat2459, sum501);
sum502 = _mm512_fmadd_ps(wt649, dat2460, sum502);
__m512 wt650 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)28));
sum503 = _mm512_fmadd_ps(wt650, dat2457, sum503);
sum504 = _mm512_fmadd_ps(wt650, dat2458, sum504);
sum505 = _mm512_fmadd_ps(wt650, dat2459, sum505);
sum506 = _mm512_fmadd_ps(wt650, dat2460, sum506);
__m512 wt651 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)32));
sum507 = _mm512_fmadd_ps(wt651, dat2457, sum507);
sum508 = _mm512_fmadd_ps(wt651, dat2458, sum508);
sum509 = _mm512_fmadd_ps(wt651, dat2459, sum509);
sum510 = _mm512_fmadd_ps(wt651, dat2460, sum510);
__m512 wt652 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)36));
sum511 = _mm512_fmadd_ps(wt652, dat2457, sum511);
sum512 = _mm512_fmadd_ps(wt652, dat2458, sum512);
sum513 = _mm512_fmadd_ps(wt652, dat2459, sum513);
sum514 = _mm512_fmadd_ps(wt652, dat2460, sum514);
__m512 wt653 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)40));
sum515 = _mm512_fmadd_ps(wt653, dat2457, sum515);
sum516 = _mm512_fmadd_ps(wt653, dat2458, sum516);
sum517 = _mm512_fmadd_ps(wt653, dat2459, sum517);
sum518 = _mm512_fmadd_ps(wt653, dat2460, sum518);
__m512 wt654 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+24*s63+(ptrdiff_t)44));
sum519 = _mm512_fmadd_ps(wt654, dat2457, sum519);
sum520 = _mm512_fmadd_ps(wt654, dat2458, sum520);
sum521 = _mm512_fmadd_ps(wt654, dat2459, sum521);
sum522 = _mm512_fmadd_ps(wt654, dat2460, sum522);
}
sum499 = _mm512_add_ps(sum499, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)0));
sum500 = _mm512_add_ps(sum500, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)64));
sum501 = _mm512_add_ps(sum501, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)128));
sum502 = _mm512_add_ps(sum502, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)192));
sum499 = _mm512_max_ps(_mm512_setzero_ps(), sum499);
sum500 = _mm512_max_ps(_mm512_setzero_ps(), sum500);
sum501 = _mm512_max_ps(_mm512_setzero_ps(), sum501);
sum502 = _mm512_max_ps(_mm512_setzero_ps(), sum502);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)0, 65535, sum499);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)64, 65535, sum500);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)128, 65535, sum501);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)192, 65535, sum502);
sum503 = _mm512_add_ps(sum503, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)832));
sum504 = _mm512_add_ps(sum504, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)896));
sum505 = _mm512_add_ps(sum505, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)960));
sum506 = _mm512_add_ps(sum506, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024));
sum503 = _mm512_max_ps(_mm512_setzero_ps(), sum503);
sum504 = _mm512_max_ps(_mm512_setzero_ps(), sum504);
sum505 = _mm512_max_ps(_mm512_setzero_ps(), sum505);
sum506 = _mm512_max_ps(_mm512_setzero_ps(), sum506);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)832, 65535, sum503);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)896, 65535, sum504);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)960, 65535, sum505);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024, 65535, sum506);
sum507 = _mm512_add_ps(sum507, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664));
sum508 = _mm512_add_ps(sum508, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728));
sum509 = _mm512_add_ps(sum509, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792));
sum510 = _mm512_add_ps(sum510, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856));
sum507 = _mm512_max_ps(_mm512_setzero_ps(), sum507);
sum508 = _mm512_max_ps(_mm512_setzero_ps(), sum508);
sum509 = _mm512_max_ps(_mm512_setzero_ps(), sum509);
sum510 = _mm512_max_ps(_mm512_setzero_ps(), sum510);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664, 65535, sum507);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728, 65535, sum508);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792, 65535, sum509);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856, 65535, sum510);
sum511 = _mm512_add_ps(sum511, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496));
sum512 = _mm512_add_ps(sum512, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560));
sum513 = _mm512_add_ps(sum513, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624));
sum514 = _mm512_add_ps(sum514, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688));
sum511 = _mm512_max_ps(_mm512_setzero_ps(), sum511);
sum512 = _mm512_max_ps(_mm512_setzero_ps(), sum512);
sum513 = _mm512_max_ps(_mm512_setzero_ps(), sum513);
sum514 = _mm512_max_ps(_mm512_setzero_ps(), sum514);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496, 65535, sum511);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560, 65535, sum512);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624, 65535, sum513);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688, 65535, sum514);
sum515 = _mm512_add_ps(sum515, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3328));
sum516 = _mm512_add_ps(sum516, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3392));
sum517 = _mm512_add_ps(sum517, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3456));
sum518 = _mm512_add_ps(sum518, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)3520));
sum515 = _mm512_max_ps(_mm512_setzero_ps(), sum515);
sum516 = _mm512_max_ps(_mm512_setzero_ps(), sum516);
sum517 = _mm512_max_ps(_mm512_setzero_ps(), sum517);
sum518 = _mm512_max_ps(_mm512_setzero_ps(), sum518);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3328, 65535, sum515);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3392, 65535, sum516);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3456, 65535, sum517);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)3520, 65535, sum518);
sum519 = _mm512_add_ps(sum519, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4160));
sum520 = _mm512_add_ps(sum520, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4224));
sum521 = _mm512_add_ps(sum521, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4288));
sum522 = _mm512_add_ps(sum522, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)4352));
sum519 = _mm512_max_ps(_mm512_setzero_ps(), sum519);
sum520 = _mm512_max_ps(_mm512_setzero_ps(), sum520);
sum521 = _mm512_max_ps(_mm512_setzero_ps(), sum521);
sum522 = _mm512_max_ps(_mm512_setzero_ps(), sum522);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4160, 65535, sum519);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4224, 65535, sum520);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4288, 65535, sum521);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)4352, 65535, sum522);
if (k164 >= kk56) return;
}
ptrdiff_t s64 = -1;
__m512 sum523 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)16));
__m512 sum527 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)20));
__m512 sum531 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)24));
__m512 sum535 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)28));
__m512 sum524 = sum523;
__m512 sum525 = sum523;
__m512 sum526 = sum523;
__m512 sum528 = sum527;
__m512 sum529 = sum527;
__m512 sum530 = sum527;
__m512 sum532 = sum531;
__m512 sum533 = sum531;
__m512 sum534 = sum531;
__m512 sum536 = sum535;
__m512 sum537 = sum535;
__m512 sum538 = sum535;
for (s64 = 0; s64 < 512; ++s64) {
__m512 dat2461 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)0);
__m512 dat2462 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)64);
__m512 dat2463 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)128);
__m512 dat2464 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+256*s64+(ptrdiff_t)192);
__m512 wt655 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)16));
sum523 = _mm512_fmadd_ps(wt655, dat2461, sum523);
sum524 = _mm512_fmadd_ps(wt655, dat2462, sum524);
sum525 = _mm512_fmadd_ps(wt655, dat2463, sum525);
sum526 = _mm512_fmadd_ps(wt655, dat2464, sum526);
__m512 wt656 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)20));
sum527 = _mm512_fmadd_ps(wt656, dat2461, sum527);
sum528 = _mm512_fmadd_ps(wt656, dat2462, sum528);
sum529 = _mm512_fmadd_ps(wt656, dat2463, sum529);
sum530 = _mm512_fmadd_ps(wt656, dat2464, sum530);
__m512 wt657 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)24));
sum531 = _mm512_fmadd_ps(wt657, dat2461, sum531);
sum532 = _mm512_fmadd_ps(wt657, dat2462, sum532);
sum533 = _mm512_fmadd_ps(wt657, dat2463, sum533);
sum534 = _mm512_fmadd_ps(wt657, dat2464, sum534);
__m512 wt658 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k164+16*s64+(ptrdiff_t)28));
sum535 = _mm512_fmadd_ps(wt658, dat2461, sum535);
sum536 = _mm512_fmadd_ps(wt658, dat2462, sum536);
sum537 = _mm512_fmadd_ps(wt658, dat2463, sum537);
sum538 = _mm512_fmadd_ps(wt658, dat2464, sum538);
}
sum523 = _mm512_add_ps(sum523, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)0));
sum524 = _mm512_add_ps(sum524, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)64));
sum525 = _mm512_add_ps(sum525, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)128));
sum526 = _mm512_add_ps(sum526, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)192));
sum523 = _mm512_max_ps(_mm512_setzero_ps(), sum523);
sum524 = _mm512_max_ps(_mm512_setzero_ps(), sum524);
sum525 = _mm512_max_ps(_mm512_setzero_ps(), sum525);
sum526 = _mm512_max_ps(_mm512_setzero_ps(), sum526);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)0, 65535, sum523);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)64, 65535, sum524);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)128, 65535, sum525);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)192, 65535, sum526);
sum527 = _mm512_add_ps(sum527, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)832));
sum528 = _mm512_add_ps(sum528, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)896));
sum529 = _mm512_add_ps(sum529, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)960));
sum530 = _mm512_add_ps(sum530, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024));
sum527 = _mm512_max_ps(_mm512_setzero_ps(), sum527);
sum528 = _mm512_max_ps(_mm512_setzero_ps(), sum528);
sum529 = _mm512_max_ps(_mm512_setzero_ps(), sum529);
sum530 = _mm512_max_ps(_mm512_setzero_ps(), sum530);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)832, 65535, sum527);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)896, 65535, sum528);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)960, 65535, sum529);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1024, 65535, sum530);
sum531 = _mm512_add_ps(sum531, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664));
sum532 = _mm512_add_ps(sum532, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728));
sum533 = _mm512_add_ps(sum533, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792));
sum534 = _mm512_add_ps(sum534, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856));
sum531 = _mm512_max_ps(_mm512_setzero_ps(), sum531);
sum532 = _mm512_max_ps(_mm512_setzero_ps(), sum532);
sum533 = _mm512_max_ps(_mm512_setzero_ps(), sum533);
sum534 = _mm512_max_ps(_mm512_setzero_ps(), sum534);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1664, 65535, sum531);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1728, 65535, sum532);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1792, 65535, sum533);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)1856, 65535, sum534);
sum535 = _mm512_add_ps(sum535, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496));
sum536 = _mm512_add_ps(sum536, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560));
sum537 = _mm512_add_ps(sum537, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624));
sum538 = _mm512_add_ps(sum538, _mm512_maskz_loadu_ps(65535, datPtr34+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688));
sum535 = _mm512_max_ps(_mm512_setzero_ps(), sum535);
sum536 = _mm512_max_ps(_mm512_setzero_ps(), sum536);
sum537 = _mm512_max_ps(_mm512_setzero_ps(), sum537);
sum538 = _mm512_max_ps(_mm512_setzero_ps(), sum538);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2496, 65535, sum535);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2560, 65535, sum536);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2624, 65535, sum537);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k164+(ptrdiff_t)2688, 65535, sum538);
if (j58 >= jj56) return;
}
ptrdiff_t k165 = 1*w68;
ptrdiff_t kk57 = k165+0;
for (; k165 != 170; ++k165) {
ptrdiff_t s65 = -1;
__m512 sum539 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)24));
__m512 sum540 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)28));
__m512 sum541 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)32));
__m512 sum542 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)36));
__m512 sum543 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)40));
__m512 sum544 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)44));
for (s65 = 0; s65 < 512; ++s65) {
__m512 dat2465 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+64*s65+(ptrdiff_t)0);
__m512 wt659 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)24));
sum539 = _mm512_fmadd_ps(wt659, dat2465, sum539);
__m512 wt660 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)28));
sum540 = _mm512_fmadd_ps(wt660, dat2465, sum540);
__m512 wt661 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)32));
sum541 = _mm512_fmadd_ps(wt661, dat2465, sum541);
__m512 wt662 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)36));
sum542 = _mm512_fmadd_ps(wt662, dat2465, sum542);
__m512 wt663 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)40));
sum543 = _mm512_fmadd_ps(wt663, dat2465, sum543);
__m512 wt664 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+24*s65+(ptrdiff_t)44));
sum544 = _mm512_fmadd_ps(wt664, dat2465, sum544);
}
sum539 = _mm512_add_ps(sum539, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)0));
sum539 = _mm512_max_ps(_mm512_setzero_ps(), sum539);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)0, 15, sum539);
sum540 = _mm512_add_ps(sum540, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)832));
sum540 = _mm512_max_ps(_mm512_setzero_ps(), sum540);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)832, 15, sum540);
sum541 = _mm512_add_ps(sum541, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664));
sum541 = _mm512_max_ps(_mm512_setzero_ps(), sum541);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664, 15, sum541);
sum542 = _mm512_add_ps(sum542, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496));
sum542 = _mm512_max_ps(_mm512_setzero_ps(), sum542);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496, 15, sum542);
sum543 = _mm512_add_ps(sum543, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)3328));
sum543 = _mm512_max_ps(_mm512_setzero_ps(), sum543);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)3328, 15, sum543);
sum544 = _mm512_add_ps(sum544, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)4160));
sum544 = _mm512_max_ps(_mm512_setzero_ps(), sum544);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)4160, 15, sum544);
if (k165 >= kk57) return;
}
ptrdiff_t s66 = -1;
__m512 sum545 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)16));
__m512 sum546 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)20));
__m512 sum547 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)24));
__m512 sum548 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)28));
for (s66 = 0; s66 < 512; ++s66) {
__m512 dat2466 = _mm512_loadu_ps(arrangedDats10+425984*i66+131072*j58+64*s66+(ptrdiff_t)0);
__m512 wt665 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)16));
sum545 = _mm512_fmadd_ps(wt665, dat2466, sum545);
__m512 wt666 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)20));
sum546 = _mm512_fmadd_ps(wt666, dat2466, sum546);
__m512 wt667 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)24));
sum547 = _mm512_fmadd_ps(wt667, dat2466, sum547);
__m512 wt668 = _mm512_set1_ps(*(float*)(arrangedWts10+2101248*i66+12312*k165+16*s66+(ptrdiff_t)28));
sum548 = _mm512_fmadd_ps(wt668, dat2466, sum548);
}
sum545 = _mm512_add_ps(sum545, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)0));
sum545 = _mm512_max_ps(_mm512_setzero_ps(), sum545);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)0, 15, sum545);
sum546 = _mm512_add_ps(sum546, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)832));
sum546 = _mm512_max_ps(_mm512_setzero_ps(), sum546);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)832, 15, sum546);
sum547 = _mm512_add_ps(sum547, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664));
sum547 = _mm512_max_ps(_mm512_setzero_ps(), sum547);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)1664, 15, sum547);
sum548 = _mm512_add_ps(sum548, _mm512_maskz_loadu_ps(15, datPtr34+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496));
sum548 = _mm512_max_ps(_mm512_setzero_ps(), sum548);
_mm512_mask_storeu_ps(datPtr35+851968*i66+256*j58+4992*k165+(ptrdiff_t)2496, 15, sum548);
}
}

static void ResNeXt50OneApply10(ResNeXt50ThreaderTeam1* team68, char** tensors109) {
void* pair25[] = {tensors109, 0};
ResNeXt50ThreaderTask1 task113;
task113.callee1 = ResNeXt50OneApply10Callee1;
task113.any1 = pair25;
task113.nd1 = 3;
task113.hull1[0] = 171;
task113.hull1[1] = 4;
task113.hull1[2] = 1;
ResNeXt50ThreaderDo1(team68, &task113);
}

static void ResNeXt50OneArrangeWts11Callee1(ResNeXt50ThreaderTask1* task114, int64_t* pt62) {
char** tensors112 = task114->any1;
ptrdiff_t b82 = pt62[0];
char*restrict wtPtr18 = tensors112[0]+(ptrdiff_t)3340*0+(ptrdiff_t)2097152*0;
char*restrict biasPtr18 = tensors112[1]+(ptrdiff_t)2048*0;
char*restrict bnPtr19 = tensors112[2]+(ptrdiff_t)8*512*0;
char*restrict arranged21 = tensors112[3]+(ptrdiff_t)1712128*0+(ptrdiff_t)2099200*0;
ptrdiff_t ii49 = 1;
for (ptrdiff_t i67 = 0; i67 < ii49; ++i67) {
ptrdiff_t j59 = 1*b82;
ptrdiff_t jj57 = j59+1;
for (; j59 < jj57; ++j59) {
if (j59 < 31) {
ptrdiff_t k167 = 0+16*(j59-0);
ptrdiff_t l71 = (size_t)(0+k167)/6;
ptrdiff_t cut25 = (size_t)(0+k167)%6;
switch (cut25) {
case 0:;
case 2: {
__m512 sum550 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k167);
__m512i pmMul38 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd38 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo32 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k167+512*i67));
__m512 masHi32 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k167+512*i67)+(ptrdiff_t)64);
__m512 postMul57 = _mm512_permutex2var_ps(masLo32, pmMul38, masHi32);
__m512 postAdd39 = _mm512_permutex2var_ps(masLo32, pmAdd38, masHi32);
sum550 = _mm512_fmadd_ps(sum550, postMul57, postAdd39);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum550);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum550);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)49152, 65535-(4095>>cut25), sum550);
ptrdiff_t c56 = 0;
for (; c56 != 64; ++c56) {
__m512 wt685 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)0);
__m512 wt686 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)4096);
__m512 wt687 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)8192);
__m512 wt688 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)12288);
__m512 wt689 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)16384);
__m512 wt690 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)20480);
__m512 wt691 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)24576);
__m512 wt692 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)28672);
__m512 wt693 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)32768);
__m512 wt694 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)36864);
__m512 wt695 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)40960);
__m512 wt696 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)45056);
__m512 wt697 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)49152);
__m512 wt698 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)53248);
__m512 wt699 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)57344);
__m512 wt700 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c56+(ptrdiff_t)61440);
__m512 tmp14131 = _mm512_unpacklo_ps(wt685, wt686);
__m512 tmp14132 = _mm512_unpackhi_ps(wt685, wt686);
__m512 tmp14133 = _mm512_unpacklo_ps(wt687, wt688);
__m512 tmp14134 = _mm512_unpackhi_ps(wt687, wt688);
__m512 tmp14135 = _mm512_unpacklo_ps(wt689, wt690);
__m512 tmp14136 = _mm512_unpackhi_ps(wt689, wt690);
__m512 tmp14137 = _mm512_unpacklo_ps(wt691, wt692);
__m512 tmp14138 = _mm512_unpackhi_ps(wt691, wt692);
__m512 tmp14139 = _mm512_unpacklo_ps(wt693, wt694);
__m512 tmp14140 = _mm512_unpackhi_ps(wt693, wt694);
__m512 tmp14141 = _mm512_unpacklo_ps(wt695, wt696);
__m512 tmp14142 = _mm512_unpackhi_ps(wt695, wt696);
__m512 tmp14143 = _mm512_unpacklo_ps(wt697, wt698);
__m512 tmp14144 = _mm512_unpackhi_ps(wt697, wt698);
__m512 tmp14145 = _mm512_unpacklo_ps(wt699, wt700);
__m512 tmp14146 = _mm512_unpackhi_ps(wt699, wt700);
__m512 tmp14147 = _mm512_shuffle_ps(tmp14131, tmp14133, 68);
__m512 tmp14148 = _mm512_shuffle_ps(tmp14131, tmp14133, 238);
__m512 tmp14149 = _mm512_shuffle_ps(tmp14132, tmp14134, 68);
__m512 tmp14150 = _mm512_shuffle_ps(tmp14132, tmp14134, 238);
__m512 tmp14151 = _mm512_shuffle_ps(tmp14135, tmp14137, 68);
__m512 tmp14152 = _mm512_shuffle_ps(tmp14135, tmp14137, 238);
__m512 tmp14153 = _mm512_shuffle_ps(tmp14136, tmp14138, 68);
__m512 tmp14154 = _mm512_shuffle_ps(tmp14136, tmp14138, 238);
__m512 tmp14155 = _mm512_shuffle_ps(tmp14139, tmp14141, 68);
__m512 tmp14156 = _mm512_shuffle_ps(tmp14139, tmp14141, 238);
__m512 tmp14157 = _mm512_shuffle_ps(tmp14140, tmp14142, 68);
__m512 tmp14158 = _mm512_shuffle_ps(tmp14140, tmp14142, 238);
__m512 tmp14159 = _mm512_shuffle_ps(tmp14143, tmp14145, 68);
__m512 tmp14160 = _mm512_shuffle_ps(tmp14143, tmp14145, 238);
__m512 tmp14161 = _mm512_shuffle_ps(tmp14144, tmp14146, 68);
__m512 tmp14162 = _mm512_shuffle_ps(tmp14144, tmp14146, 238);
__m512 tmp14163 = _mm512_shuffle_f32x4(tmp14147, tmp14151, 136);
__m512 tmp14164 = _mm512_shuffle_f32x4(tmp14147, tmp14151, 221);
__m512 tmp14165 = _mm512_shuffle_f32x4(tmp14148, tmp14152, 136);
__m512 tmp14166 = _mm512_shuffle_f32x4(tmp14148, tmp14152, 221);
__m512 tmp14167 = _mm512_shuffle_f32x4(tmp14149, tmp14153, 136);
__m512 tmp14168 = _mm512_shuffle_f32x4(tmp14149, tmp14153, 221);
__m512 tmp14169 = _mm512_shuffle_f32x4(tmp14150, tmp14154, 136);
__m512 tmp14170 = _mm512_shuffle_f32x4(tmp14150, tmp14154, 221);
__m512 tmp14171 = _mm512_shuffle_f32x4(tmp14155, tmp14159, 136);
__m512 tmp14172 = _mm512_shuffle_f32x4(tmp14155, tmp14159, 221);
__m512 tmp14173 = _mm512_shuffle_f32x4(tmp14156, tmp14160, 136);
__m512 tmp14174 = _mm512_shuffle_f32x4(tmp14156, tmp14160, 221);
__m512 tmp14175 = _mm512_shuffle_f32x4(tmp14157, tmp14161, 136);
__m512 tmp14176 = _mm512_shuffle_f32x4(tmp14157, tmp14161, 221);
__m512 tmp14177 = _mm512_shuffle_f32x4(tmp14158, tmp14162, 136);
__m512 tmp14178 = _mm512_shuffle_f32x4(tmp14158, tmp14162, 221);
wt685 = _mm512_shuffle_f32x4(tmp14163, tmp14171, 136);
wt693 = _mm512_shuffle_f32x4(tmp14163, tmp14171, 221);
wt686 = _mm512_shuffle_f32x4(tmp14165, tmp14173, 136);
wt694 = _mm512_shuffle_f32x4(tmp14165, tmp14173, 221);
wt687 = _mm512_shuffle_f32x4(tmp14167, tmp14175, 136);
wt695 = _mm512_shuffle_f32x4(tmp14167, tmp14175, 221);
wt688 = _mm512_shuffle_f32x4(tmp14169, tmp14177, 136);
wt696 = _mm512_shuffle_f32x4(tmp14169, tmp14177, 221);
wt689 = _mm512_shuffle_f32x4(tmp14164, tmp14172, 136);
wt697 = _mm512_shuffle_f32x4(tmp14164, tmp14172, 221);
wt690 = _mm512_shuffle_f32x4(tmp14166, tmp14174, 136);
wt698 = _mm512_shuffle_f32x4(tmp14166, tmp14174, 221);
wt691 = _mm512_shuffle_f32x4(tmp14168, tmp14176, 136);
wt699 = _mm512_shuffle_f32x4(tmp14168, tmp14176, 221);
wt692 = _mm512_shuffle_f32x4(tmp14170, tmp14178, 136);
wt700 = _mm512_shuffle_f32x4(tmp14170, tmp14178, 221);
wt685 = _mm512_mul_ps(wt685, postMul57);
wt686 = _mm512_mul_ps(wt686, postMul57);
wt687 = _mm512_mul_ps(wt687, postMul57);
wt688 = _mm512_mul_ps(wt688, postMul57);
wt689 = _mm512_mul_ps(wt689, postMul57);
wt690 = _mm512_mul_ps(wt690, postMul57);
wt691 = _mm512_mul_ps(wt691, postMul57);
wt692 = _mm512_mul_ps(wt692, postMul57);
wt693 = _mm512_mul_ps(wt693, postMul57);
wt694 = _mm512_mul_ps(wt694, postMul57);
wt695 = _mm512_mul_ps(wt695, postMul57);
wt696 = _mm512_mul_ps(wt696, postMul57);
wt697 = _mm512_mul_ps(wt697, postMul57);
wt698 = _mm512_mul_ps(wt698, postMul57);
wt699 = _mm512_mul_ps(wt699, postMul57);
wt700 = _mm512_mul_ps(wt700, postMul57);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)0, 63>>cut25, wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)0, 63>>cut25, wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)0, 63>>cut25, wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)0, 63>>cut25, wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)0, 63>>cut25, wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)0, 63>>cut25, wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)0, 63>>cut25, wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)0, 63>>cut25, wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)0, 63>>cut25, wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)0, 63>>cut25, wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)0, 63>>cut25, wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)0, 63>>cut25, wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)0, 63>>cut25, wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)0, 63>>cut25, wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)0, 63>>cut25, wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)0, 63>>cut25, wt700);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)24576, 4032>>cut25, wt700);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt685);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt686);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt687);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt688);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt689);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt690);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt691);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt692);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt693);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt694);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt695);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt696);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt697);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt698);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt699);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c56)+(ptrdiff_t)49152, 65535-(4095>>cut25), wt700);
}
break;
}
default: {
cut25 = 4;
__m512 sum551 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k167);
__m512i pmMul39 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd39 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo33 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k167+512*i67));
__m512 masHi33 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k167+512*i67)+(ptrdiff_t)64);
__m512 postMul58 = _mm512_permutex2var_ps(masLo33, pmMul39, masHi33);
__m512 postAdd40 = _mm512_permutex2var_ps(masLo33, pmAdd39, masHi33);
sum551 = _mm512_fmadd_ps(sum551, postMul58, postAdd40);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)0, 63>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)24576, 4032>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)49152, 258048>>cut25, sum551);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*0+(ptrdiff_t)73728, 65535-(262143>>cut25), sum551);
ptrdiff_t c57 = 0;
for (; c57 != 64; ++c57) {
__m512 wt701 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)0);
__m512 wt702 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)4096);
__m512 wt703 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)8192);
__m512 wt704 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)12288);
__m512 wt705 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)16384);
__m512 wt706 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)20480);
__m512 wt707 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)24576);
__m512 wt708 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)28672);
__m512 wt709 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)32768);
__m512 wt710 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)36864);
__m512 wt711 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)40960);
__m512 wt712 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)45056);
__m512 wt713 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)49152);
__m512 wt714 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)53248);
__m512 wt715 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)57344);
__m512 wt716 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k167+64*c57+(ptrdiff_t)61440);
__m512 tmp14179 = _mm512_unpacklo_ps(wt701, wt702);
__m512 tmp14180 = _mm512_unpackhi_ps(wt701, wt702);
__m512 tmp14181 = _mm512_unpacklo_ps(wt703, wt704);
__m512 tmp14182 = _mm512_unpackhi_ps(wt703, wt704);
__m512 tmp14183 = _mm512_unpacklo_ps(wt705, wt706);
__m512 tmp14184 = _mm512_unpackhi_ps(wt705, wt706);
__m512 tmp14185 = _mm512_unpacklo_ps(wt707, wt708);
__m512 tmp14186 = _mm512_unpackhi_ps(wt707, wt708);
__m512 tmp14187 = _mm512_unpacklo_ps(wt709, wt710);
__m512 tmp14188 = _mm512_unpackhi_ps(wt709, wt710);
__m512 tmp14189 = _mm512_unpacklo_ps(wt711, wt712);
__m512 tmp14190 = _mm512_unpackhi_ps(wt711, wt712);
__m512 tmp14191 = _mm512_unpacklo_ps(wt713, wt714);
__m512 tmp14192 = _mm512_unpackhi_ps(wt713, wt714);
__m512 tmp14193 = _mm512_unpacklo_ps(wt715, wt716);
__m512 tmp14194 = _mm512_unpackhi_ps(wt715, wt716);
__m512 tmp14195 = _mm512_shuffle_ps(tmp14179, tmp14181, 68);
__m512 tmp14196 = _mm512_shuffle_ps(tmp14179, tmp14181, 238);
__m512 tmp14197 = _mm512_shuffle_ps(tmp14180, tmp14182, 68);
__m512 tmp14198 = _mm512_shuffle_ps(tmp14180, tmp14182, 238);
__m512 tmp14199 = _mm512_shuffle_ps(tmp14183, tmp14185, 68);
__m512 tmp14200 = _mm512_shuffle_ps(tmp14183, tmp14185, 238);
__m512 tmp14201 = _mm512_shuffle_ps(tmp14184, tmp14186, 68);
__m512 tmp14202 = _mm512_shuffle_ps(tmp14184, tmp14186, 238);
__m512 tmp14203 = _mm512_shuffle_ps(tmp14187, tmp14189, 68);
__m512 tmp14204 = _mm512_shuffle_ps(tmp14187, tmp14189, 238);
__m512 tmp14205 = _mm512_shuffle_ps(tmp14188, tmp14190, 68);
__m512 tmp14206 = _mm512_shuffle_ps(tmp14188, tmp14190, 238);
__m512 tmp14207 = _mm512_shuffle_ps(tmp14191, tmp14193, 68);
__m512 tmp14208 = _mm512_shuffle_ps(tmp14191, tmp14193, 238);
__m512 tmp14209 = _mm512_shuffle_ps(tmp14192, tmp14194, 68);
__m512 tmp14210 = _mm512_shuffle_ps(tmp14192, tmp14194, 238);
__m512 tmp14211 = _mm512_shuffle_f32x4(tmp14195, tmp14199, 136);
__m512 tmp14212 = _mm512_shuffle_f32x4(tmp14195, tmp14199, 221);
__m512 tmp14213 = _mm512_shuffle_f32x4(tmp14196, tmp14200, 136);
__m512 tmp14214 = _mm512_shuffle_f32x4(tmp14196, tmp14200, 221);
__m512 tmp14215 = _mm512_shuffle_f32x4(tmp14197, tmp14201, 136);
__m512 tmp14216 = _mm512_shuffle_f32x4(tmp14197, tmp14201, 221);
__m512 tmp14217 = _mm512_shuffle_f32x4(tmp14198, tmp14202, 136);
__m512 tmp14218 = _mm512_shuffle_f32x4(tmp14198, tmp14202, 221);
__m512 tmp14219 = _mm512_shuffle_f32x4(tmp14203, tmp14207, 136);
__m512 tmp14220 = _mm512_shuffle_f32x4(tmp14203, tmp14207, 221);
__m512 tmp14221 = _mm512_shuffle_f32x4(tmp14204, tmp14208, 136);
__m512 tmp14222 = _mm512_shuffle_f32x4(tmp14204, tmp14208, 221);
__m512 tmp14223 = _mm512_shuffle_f32x4(tmp14205, tmp14209, 136);
__m512 tmp14224 = _mm512_shuffle_f32x4(tmp14205, tmp14209, 221);
__m512 tmp14225 = _mm512_shuffle_f32x4(tmp14206, tmp14210, 136);
__m512 tmp14226 = _mm512_shuffle_f32x4(tmp14206, tmp14210, 221);
wt701 = _mm512_shuffle_f32x4(tmp14211, tmp14219, 136);
wt709 = _mm512_shuffle_f32x4(tmp14211, tmp14219, 221);
wt702 = _mm512_shuffle_f32x4(tmp14213, tmp14221, 136);
wt710 = _mm512_shuffle_f32x4(tmp14213, tmp14221, 221);
wt703 = _mm512_shuffle_f32x4(tmp14215, tmp14223, 136);
wt711 = _mm512_shuffle_f32x4(tmp14215, tmp14223, 221);
wt704 = _mm512_shuffle_f32x4(tmp14217, tmp14225, 136);
wt712 = _mm512_shuffle_f32x4(tmp14217, tmp14225, 221);
wt705 = _mm512_shuffle_f32x4(tmp14212, tmp14220, 136);
wt713 = _mm512_shuffle_f32x4(tmp14212, tmp14220, 221);
wt706 = _mm512_shuffle_f32x4(tmp14214, tmp14222, 136);
wt714 = _mm512_shuffle_f32x4(tmp14214, tmp14222, 221);
wt707 = _mm512_shuffle_f32x4(tmp14216, tmp14224, 136);
wt715 = _mm512_shuffle_f32x4(tmp14216, tmp14224, 221);
wt708 = _mm512_shuffle_f32x4(tmp14218, tmp14226, 136);
wt716 = _mm512_shuffle_f32x4(tmp14218, tmp14226, 221);
wt701 = _mm512_mul_ps(wt701, postMul58);
wt702 = _mm512_mul_ps(wt702, postMul58);
wt703 = _mm512_mul_ps(wt703, postMul58);
wt704 = _mm512_mul_ps(wt704, postMul58);
wt705 = _mm512_mul_ps(wt705, postMul58);
wt706 = _mm512_mul_ps(wt706, postMul58);
wt707 = _mm512_mul_ps(wt707, postMul58);
wt708 = _mm512_mul_ps(wt708, postMul58);
wt709 = _mm512_mul_ps(wt709, postMul58);
wt710 = _mm512_mul_ps(wt710, postMul58);
wt711 = _mm512_mul_ps(wt711, postMul58);
wt712 = _mm512_mul_ps(wt712, postMul58);
wt713 = _mm512_mul_ps(wt713, postMul58);
wt714 = _mm512_mul_ps(wt714, postMul58);
wt715 = _mm512_mul_ps(wt715, postMul58);
wt716 = _mm512_mul_ps(wt716, postMul58);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)0, 63>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)0, 63>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)0, 63>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)0, 63>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)0, 63>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)0, 63>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)0, 63>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)0, 63>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)0, 63>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)0, 63>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)0, 63>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)0, 63>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)0, 63>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)0, 63>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)0, 63>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)0, 63>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)24576, 4032>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)49152, 258048>>cut25, wt716);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(1+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt701);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(2+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt702);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(3+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt703);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(4+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt704);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(5+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt705);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(6+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt706);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(7+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt707);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(8+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt708);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(9+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt709);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(10+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt710);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(11+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt711);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(12+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt712);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(13+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt713);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(14+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt714);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(15+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt715);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l71+4*cut25+24*(16+16*c57)+(ptrdiff_t)73728, 65535-(262143>>cut25), wt716);
}
}
}
} else {
ptrdiff_t k166 = 496;
ptrdiff_t l70 = (size_t)(0+k166)/6;
ptrdiff_t cut24 = (size_t)(0+k166)%6;
__m512 sum549 = _mm512_maskz_loadu_ps(65535, biasPtr18+2048*i67+4*k166);
__m512i pmMul40 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd40 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo34 = _mm512_loadu_ps(bnPtr19+(ptrdiff_t)8*(k166+512*i67));
__m512 masHi34 = _mm512_maskz_loadu_ps(65535, bnPtr19+(ptrdiff_t)8*(k166+512*i67)+(ptrdiff_t)64);
__m512 postMul56 = _mm512_permutex2var_ps(masLo34, pmMul40, masHi34);
__m512 postAdd38 = _mm512_permutex2var_ps(masLo34, pmAdd40, masHi34);
sum549 = _mm512_fmadd_ps(sum549, postMul56, postAdd38);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)0, 63>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)24576, 4032>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*0+(ptrdiff_t)49152, 258048>>cut24, sum549);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*0+(ptrdiff_t)73728, 65535-(262143>>cut24), sum549);
ptrdiff_t c55 = 0;
for (; c55 != 64; ++c55) {
__m512 wt669 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)0);
__m512 wt670 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)4096);
__m512 wt671 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)8192);
__m512 wt672 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)12288);
__m512 wt673 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)16384);
__m512 wt674 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)20480);
__m512 wt675 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)24576);
__m512 wt676 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)28672);
__m512 wt677 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)32768);
__m512 wt678 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)36864);
__m512 wt679 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)40960);
__m512 wt680 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)45056);
__m512 wt681 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)49152);
__m512 wt682 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)53248);
__m512 wt683 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)57344);
__m512 wt684 = _mm512_maskz_loadu_ps(65535, wtPtr18+2097152*i67+4096*k166+64*c55+(ptrdiff_t)61440);
__m512 tmp14227 = _mm512_unpacklo_ps(wt669, wt670);
__m512 tmp14228 = _mm512_unpackhi_ps(wt669, wt670);
__m512 tmp14229 = _mm512_unpacklo_ps(wt671, wt672);
__m512 tmp14230 = _mm512_unpackhi_ps(wt671, wt672);
__m512 tmp14231 = _mm512_unpacklo_ps(wt673, wt674);
__m512 tmp14232 = _mm512_unpackhi_ps(wt673, wt674);
__m512 tmp14233 = _mm512_unpacklo_ps(wt675, wt676);
__m512 tmp14234 = _mm512_unpackhi_ps(wt675, wt676);
__m512 tmp14235 = _mm512_unpacklo_ps(wt677, wt678);
__m512 tmp14236 = _mm512_unpackhi_ps(wt677, wt678);
__m512 tmp14237 = _mm512_unpacklo_ps(wt679, wt680);
__m512 tmp14238 = _mm512_unpackhi_ps(wt679, wt680);
__m512 tmp14239 = _mm512_unpacklo_ps(wt681, wt682);
__m512 tmp14240 = _mm512_unpackhi_ps(wt681, wt682);
__m512 tmp14241 = _mm512_unpacklo_ps(wt683, wt684);
__m512 tmp14242 = _mm512_unpackhi_ps(wt683, wt684);
__m512 tmp14243 = _mm512_shuffle_ps(tmp14227, tmp14229, 68);
__m512 tmp14244 = _mm512_shuffle_ps(tmp14227, tmp14229, 238);
__m512 tmp14245 = _mm512_shuffle_ps(tmp14228, tmp14230, 68);
__m512 tmp14246 = _mm512_shuffle_ps(tmp14228, tmp14230, 238);
__m512 tmp14247 = _mm512_shuffle_ps(tmp14231, tmp14233, 68);
__m512 tmp14248 = _mm512_shuffle_ps(tmp14231, tmp14233, 238);
__m512 tmp14249 = _mm512_shuffle_ps(tmp14232, tmp14234, 68);
__m512 tmp14250 = _mm512_shuffle_ps(tmp14232, tmp14234, 238);
__m512 tmp14251 = _mm512_shuffle_ps(tmp14235, tmp14237, 68);
__m512 tmp14252 = _mm512_shuffle_ps(tmp14235, tmp14237, 238);
__m512 tmp14253 = _mm512_shuffle_ps(tmp14236, tmp14238, 68);
__m512 tmp14254 = _mm512_shuffle_ps(tmp14236, tmp14238, 238);
__m512 tmp14255 = _mm512_shuffle_ps(tmp14239, tmp14241, 68);
__m512 tmp14256 = _mm512_shuffle_ps(tmp14239, tmp14241, 238);
__m512 tmp14257 = _mm512_shuffle_ps(tmp14240, tmp14242, 68);
__m512 tmp14258 = _mm512_shuffle_ps(tmp14240, tmp14242, 238);
__m512 tmp14259 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 136);
__m512 tmp14260 = _mm512_shuffle_f32x4(tmp14243, tmp14247, 221);
__m512 tmp14261 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 136);
__m512 tmp14262 = _mm512_shuffle_f32x4(tmp14244, tmp14248, 221);
__m512 tmp14263 = _mm512_shuffle_f32x4(tmp14245, tmp14249, 136);
__m512 tmp14264 = _mm512_shuffle_f32x4(tmp14245, tmp14249, 221);
__m512 tmp14265 = _mm512_shuffle_f32x4(tmp14246, tmp14250, 136);
__m512 tmp14266 = _mm512_shuffle_f32x4(tmp14246, tmp14250, 221);
__m512 tmp14267 = _mm512_shuffle_f32x4(tmp14251, tmp14255, 136);
__m512 tmp14268 = _mm512_shuffle_f32x4(tmp14251, tmp14255, 221);
__m512 tmp14269 = _mm512_shuffle_f32x4(tmp14252, tmp14256, 136);
__m512 tmp14270 = _mm512_shuffle_f32x4(tmp14252, tmp14256, 221);
__m512 tmp14271 = _mm512_shuffle_f32x4(tmp14253, tmp14257, 136);
__m512 tmp14272 = _mm512_shuffle_f32x4(tmp14253, tmp14257, 221);
__m512 tmp14273 = _mm512_shuffle_f32x4(tmp14254, tmp14258, 136);
__m512 tmp14274 = _mm512_shuffle_f32x4(tmp14254, tmp14258, 221);
wt669 = _mm512_shuffle_f32x4(tmp14259, tmp14267, 136);
wt677 = _mm512_shuffle_f32x4(tmp14259, tmp14267, 221);
wt670 = _mm512_shuffle_f32x4(tmp14261, tmp14269, 136);
wt678 = _mm512_shuffle_f32x4(tmp14261, tmp14269, 221);
wt671 = _mm512_shuffle_f32x4(tmp14263, tmp14271, 136);
wt679 = _mm512_shuffle_f32x4(tmp14263, tmp14271, 221);
wt672 = _mm512_shuffle_f32x4(tmp14265, tmp14273, 136);
wt680 = _mm512_shuffle_f32x4(tmp14265, tmp14273, 221);
wt673 = _mm512_shuffle_f32x4(tmp14260, tmp14268, 136);
wt681 = _mm512_shuffle_f32x4(tmp14260, tmp14268, 221);
wt674 = _mm512_shuffle_f32x4(tmp14262, tmp14270, 136);
wt682 = _mm512_shuffle_f32x4(tmp14262, tmp14270, 221);
wt675 = _mm512_shuffle_f32x4(tmp14264, tmp14272, 136);
wt683 = _mm512_shuffle_f32x4(tmp14264, tmp14272, 221);
wt676 = _mm512_shuffle_f32x4(tmp14266, tmp14274, 136);
wt684 = _mm512_shuffle_f32x4(tmp14266, tmp14274, 221);
wt669 = _mm512_mul_ps(wt669, postMul56);
wt670 = _mm512_mul_ps(wt670, postMul56);
wt671 = _mm512_mul_ps(wt671, postMul56);
wt672 = _mm512_mul_ps(wt672, postMul56);
wt673 = _mm512_mul_ps(wt673, postMul56);
wt674 = _mm512_mul_ps(wt674, postMul56);
wt675 = _mm512_mul_ps(wt675, postMul56);
wt676 = _mm512_mul_ps(wt676, postMul56);
wt677 = _mm512_mul_ps(wt677, postMul56);
wt678 = _mm512_mul_ps(wt678, postMul56);
wt679 = _mm512_mul_ps(wt679, postMul56);
wt680 = _mm512_mul_ps(wt680, postMul56);
wt681 = _mm512_mul_ps(wt681, postMul56);
wt682 = _mm512_mul_ps(wt682, postMul56);
wt683 = _mm512_mul_ps(wt683, postMul56);
wt684 = _mm512_mul_ps(wt684, postMul56);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)0, 63>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)0, 63>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)0, 63>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)0, 63>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)0, 63>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)0, 63>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)0, 63>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)0, 63>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)0, 63>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)0, 63>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)0, 63>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)0, 63>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)0, 63>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)0, 63>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)0, 63>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)0, 63>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)24576, 4032>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(1+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(2+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(3+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(4+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(5+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(6+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(7+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(8+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(9+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(10+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(11+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(12+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(13+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(14+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(15+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+24*(16+16*c55)+(ptrdiff_t)49152, 258048>>cut24, wt684);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(1+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt669);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(2+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt670);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(3+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt671);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(4+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt672);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(5+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt673);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(6+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt674);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(7+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt675);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(8+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt676);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(9+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt677);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(10+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt678);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(11+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt679);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(12+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt680);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(13+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt681);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(14+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt682);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(15+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt683);
_mm512_mask_storeu_ps(arranged21+2099200*i67+24600*l70+4*cut24+8*(16+16*c55)+(ptrdiff_t)73728, 65535-(262143>>cut24), wt684);
}
}
}
}
}

static void ResNeXt50OneArrangeWts11(ResNeXt50ThreaderTeam1* team69, char** tensors111) {
ResNeXt50ThreaderTask1 task115;
task115.callee1 = ResNeXt50OneArrangeWts11Callee1;
task115.any1 = tensors111;
task115.nd1 = 3;
task115.hull1[0] = 32;
task115.hull1[1] = 1;
task115.hull1[2] = 1;
ResNeXt50ThreaderDo1(team69, &task115);
}

static void ResNeXt50OneArrangeDats11Callee1(ResNeXt50ThreaderTask1* task116, int64_t* pt63) {
char** tensors114 = task116->any1;
ptrdiff_t s67 = pt63[0];
ptrdiff_t c58 = pt63[1];
char*restrict datPtr36 = tensors114[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged22 = tensors114[1]+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
ptrdiff_t ii50 = 1;
for (ptrdiff_t i68 = 0; i68 < ii50; ++i68) {
ptrdiff_t j60 = 1*c58;
ptrdiff_t jj58 = j60+0;
for (; j60 != 3; ++j60) {
ptrdiff_t k168 = 128*s67;
ptrdiff_t kk58 = k168+128;
for (; k168 < kk58; ++k168) {
__m512 dat2467 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)0);
__m512 dat2468 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)64);
__m512 dat2469 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)128);
__m512 dat2470 = _mm512_maskz_loadu_ps(65535, datPtr36+851968*i68+256*j60+832*k168+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)0, 65535, dat2467);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)64, 65535, dat2468);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)128, 65535, dat2469);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+256*k168+(ptrdiff_t)192, 65535, dat2470);
}
if (j60 >= jj58) goto next11;
}
ptrdiff_t k169 = 128*s67;
ptrdiff_t kk59 = k169+128;
for (; k169 < kk59; ++k169) {
__m512 dat2471 = _mm512_maskz_loadu_ps(15, datPtr36+851968*i68+256*j60+832*k169+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged22+851968*i68+262144*j60+64*k169+(ptrdiff_t)0, 15, dat2471);
}
next11:;
}
}

static void ResNeXt50OneArrangeDats11(ResNeXt50ThreaderTeam1* team70, char** tensors113) {
ResNeXt50ThreaderTask1 task117;
task117.callee1 = ResNeXt50OneArrangeDats11Callee1;
task117.any1 = tensors113;
task117.nd1 = 4;
task117.hull1[0] = 8;
task117.hull1[1] = 4;
task117.hull1[2] = 1;
task117.hull1[3] = 1;
ResNeXt50ThreaderDo1(team70, &task117);
}

static void ResNeXt50OneApply11Callee1(ResNeXt50ThreaderTask1* task118, int64_t* pt64) {
void** pair28 = task118->any1;
char** tensors116 = pair28[0];
ptrdiff_t e33 = 0;
ptrdiff_t g36 = 0;
ptrdiff_t d23 = pt64[1];
ptrdiff_t w69 = pt64[0];
char*restrict arrangedWts11 = tensors116[0]+1712128*e33+(ptrdiff_t)2099200*1*g36;
char*restrict arrangedDats11 = tensors116[1]+694720*e33+(ptrdiff_t)851968*1*g36;
char*restrict datPtr37 = tensors116[2]+(ptrdiff_t)425984*1*g36;
ptrdiff_t ii51 = 1;
for (ptrdiff_t i69 = 0; i69 < ii51; ++i69) {
ptrdiff_t j61 = 1*d23;
ptrdiff_t jj59 = j61+0;
for (; j61 != 3; ++j61) {
ptrdiff_t k170 = 1*w69;
ptrdiff_t kk60 = k170+0;
for (; k170 != 85; ++k170) {
ptrdiff_t s68 = -1;
__m512 sum552 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)24));
__m512 sum556 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)28));
__m512 sum560 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)32));
__m512 sum564 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)36));
__m512 sum568 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)40));
__m512 sum572 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)44));
__m512 sum553 = sum552;
__m512 sum554 = sum552;
__m512 sum555 = sum552;
__m512 sum557 = sum556;
__m512 sum558 = sum556;
__m512 sum559 = sum556;
__m512 sum561 = sum560;
__m512 sum562 = sum560;
__m512 sum563 = sum560;
__m512 sum565 = sum564;
__m512 sum566 = sum564;
__m512 sum567 = sum564;
__m512 sum569 = sum568;
__m512 sum570 = sum568;
__m512 sum571 = sum568;
__m512 sum573 = sum572;
__m512 sum574 = sum572;
__m512 sum575 = sum572;
for (s68 = 0; s68 < 1024; ++s68) {
__m512 dat2472 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)0);
__m512 dat2473 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)64);
__m512 dat2474 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)128);
__m512 dat2475 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s68+(ptrdiff_t)192);
__m512 wt717 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)24));
sum552 = _mm512_fmadd_ps(wt717, dat2472, sum552);
sum553 = _mm512_fmadd_ps(wt717, dat2473, sum553);
sum554 = _mm512_fmadd_ps(wt717, dat2474, sum554);
sum555 = _mm512_fmadd_ps(wt717, dat2475, sum555);
__m512 wt718 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)28));
sum556 = _mm512_fmadd_ps(wt718, dat2472, sum556);
sum557 = _mm512_fmadd_ps(wt718, dat2473, sum557);
sum558 = _mm512_fmadd_ps(wt718, dat2474, sum558);
sum559 = _mm512_fmadd_ps(wt718, dat2475, sum559);
__m512 wt719 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)32));
sum560 = _mm512_fmadd_ps(wt719, dat2472, sum560);
sum561 = _mm512_fmadd_ps(wt719, dat2473, sum561);
sum562 = _mm512_fmadd_ps(wt719, dat2474, sum562);
sum563 = _mm512_fmadd_ps(wt719, dat2475, sum563);
__m512 wt720 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)36));
sum564 = _mm512_fmadd_ps(wt720, dat2472, sum564);
sum565 = _mm512_fmadd_ps(wt720, dat2473, sum565);
sum566 = _mm512_fmadd_ps(wt720, dat2474, sum566);
sum567 = _mm512_fmadd_ps(wt720, dat2475, sum567);
__m512 wt721 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)40));
sum568 = _mm512_fmadd_ps(wt721, dat2472, sum568);
sum569 = _mm512_fmadd_ps(wt721, dat2473, sum569);
sum570 = _mm512_fmadd_ps(wt721, dat2474, sum570);
sum571 = _mm512_fmadd_ps(wt721, dat2475, sum571);
__m512 wt722 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+24*s68+(ptrdiff_t)44));
sum572 = _mm512_fmadd_ps(wt722, dat2472, sum572);
sum573 = _mm512_fmadd_ps(wt722, dat2473, sum573);
sum574 = _mm512_fmadd_ps(wt722, dat2474, sum574);
sum575 = _mm512_fmadd_ps(wt722, dat2475, sum575);
}
sum552 = _mm512_max_ps(_mm512_setzero_ps(), sum552);
sum553 = _mm512_max_ps(_mm512_setzero_ps(), sum553);
sum554 = _mm512_max_ps(_mm512_setzero_ps(), sum554);
sum555 = _mm512_max_ps(_mm512_setzero_ps(), sum555);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)0, 65535, sum552);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)64, 65535, sum553);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)128, 65535, sum554);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)192, 65535, sum555);
sum556 = _mm512_max_ps(_mm512_setzero_ps(), sum556);
sum557 = _mm512_max_ps(_mm512_setzero_ps(), sum557);
sum558 = _mm512_max_ps(_mm512_setzero_ps(), sum558);
sum559 = _mm512_max_ps(_mm512_setzero_ps(), sum559);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)832, 65535, sum556);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)896, 65535, sum557);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)960, 65535, sum558);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1024, 65535, sum559);
sum560 = _mm512_max_ps(_mm512_setzero_ps(), sum560);
sum561 = _mm512_max_ps(_mm512_setzero_ps(), sum561);
sum562 = _mm512_max_ps(_mm512_setzero_ps(), sum562);
sum563 = _mm512_max_ps(_mm512_setzero_ps(), sum563);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1664, 65535, sum560);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1728, 65535, sum561);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1792, 65535, sum562);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1856, 65535, sum563);
sum564 = _mm512_max_ps(_mm512_setzero_ps(), sum564);
sum565 = _mm512_max_ps(_mm512_setzero_ps(), sum565);
sum566 = _mm512_max_ps(_mm512_setzero_ps(), sum566);
sum567 = _mm512_max_ps(_mm512_setzero_ps(), sum567);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2496, 65535, sum564);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2560, 65535, sum565);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2624, 65535, sum566);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)2688, 65535, sum567);
sum568 = _mm512_max_ps(_mm512_setzero_ps(), sum568);
sum569 = _mm512_max_ps(_mm512_setzero_ps(), sum569);
sum570 = _mm512_max_ps(_mm512_setzero_ps(), sum570);
sum571 = _mm512_max_ps(_mm512_setzero_ps(), sum571);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3328, 65535, sum568);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3392, 65535, sum569);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3456, 65535, sum570);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)3520, 65535, sum571);
sum572 = _mm512_max_ps(_mm512_setzero_ps(), sum572);
sum573 = _mm512_max_ps(_mm512_setzero_ps(), sum573);
sum574 = _mm512_max_ps(_mm512_setzero_ps(), sum574);
sum575 = _mm512_max_ps(_mm512_setzero_ps(), sum575);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4160, 65535, sum572);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4224, 65535, sum573);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4288, 65535, sum574);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)4352, 65535, sum575);
if (k170 >= kk60) return;
}
ptrdiff_t s69 = -1;
__m512 sum576 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)8));
__m512 sum580 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)12));
__m512 sum577 = sum576;
__m512 sum578 = sum576;
__m512 sum579 = sum576;
__m512 sum581 = sum580;
__m512 sum582 = sum580;
__m512 sum583 = sum580;
for (s69 = 0; s69 < 1024; ++s69) {
__m512 dat2476 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)0);
__m512 dat2477 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)64);
__m512 dat2478 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)128);
__m512 dat2479 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+256*s69+(ptrdiff_t)192);
__m512 wt723 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)8));
sum576 = _mm512_fmadd_ps(wt723, dat2476, sum576);
sum577 = _mm512_fmadd_ps(wt723, dat2477, sum577);
sum578 = _mm512_fmadd_ps(wt723, dat2478, sum578);
sum579 = _mm512_fmadd_ps(wt723, dat2479, sum579);
__m512 wt724 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k170+8*s69+(ptrdiff_t)12));
sum580 = _mm512_fmadd_ps(wt724, dat2476, sum580);
sum581 = _mm512_fmadd_ps(wt724, dat2477, sum581);
sum582 = _mm512_fmadd_ps(wt724, dat2478, sum582);
sum583 = _mm512_fmadd_ps(wt724, dat2479, sum583);
}
sum576 = _mm512_max_ps(_mm512_setzero_ps(), sum576);
sum577 = _mm512_max_ps(_mm512_setzero_ps(), sum577);
sum578 = _mm512_max_ps(_mm512_setzero_ps(), sum578);
sum579 = _mm512_max_ps(_mm512_setzero_ps(), sum579);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)0, 65535, sum576);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)64, 65535, sum577);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)128, 65535, sum578);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)192, 65535, sum579);
sum580 = _mm512_max_ps(_mm512_setzero_ps(), sum580);
sum581 = _mm512_max_ps(_mm512_setzero_ps(), sum581);
sum582 = _mm512_max_ps(_mm512_setzero_ps(), sum582);
sum583 = _mm512_max_ps(_mm512_setzero_ps(), sum583);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)832, 65535, sum580);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)896, 65535, sum581);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)960, 65535, sum582);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k170+(ptrdiff_t)1024, 65535, sum583);
if (j61 >= jj59) return;
}
ptrdiff_t k171 = 1*w69;
ptrdiff_t kk61 = k171+0;
for (; k171 != 85; ++k171) {
ptrdiff_t s70 = -1;
__m512 sum584 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)24));
__m512 sum585 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)28));
__m512 sum586 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)32));
__m512 sum587 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)36));
__m512 sum588 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)40));
__m512 sum589 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)44));
for (s70 = 0; s70 < 1024; ++s70) {
__m512 dat2480 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+64*s70+(ptrdiff_t)0);
__m512 wt725 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)24));
sum584 = _mm512_fmadd_ps(wt725, dat2480, sum584);
__m512 wt726 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)28));
sum585 = _mm512_fmadd_ps(wt726, dat2480, sum585);
__m512 wt727 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)32));
sum586 = _mm512_fmadd_ps(wt727, dat2480, sum586);
__m512 wt728 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)36));
sum587 = _mm512_fmadd_ps(wt728, dat2480, sum587);
__m512 wt729 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)40));
sum588 = _mm512_fmadd_ps(wt729, dat2480, sum588);
__m512 wt730 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+24*s70+(ptrdiff_t)44));
sum589 = _mm512_fmadd_ps(wt730, dat2480, sum589);
}
sum584 = _mm512_max_ps(_mm512_setzero_ps(), sum584);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)0, 15, sum584);
sum585 = _mm512_max_ps(_mm512_setzero_ps(), sum585);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)832, 15, sum585);
sum586 = _mm512_max_ps(_mm512_setzero_ps(), sum586);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)1664, 15, sum586);
sum587 = _mm512_max_ps(_mm512_setzero_ps(), sum587);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)2496, 15, sum587);
sum588 = _mm512_max_ps(_mm512_setzero_ps(), sum588);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)3328, 15, sum588);
sum589 = _mm512_max_ps(_mm512_setzero_ps(), sum589);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)4160, 15, sum589);
if (k171 >= kk61) return;
}
ptrdiff_t s71 = -1;
__m512 sum590 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)8));
__m512 sum591 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)12));
for (s71 = 0; s71 < 1024; ++s71) {
__m512 dat2481 = _mm512_loadu_ps(arrangedDats11+851968*i69+262144*j61+64*s71+(ptrdiff_t)0);
__m512 wt731 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)8));
sum590 = _mm512_fmadd_ps(wt731, dat2481, sum590);
__m512 wt732 = _mm512_set1_ps(*(float*)(arrangedWts11+2099200*i69+24600*k171+8*s71+(ptrdiff_t)12));
sum591 = _mm512_fmadd_ps(wt732, dat2481, sum591);
}
sum590 = _mm512_max_ps(_mm512_setzero_ps(), sum590);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)0, 15, sum590);
sum591 = _mm512_max_ps(_mm512_setzero_ps(), sum591);
_mm512_mask_storeu_ps(datPtr37+425984*i69+256*j61+4992*k171+(ptrdiff_t)832, 15, sum591);
}
}

static void ResNeXt50OneApply11(ResNeXt50ThreaderTeam1* team71, char** tensors115) {
void* pair27[] = {tensors115, 0};
ResNeXt50ThreaderTask1 task119;
task119.callee1 = ResNeXt50OneApply11Callee1;
task119.any1 = pair27;
task119.nd1 = 3;
task119.hull1[0] = 86;
task119.hull1[1] = 4;
task119.hull1[2] = 1;
ResNeXt50ThreaderDo1(team71, &task119);
}

static void ResNeXt50OneArrangeWts12Callee1(ResNeXt50ThreaderTask1* task128, int64_t* pt69) {
char** tensors126 = task128->any1;
ptrdiff_t b86 = pt69[0];
char*restrict wtPtr20 = tensors126[0]+(ptrdiff_t)3340*0+(ptrdiff_t)8388608*0;
char*restrict biasPtr20 = tensors126[1]+(ptrdiff_t)8192*0;
char*restrict bnPtr21 = tensors126[2]+(ptrdiff_t)8*2048*0;
char*restrict arranged23 = tensors126[3]+(ptrdiff_t)6848512*0+(ptrdiff_t)8396800*0;
ptrdiff_t ii55 = 1;
for (ptrdiff_t i75 = 0; i75 < ii55; ++i75) {
ptrdiff_t j66 = 1*b86;
ptrdiff_t jj61 = j66+1;
for (; j66 < jj61; ++j66) {
if (j66 < 127) {
ptrdiff_t k179 = 0+16*(j66-0);
ptrdiff_t l77 = (size_t)(0+k179)/6;
ptrdiff_t cut28 = (size_t)(0+k179)%6;
switch (cut28) {
case 0:;
case 2: {
__m512 sum629 = _mm512_maskz_loadu_ps(65535, biasPtr20+8192*i75+4*k179);
__m512i pmMul42 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd42 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo35 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k179+2048*i75));
__m512 masHi35 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k179+2048*i75)+(ptrdiff_t)64);
__m512 postMul65 = _mm512_permutex2var_ps(masLo35, pmMul42, masHi35);
__m512 postAdd43 = _mm512_permutex2var_ps(masLo35, pmAdd42, masHi35);
sum629 = _mm512_fmadd_ps(sum629, postMul65, postAdd43);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum629);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum629);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)49152, 65535-(4095>>cut28), sum629);
ptrdiff_t c61 = 0;
for (; c61 != 64; ++c61) {
__m512 wt753 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)0);
__m512 wt754 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)4096);
__m512 wt755 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)8192);
__m512 wt756 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)12288);
__m512 wt757 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)16384);
__m512 wt758 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)20480);
__m512 wt759 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)24576);
__m512 wt760 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)28672);
__m512 wt761 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)32768);
__m512 wt762 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)36864);
__m512 wt763 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)40960);
__m512 wt764 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)45056);
__m512 wt765 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)49152);
__m512 wt766 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)53248);
__m512 wt767 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)57344);
__m512 wt768 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c61+(ptrdiff_t)61440);
__m512 tmp15491 = _mm512_unpacklo_ps(wt753, wt754);
__m512 tmp15492 = _mm512_unpackhi_ps(wt753, wt754);
__m512 tmp15493 = _mm512_unpacklo_ps(wt755, wt756);
__m512 tmp15494 = _mm512_unpackhi_ps(wt755, wt756);
__m512 tmp15495 = _mm512_unpacklo_ps(wt757, wt758);
__m512 tmp15496 = _mm512_unpackhi_ps(wt757, wt758);
__m512 tmp15497 = _mm512_unpacklo_ps(wt759, wt760);
__m512 tmp15498 = _mm512_unpackhi_ps(wt759, wt760);
__m512 tmp15499 = _mm512_unpacklo_ps(wt761, wt762);
__m512 tmp15500 = _mm512_unpackhi_ps(wt761, wt762);
__m512 tmp15501 = _mm512_unpacklo_ps(wt763, wt764);
__m512 tmp15502 = _mm512_unpackhi_ps(wt763, wt764);
__m512 tmp15503 = _mm512_unpacklo_ps(wt765, wt766);
__m512 tmp15504 = _mm512_unpackhi_ps(wt765, wt766);
__m512 tmp15505 = _mm512_unpacklo_ps(wt767, wt768);
__m512 tmp15506 = _mm512_unpackhi_ps(wt767, wt768);
__m512 tmp15507 = _mm512_shuffle_ps(tmp15491, tmp15493, 68);
__m512 tmp15508 = _mm512_shuffle_ps(tmp15491, tmp15493, 238);
__m512 tmp15509 = _mm512_shuffle_ps(tmp15492, tmp15494, 68);
__m512 tmp15510 = _mm512_shuffle_ps(tmp15492, tmp15494, 238);
__m512 tmp15511 = _mm512_shuffle_ps(tmp15495, tmp15497, 68);
__m512 tmp15512 = _mm512_shuffle_ps(tmp15495, tmp15497, 238);
__m512 tmp15513 = _mm512_shuffle_ps(tmp15496, tmp15498, 68);
__m512 tmp15514 = _mm512_shuffle_ps(tmp15496, tmp15498, 238);
__m512 tmp15515 = _mm512_shuffle_ps(tmp15499, tmp15501, 68);
__m512 tmp15516 = _mm512_shuffle_ps(tmp15499, tmp15501, 238);
__m512 tmp15517 = _mm512_shuffle_ps(tmp15500, tmp15502, 68);
__m512 tmp15518 = _mm512_shuffle_ps(tmp15500, tmp15502, 238);
__m512 tmp15519 = _mm512_shuffle_ps(tmp15503, tmp15505, 68);
__m512 tmp15520 = _mm512_shuffle_ps(tmp15503, tmp15505, 238);
__m512 tmp15521 = _mm512_shuffle_ps(tmp15504, tmp15506, 68);
__m512 tmp15522 = _mm512_shuffle_ps(tmp15504, tmp15506, 238);
__m512 tmp15523 = _mm512_shuffle_f32x4(tmp15507, tmp15511, 136);
__m512 tmp15524 = _mm512_shuffle_f32x4(tmp15507, tmp15511, 221);
__m512 tmp15525 = _mm512_shuffle_f32x4(tmp15508, tmp15512, 136);
__m512 tmp15526 = _mm512_shuffle_f32x4(tmp15508, tmp15512, 221);
__m512 tmp15527 = _mm512_shuffle_f32x4(tmp15509, tmp15513, 136);
__m512 tmp15528 = _mm512_shuffle_f32x4(tmp15509, tmp15513, 221);
__m512 tmp15529 = _mm512_shuffle_f32x4(tmp15510, tmp15514, 136);
__m512 tmp15530 = _mm512_shuffle_f32x4(tmp15510, tmp15514, 221);
__m512 tmp15531 = _mm512_shuffle_f32x4(tmp15515, tmp15519, 136);
__m512 tmp15532 = _mm512_shuffle_f32x4(tmp15515, tmp15519, 221);
__m512 tmp15533 = _mm512_shuffle_f32x4(tmp15516, tmp15520, 136);
__m512 tmp15534 = _mm512_shuffle_f32x4(tmp15516, tmp15520, 221);
__m512 tmp15535 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 136);
__m512 tmp15536 = _mm512_shuffle_f32x4(tmp15517, tmp15521, 221);
__m512 tmp15537 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 136);
__m512 tmp15538 = _mm512_shuffle_f32x4(tmp15518, tmp15522, 221);
wt753 = _mm512_shuffle_f32x4(tmp15523, tmp15531, 136);
wt761 = _mm512_shuffle_f32x4(tmp15523, tmp15531, 221);
wt754 = _mm512_shuffle_f32x4(tmp15525, tmp15533, 136);
wt762 = _mm512_shuffle_f32x4(tmp15525, tmp15533, 221);
wt755 = _mm512_shuffle_f32x4(tmp15527, tmp15535, 136);
wt763 = _mm512_shuffle_f32x4(tmp15527, tmp15535, 221);
wt756 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 136);
wt764 = _mm512_shuffle_f32x4(tmp15529, tmp15537, 221);
wt757 = _mm512_shuffle_f32x4(tmp15524, tmp15532, 136);
wt765 = _mm512_shuffle_f32x4(tmp15524, tmp15532, 221);
wt758 = _mm512_shuffle_f32x4(tmp15526, tmp15534, 136);
wt766 = _mm512_shuffle_f32x4(tmp15526, tmp15534, 221);
wt759 = _mm512_shuffle_f32x4(tmp15528, tmp15536, 136);
wt767 = _mm512_shuffle_f32x4(tmp15528, tmp15536, 221);
wt760 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 136);
wt768 = _mm512_shuffle_f32x4(tmp15530, tmp15538, 221);
wt753 = _mm512_mul_ps(wt753, postMul65);
wt754 = _mm512_mul_ps(wt754, postMul65);
wt755 = _mm512_mul_ps(wt755, postMul65);
wt756 = _mm512_mul_ps(wt756, postMul65);
wt757 = _mm512_mul_ps(wt757, postMul65);
wt758 = _mm512_mul_ps(wt758, postMul65);
wt759 = _mm512_mul_ps(wt759, postMul65);
wt760 = _mm512_mul_ps(wt760, postMul65);
wt761 = _mm512_mul_ps(wt761, postMul65);
wt762 = _mm512_mul_ps(wt762, postMul65);
wt763 = _mm512_mul_ps(wt763, postMul65);
wt764 = _mm512_mul_ps(wt764, postMul65);
wt765 = _mm512_mul_ps(wt765, postMul65);
wt766 = _mm512_mul_ps(wt766, postMul65);
wt767 = _mm512_mul_ps(wt767, postMul65);
wt768 = _mm512_mul_ps(wt768, postMul65);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c61)+(ptrdiff_t)0, 63>>cut28, wt753);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c61)+(ptrdiff_t)0, 63>>cut28, wt754);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c61)+(ptrdiff_t)0, 63>>cut28, wt755);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c61)+(ptrdiff_t)0, 63>>cut28, wt756);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c61)+(ptrdiff_t)0, 63>>cut28, wt757);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c61)+(ptrdiff_t)0, 63>>cut28, wt758);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c61)+(ptrdiff_t)0, 63>>cut28, wt759);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c61)+(ptrdiff_t)0, 63>>cut28, wt760);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c61)+(ptrdiff_t)0, 63>>cut28, wt761);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c61)+(ptrdiff_t)0, 63>>cut28, wt762);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c61)+(ptrdiff_t)0, 63>>cut28, wt763);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c61)+(ptrdiff_t)0, 63>>cut28, wt764);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c61)+(ptrdiff_t)0, 63>>cut28, wt765);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c61)+(ptrdiff_t)0, 63>>cut28, wt766);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c61)+(ptrdiff_t)0, 63>>cut28, wt767);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c61)+(ptrdiff_t)0, 63>>cut28, wt768);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt753);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt754);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt755);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt756);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt757);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt758);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt759);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt760);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt761);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt762);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt763);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt764);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt765);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt766);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt767);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c61)+(ptrdiff_t)24576, 4032>>cut28, wt768);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt753);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt754);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt755);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt756);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt757);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt758);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt759);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt760);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt761);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt762);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt763);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt764);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt765);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt766);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt767);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c61)+(ptrdiff_t)49152, 65535-(4095>>cut28), wt768);
}
break;
}
default: {
cut28 = 4;
__m512 sum630 = _mm512_maskz_loadu_ps(65535, biasPtr20+8192*i75+4*k179);
__m512i pmMul43 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd43 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo36 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k179+2048*i75));
__m512 masHi36 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k179+2048*i75)+(ptrdiff_t)64);
__m512 postMul66 = _mm512_permutex2var_ps(masLo36, pmMul43, masHi36);
__m512 postAdd44 = _mm512_permutex2var_ps(masLo36, pmAdd43, masHi36);
sum630 = _mm512_fmadd_ps(sum630, postMul66, postAdd44);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)0, 63>>cut28, sum630);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)24576, 4032>>cut28, sum630);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)49152, 258048>>cut28, sum630);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*0+(ptrdiff_t)73728, 65535-(262143>>cut28), sum630);
ptrdiff_t c62 = 0;
for (; c62 != 64; ++c62) {
__m512 wt769 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)0);
__m512 wt770 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)4096);
__m512 wt771 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)8192);
__m512 wt772 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)12288);
__m512 wt773 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)16384);
__m512 wt774 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)20480);
__m512 wt775 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)24576);
__m512 wt776 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)28672);
__m512 wt777 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)32768);
__m512 wt778 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)36864);
__m512 wt779 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)40960);
__m512 wt780 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)45056);
__m512 wt781 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)49152);
__m512 wt782 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)53248);
__m512 wt783 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)57344);
__m512 wt784 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k179+64*c62+(ptrdiff_t)61440);
__m512 tmp15539 = _mm512_unpacklo_ps(wt769, wt770);
__m512 tmp15540 = _mm512_unpackhi_ps(wt769, wt770);
__m512 tmp15541 = _mm512_unpacklo_ps(wt771, wt772);
__m512 tmp15542 = _mm512_unpackhi_ps(wt771, wt772);
__m512 tmp15543 = _mm512_unpacklo_ps(wt773, wt774);
__m512 tmp15544 = _mm512_unpackhi_ps(wt773, wt774);
__m512 tmp15545 = _mm512_unpacklo_ps(wt775, wt776);
__m512 tmp15546 = _mm512_unpackhi_ps(wt775, wt776);
__m512 tmp15547 = _mm512_unpacklo_ps(wt777, wt778);
__m512 tmp15548 = _mm512_unpackhi_ps(wt777, wt778);
__m512 tmp15549 = _mm512_unpacklo_ps(wt779, wt780);
__m512 tmp15550 = _mm512_unpackhi_ps(wt779, wt780);
__m512 tmp15551 = _mm512_unpacklo_ps(wt781, wt782);
__m512 tmp15552 = _mm512_unpackhi_ps(wt781, wt782);
__m512 tmp15553 = _mm512_unpacklo_ps(wt783, wt784);
__m512 tmp15554 = _mm512_unpackhi_ps(wt783, wt784);
__m512 tmp15555 = _mm512_shuffle_ps(tmp15539, tmp15541, 68);
__m512 tmp15556 = _mm512_shuffle_ps(tmp15539, tmp15541, 238);
__m512 tmp15557 = _mm512_shuffle_ps(tmp15540, tmp15542, 68);
__m512 tmp15558 = _mm512_shuffle_ps(tmp15540, tmp15542, 238);
__m512 tmp15559 = _mm512_shuffle_ps(tmp15543, tmp15545, 68);
__m512 tmp15560 = _mm512_shuffle_ps(tmp15543, tmp15545, 238);
__m512 tmp15561 = _mm512_shuffle_ps(tmp15544, tmp15546, 68);
__m512 tmp15562 = _mm512_shuffle_ps(tmp15544, tmp15546, 238);
__m512 tmp15563 = _mm512_shuffle_ps(tmp15547, tmp15549, 68);
__m512 tmp15564 = _mm512_shuffle_ps(tmp15547, tmp15549, 238);
__m512 tmp15565 = _mm512_shuffle_ps(tmp15548, tmp15550, 68);
__m512 tmp15566 = _mm512_shuffle_ps(tmp15548, tmp15550, 238);
__m512 tmp15567 = _mm512_shuffle_ps(tmp15551, tmp15553, 68);
__m512 tmp15568 = _mm512_shuffle_ps(tmp15551, tmp15553, 238);
__m512 tmp15569 = _mm512_shuffle_ps(tmp15552, tmp15554, 68);
__m512 tmp15570 = _mm512_shuffle_ps(tmp15552, tmp15554, 238);
__m512 tmp15571 = _mm512_shuffle_f32x4(tmp15555, tmp15559, 136);
__m512 tmp15572 = _mm512_shuffle_f32x4(tmp15555, tmp15559, 221);
__m512 tmp15573 = _mm512_shuffle_f32x4(tmp15556, tmp15560, 136);
__m512 tmp15574 = _mm512_shuffle_f32x4(tmp15556, tmp15560, 221);
__m512 tmp15575 = _mm512_shuffle_f32x4(tmp15557, tmp15561, 136);
__m512 tmp15576 = _mm512_shuffle_f32x4(tmp15557, tmp15561, 221);
__m512 tmp15577 = _mm512_shuffle_f32x4(tmp15558, tmp15562, 136);
__m512 tmp15578 = _mm512_shuffle_f32x4(tmp15558, tmp15562, 221);
__m512 tmp15579 = _mm512_shuffle_f32x4(tmp15563, tmp15567, 136);
__m512 tmp15580 = _mm512_shuffle_f32x4(tmp15563, tmp15567, 221);
__m512 tmp15581 = _mm512_shuffle_f32x4(tmp15564, tmp15568, 136);
__m512 tmp15582 = _mm512_shuffle_f32x4(tmp15564, tmp15568, 221);
__m512 tmp15583 = _mm512_shuffle_f32x4(tmp15565, tmp15569, 136);
__m512 tmp15584 = _mm512_shuffle_f32x4(tmp15565, tmp15569, 221);
__m512 tmp15585 = _mm512_shuffle_f32x4(tmp15566, tmp15570, 136);
__m512 tmp15586 = _mm512_shuffle_f32x4(tmp15566, tmp15570, 221);
wt769 = _mm512_shuffle_f32x4(tmp15571, tmp15579, 136);
wt777 = _mm512_shuffle_f32x4(tmp15571, tmp15579, 221);
wt770 = _mm512_shuffle_f32x4(tmp15573, tmp15581, 136);
wt778 = _mm512_shuffle_f32x4(tmp15573, tmp15581, 221);
wt771 = _mm512_shuffle_f32x4(tmp15575, tmp15583, 136);
wt779 = _mm512_shuffle_f32x4(tmp15575, tmp15583, 221);
wt772 = _mm512_shuffle_f32x4(tmp15577, tmp15585, 136);
wt780 = _mm512_shuffle_f32x4(tmp15577, tmp15585, 221);
wt773 = _mm512_shuffle_f32x4(tmp15572, tmp15580, 136);
wt781 = _mm512_shuffle_f32x4(tmp15572, tmp15580, 221);
wt774 = _mm512_shuffle_f32x4(tmp15574, tmp15582, 136);
wt782 = _mm512_shuffle_f32x4(tmp15574, tmp15582, 221);
wt775 = _mm512_shuffle_f32x4(tmp15576, tmp15584, 136);
wt783 = _mm512_shuffle_f32x4(tmp15576, tmp15584, 221);
wt776 = _mm512_shuffle_f32x4(tmp15578, tmp15586, 136);
wt784 = _mm512_shuffle_f32x4(tmp15578, tmp15586, 221);
wt769 = _mm512_mul_ps(wt769, postMul66);
wt770 = _mm512_mul_ps(wt770, postMul66);
wt771 = _mm512_mul_ps(wt771, postMul66);
wt772 = _mm512_mul_ps(wt772, postMul66);
wt773 = _mm512_mul_ps(wt773, postMul66);
wt774 = _mm512_mul_ps(wt774, postMul66);
wt775 = _mm512_mul_ps(wt775, postMul66);
wt776 = _mm512_mul_ps(wt776, postMul66);
wt777 = _mm512_mul_ps(wt777, postMul66);
wt778 = _mm512_mul_ps(wt778, postMul66);
wt779 = _mm512_mul_ps(wt779, postMul66);
wt780 = _mm512_mul_ps(wt780, postMul66);
wt781 = _mm512_mul_ps(wt781, postMul66);
wt782 = _mm512_mul_ps(wt782, postMul66);
wt783 = _mm512_mul_ps(wt783, postMul66);
wt784 = _mm512_mul_ps(wt784, postMul66);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c62)+(ptrdiff_t)0, 63>>cut28, wt769);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c62)+(ptrdiff_t)0, 63>>cut28, wt770);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c62)+(ptrdiff_t)0, 63>>cut28, wt771);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c62)+(ptrdiff_t)0, 63>>cut28, wt772);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c62)+(ptrdiff_t)0, 63>>cut28, wt773);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c62)+(ptrdiff_t)0, 63>>cut28, wt774);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c62)+(ptrdiff_t)0, 63>>cut28, wt775);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c62)+(ptrdiff_t)0, 63>>cut28, wt776);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c62)+(ptrdiff_t)0, 63>>cut28, wt777);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c62)+(ptrdiff_t)0, 63>>cut28, wt778);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c62)+(ptrdiff_t)0, 63>>cut28, wt779);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c62)+(ptrdiff_t)0, 63>>cut28, wt780);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c62)+(ptrdiff_t)0, 63>>cut28, wt781);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c62)+(ptrdiff_t)0, 63>>cut28, wt782);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c62)+(ptrdiff_t)0, 63>>cut28, wt783);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c62)+(ptrdiff_t)0, 63>>cut28, wt784);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt769);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt770);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt771);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt772);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt773);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt774);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt775);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt776);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt777);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt778);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt779);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt780);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt781);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt782);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt783);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c62)+(ptrdiff_t)24576, 4032>>cut28, wt784);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt769);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt770);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt771);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt772);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt773);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt774);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt775);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt776);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt777);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt778);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt779);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt780);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt781);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt782);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt783);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c62)+(ptrdiff_t)49152, 258048>>cut28, wt784);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(1+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt769);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(2+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt770);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(3+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt771);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(4+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt772);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(5+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt773);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(6+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt774);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(7+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt775);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(8+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt776);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(9+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt777);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(10+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt778);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(11+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt779);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(12+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt780);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(13+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt781);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(14+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt782);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(15+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt783);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l77+4*cut28+24*(16+16*c62)+(ptrdiff_t)73728, 65535-(262143>>cut28), wt784);
}
}
}
} else {
ptrdiff_t k178 = 2032;
ptrdiff_t l76 = (size_t)(0+k178)/6;
ptrdiff_t cut27 = (size_t)(0+k178)%6;
__m512 sum628 = _mm512_maskz_loadu_ps(65535, biasPtr20+8192*i75+4*k178);
__m512i pmMul44 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd44 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo37 = _mm512_loadu_ps(bnPtr21+(ptrdiff_t)8*(k178+2048*i75));
__m512 masHi37 = _mm512_maskz_loadu_ps(65535, bnPtr21+(ptrdiff_t)8*(k178+2048*i75)+(ptrdiff_t)64);
__m512 postMul64 = _mm512_permutex2var_ps(masLo37, pmMul44, masHi37);
__m512 postAdd42 = _mm512_permutex2var_ps(masLo37, pmAdd44, masHi37);
sum628 = _mm512_fmadd_ps(sum628, postMul64, postAdd42);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*0+(ptrdiff_t)0, 63>>cut27, sum628);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*0+(ptrdiff_t)24576, 4032>>cut27, sum628);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*0+(ptrdiff_t)49152, 258048>>cut27, sum628);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*0+(ptrdiff_t)73728, 65535-(262143>>cut27), sum628);
ptrdiff_t c60 = 0;
for (; c60 != 64; ++c60) {
__m512 wt737 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)0);
__m512 wt738 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)4096);
__m512 wt739 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)8192);
__m512 wt740 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)12288);
__m512 wt741 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)16384);
__m512 wt742 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)20480);
__m512 wt743 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)24576);
__m512 wt744 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)28672);
__m512 wt745 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)32768);
__m512 wt746 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)36864);
__m512 wt747 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)40960);
__m512 wt748 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)45056);
__m512 wt749 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)49152);
__m512 wt750 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)53248);
__m512 wt751 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)57344);
__m512 wt752 = _mm512_maskz_loadu_ps(65535, wtPtr20+8388608*i75+4096*k178+64*c60+(ptrdiff_t)61440);
__m512 tmp15587 = _mm512_unpacklo_ps(wt737, wt738);
__m512 tmp15588 = _mm512_unpackhi_ps(wt737, wt738);
__m512 tmp15589 = _mm512_unpacklo_ps(wt739, wt740);
__m512 tmp15590 = _mm512_unpackhi_ps(wt739, wt740);
__m512 tmp15591 = _mm512_unpacklo_ps(wt741, wt742);
__m512 tmp15592 = _mm512_unpackhi_ps(wt741, wt742);
__m512 tmp15593 = _mm512_unpacklo_ps(wt743, wt744);
__m512 tmp15594 = _mm512_unpackhi_ps(wt743, wt744);
__m512 tmp15595 = _mm512_unpacklo_ps(wt745, wt746);
__m512 tmp15596 = _mm512_unpackhi_ps(wt745, wt746);
__m512 tmp15597 = _mm512_unpacklo_ps(wt747, wt748);
__m512 tmp15598 = _mm512_unpackhi_ps(wt747, wt748);
__m512 tmp15599 = _mm512_unpacklo_ps(wt749, wt750);
__m512 tmp15600 = _mm512_unpackhi_ps(wt749, wt750);
__m512 tmp15601 = _mm512_unpacklo_ps(wt751, wt752);
__m512 tmp15602 = _mm512_unpackhi_ps(wt751, wt752);
__m512 tmp15603 = _mm512_shuffle_ps(tmp15587, tmp15589, 68);
__m512 tmp15604 = _mm512_shuffle_ps(tmp15587, tmp15589, 238);
__m512 tmp15605 = _mm512_shuffle_ps(tmp15588, tmp15590, 68);
__m512 tmp15606 = _mm512_shuffle_ps(tmp15588, tmp15590, 238);
__m512 tmp15607 = _mm512_shuffle_ps(tmp15591, tmp15593, 68);
__m512 tmp15608 = _mm512_shuffle_ps(tmp15591, tmp15593, 238);
__m512 tmp15609 = _mm512_shuffle_ps(tmp15592, tmp15594, 68);
__m512 tmp15610 = _mm512_shuffle_ps(tmp15592, tmp15594, 238);
__m512 tmp15611 = _mm512_shuffle_ps(tmp15595, tmp15597, 68);
__m512 tmp15612 = _mm512_shuffle_ps(tmp15595, tmp15597, 238);
__m512 tmp15613 = _mm512_shuffle_ps(tmp15596, tmp15598, 68);
__m512 tmp15614 = _mm512_shuffle_ps(tmp15596, tmp15598, 238);
__m512 tmp15615 = _mm512_shuffle_ps(tmp15599, tmp15601, 68);
__m512 tmp15616 = _mm512_shuffle_ps(tmp15599, tmp15601, 238);
__m512 tmp15617 = _mm512_shuffle_ps(tmp15600, tmp15602, 68);
__m512 tmp15618 = _mm512_shuffle_ps(tmp15600, tmp15602, 238);
__m512 tmp15619 = _mm512_shuffle_f32x4(tmp15603, tmp15607, 136);
__m512 tmp15620 = _mm512_shuffle_f32x4(tmp15603, tmp15607, 221);
__m512 tmp15621 = _mm512_shuffle_f32x4(tmp15604, tmp15608, 136);
__m512 tmp15622 = _mm512_shuffle_f32x4(tmp15604, tmp15608, 221);
__m512 tmp15623 = _mm512_shuffle_f32x4(tmp15605, tmp15609, 136);
__m512 tmp15624 = _mm512_shuffle_f32x4(tmp15605, tmp15609, 221);
__m512 tmp15625 = _mm512_shuffle_f32x4(tmp15606, tmp15610, 136);
__m512 tmp15626 = _mm512_shuffle_f32x4(tmp15606, tmp15610, 221);
__m512 tmp15627 = _mm512_shuffle_f32x4(tmp15611, tmp15615, 136);
__m512 tmp15628 = _mm512_shuffle_f32x4(tmp15611, tmp15615, 221);
__m512 tmp15629 = _mm512_shuffle_f32x4(tmp15612, tmp15616, 136);
__m512 tmp15630 = _mm512_shuffle_f32x4(tmp15612, tmp15616, 221);
__m512 tmp15631 = _mm512_shuffle_f32x4(tmp15613, tmp15617, 136);
__m512 tmp15632 = _mm512_shuffle_f32x4(tmp15613, tmp15617, 221);
__m512 tmp15633 = _mm512_shuffle_f32x4(tmp15614, tmp15618, 136);
__m512 tmp15634 = _mm512_shuffle_f32x4(tmp15614, tmp15618, 221);
wt737 = _mm512_shuffle_f32x4(tmp15619, tmp15627, 136);
wt745 = _mm512_shuffle_f32x4(tmp15619, tmp15627, 221);
wt738 = _mm512_shuffle_f32x4(tmp15621, tmp15629, 136);
wt746 = _mm512_shuffle_f32x4(tmp15621, tmp15629, 221);
wt739 = _mm512_shuffle_f32x4(tmp15623, tmp15631, 136);
wt747 = _mm512_shuffle_f32x4(tmp15623, tmp15631, 221);
wt740 = _mm512_shuffle_f32x4(tmp15625, tmp15633, 136);
wt748 = _mm512_shuffle_f32x4(tmp15625, tmp15633, 221);
wt741 = _mm512_shuffle_f32x4(tmp15620, tmp15628, 136);
wt749 = _mm512_shuffle_f32x4(tmp15620, tmp15628, 221);
wt742 = _mm512_shuffle_f32x4(tmp15622, tmp15630, 136);
wt750 = _mm512_shuffle_f32x4(tmp15622, tmp15630, 221);
wt743 = _mm512_shuffle_f32x4(tmp15624, tmp15632, 136);
wt751 = _mm512_shuffle_f32x4(tmp15624, tmp15632, 221);
wt744 = _mm512_shuffle_f32x4(tmp15626, tmp15634, 136);
wt752 = _mm512_shuffle_f32x4(tmp15626, tmp15634, 221);
wt737 = _mm512_mul_ps(wt737, postMul64);
wt738 = _mm512_mul_ps(wt738, postMul64);
wt739 = _mm512_mul_ps(wt739, postMul64);
wt740 = _mm512_mul_ps(wt740, postMul64);
wt741 = _mm512_mul_ps(wt741, postMul64);
wt742 = _mm512_mul_ps(wt742, postMul64);
wt743 = _mm512_mul_ps(wt743, postMul64);
wt744 = _mm512_mul_ps(wt744, postMul64);
wt745 = _mm512_mul_ps(wt745, postMul64);
wt746 = _mm512_mul_ps(wt746, postMul64);
wt747 = _mm512_mul_ps(wt747, postMul64);
wt748 = _mm512_mul_ps(wt748, postMul64);
wt749 = _mm512_mul_ps(wt749, postMul64);
wt750 = _mm512_mul_ps(wt750, postMul64);
wt751 = _mm512_mul_ps(wt751, postMul64);
wt752 = _mm512_mul_ps(wt752, postMul64);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(1+16*c60)+(ptrdiff_t)0, 63>>cut27, wt737);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(2+16*c60)+(ptrdiff_t)0, 63>>cut27, wt738);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(3+16*c60)+(ptrdiff_t)0, 63>>cut27, wt739);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(4+16*c60)+(ptrdiff_t)0, 63>>cut27, wt740);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(5+16*c60)+(ptrdiff_t)0, 63>>cut27, wt741);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(6+16*c60)+(ptrdiff_t)0, 63>>cut27, wt742);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(7+16*c60)+(ptrdiff_t)0, 63>>cut27, wt743);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(8+16*c60)+(ptrdiff_t)0, 63>>cut27, wt744);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(9+16*c60)+(ptrdiff_t)0, 63>>cut27, wt745);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(10+16*c60)+(ptrdiff_t)0, 63>>cut27, wt746);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(11+16*c60)+(ptrdiff_t)0, 63>>cut27, wt747);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(12+16*c60)+(ptrdiff_t)0, 63>>cut27, wt748);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(13+16*c60)+(ptrdiff_t)0, 63>>cut27, wt749);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(14+16*c60)+(ptrdiff_t)0, 63>>cut27, wt750);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(15+16*c60)+(ptrdiff_t)0, 63>>cut27, wt751);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(16+16*c60)+(ptrdiff_t)0, 63>>cut27, wt752);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(1+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt737);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(2+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt738);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(3+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt739);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(4+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt740);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(5+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt741);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(6+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt742);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(7+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt743);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(8+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt744);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(9+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt745);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(10+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt746);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(11+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt747);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(12+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt748);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(13+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt749);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(14+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt750);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(15+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt751);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(16+16*c60)+(ptrdiff_t)24576, 4032>>cut27, wt752);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(1+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt737);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(2+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt738);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(3+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt739);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(4+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt740);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(5+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt741);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(6+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt742);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(7+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt743);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(8+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt744);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(9+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt745);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(10+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt746);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(11+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt747);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(12+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt748);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(13+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt749);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(14+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt750);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(15+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt751);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+24*(16+16*c60)+(ptrdiff_t)49152, 258048>>cut27, wt752);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(1+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt737);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(2+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt738);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(3+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt739);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(4+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt740);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(5+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt741);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(6+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt742);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(7+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt743);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(8+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt744);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(9+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt745);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(10+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt746);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(11+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt747);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(12+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt748);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(13+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt749);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(14+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt750);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(15+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt751);
_mm512_mask_storeu_ps(arranged23+8396800*i75+24600*l76+4*cut27+8*(16+16*c60)+(ptrdiff_t)73728, 65535-(262143>>cut27), wt752);
}
}
}
}
}

static void ResNeXt50OneArrangeWts12(ResNeXt50ThreaderTeam1* team76, char** tensors125) {
ResNeXt50ThreaderTask1 task129;
task129.callee1 = ResNeXt50OneArrangeWts12Callee1;
task129.any1 = tensors125;
task129.nd1 = 3;
task129.hull1[0] = 128;
task129.hull1[1] = 1;
task129.hull1[2] = 1;
ResNeXt50ThreaderDo1(team76, &task129);
}

static void ResNeXt50OneArrangeDats12Callee1(ResNeXt50ThreaderTask1* task130, int64_t* pt70) {
char** tensors128 = task130->any1;
ptrdiff_t s74 = pt70[0];
char*restrict datPtr40 = tensors128[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged24 = tensors128[1]+(ptrdiff_t)213760*0+(ptrdiff_t)262144*0;
ptrdiff_t ii56 = 1;
for (ptrdiff_t i76 = 0; i76 < ii56; ++i76) {
ptrdiff_t j67 = 0;
switch ((size_t)j67-0) {
default: {
j67 = 0;
ptrdiff_t k180 = 128*s74;
ptrdiff_t kk62 = k180+128;
for (; k180 < kk62; ++k180) {
__m512 dat2545 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)0);
__m512i pm227 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2546 = _mm512_permutexvar_ps(pm227, dat2545);
__m512 dat2547 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)112);
__m512i pm228 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2548 = _mm512_permutexvar_ps(pm228, dat2547);
dat2546 = _mm512_mask_mov_ps(dat2546, 16256, dat2548);
__m512 dat2549 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)224);
__m512i pm229 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2550 = _mm512_permutexvar_ps(pm229, dat2549);
__m512 dat2551 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)336);
__m512i pm230 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2552 = _mm512_permutexvar_ps(pm230, dat2551);
dat2550 = _mm512_mask_mov_ps(dat2550, 16256, dat2552);
__m512 dat2553 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)448);
__m512i pm231 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2554 = _mm512_permutexvar_ps(pm231, dat2553);
__m512 dat2555 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)560);
__m512i pm232 = _mm512_set_epi32(16, 14, 12, 10, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 dat2556 = _mm512_permutexvar_ps(pm232, dat2555);
dat2554 = _mm512_mask_mov_ps(dat2554, 16256, dat2556);
__m512 dat2557 = _mm512_maskz_loadu_ps(8191, datPtr40+851968*i76+56*(ptrdiff_t)0+832*k180+(ptrdiff_t)672);
__m512i pm233 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512 dat2558 = _mm512_permutexvar_ps(pm233, dat2557);
_mm512_storeu_ps(arranged24+262144*i76+262144*j67+256*k180+(ptrdiff_t)0, dat2546);
_mm512_storeu_ps(arranged24+262144*i76+262144*j67+256*k180+(ptrdiff_t)64, dat2550);
_mm512_storeu_ps(arranged24+262144*i76+262144*j67+256*k180+(ptrdiff_t)128, dat2554);
_mm512_storeu_ps(arranged24+262144*i76+262144*j67+256*k180+(ptrdiff_t)192, dat2558);
}
}
}
j67 = 1;
}
}

static void ResNeXt50OneArrangeDats12(ResNeXt50ThreaderTeam1* team77, char** tensors127) {
ResNeXt50ThreaderTask1 task131;
task131.callee1 = ResNeXt50OneArrangeDats12Callee1;
task131.any1 = tensors127;
task131.nd1 = 4;
task131.hull1[0] = 8;
task131.hull1[1] = 1;
task131.hull1[2] = 1;
task131.hull1[3] = 1;
ResNeXt50ThreaderDo1(team77, &task131);
}

static void ResNeXt50OneApply12Callee1(ResNeXt50ThreaderTask1* task132, int64_t* pt71) {
void** pair32 = task132->any1;
char** tensors130 = pair32[0];
ptrdiff_t e37 = 0;
ptrdiff_t g41 = 0;
ptrdiff_t d26 = 0;
ptrdiff_t w74 = pt71[0];
char*restrict arrangedWts12 = tensors130[0]+6848512*e37+(ptrdiff_t)8396800*1*g41;
char*restrict arrangedDats12 = tensors130[1]+213760*e37+(ptrdiff_t)262144*1*g41;
char*restrict datPtr41 = tensors130[2]+(ptrdiff_t)655360*1*g41;
ptrdiff_t ii57 = 1;
for (ptrdiff_t i77 = 0; i77 < ii57; ++i77) {
ptrdiff_t j68 = 1*d26;
ptrdiff_t h54 = 0;
switch (j68) {
default: {
j68 = 0;
ptrdiff_t k181 = 1*w74;
ptrdiff_t kk63 = k181+0;
for (; k181 != 341; ++k181) {
ptrdiff_t s75 = -1;
__m512 sum631 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)24));
__m512 sum635 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)28));
__m512 sum639 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)32));
__m512 sum643 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)36));
__m512 sum647 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)40));
__m512 sum651 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)44));
__m512 sum632 = sum631;
__m512 sum633 = sum631;
__m512 sum634 = sum631;
__m512 sum636 = sum635;
__m512 sum637 = sum635;
__m512 sum638 = sum635;
__m512 sum640 = sum639;
__m512 sum641 = sum639;
__m512 sum642 = sum639;
__m512 sum644 = sum643;
__m512 sum645 = sum643;
__m512 sum646 = sum643;
__m512 sum648 = sum647;
__m512 sum649 = sum647;
__m512 sum650 = sum647;
__m512 sum652 = sum651;
__m512 sum653 = sum651;
__m512 sum654 = sum651;
for (s75 = 0; s75 < 1024; ++s75) {
__m512 dat2559 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s75+(ptrdiff_t)0);
__m512 dat2560 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s75+(ptrdiff_t)64);
__m512 dat2561 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s75+(ptrdiff_t)128);
__m512 dat2562 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s75+(ptrdiff_t)192);
__m512 wt785 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)24));
sum631 = _mm512_fmadd_ps(wt785, dat2559, sum631);
sum632 = _mm512_fmadd_ps(wt785, dat2560, sum632);
sum633 = _mm512_fmadd_ps(wt785, dat2561, sum633);
sum634 = _mm512_fmadd_ps(wt785, dat2562, sum634);
__m512 wt786 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)28));
sum635 = _mm512_fmadd_ps(wt786, dat2559, sum635);
sum636 = _mm512_fmadd_ps(wt786, dat2560, sum636);
sum637 = _mm512_fmadd_ps(wt786, dat2561, sum637);
sum638 = _mm512_fmadd_ps(wt786, dat2562, sum638);
__m512 wt787 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)32));
sum639 = _mm512_fmadd_ps(wt787, dat2559, sum639);
sum640 = _mm512_fmadd_ps(wt787, dat2560, sum640);
sum641 = _mm512_fmadd_ps(wt787, dat2561, sum641);
sum642 = _mm512_fmadd_ps(wt787, dat2562, sum642);
__m512 wt788 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)36));
sum643 = _mm512_fmadd_ps(wt788, dat2559, sum643);
sum644 = _mm512_fmadd_ps(wt788, dat2560, sum644);
sum645 = _mm512_fmadd_ps(wt788, dat2561, sum645);
sum646 = _mm512_fmadd_ps(wt788, dat2562, sum646);
__m512 wt789 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)40));
sum647 = _mm512_fmadd_ps(wt789, dat2559, sum647);
sum648 = _mm512_fmadd_ps(wt789, dat2560, sum648);
sum649 = _mm512_fmadd_ps(wt789, dat2561, sum649);
sum650 = _mm512_fmadd_ps(wt789, dat2562, sum650);
__m512 wt790 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+24*s75+(ptrdiff_t)44));
sum651 = _mm512_fmadd_ps(wt790, dat2559, sum651);
sum652 = _mm512_fmadd_ps(wt790, dat2560, sum652);
sum653 = _mm512_fmadd_ps(wt790, dat2561, sum653);
sum654 = _mm512_fmadd_ps(wt790, dat2562, sum654);
}
__m512 dat2563 = sum631;
__m512i via1 = _mm512_castps_si512(sum631);
via1 = _mm512_alignr_epi32(via1, via1, 7);
__m512 dat2564 = _mm512_castsi512_ps(via1);
__m512 dat2565 = sum632;
__m512i via2 = _mm512_castps_si512(sum632);
via2 = _mm512_alignr_epi32(via2, via2, 7);
__m512 dat2566 = _mm512_castsi512_ps(via2);
__m512 dat2567 = sum633;
__m512i via3 = _mm512_castps_si512(sum633);
via3 = _mm512_alignr_epi32(via3, via3, 7);
__m512 dat2568 = _mm512_castsi512_ps(via3);
__m512 dat2569 = sum634;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)0, 127, dat2563);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)28, 127, dat2564);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)56, 127, dat2565);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)84, 127, dat2566);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)112, 127, dat2567);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)140, 127, dat2568);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)168, 127, dat2569);
__m512 dat2570 = sum635;
__m512i via4 = _mm512_castps_si512(sum635);
via4 = _mm512_alignr_epi32(via4, via4, 7);
__m512 dat2571 = _mm512_castsi512_ps(via4);
__m512 dat2572 = sum636;
__m512i via5 = _mm512_castps_si512(sum636);
via5 = _mm512_alignr_epi32(via5, via5, 7);
__m512 dat2573 = _mm512_castsi512_ps(via5);
__m512 dat2574 = sum637;
__m512i via6 = _mm512_castps_si512(sum637);
via6 = _mm512_alignr_epi32(via6, via6, 7);
__m512 dat2575 = _mm512_castsi512_ps(via6);
__m512 dat2576 = sum638;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)320, 127, dat2570);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)348, 127, dat2571);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)376, 127, dat2572);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)404, 127, dat2573);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)432, 127, dat2574);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)460, 127, dat2575);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)488, 127, dat2576);
__m512 dat2577 = sum639;
__m512i via7 = _mm512_castps_si512(sum639);
via7 = _mm512_alignr_epi32(via7, via7, 7);
__m512 dat2578 = _mm512_castsi512_ps(via7);
__m512 dat2579 = sum640;
__m512i via8 = _mm512_castps_si512(sum640);
via8 = _mm512_alignr_epi32(via8, via8, 7);
__m512 dat2580 = _mm512_castsi512_ps(via8);
__m512 dat2581 = sum641;
__m512i via9 = _mm512_castps_si512(sum641);
via9 = _mm512_alignr_epi32(via9, via9, 7);
__m512 dat2582 = _mm512_castsi512_ps(via9);
__m512 dat2583 = sum642;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)640, 127, dat2577);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)668, 127, dat2578);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)696, 127, dat2579);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)724, 127, dat2580);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)752, 127, dat2581);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)780, 127, dat2582);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)808, 127, dat2583);
__m512 dat2584 = sum643;
__m512i via10 = _mm512_castps_si512(sum643);
via10 = _mm512_alignr_epi32(via10, via10, 7);
__m512 dat2585 = _mm512_castsi512_ps(via10);
__m512 dat2586 = sum644;
__m512i via11 = _mm512_castps_si512(sum644);
via11 = _mm512_alignr_epi32(via11, via11, 7);
__m512 dat2587 = _mm512_castsi512_ps(via11);
__m512 dat2588 = sum645;
__m512i via12 = _mm512_castps_si512(sum645);
via12 = _mm512_alignr_epi32(via12, via12, 7);
__m512 dat2589 = _mm512_castsi512_ps(via12);
__m512 dat2590 = sum646;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)960, 127, dat2584);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)988, 127, dat2585);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1016, 127, dat2586);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1044, 127, dat2587);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1072, 127, dat2588);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1100, 127, dat2589);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1128, 127, dat2590);
__m512 dat2591 = sum647;
__m512i via13 = _mm512_castps_si512(sum647);
via13 = _mm512_alignr_epi32(via13, via13, 7);
__m512 dat2592 = _mm512_castsi512_ps(via13);
__m512 dat2593 = sum648;
__m512i via14 = _mm512_castps_si512(sum648);
via14 = _mm512_alignr_epi32(via14, via14, 7);
__m512 dat2594 = _mm512_castsi512_ps(via14);
__m512 dat2595 = sum649;
__m512i via15 = _mm512_castps_si512(sum649);
via15 = _mm512_alignr_epi32(via15, via15, 7);
__m512 dat2596 = _mm512_castsi512_ps(via15);
__m512 dat2597 = sum650;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1280, 127, dat2591);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1308, 127, dat2592);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1336, 127, dat2593);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1364, 127, dat2594);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1392, 127, dat2595);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1420, 127, dat2596);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1448, 127, dat2597);
__m512 dat2598 = sum651;
__m512i via16 = _mm512_castps_si512(sum651);
via16 = _mm512_alignr_epi32(via16, via16, 7);
__m512 dat2599 = _mm512_castsi512_ps(via16);
__m512 dat2600 = sum652;
__m512i via17 = _mm512_castps_si512(sum652);
via17 = _mm512_alignr_epi32(via17, via17, 7);
__m512 dat2601 = _mm512_castsi512_ps(via17);
__m512 dat2602 = sum653;
__m512i via18 = _mm512_castps_si512(sum653);
via18 = _mm512_alignr_epi32(via18, via18, 7);
__m512 dat2603 = _mm512_castsi512_ps(via18);
__m512 dat2604 = sum654;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1600, 127, dat2598);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1628, 127, dat2599);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1656, 127, dat2600);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1684, 127, dat2601);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1712, 127, dat2602);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1740, 127, dat2603);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)1768, 127, dat2604);
if (k181 >= kk63) return;
}
ptrdiff_t s76 = -1;
__m512 sum655 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+8*s76+(ptrdiff_t)8));
__m512 sum659 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+8*s76+(ptrdiff_t)12));
__m512 sum656 = sum655;
__m512 sum657 = sum655;
__m512 sum658 = sum655;
__m512 sum660 = sum659;
__m512 sum661 = sum659;
__m512 sum662 = sum659;
for (s76 = 0; s76 < 1024; ++s76) {
__m512 dat2605 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s76+(ptrdiff_t)0);
__m512 dat2606 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s76+(ptrdiff_t)64);
__m512 dat2607 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s76+(ptrdiff_t)128);
__m512 dat2608 = _mm512_loadu_ps(arrangedDats12+262144*i77+262144*j68+256*s76+(ptrdiff_t)192);
__m512 wt791 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+8*s76+(ptrdiff_t)8));
sum655 = _mm512_fmadd_ps(wt791, dat2605, sum655);
sum656 = _mm512_fmadd_ps(wt791, dat2606, sum656);
sum657 = _mm512_fmadd_ps(wt791, dat2607, sum657);
sum658 = _mm512_fmadd_ps(wt791, dat2608, sum658);
__m512 wt792 = _mm512_set1_ps(*(float*)(arrangedWts12+8396800*i77+24600*k181+8*s76+(ptrdiff_t)12));
sum659 = _mm512_fmadd_ps(wt792, dat2605, sum659);
sum660 = _mm512_fmadd_ps(wt792, dat2606, sum660);
sum661 = _mm512_fmadd_ps(wt792, dat2607, sum661);
sum662 = _mm512_fmadd_ps(wt792, dat2608, sum662);
}
__m512 dat2609 = sum655;
__m512i via19 = _mm512_castps_si512(sum655);
via19 = _mm512_alignr_epi32(via19, via19, 7);
__m512 dat2610 = _mm512_castsi512_ps(via19);
__m512 dat2611 = sum656;
__m512i via20 = _mm512_castps_si512(sum656);
via20 = _mm512_alignr_epi32(via20, via20, 7);
__m512 dat2612 = _mm512_castsi512_ps(via20);
__m512 dat2613 = sum657;
__m512i via21 = _mm512_castps_si512(sum657);
via21 = _mm512_alignr_epi32(via21, via21, 7);
__m512 dat2614 = _mm512_castsi512_ps(via21);
__m512 dat2615 = sum658;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)0, 127, dat2609);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)28, 127, dat2610);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)56, 127, dat2611);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)84, 127, dat2612);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)112, 127, dat2613);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)140, 127, dat2614);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)168, 127, dat2615);
__m512 dat2616 = sum659;
__m512i via22 = _mm512_castps_si512(sum659);
via22 = _mm512_alignr_epi32(via22, via22, 7);
__m512 dat2617 = _mm512_castsi512_ps(via22);
__m512 dat2618 = sum660;
__m512i via23 = _mm512_castps_si512(sum660);
via23 = _mm512_alignr_epi32(via23, via23, 7);
__m512 dat2619 = _mm512_castsi512_ps(via23);
__m512 dat2620 = sum661;
__m512i via24 = _mm512_castps_si512(sum661);
via24 = _mm512_alignr_epi32(via24, via24, 7);
__m512 dat2621 = _mm512_castsi512_ps(via24);
__m512 dat2622 = sum662;
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)320, 127, dat2616);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)348, 127, dat2617);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)376, 127, dat2618);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)404, 127, dat2619);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)432, 127, dat2620);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)460, 127, dat2621);
_mm512_mask_storeu_ps(datPtr41+655360*i77+28*h54+1920*k181+(ptrdiff_t)488, 127, dat2622);
}
}
j68 = 1;
}
}

static void ResNeXt50OneApply12(ResNeXt50ThreaderTeam1* team78, char** tensors129) {
void* pair31[] = {tensors129, 0};
ResNeXt50ThreaderTask1 task133;
task133.callee1 = ResNeXt50OneApply12Callee1;
task133.any1 = pair31;
task133.nd1 = 3;
task133.hull1[0] = 342;
task133.hull1[1] = 1;
task133.hull1[2] = 1;
ResNeXt50ThreaderDo1(team78, &task133);
}

static void ResNeXt50OneArrangeWts13Callee1(ResNeXt50ThreaderTask1* task134, int64_t* pt72) {
char** tensors132 = task134->any1;
ptrdiff_t b87 = pt72[0];
char*restrict wtPtr21 = tensors132[0]+(ptrdiff_t)3340*0+(ptrdiff_t)4194304*0;
char*restrict biasPtr21 = tensors132[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr22 = tensors132[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged25 = tensors132[3]+(ptrdiff_t)3424256*0+(ptrdiff_t)4198400*0;
ptrdiff_t ii58 = 1;
for (ptrdiff_t i78 = 0; i78 < ii58; ++i78) {
ptrdiff_t j69 = 1*b87;
ptrdiff_t jj62 = j69+1;
for (; j69 < jj62; ++j69) {
if (j69 < 63) {
ptrdiff_t k183 = 0+16*(j69-0);
ptrdiff_t l79 = (size_t)(0+k183)/6;
ptrdiff_t cut30 = (size_t)(0+k183)%6;
switch (cut30) {
case 0:;
case 2: {
__m512 sum664 = _mm512_maskz_loadu_ps(65535, biasPtr21+4096*i78+4*k183);
__m512i pmMul45 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd45 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo38 = _mm512_loadu_ps(bnPtr22+(ptrdiff_t)8*(k183+1024*i78));
__m512 masHi38 = _mm512_maskz_loadu_ps(65535, bnPtr22+(ptrdiff_t)8*(k183+1024*i78)+(ptrdiff_t)64);
__m512 postMul68 = _mm512_permutex2var_ps(masLo38, pmMul45, masHi38);
__m512 postAdd46 = _mm512_permutex2var_ps(masLo38, pmAdd45, masHi38);
sum664 = _mm512_fmadd_ps(sum664, postMul68, postAdd46);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum664);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum664);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)49152, 65535-(4095>>cut30), sum664);
ptrdiff_t c64 = 0;
for (; c64 != 64; ++c64) {
__m512 wt809 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)0);
__m512 wt810 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)4096);
__m512 wt811 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)8192);
__m512 wt812 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)12288);
__m512 wt813 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)16384);
__m512 wt814 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)20480);
__m512 wt815 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)24576);
__m512 wt816 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)28672);
__m512 wt817 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)32768);
__m512 wt818 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)36864);
__m512 wt819 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)40960);
__m512 wt820 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)45056);
__m512 wt821 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)49152);
__m512 wt822 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)53248);
__m512 wt823 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)57344);
__m512 wt824 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c64+(ptrdiff_t)61440);
__m512 tmp15635 = _mm512_unpacklo_ps(wt809, wt810);
__m512 tmp15636 = _mm512_unpackhi_ps(wt809, wt810);
__m512 tmp15637 = _mm512_unpacklo_ps(wt811, wt812);
__m512 tmp15638 = _mm512_unpackhi_ps(wt811, wt812);
__m512 tmp15639 = _mm512_unpacklo_ps(wt813, wt814);
__m512 tmp15640 = _mm512_unpackhi_ps(wt813, wt814);
__m512 tmp15641 = _mm512_unpacklo_ps(wt815, wt816);
__m512 tmp15642 = _mm512_unpackhi_ps(wt815, wt816);
__m512 tmp15643 = _mm512_unpacklo_ps(wt817, wt818);
__m512 tmp15644 = _mm512_unpackhi_ps(wt817, wt818);
__m512 tmp15645 = _mm512_unpacklo_ps(wt819, wt820);
__m512 tmp15646 = _mm512_unpackhi_ps(wt819, wt820);
__m512 tmp15647 = _mm512_unpacklo_ps(wt821, wt822);
__m512 tmp15648 = _mm512_unpackhi_ps(wt821, wt822);
__m512 tmp15649 = _mm512_unpacklo_ps(wt823, wt824);
__m512 tmp15650 = _mm512_unpackhi_ps(wt823, wt824);
__m512 tmp15651 = _mm512_shuffle_ps(tmp15635, tmp15637, 68);
__m512 tmp15652 = _mm512_shuffle_ps(tmp15635, tmp15637, 238);
__m512 tmp15653 = _mm512_shuffle_ps(tmp15636, tmp15638, 68);
__m512 tmp15654 = _mm512_shuffle_ps(tmp15636, tmp15638, 238);
__m512 tmp15655 = _mm512_shuffle_ps(tmp15639, tmp15641, 68);
__m512 tmp15656 = _mm512_shuffle_ps(tmp15639, tmp15641, 238);
__m512 tmp15657 = _mm512_shuffle_ps(tmp15640, tmp15642, 68);
__m512 tmp15658 = _mm512_shuffle_ps(tmp15640, tmp15642, 238);
__m512 tmp15659 = _mm512_shuffle_ps(tmp15643, tmp15645, 68);
__m512 tmp15660 = _mm512_shuffle_ps(tmp15643, tmp15645, 238);
__m512 tmp15661 = _mm512_shuffle_ps(tmp15644, tmp15646, 68);
__m512 tmp15662 = _mm512_shuffle_ps(tmp15644, tmp15646, 238);
__m512 tmp15663 = _mm512_shuffle_ps(tmp15647, tmp15649, 68);
__m512 tmp15664 = _mm512_shuffle_ps(tmp15647, tmp15649, 238);
__m512 tmp15665 = _mm512_shuffle_ps(tmp15648, tmp15650, 68);
__m512 tmp15666 = _mm512_shuffle_ps(tmp15648, tmp15650, 238);
__m512 tmp15667 = _mm512_shuffle_f32x4(tmp15651, tmp15655, 136);
__m512 tmp15668 = _mm512_shuffle_f32x4(tmp15651, tmp15655, 221);
__m512 tmp15669 = _mm512_shuffle_f32x4(tmp15652, tmp15656, 136);
__m512 tmp15670 = _mm512_shuffle_f32x4(tmp15652, tmp15656, 221);
__m512 tmp15671 = _mm512_shuffle_f32x4(tmp15653, tmp15657, 136);
__m512 tmp15672 = _mm512_shuffle_f32x4(tmp15653, tmp15657, 221);
__m512 tmp15673 = _mm512_shuffle_f32x4(tmp15654, tmp15658, 136);
__m512 tmp15674 = _mm512_shuffle_f32x4(tmp15654, tmp15658, 221);
__m512 tmp15675 = _mm512_shuffle_f32x4(tmp15659, tmp15663, 136);
__m512 tmp15676 = _mm512_shuffle_f32x4(tmp15659, tmp15663, 221);
__m512 tmp15677 = _mm512_shuffle_f32x4(tmp15660, tmp15664, 136);
__m512 tmp15678 = _mm512_shuffle_f32x4(tmp15660, tmp15664, 221);
__m512 tmp15679 = _mm512_shuffle_f32x4(tmp15661, tmp15665, 136);
__m512 tmp15680 = _mm512_shuffle_f32x4(tmp15661, tmp15665, 221);
__m512 tmp15681 = _mm512_shuffle_f32x4(tmp15662, tmp15666, 136);
__m512 tmp15682 = _mm512_shuffle_f32x4(tmp15662, tmp15666, 221);
wt809 = _mm512_shuffle_f32x4(tmp15667, tmp15675, 136);
wt817 = _mm512_shuffle_f32x4(tmp15667, tmp15675, 221);
wt810 = _mm512_shuffle_f32x4(tmp15669, tmp15677, 136);
wt818 = _mm512_shuffle_f32x4(tmp15669, tmp15677, 221);
wt811 = _mm512_shuffle_f32x4(tmp15671, tmp15679, 136);
wt819 = _mm512_shuffle_f32x4(tmp15671, tmp15679, 221);
wt812 = _mm512_shuffle_f32x4(tmp15673, tmp15681, 136);
wt820 = _mm512_shuffle_f32x4(tmp15673, tmp15681, 221);
wt813 = _mm512_shuffle_f32x4(tmp15668, tmp15676, 136);
wt821 = _mm512_shuffle_f32x4(tmp15668, tmp15676, 221);
wt814 = _mm512_shuffle_f32x4(tmp15670, tmp15678, 136);
wt822 = _mm512_shuffle_f32x4(tmp15670, tmp15678, 221);
wt815 = _mm512_shuffle_f32x4(tmp15672, tmp15680, 136);
wt823 = _mm512_shuffle_f32x4(tmp15672, tmp15680, 221);
wt816 = _mm512_shuffle_f32x4(tmp15674, tmp15682, 136);
wt824 = _mm512_shuffle_f32x4(tmp15674, tmp15682, 221);
wt809 = _mm512_mul_ps(wt809, postMul68);
wt810 = _mm512_mul_ps(wt810, postMul68);
wt811 = _mm512_mul_ps(wt811, postMul68);
wt812 = _mm512_mul_ps(wt812, postMul68);
wt813 = _mm512_mul_ps(wt813, postMul68);
wt814 = _mm512_mul_ps(wt814, postMul68);
wt815 = _mm512_mul_ps(wt815, postMul68);
wt816 = _mm512_mul_ps(wt816, postMul68);
wt817 = _mm512_mul_ps(wt817, postMul68);
wt818 = _mm512_mul_ps(wt818, postMul68);
wt819 = _mm512_mul_ps(wt819, postMul68);
wt820 = _mm512_mul_ps(wt820, postMul68);
wt821 = _mm512_mul_ps(wt821, postMul68);
wt822 = _mm512_mul_ps(wt822, postMul68);
wt823 = _mm512_mul_ps(wt823, postMul68);
wt824 = _mm512_mul_ps(wt824, postMul68);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c64)+(ptrdiff_t)0, 63>>cut30, wt809);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c64)+(ptrdiff_t)0, 63>>cut30, wt810);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c64)+(ptrdiff_t)0, 63>>cut30, wt811);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c64)+(ptrdiff_t)0, 63>>cut30, wt812);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c64)+(ptrdiff_t)0, 63>>cut30, wt813);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c64)+(ptrdiff_t)0, 63>>cut30, wt814);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c64)+(ptrdiff_t)0, 63>>cut30, wt815);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c64)+(ptrdiff_t)0, 63>>cut30, wt816);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c64)+(ptrdiff_t)0, 63>>cut30, wt817);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c64)+(ptrdiff_t)0, 63>>cut30, wt818);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c64)+(ptrdiff_t)0, 63>>cut30, wt819);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c64)+(ptrdiff_t)0, 63>>cut30, wt820);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c64)+(ptrdiff_t)0, 63>>cut30, wt821);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c64)+(ptrdiff_t)0, 63>>cut30, wt822);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c64)+(ptrdiff_t)0, 63>>cut30, wt823);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c64)+(ptrdiff_t)0, 63>>cut30, wt824);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt809);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt810);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt811);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt812);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt813);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt814);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt815);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt816);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt817);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt818);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt819);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt820);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt821);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt822);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt823);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c64)+(ptrdiff_t)24576, 4032>>cut30, wt824);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt809);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt810);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt811);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt812);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt813);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt814);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt815);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt816);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt817);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt818);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt819);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt820);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt821);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt822);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt823);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c64)+(ptrdiff_t)49152, 65535-(4095>>cut30), wt824);
}
break;
}
default: {
cut30 = 4;
__m512 sum665 = _mm512_maskz_loadu_ps(65535, biasPtr21+4096*i78+4*k183);
__m512i pmMul46 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd46 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo39 = _mm512_loadu_ps(bnPtr22+(ptrdiff_t)8*(k183+1024*i78));
__m512 masHi39 = _mm512_maskz_loadu_ps(65535, bnPtr22+(ptrdiff_t)8*(k183+1024*i78)+(ptrdiff_t)64);
__m512 postMul69 = _mm512_permutex2var_ps(masLo39, pmMul46, masHi39);
__m512 postAdd47 = _mm512_permutex2var_ps(masLo39, pmAdd46, masHi39);
sum665 = _mm512_fmadd_ps(sum665, postMul69, postAdd47);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)0, 63>>cut30, sum665);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)24576, 4032>>cut30, sum665);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)49152, 258048>>cut30, sum665);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*0+(ptrdiff_t)73728, 65535-(262143>>cut30), sum665);
ptrdiff_t c65 = 0;
for (; c65 != 64; ++c65) {
__m512 wt825 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)0);
__m512 wt826 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)4096);
__m512 wt827 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)8192);
__m512 wt828 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)12288);
__m512 wt829 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)16384);
__m512 wt830 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)20480);
__m512 wt831 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)24576);
__m512 wt832 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)28672);
__m512 wt833 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)32768);
__m512 wt834 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)36864);
__m512 wt835 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)40960);
__m512 wt836 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)45056);
__m512 wt837 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)49152);
__m512 wt838 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)53248);
__m512 wt839 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)57344);
__m512 wt840 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k183+64*c65+(ptrdiff_t)61440);
__m512 tmp15683 = _mm512_unpacklo_ps(wt825, wt826);
__m512 tmp15684 = _mm512_unpackhi_ps(wt825, wt826);
__m512 tmp15685 = _mm512_unpacklo_ps(wt827, wt828);
__m512 tmp15686 = _mm512_unpackhi_ps(wt827, wt828);
__m512 tmp15687 = _mm512_unpacklo_ps(wt829, wt830);
__m512 tmp15688 = _mm512_unpackhi_ps(wt829, wt830);
__m512 tmp15689 = _mm512_unpacklo_ps(wt831, wt832);
__m512 tmp15690 = _mm512_unpackhi_ps(wt831, wt832);
__m512 tmp15691 = _mm512_unpacklo_ps(wt833, wt834);
__m512 tmp15692 = _mm512_unpackhi_ps(wt833, wt834);
__m512 tmp15693 = _mm512_unpacklo_ps(wt835, wt836);
__m512 tmp15694 = _mm512_unpackhi_ps(wt835, wt836);
__m512 tmp15695 = _mm512_unpacklo_ps(wt837, wt838);
__m512 tmp15696 = _mm512_unpackhi_ps(wt837, wt838);
__m512 tmp15697 = _mm512_unpacklo_ps(wt839, wt840);
__m512 tmp15698 = _mm512_unpackhi_ps(wt839, wt840);
__m512 tmp15699 = _mm512_shuffle_ps(tmp15683, tmp15685, 68);
__m512 tmp15700 = _mm512_shuffle_ps(tmp15683, tmp15685, 238);
__m512 tmp15701 = _mm512_shuffle_ps(tmp15684, tmp15686, 68);
__m512 tmp15702 = _mm512_shuffle_ps(tmp15684, tmp15686, 238);
__m512 tmp15703 = _mm512_shuffle_ps(tmp15687, tmp15689, 68);
__m512 tmp15704 = _mm512_shuffle_ps(tmp15687, tmp15689, 238);
__m512 tmp15705 = _mm512_shuffle_ps(tmp15688, tmp15690, 68);
__m512 tmp15706 = _mm512_shuffle_ps(tmp15688, tmp15690, 238);
__m512 tmp15707 = _mm512_shuffle_ps(tmp15691, tmp15693, 68);
__m512 tmp15708 = _mm512_shuffle_ps(tmp15691, tmp15693, 238);
__m512 tmp15709 = _mm512_shuffle_ps(tmp15692, tmp15694, 68);
__m512 tmp15710 = _mm512_shuffle_ps(tmp15692, tmp15694, 238);
__m512 tmp15711 = _mm512_shuffle_ps(tmp15695, tmp15697, 68);
__m512 tmp15712 = _mm512_shuffle_ps(tmp15695, tmp15697, 238);
__m512 tmp15713 = _mm512_shuffle_ps(tmp15696, tmp15698, 68);
__m512 tmp15714 = _mm512_shuffle_ps(tmp15696, tmp15698, 238);
__m512 tmp15715 = _mm512_shuffle_f32x4(tmp15699, tmp15703, 136);
__m512 tmp15716 = _mm512_shuffle_f32x4(tmp15699, tmp15703, 221);
__m512 tmp15717 = _mm512_shuffle_f32x4(tmp15700, tmp15704, 136);
__m512 tmp15718 = _mm512_shuffle_f32x4(tmp15700, tmp15704, 221);
__m512 tmp15719 = _mm512_shuffle_f32x4(tmp15701, tmp15705, 136);
__m512 tmp15720 = _mm512_shuffle_f32x4(tmp15701, tmp15705, 221);
__m512 tmp15721 = _mm512_shuffle_f32x4(tmp15702, tmp15706, 136);
__m512 tmp15722 = _mm512_shuffle_f32x4(tmp15702, tmp15706, 221);
__m512 tmp15723 = _mm512_shuffle_f32x4(tmp15707, tmp15711, 136);
__m512 tmp15724 = _mm512_shuffle_f32x4(tmp15707, tmp15711, 221);
__m512 tmp15725 = _mm512_shuffle_f32x4(tmp15708, tmp15712, 136);
__m512 tmp15726 = _mm512_shuffle_f32x4(tmp15708, tmp15712, 221);
__m512 tmp15727 = _mm512_shuffle_f32x4(tmp15709, tmp15713, 136);
__m512 tmp15728 = _mm512_shuffle_f32x4(tmp15709, tmp15713, 221);
__m512 tmp15729 = _mm512_shuffle_f32x4(tmp15710, tmp15714, 136);
__m512 tmp15730 = _mm512_shuffle_f32x4(tmp15710, tmp15714, 221);
wt825 = _mm512_shuffle_f32x4(tmp15715, tmp15723, 136);
wt833 = _mm512_shuffle_f32x4(tmp15715, tmp15723, 221);
wt826 = _mm512_shuffle_f32x4(tmp15717, tmp15725, 136);
wt834 = _mm512_shuffle_f32x4(tmp15717, tmp15725, 221);
wt827 = _mm512_shuffle_f32x4(tmp15719, tmp15727, 136);
wt835 = _mm512_shuffle_f32x4(tmp15719, tmp15727, 221);
wt828 = _mm512_shuffle_f32x4(tmp15721, tmp15729, 136);
wt836 = _mm512_shuffle_f32x4(tmp15721, tmp15729, 221);
wt829 = _mm512_shuffle_f32x4(tmp15716, tmp15724, 136);
wt837 = _mm512_shuffle_f32x4(tmp15716, tmp15724, 221);
wt830 = _mm512_shuffle_f32x4(tmp15718, tmp15726, 136);
wt838 = _mm512_shuffle_f32x4(tmp15718, tmp15726, 221);
wt831 = _mm512_shuffle_f32x4(tmp15720, tmp15728, 136);
wt839 = _mm512_shuffle_f32x4(tmp15720, tmp15728, 221);
wt832 = _mm512_shuffle_f32x4(tmp15722, tmp15730, 136);
wt840 = _mm512_shuffle_f32x4(tmp15722, tmp15730, 221);
wt825 = _mm512_mul_ps(wt825, postMul69);
wt826 = _mm512_mul_ps(wt826, postMul69);
wt827 = _mm512_mul_ps(wt827, postMul69);
wt828 = _mm512_mul_ps(wt828, postMul69);
wt829 = _mm512_mul_ps(wt829, postMul69);
wt830 = _mm512_mul_ps(wt830, postMul69);
wt831 = _mm512_mul_ps(wt831, postMul69);
wt832 = _mm512_mul_ps(wt832, postMul69);
wt833 = _mm512_mul_ps(wt833, postMul69);
wt834 = _mm512_mul_ps(wt834, postMul69);
wt835 = _mm512_mul_ps(wt835, postMul69);
wt836 = _mm512_mul_ps(wt836, postMul69);
wt837 = _mm512_mul_ps(wt837, postMul69);
wt838 = _mm512_mul_ps(wt838, postMul69);
wt839 = _mm512_mul_ps(wt839, postMul69);
wt840 = _mm512_mul_ps(wt840, postMul69);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c65)+(ptrdiff_t)0, 63>>cut30, wt825);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c65)+(ptrdiff_t)0, 63>>cut30, wt826);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c65)+(ptrdiff_t)0, 63>>cut30, wt827);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c65)+(ptrdiff_t)0, 63>>cut30, wt828);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c65)+(ptrdiff_t)0, 63>>cut30, wt829);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c65)+(ptrdiff_t)0, 63>>cut30, wt830);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c65)+(ptrdiff_t)0, 63>>cut30, wt831);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c65)+(ptrdiff_t)0, 63>>cut30, wt832);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c65)+(ptrdiff_t)0, 63>>cut30, wt833);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c65)+(ptrdiff_t)0, 63>>cut30, wt834);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c65)+(ptrdiff_t)0, 63>>cut30, wt835);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c65)+(ptrdiff_t)0, 63>>cut30, wt836);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c65)+(ptrdiff_t)0, 63>>cut30, wt837);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c65)+(ptrdiff_t)0, 63>>cut30, wt838);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c65)+(ptrdiff_t)0, 63>>cut30, wt839);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c65)+(ptrdiff_t)0, 63>>cut30, wt840);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt825);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt826);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt827);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt828);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt829);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt830);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt831);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt832);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt833);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt834);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt835);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt836);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt837);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt838);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt839);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c65)+(ptrdiff_t)24576, 4032>>cut30, wt840);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt825);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt826);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt827);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt828);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt829);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt830);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt831);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt832);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt833);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt834);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt835);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt836);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt837);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt838);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt839);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c65)+(ptrdiff_t)49152, 258048>>cut30, wt840);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(1+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt825);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(2+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt826);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(3+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt827);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(4+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt828);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(5+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt829);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(6+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt830);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(7+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt831);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(8+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt832);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(9+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt833);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(10+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt834);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(11+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt835);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(12+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt836);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(13+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt837);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(14+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt838);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(15+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt839);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l79+4*cut30+24*(16+16*c65)+(ptrdiff_t)73728, 65535-(262143>>cut30), wt840);
}
}
}
} else {
ptrdiff_t k182 = 1008;
ptrdiff_t l78 = (size_t)(0+k182)/6;
ptrdiff_t cut29 = (size_t)(0+k182)%6;
__m512 sum663 = _mm512_maskz_loadu_ps(65535, biasPtr21+4096*i78+4*k182);
__m512i pmMul47 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd47 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo40 = _mm512_loadu_ps(bnPtr22+(ptrdiff_t)8*(k182+1024*i78));
__m512 masHi40 = _mm512_maskz_loadu_ps(65535, bnPtr22+(ptrdiff_t)8*(k182+1024*i78)+(ptrdiff_t)64);
__m512 postMul67 = _mm512_permutex2var_ps(masLo40, pmMul47, masHi40);
__m512 postAdd45 = _mm512_permutex2var_ps(masLo40, pmAdd47, masHi40);
sum663 = _mm512_fmadd_ps(sum663, postMul67, postAdd45);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*0+(ptrdiff_t)0, 63>>cut29, sum663);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*0+(ptrdiff_t)24576, 4032>>cut29, sum663);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*0+(ptrdiff_t)49152, 65535-(4095>>cut29), sum663);
ptrdiff_t c63 = 0;
for (; c63 != 64; ++c63) {
__m512 wt793 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)0);
__m512 wt794 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)4096);
__m512 wt795 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)8192);
__m512 wt796 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)12288);
__m512 wt797 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)16384);
__m512 wt798 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)20480);
__m512 wt799 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)24576);
__m512 wt800 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)28672);
__m512 wt801 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)32768);
__m512 wt802 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)36864);
__m512 wt803 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)40960);
__m512 wt804 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)45056);
__m512 wt805 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)49152);
__m512 wt806 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)53248);
__m512 wt807 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)57344);
__m512 wt808 = _mm512_maskz_loadu_ps(65535, wtPtr21+4194304*i78+4096*k182+64*c63+(ptrdiff_t)61440);
__m512 tmp15731 = _mm512_unpacklo_ps(wt793, wt794);
__m512 tmp15732 = _mm512_unpackhi_ps(wt793, wt794);
__m512 tmp15733 = _mm512_unpacklo_ps(wt795, wt796);
__m512 tmp15734 = _mm512_unpackhi_ps(wt795, wt796);
__m512 tmp15735 = _mm512_unpacklo_ps(wt797, wt798);
__m512 tmp15736 = _mm512_unpackhi_ps(wt797, wt798);
__m512 tmp15737 = _mm512_unpacklo_ps(wt799, wt800);
__m512 tmp15738 = _mm512_unpackhi_ps(wt799, wt800);
__m512 tmp15739 = _mm512_unpacklo_ps(wt801, wt802);
__m512 tmp15740 = _mm512_unpackhi_ps(wt801, wt802);
__m512 tmp15741 = _mm512_unpacklo_ps(wt803, wt804);
__m512 tmp15742 = _mm512_unpackhi_ps(wt803, wt804);
__m512 tmp15743 = _mm512_unpacklo_ps(wt805, wt806);
__m512 tmp15744 = _mm512_unpackhi_ps(wt805, wt806);
__m512 tmp15745 = _mm512_unpacklo_ps(wt807, wt808);
__m512 tmp15746 = _mm512_unpackhi_ps(wt807, wt808);
__m512 tmp15747 = _mm512_shuffle_ps(tmp15731, tmp15733, 68);
__m512 tmp15748 = _mm512_shuffle_ps(tmp15731, tmp15733, 238);
__m512 tmp15749 = _mm512_shuffle_ps(tmp15732, tmp15734, 68);
__m512 tmp15750 = _mm512_shuffle_ps(tmp15732, tmp15734, 238);
__m512 tmp15751 = _mm512_shuffle_ps(tmp15735, tmp15737, 68);
__m512 tmp15752 = _mm512_shuffle_ps(tmp15735, tmp15737, 238);
__m512 tmp15753 = _mm512_shuffle_ps(tmp15736, tmp15738, 68);
__m512 tmp15754 = _mm512_shuffle_ps(tmp15736, tmp15738, 238);
__m512 tmp15755 = _mm512_shuffle_ps(tmp15739, tmp15741, 68);
__m512 tmp15756 = _mm512_shuffle_ps(tmp15739, tmp15741, 238);
__m512 tmp15757 = _mm512_shuffle_ps(tmp15740, tmp15742, 68);
__m512 tmp15758 = _mm512_shuffle_ps(tmp15740, tmp15742, 238);
__m512 tmp15759 = _mm512_shuffle_ps(tmp15743, tmp15745, 68);
__m512 tmp15760 = _mm512_shuffle_ps(tmp15743, tmp15745, 238);
__m512 tmp15761 = _mm512_shuffle_ps(tmp15744, tmp15746, 68);
__m512 tmp15762 = _mm512_shuffle_ps(tmp15744, tmp15746, 238);
__m512 tmp15763 = _mm512_shuffle_f32x4(tmp15747, tmp15751, 136);
__m512 tmp15764 = _mm512_shuffle_f32x4(tmp15747, tmp15751, 221);
__m512 tmp15765 = _mm512_shuffle_f32x4(tmp15748, tmp15752, 136);
__m512 tmp15766 = _mm512_shuffle_f32x4(tmp15748, tmp15752, 221);
__m512 tmp15767 = _mm512_shuffle_f32x4(tmp15749, tmp15753, 136);
__m512 tmp15768 = _mm512_shuffle_f32x4(tmp15749, tmp15753, 221);
__m512 tmp15769 = _mm512_shuffle_f32x4(tmp15750, tmp15754, 136);
__m512 tmp15770 = _mm512_shuffle_f32x4(tmp15750, tmp15754, 221);
__m512 tmp15771 = _mm512_shuffle_f32x4(tmp15755, tmp15759, 136);
__m512 tmp15772 = _mm512_shuffle_f32x4(tmp15755, tmp15759, 221);
__m512 tmp15773 = _mm512_shuffle_f32x4(tmp15756, tmp15760, 136);
__m512 tmp15774 = _mm512_shuffle_f32x4(tmp15756, tmp15760, 221);
__m512 tmp15775 = _mm512_shuffle_f32x4(tmp15757, tmp15761, 136);
__m512 tmp15776 = _mm512_shuffle_f32x4(tmp15757, tmp15761, 221);
__m512 tmp15777 = _mm512_shuffle_f32x4(tmp15758, tmp15762, 136);
__m512 tmp15778 = _mm512_shuffle_f32x4(tmp15758, tmp15762, 221);
wt793 = _mm512_shuffle_f32x4(tmp15763, tmp15771, 136);
wt801 = _mm512_shuffle_f32x4(tmp15763, tmp15771, 221);
wt794 = _mm512_shuffle_f32x4(tmp15765, tmp15773, 136);
wt802 = _mm512_shuffle_f32x4(tmp15765, tmp15773, 221);
wt795 = _mm512_shuffle_f32x4(tmp15767, tmp15775, 136);
wt803 = _mm512_shuffle_f32x4(tmp15767, tmp15775, 221);
wt796 = _mm512_shuffle_f32x4(tmp15769, tmp15777, 136);
wt804 = _mm512_shuffle_f32x4(tmp15769, tmp15777, 221);
wt797 = _mm512_shuffle_f32x4(tmp15764, tmp15772, 136);
wt805 = _mm512_shuffle_f32x4(tmp15764, tmp15772, 221);
wt798 = _mm512_shuffle_f32x4(tmp15766, tmp15774, 136);
wt806 = _mm512_shuffle_f32x4(tmp15766, tmp15774, 221);
wt799 = _mm512_shuffle_f32x4(tmp15768, tmp15776, 136);
wt807 = _mm512_shuffle_f32x4(tmp15768, tmp15776, 221);
wt800 = _mm512_shuffle_f32x4(tmp15770, tmp15778, 136);
wt808 = _mm512_shuffle_f32x4(tmp15770, tmp15778, 221);
wt793 = _mm512_mul_ps(wt793, postMul67);
wt794 = _mm512_mul_ps(wt794, postMul67);
wt795 = _mm512_mul_ps(wt795, postMul67);
wt796 = _mm512_mul_ps(wt796, postMul67);
wt797 = _mm512_mul_ps(wt797, postMul67);
wt798 = _mm512_mul_ps(wt798, postMul67);
wt799 = _mm512_mul_ps(wt799, postMul67);
wt800 = _mm512_mul_ps(wt800, postMul67);
wt801 = _mm512_mul_ps(wt801, postMul67);
wt802 = _mm512_mul_ps(wt802, postMul67);
wt803 = _mm512_mul_ps(wt803, postMul67);
wt804 = _mm512_mul_ps(wt804, postMul67);
wt805 = _mm512_mul_ps(wt805, postMul67);
wt806 = _mm512_mul_ps(wt806, postMul67);
wt807 = _mm512_mul_ps(wt807, postMul67);
wt808 = _mm512_mul_ps(wt808, postMul67);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(1+16*c63)+(ptrdiff_t)0, 63>>cut29, wt793);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(2+16*c63)+(ptrdiff_t)0, 63>>cut29, wt794);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(3+16*c63)+(ptrdiff_t)0, 63>>cut29, wt795);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(4+16*c63)+(ptrdiff_t)0, 63>>cut29, wt796);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(5+16*c63)+(ptrdiff_t)0, 63>>cut29, wt797);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(6+16*c63)+(ptrdiff_t)0, 63>>cut29, wt798);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(7+16*c63)+(ptrdiff_t)0, 63>>cut29, wt799);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(8+16*c63)+(ptrdiff_t)0, 63>>cut29, wt800);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(9+16*c63)+(ptrdiff_t)0, 63>>cut29, wt801);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(10+16*c63)+(ptrdiff_t)0, 63>>cut29, wt802);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(11+16*c63)+(ptrdiff_t)0, 63>>cut29, wt803);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(12+16*c63)+(ptrdiff_t)0, 63>>cut29, wt804);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(13+16*c63)+(ptrdiff_t)0, 63>>cut29, wt805);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(14+16*c63)+(ptrdiff_t)0, 63>>cut29, wt806);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(15+16*c63)+(ptrdiff_t)0, 63>>cut29, wt807);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(16+16*c63)+(ptrdiff_t)0, 63>>cut29, wt808);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(1+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt793);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(2+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt794);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(3+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt795);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(4+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt796);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(5+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt797);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(6+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt798);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(7+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt799);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(8+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt800);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(9+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt801);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(10+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt802);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(11+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt803);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(12+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt804);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(13+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt805);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(14+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt806);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(15+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt807);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+24*(16+16*c63)+(ptrdiff_t)24576, 4032>>cut29, wt808);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(1+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt793);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(2+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt794);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(3+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt795);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(4+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt796);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(5+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt797);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(6+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt798);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(7+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt799);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(8+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt800);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(9+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt801);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(10+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt802);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(11+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt803);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(12+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt804);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(13+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt805);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(14+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt806);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(15+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt807);
_mm512_mask_storeu_ps(arranged25+4198400*i78+24600*l78+4*cut29+16*(16+16*c63)+(ptrdiff_t)49152, 65535-(4095>>cut29), wt808);
}
}
}
}
}

static void ResNeXt50OneArrangeWts13(ResNeXt50ThreaderTeam1* team79, char** tensors131) {
ResNeXt50ThreaderTask1 task135;
task135.callee1 = ResNeXt50OneArrangeWts13Callee1;
task135.any1 = tensors131;
task135.nd1 = 3;
task135.hull1[0] = 64;
task135.hull1[1] = 1;
task135.hull1[2] = 1;
ResNeXt50ThreaderDo1(team79, &task135);
}

static void ResNeXt50OneArrangeDats13Callee1(ResNeXt50ThreaderTask1* task136, int64_t* pt73) {
char** tensors134 = task136->any1;
ptrdiff_t s77 = pt73[0];
ptrdiff_t c66 = pt73[1];
char*restrict datPtr42 = tensors134[0]+(ptrdiff_t)0+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
char*restrict arranged26 = tensors134[1]+(ptrdiff_t)694720*0+(ptrdiff_t)851968*0;
ptrdiff_t ii59 = 1;
for (ptrdiff_t i79 = 0; i79 < ii59; ++i79) {
ptrdiff_t j70 = 1*c66;
ptrdiff_t jj63 = j70+0;
for (; j70 != 3; ++j70) {
ptrdiff_t k184 = 128*s77;
ptrdiff_t kk64 = k184+128;
for (; k184 < kk64; ++k184) {
__m512 dat2623 = _mm512_maskz_loadu_ps(65535, datPtr42+851968*i79+256*j70+832*k184+(ptrdiff_t)0);
__m512 dat2624 = _mm512_maskz_loadu_ps(65535, datPtr42+851968*i79+256*j70+832*k184+(ptrdiff_t)64);
__m512 dat2625 = _mm512_maskz_loadu_ps(65535, datPtr42+851968*i79+256*j70+832*k184+(ptrdiff_t)128);
__m512 dat2626 = _mm512_maskz_loadu_ps(65535, datPtr42+851968*i79+256*j70+832*k184+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged26+851968*i79+262144*j70+256*k184+(ptrdiff_t)0, 65535, dat2623);
_mm512_mask_storeu_ps(arranged26+851968*i79+262144*j70+256*k184+(ptrdiff_t)64, 65535, dat2624);
_mm512_mask_storeu_ps(arranged26+851968*i79+262144*j70+256*k184+(ptrdiff_t)128, 65535, dat2625);
_mm512_mask_storeu_ps(arranged26+851968*i79+262144*j70+256*k184+(ptrdiff_t)192, 65535, dat2626);
}
if (j70 >= jj63) goto next12;
}
ptrdiff_t k185 = 128*s77;
ptrdiff_t kk65 = k185+128;
for (; k185 < kk65; ++k185) {
__m512 dat2627 = _mm512_maskz_loadu_ps(15, datPtr42+851968*i79+256*j70+832*k185+(ptrdiff_t)0);
_mm512_mask_storeu_ps(arranged26+851968*i79+262144*j70+64*k185+(ptrdiff_t)0, 15, dat2627);
}
next12:;
}
}

static void ResNeXt50OneArrangeDats13(ResNeXt50ThreaderTeam1* team80, char** tensors133) {
ResNeXt50ThreaderTask1 task137;
task137.callee1 = ResNeXt50OneArrangeDats13Callee1;
task137.any1 = tensors133;
task137.nd1 = 4;
task137.hull1[0] = 8;
task137.hull1[1] = 4;
task137.hull1[2] = 1;
task137.hull1[3] = 1;
ResNeXt50ThreaderDo1(team80, &task137);
}

static void ResNeXt50OneApply13Callee1(ResNeXt50ThreaderTask1* task138, int64_t* pt74) {
void** pair34 = task138->any1;
char** tensors136 = pair34[0];
ptrdiff_t e38 = 0;
ptrdiff_t g42 = 0;
ptrdiff_t d27 = pt74[1];
ptrdiff_t w75 = pt74[0];
char*restrict arrangedWts13 = tensors136[0]+3424256*e38+(ptrdiff_t)4198400*1*g42;
char*restrict arrangedDats13 = tensors136[1]+694720*e38+(ptrdiff_t)851968*1*g42;
char*restrict datPtr43 = tensors136[2]+(ptrdiff_t)851968*1*g42;
ptrdiff_t ii60 = 1;
for (ptrdiff_t i80 = 0; i80 < ii60; ++i80) {
ptrdiff_t j71 = 1*d27;
ptrdiff_t jj64 = j71+0;
for (; j71 != 3; ++j71) {
ptrdiff_t k186 = 1*w75;
ptrdiff_t kk66 = k186+0;
for (; k186 != 170; ++k186) {
ptrdiff_t s78 = -1;
__m512 sum666 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)24));
__m512 sum670 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)28));
__m512 sum674 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)32));
__m512 sum678 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)36));
__m512 sum682 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)40));
__m512 sum686 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)44));
__m512 sum667 = sum666;
__m512 sum668 = sum666;
__m512 sum669 = sum666;
__m512 sum671 = sum670;
__m512 sum672 = sum670;
__m512 sum673 = sum670;
__m512 sum675 = sum674;
__m512 sum676 = sum674;
__m512 sum677 = sum674;
__m512 sum679 = sum678;
__m512 sum680 = sum678;
__m512 sum681 = sum678;
__m512 sum683 = sum682;
__m512 sum684 = sum682;
__m512 sum685 = sum682;
__m512 sum687 = sum686;
__m512 sum688 = sum686;
__m512 sum689 = sum686;
for (s78 = 0; s78 < 1024; ++s78) {
__m512 dat2628 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s78+(ptrdiff_t)0);
__m512 dat2629 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s78+(ptrdiff_t)64);
__m512 dat2630 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s78+(ptrdiff_t)128);
__m512 dat2631 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s78+(ptrdiff_t)192);
__m512 wt841 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)24));
sum666 = _mm512_fmadd_ps(wt841, dat2628, sum666);
sum667 = _mm512_fmadd_ps(wt841, dat2629, sum667);
sum668 = _mm512_fmadd_ps(wt841, dat2630, sum668);
sum669 = _mm512_fmadd_ps(wt841, dat2631, sum669);
__m512 wt842 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)28));
sum670 = _mm512_fmadd_ps(wt842, dat2628, sum670);
sum671 = _mm512_fmadd_ps(wt842, dat2629, sum671);
sum672 = _mm512_fmadd_ps(wt842, dat2630, sum672);
sum673 = _mm512_fmadd_ps(wt842, dat2631, sum673);
__m512 wt843 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)32));
sum674 = _mm512_fmadd_ps(wt843, dat2628, sum674);
sum675 = _mm512_fmadd_ps(wt843, dat2629, sum675);
sum676 = _mm512_fmadd_ps(wt843, dat2630, sum676);
sum677 = _mm512_fmadd_ps(wt843, dat2631, sum677);
__m512 wt844 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)36));
sum678 = _mm512_fmadd_ps(wt844, dat2628, sum678);
sum679 = _mm512_fmadd_ps(wt844, dat2629, sum679);
sum680 = _mm512_fmadd_ps(wt844, dat2630, sum680);
sum681 = _mm512_fmadd_ps(wt844, dat2631, sum681);
__m512 wt845 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)40));
sum682 = _mm512_fmadd_ps(wt845, dat2628, sum682);
sum683 = _mm512_fmadd_ps(wt845, dat2629, sum683);
sum684 = _mm512_fmadd_ps(wt845, dat2630, sum684);
sum685 = _mm512_fmadd_ps(wt845, dat2631, sum685);
__m512 wt846 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+24*s78+(ptrdiff_t)44));
sum686 = _mm512_fmadd_ps(wt846, dat2628, sum686);
sum687 = _mm512_fmadd_ps(wt846, dat2629, sum687);
sum688 = _mm512_fmadd_ps(wt846, dat2630, sum688);
sum689 = _mm512_fmadd_ps(wt846, dat2631, sum689);
}
sum666 = _mm512_max_ps(_mm512_setzero_ps(), sum666);
sum667 = _mm512_max_ps(_mm512_setzero_ps(), sum667);
sum668 = _mm512_max_ps(_mm512_setzero_ps(), sum668);
sum669 = _mm512_max_ps(_mm512_setzero_ps(), sum669);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)0, 65535, sum666);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)64, 65535, sum667);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)128, 65535, sum668);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)192, 65535, sum669);
sum670 = _mm512_max_ps(_mm512_setzero_ps(), sum670);
sum671 = _mm512_max_ps(_mm512_setzero_ps(), sum671);
sum672 = _mm512_max_ps(_mm512_setzero_ps(), sum672);
sum673 = _mm512_max_ps(_mm512_setzero_ps(), sum673);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)832, 65535, sum670);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)896, 65535, sum671);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)960, 65535, sum672);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1024, 65535, sum673);
sum674 = _mm512_max_ps(_mm512_setzero_ps(), sum674);
sum675 = _mm512_max_ps(_mm512_setzero_ps(), sum675);
sum676 = _mm512_max_ps(_mm512_setzero_ps(), sum676);
sum677 = _mm512_max_ps(_mm512_setzero_ps(), sum677);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1664, 65535, sum674);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1728, 65535, sum675);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1792, 65535, sum676);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1856, 65535, sum677);
sum678 = _mm512_max_ps(_mm512_setzero_ps(), sum678);
sum679 = _mm512_max_ps(_mm512_setzero_ps(), sum679);
sum680 = _mm512_max_ps(_mm512_setzero_ps(), sum680);
sum681 = _mm512_max_ps(_mm512_setzero_ps(), sum681);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2496, 65535, sum678);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2560, 65535, sum679);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2624, 65535, sum680);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2688, 65535, sum681);
sum682 = _mm512_max_ps(_mm512_setzero_ps(), sum682);
sum683 = _mm512_max_ps(_mm512_setzero_ps(), sum683);
sum684 = _mm512_max_ps(_mm512_setzero_ps(), sum684);
sum685 = _mm512_max_ps(_mm512_setzero_ps(), sum685);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)3328, 65535, sum682);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)3392, 65535, sum683);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)3456, 65535, sum684);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)3520, 65535, sum685);
sum686 = _mm512_max_ps(_mm512_setzero_ps(), sum686);
sum687 = _mm512_max_ps(_mm512_setzero_ps(), sum687);
sum688 = _mm512_max_ps(_mm512_setzero_ps(), sum688);
sum689 = _mm512_max_ps(_mm512_setzero_ps(), sum689);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)4160, 65535, sum686);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)4224, 65535, sum687);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)4288, 65535, sum688);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)4352, 65535, sum689);
if (k186 >= kk66) return;
}
ptrdiff_t s79 = -1;
__m512 sum690 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)16));
__m512 sum694 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)20));
__m512 sum698 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)24));
__m512 sum702 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)28));
__m512 sum691 = sum690;
__m512 sum692 = sum690;
__m512 sum693 = sum690;
__m512 sum695 = sum694;
__m512 sum696 = sum694;
__m512 sum697 = sum694;
__m512 sum699 = sum698;
__m512 sum700 = sum698;
__m512 sum701 = sum698;
__m512 sum703 = sum702;
__m512 sum704 = sum702;
__m512 sum705 = sum702;
for (s79 = 0; s79 < 1024; ++s79) {
__m512 dat2632 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s79+(ptrdiff_t)0);
__m512 dat2633 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s79+(ptrdiff_t)64);
__m512 dat2634 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s79+(ptrdiff_t)128);
__m512 dat2635 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+256*s79+(ptrdiff_t)192);
__m512 wt847 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)16));
sum690 = _mm512_fmadd_ps(wt847, dat2632, sum690);
sum691 = _mm512_fmadd_ps(wt847, dat2633, sum691);
sum692 = _mm512_fmadd_ps(wt847, dat2634, sum692);
sum693 = _mm512_fmadd_ps(wt847, dat2635, sum693);
__m512 wt848 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)20));
sum694 = _mm512_fmadd_ps(wt848, dat2632, sum694);
sum695 = _mm512_fmadd_ps(wt848, dat2633, sum695);
sum696 = _mm512_fmadd_ps(wt848, dat2634, sum696);
sum697 = _mm512_fmadd_ps(wt848, dat2635, sum697);
__m512 wt849 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)24));
sum698 = _mm512_fmadd_ps(wt849, dat2632, sum698);
sum699 = _mm512_fmadd_ps(wt849, dat2633, sum699);
sum700 = _mm512_fmadd_ps(wt849, dat2634, sum700);
sum701 = _mm512_fmadd_ps(wt849, dat2635, sum701);
__m512 wt850 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k186+16*s79+(ptrdiff_t)28));
sum702 = _mm512_fmadd_ps(wt850, dat2632, sum702);
sum703 = _mm512_fmadd_ps(wt850, dat2633, sum703);
sum704 = _mm512_fmadd_ps(wt850, dat2634, sum704);
sum705 = _mm512_fmadd_ps(wt850, dat2635, sum705);
}
sum690 = _mm512_max_ps(_mm512_setzero_ps(), sum690);
sum691 = _mm512_max_ps(_mm512_setzero_ps(), sum691);
sum692 = _mm512_max_ps(_mm512_setzero_ps(), sum692);
sum693 = _mm512_max_ps(_mm512_setzero_ps(), sum693);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)0, 65535, sum690);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)64, 65535, sum691);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)128, 65535, sum692);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)192, 65535, sum693);
sum694 = _mm512_max_ps(_mm512_setzero_ps(), sum694);
sum695 = _mm512_max_ps(_mm512_setzero_ps(), sum695);
sum696 = _mm512_max_ps(_mm512_setzero_ps(), sum696);
sum697 = _mm512_max_ps(_mm512_setzero_ps(), sum697);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)832, 65535, sum694);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)896, 65535, sum695);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)960, 65535, sum696);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1024, 65535, sum697);
sum698 = _mm512_max_ps(_mm512_setzero_ps(), sum698);
sum699 = _mm512_max_ps(_mm512_setzero_ps(), sum699);
sum700 = _mm512_max_ps(_mm512_setzero_ps(), sum700);
sum701 = _mm512_max_ps(_mm512_setzero_ps(), sum701);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1664, 65535, sum698);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1728, 65535, sum699);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1792, 65535, sum700);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)1856, 65535, sum701);
sum702 = _mm512_max_ps(_mm512_setzero_ps(), sum702);
sum703 = _mm512_max_ps(_mm512_setzero_ps(), sum703);
sum704 = _mm512_max_ps(_mm512_setzero_ps(), sum704);
sum705 = _mm512_max_ps(_mm512_setzero_ps(), sum705);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2496, 65535, sum702);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2560, 65535, sum703);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2624, 65535, sum704);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k186+(ptrdiff_t)2688, 65535, sum705);
if (j71 >= jj64) return;
}
ptrdiff_t k187 = 1*w75;
ptrdiff_t kk67 = k187+0;
for (; k187 != 170; ++k187) {
ptrdiff_t s80 = -1;
__m512 sum706 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)24));
__m512 sum707 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)28));
__m512 sum708 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)32));
__m512 sum709 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)36));
__m512 sum710 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)40));
__m512 sum711 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)44));
for (s80 = 0; s80 < 1024; ++s80) {
__m512 dat2636 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+64*s80+(ptrdiff_t)0);
__m512 wt851 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)24));
sum706 = _mm512_fmadd_ps(wt851, dat2636, sum706);
__m512 wt852 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)28));
sum707 = _mm512_fmadd_ps(wt852, dat2636, sum707);
__m512 wt853 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)32));
sum708 = _mm512_fmadd_ps(wt853, dat2636, sum708);
__m512 wt854 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)36));
sum709 = _mm512_fmadd_ps(wt854, dat2636, sum709);
__m512 wt855 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)40));
sum710 = _mm512_fmadd_ps(wt855, dat2636, sum710);
__m512 wt856 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+24*s80+(ptrdiff_t)44));
sum711 = _mm512_fmadd_ps(wt856, dat2636, sum711);
}
sum706 = _mm512_max_ps(_mm512_setzero_ps(), sum706);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)0, 15, sum706);
sum707 = _mm512_max_ps(_mm512_setzero_ps(), sum707);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)832, 15, sum707);
sum708 = _mm512_max_ps(_mm512_setzero_ps(), sum708);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)1664, 15, sum708);
sum709 = _mm512_max_ps(_mm512_setzero_ps(), sum709);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)2496, 15, sum709);
sum710 = _mm512_max_ps(_mm512_setzero_ps(), sum710);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)3328, 15, sum710);
sum711 = _mm512_max_ps(_mm512_setzero_ps(), sum711);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)4160, 15, sum711);
if (k187 >= kk67) return;
}
ptrdiff_t s81 = -1;
__m512 sum712 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)16));
__m512 sum713 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)20));
__m512 sum714 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)24));
__m512 sum715 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)28));
for (s81 = 0; s81 < 1024; ++s81) {
__m512 dat2637 = _mm512_loadu_ps(arrangedDats13+851968*i80+262144*j71+64*s81+(ptrdiff_t)0);
__m512 wt857 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)16));
sum712 = _mm512_fmadd_ps(wt857, dat2637, sum712);
__m512 wt858 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)20));
sum713 = _mm512_fmadd_ps(wt858, dat2637, sum713);
__m512 wt859 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)24));
sum714 = _mm512_fmadd_ps(wt859, dat2637, sum714);
__m512 wt860 = _mm512_set1_ps(*(float*)(arrangedWts13+4198400*i80+24600*k187+16*s81+(ptrdiff_t)28));
sum715 = _mm512_fmadd_ps(wt860, dat2637, sum715);
}
sum712 = _mm512_max_ps(_mm512_setzero_ps(), sum712);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)0, 15, sum712);
sum713 = _mm512_max_ps(_mm512_setzero_ps(), sum713);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)832, 15, sum713);
sum714 = _mm512_max_ps(_mm512_setzero_ps(), sum714);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)1664, 15, sum714);
sum715 = _mm512_max_ps(_mm512_setzero_ps(), sum715);
_mm512_mask_storeu_ps(datPtr43+851968*i80+256*j71+4992*k187+(ptrdiff_t)2496, 15, sum715);
}
}

static void ResNeXt50OneApply13(ResNeXt50ThreaderTeam1* team81, char** tensors135) {
void* pair33[] = {tensors135, 0};
ResNeXt50ThreaderTask1 task139;
task139.callee1 = ResNeXt50OneApply13Callee1;
task139.any1 = pair33;
task139.nd1 = 3;
task139.hull1[0] = 171;
task139.hull1[1] = 4;
task139.hull1[2] = 1;
ResNeXt50ThreaderDo1(team81, &task139);
}

static void ResNeXt50OneArrangeWts14Callee1(ResNeXt50ThreaderTask1* task148, int64_t* pt79) {
char** tensors146 = task148->any1;
ptrdiff_t b90 = pt79[0];
char*restrict wtPtr23 = tensors146[0]+(ptrdiff_t)3340*0+(ptrdiff_t)8388608*0;
char*restrict biasPtr23 = tensors146[1]+(ptrdiff_t)8192*0;
char*restrict bnPtr24 = tensors146[2]+(ptrdiff_t)8*2048*0;
char*restrict arranged27 = tensors146[3]+(ptrdiff_t)6848512*0+(ptrdiff_t)8396800*0;
ptrdiff_t ii63 = 1;
for (ptrdiff_t i86 = 0; i86 < ii63; ++i86) {
ptrdiff_t j77 = 1*b90;
ptrdiff_t jj68 = j77+1;
for (; j77 < jj68; ++j77) {
if (j77 < 127) {
ptrdiff_t k196 = 0+16*(j77-0);
ptrdiff_t l85 = (size_t)(0+k196)/6;
ptrdiff_t cut32 = (size_t)(0+k196)%6;
switch (cut32) {
case 0:;
case 2: {
__m512 sum717 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i86+4*k196);
__m512i pmMul49 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd49 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo41 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k196+2048*i86));
__m512 masHi41 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k196+2048*i86)+(ptrdiff_t)64);
__m512 postMul74 = _mm512_permutex2var_ps(masLo41, pmMul49, masHi41);
__m512 postAdd50 = _mm512_permutex2var_ps(masLo41, pmAdd49, masHi41);
sum717 = _mm512_fmadd_ps(sum717, postMul74, postAdd50);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)0, 63>>cut32, sum717);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)24576, 4032>>cut32, sum717);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)49152, 65535-(4095>>cut32), sum717);
ptrdiff_t c71 = 0;
for (; c71 != 64; ++c71) {
__m512 wt883 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)0);
__m512 wt884 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)4096);
__m512 wt885 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)8192);
__m512 wt886 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)12288);
__m512 wt887 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)16384);
__m512 wt888 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)20480);
__m512 wt889 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)24576);
__m512 wt890 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)28672);
__m512 wt891 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)32768);
__m512 wt892 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)36864);
__m512 wt893 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)40960);
__m512 wt894 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)45056);
__m512 wt895 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)49152);
__m512 wt896 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)53248);
__m512 wt897 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)57344);
__m512 wt898 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c71+(ptrdiff_t)61440);
__m512 tmp15779 = _mm512_unpacklo_ps(wt883, wt884);
__m512 tmp15780 = _mm512_unpackhi_ps(wt883, wt884);
__m512 tmp15781 = _mm512_unpacklo_ps(wt885, wt886);
__m512 tmp15782 = _mm512_unpackhi_ps(wt885, wt886);
__m512 tmp15783 = _mm512_unpacklo_ps(wt887, wt888);
__m512 tmp15784 = _mm512_unpackhi_ps(wt887, wt888);
__m512 tmp15785 = _mm512_unpacklo_ps(wt889, wt890);
__m512 tmp15786 = _mm512_unpackhi_ps(wt889, wt890);
__m512 tmp15787 = _mm512_unpacklo_ps(wt891, wt892);
__m512 tmp15788 = _mm512_unpackhi_ps(wt891, wt892);
__m512 tmp15789 = _mm512_unpacklo_ps(wt893, wt894);
__m512 tmp15790 = _mm512_unpackhi_ps(wt893, wt894);
__m512 tmp15791 = _mm512_unpacklo_ps(wt895, wt896);
__m512 tmp15792 = _mm512_unpackhi_ps(wt895, wt896);
__m512 tmp15793 = _mm512_unpacklo_ps(wt897, wt898);
__m512 tmp15794 = _mm512_unpackhi_ps(wt897, wt898);
__m512 tmp15795 = _mm512_shuffle_ps(tmp15779, tmp15781, 68);
__m512 tmp15796 = _mm512_shuffle_ps(tmp15779, tmp15781, 238);
__m512 tmp15797 = _mm512_shuffle_ps(tmp15780, tmp15782, 68);
__m512 tmp15798 = _mm512_shuffle_ps(tmp15780, tmp15782, 238);
__m512 tmp15799 = _mm512_shuffle_ps(tmp15783, tmp15785, 68);
__m512 tmp15800 = _mm512_shuffle_ps(tmp15783, tmp15785, 238);
__m512 tmp15801 = _mm512_shuffle_ps(tmp15784, tmp15786, 68);
__m512 tmp15802 = _mm512_shuffle_ps(tmp15784, tmp15786, 238);
__m512 tmp15803 = _mm512_shuffle_ps(tmp15787, tmp15789, 68);
__m512 tmp15804 = _mm512_shuffle_ps(tmp15787, tmp15789, 238);
__m512 tmp15805 = _mm512_shuffle_ps(tmp15788, tmp15790, 68);
__m512 tmp15806 = _mm512_shuffle_ps(tmp15788, tmp15790, 238);
__m512 tmp15807 = _mm512_shuffle_ps(tmp15791, tmp15793, 68);
__m512 tmp15808 = _mm512_shuffle_ps(tmp15791, tmp15793, 238);
__m512 tmp15809 = _mm512_shuffle_ps(tmp15792, tmp15794, 68);
__m512 tmp15810 = _mm512_shuffle_ps(tmp15792, tmp15794, 238);
__m512 tmp15811 = _mm512_shuffle_f32x4(tmp15795, tmp15799, 136);
__m512 tmp15812 = _mm512_shuffle_f32x4(tmp15795, tmp15799, 221);
__m512 tmp15813 = _mm512_shuffle_f32x4(tmp15796, tmp15800, 136);
__m512 tmp15814 = _mm512_shuffle_f32x4(tmp15796, tmp15800, 221);
__m512 tmp15815 = _mm512_shuffle_f32x4(tmp15797, tmp15801, 136);
__m512 tmp15816 = _mm512_shuffle_f32x4(tmp15797, tmp15801, 221);
__m512 tmp15817 = _mm512_shuffle_f32x4(tmp15798, tmp15802, 136);
__m512 tmp15818 = _mm512_shuffle_f32x4(tmp15798, tmp15802, 221);
__m512 tmp15819 = _mm512_shuffle_f32x4(tmp15803, tmp15807, 136);
__m512 tmp15820 = _mm512_shuffle_f32x4(tmp15803, tmp15807, 221);
__m512 tmp15821 = _mm512_shuffle_f32x4(tmp15804, tmp15808, 136);
__m512 tmp15822 = _mm512_shuffle_f32x4(tmp15804, tmp15808, 221);
__m512 tmp15823 = _mm512_shuffle_f32x4(tmp15805, tmp15809, 136);
__m512 tmp15824 = _mm512_shuffle_f32x4(tmp15805, tmp15809, 221);
__m512 tmp15825 = _mm512_shuffle_f32x4(tmp15806, tmp15810, 136);
__m512 tmp15826 = _mm512_shuffle_f32x4(tmp15806, tmp15810, 221);
wt883 = _mm512_shuffle_f32x4(tmp15811, tmp15819, 136);
wt891 = _mm512_shuffle_f32x4(tmp15811, tmp15819, 221);
wt884 = _mm512_shuffle_f32x4(tmp15813, tmp15821, 136);
wt892 = _mm512_shuffle_f32x4(tmp15813, tmp15821, 221);
wt885 = _mm512_shuffle_f32x4(tmp15815, tmp15823, 136);
wt893 = _mm512_shuffle_f32x4(tmp15815, tmp15823, 221);
wt886 = _mm512_shuffle_f32x4(tmp15817, tmp15825, 136);
wt894 = _mm512_shuffle_f32x4(tmp15817, tmp15825, 221);
wt887 = _mm512_shuffle_f32x4(tmp15812, tmp15820, 136);
wt895 = _mm512_shuffle_f32x4(tmp15812, tmp15820, 221);
wt888 = _mm512_shuffle_f32x4(tmp15814, tmp15822, 136);
wt896 = _mm512_shuffle_f32x4(tmp15814, tmp15822, 221);
wt889 = _mm512_shuffle_f32x4(tmp15816, tmp15824, 136);
wt897 = _mm512_shuffle_f32x4(tmp15816, tmp15824, 221);
wt890 = _mm512_shuffle_f32x4(tmp15818, tmp15826, 136);
wt898 = _mm512_shuffle_f32x4(tmp15818, tmp15826, 221);
wt883 = _mm512_mul_ps(wt883, postMul74);
wt884 = _mm512_mul_ps(wt884, postMul74);
wt885 = _mm512_mul_ps(wt885, postMul74);
wt886 = _mm512_mul_ps(wt886, postMul74);
wt887 = _mm512_mul_ps(wt887, postMul74);
wt888 = _mm512_mul_ps(wt888, postMul74);
wt889 = _mm512_mul_ps(wt889, postMul74);
wt890 = _mm512_mul_ps(wt890, postMul74);
wt891 = _mm512_mul_ps(wt891, postMul74);
wt892 = _mm512_mul_ps(wt892, postMul74);
wt893 = _mm512_mul_ps(wt893, postMul74);
wt894 = _mm512_mul_ps(wt894, postMul74);
wt895 = _mm512_mul_ps(wt895, postMul74);
wt896 = _mm512_mul_ps(wt896, postMul74);
wt897 = _mm512_mul_ps(wt897, postMul74);
wt898 = _mm512_mul_ps(wt898, postMul74);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c71)+(ptrdiff_t)0, 63>>cut32, wt883);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c71)+(ptrdiff_t)0, 63>>cut32, wt884);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c71)+(ptrdiff_t)0, 63>>cut32, wt885);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c71)+(ptrdiff_t)0, 63>>cut32, wt886);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c71)+(ptrdiff_t)0, 63>>cut32, wt887);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c71)+(ptrdiff_t)0, 63>>cut32, wt888);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c71)+(ptrdiff_t)0, 63>>cut32, wt889);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c71)+(ptrdiff_t)0, 63>>cut32, wt890);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c71)+(ptrdiff_t)0, 63>>cut32, wt891);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c71)+(ptrdiff_t)0, 63>>cut32, wt892);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c71)+(ptrdiff_t)0, 63>>cut32, wt893);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c71)+(ptrdiff_t)0, 63>>cut32, wt894);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c71)+(ptrdiff_t)0, 63>>cut32, wt895);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c71)+(ptrdiff_t)0, 63>>cut32, wt896);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c71)+(ptrdiff_t)0, 63>>cut32, wt897);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c71)+(ptrdiff_t)0, 63>>cut32, wt898);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt883);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt884);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt885);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt886);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt887);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt888);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt889);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt890);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt891);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt892);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt893);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt894);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt895);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt896);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt897);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c71)+(ptrdiff_t)24576, 4032>>cut32, wt898);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt883);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt884);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt885);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt886);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt887);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt888);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt889);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt890);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt891);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt892);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt893);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt894);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt895);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt896);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt897);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c71)+(ptrdiff_t)49152, 65535-(4095>>cut32), wt898);
}
break;
}
default: {
cut32 = 4;
__m512 sum718 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i86+4*k196);
__m512i pmMul50 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd50 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo42 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k196+2048*i86));
__m512 masHi42 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k196+2048*i86)+(ptrdiff_t)64);
__m512 postMul75 = _mm512_permutex2var_ps(masLo42, pmMul50, masHi42);
__m512 postAdd51 = _mm512_permutex2var_ps(masLo42, pmAdd50, masHi42);
sum718 = _mm512_fmadd_ps(sum718, postMul75, postAdd51);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)0, 63>>cut32, sum718);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)24576, 4032>>cut32, sum718);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)49152, 258048>>cut32, sum718);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*0+(ptrdiff_t)73728, 65535-(262143>>cut32), sum718);
ptrdiff_t c72 = 0;
for (; c72 != 64; ++c72) {
__m512 wt899 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)0);
__m512 wt900 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)4096);
__m512 wt901 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)8192);
__m512 wt902 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)12288);
__m512 wt903 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)16384);
__m512 wt904 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)20480);
__m512 wt905 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)24576);
__m512 wt906 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)28672);
__m512 wt907 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)32768);
__m512 wt908 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)36864);
__m512 wt909 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)40960);
__m512 wt910 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)45056);
__m512 wt911 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)49152);
__m512 wt912 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)53248);
__m512 wt913 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)57344);
__m512 wt914 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k196+64*c72+(ptrdiff_t)61440);
__m512 tmp15827 = _mm512_unpacklo_ps(wt899, wt900);
__m512 tmp15828 = _mm512_unpackhi_ps(wt899, wt900);
__m512 tmp15829 = _mm512_unpacklo_ps(wt901, wt902);
__m512 tmp15830 = _mm512_unpackhi_ps(wt901, wt902);
__m512 tmp15831 = _mm512_unpacklo_ps(wt903, wt904);
__m512 tmp15832 = _mm512_unpackhi_ps(wt903, wt904);
__m512 tmp15833 = _mm512_unpacklo_ps(wt905, wt906);
__m512 tmp15834 = _mm512_unpackhi_ps(wt905, wt906);
__m512 tmp15835 = _mm512_unpacklo_ps(wt907, wt908);
__m512 tmp15836 = _mm512_unpackhi_ps(wt907, wt908);
__m512 tmp15837 = _mm512_unpacklo_ps(wt909, wt910);
__m512 tmp15838 = _mm512_unpackhi_ps(wt909, wt910);
__m512 tmp15839 = _mm512_unpacklo_ps(wt911, wt912);
__m512 tmp15840 = _mm512_unpackhi_ps(wt911, wt912);
__m512 tmp15841 = _mm512_unpacklo_ps(wt913, wt914);
__m512 tmp15842 = _mm512_unpackhi_ps(wt913, wt914);
__m512 tmp15843 = _mm512_shuffle_ps(tmp15827, tmp15829, 68);
__m512 tmp15844 = _mm512_shuffle_ps(tmp15827, tmp15829, 238);
__m512 tmp15845 = _mm512_shuffle_ps(tmp15828, tmp15830, 68);
__m512 tmp15846 = _mm512_shuffle_ps(tmp15828, tmp15830, 238);
__m512 tmp15847 = _mm512_shuffle_ps(tmp15831, tmp15833, 68);
__m512 tmp15848 = _mm512_shuffle_ps(tmp15831, tmp15833, 238);
__m512 tmp15849 = _mm512_shuffle_ps(tmp15832, tmp15834, 68);
__m512 tmp15850 = _mm512_shuffle_ps(tmp15832, tmp15834, 238);
__m512 tmp15851 = _mm512_shuffle_ps(tmp15835, tmp15837, 68);
__m512 tmp15852 = _mm512_shuffle_ps(tmp15835, tmp15837, 238);
__m512 tmp15853 = _mm512_shuffle_ps(tmp15836, tmp15838, 68);
__m512 tmp15854 = _mm512_shuffle_ps(tmp15836, tmp15838, 238);
__m512 tmp15855 = _mm512_shuffle_ps(tmp15839, tmp15841, 68);
__m512 tmp15856 = _mm512_shuffle_ps(tmp15839, tmp15841, 238);
__m512 tmp15857 = _mm512_shuffle_ps(tmp15840, tmp15842, 68);
__m512 tmp15858 = _mm512_shuffle_ps(tmp15840, tmp15842, 238);
__m512 tmp15859 = _mm512_shuffle_f32x4(tmp15843, tmp15847, 136);
__m512 tmp15860 = _mm512_shuffle_f32x4(tmp15843, tmp15847, 221);
__m512 tmp15861 = _mm512_shuffle_f32x4(tmp15844, tmp15848, 136);
__m512 tmp15862 = _mm512_shuffle_f32x4(tmp15844, tmp15848, 221);
__m512 tmp15863 = _mm512_shuffle_f32x4(tmp15845, tmp15849, 136);
__m512 tmp15864 = _mm512_shuffle_f32x4(tmp15845, tmp15849, 221);
__m512 tmp15865 = _mm512_shuffle_f32x4(tmp15846, tmp15850, 136);
__m512 tmp15866 = _mm512_shuffle_f32x4(tmp15846, tmp15850, 221);
__m512 tmp15867 = _mm512_shuffle_f32x4(tmp15851, tmp15855, 136);
__m512 tmp15868 = _mm512_shuffle_f32x4(tmp15851, tmp15855, 221);
__m512 tmp15869 = _mm512_shuffle_f32x4(tmp15852, tmp15856, 136);
__m512 tmp15870 = _mm512_shuffle_f32x4(tmp15852, tmp15856, 221);
__m512 tmp15871 = _mm512_shuffle_f32x4(tmp15853, tmp15857, 136);
__m512 tmp15872 = _mm512_shuffle_f32x4(tmp15853, tmp15857, 221);
__m512 tmp15873 = _mm512_shuffle_f32x4(tmp15854, tmp15858, 136);
__m512 tmp15874 = _mm512_shuffle_f32x4(tmp15854, tmp15858, 221);
wt899 = _mm512_shuffle_f32x4(tmp15859, tmp15867, 136);
wt907 = _mm512_shuffle_f32x4(tmp15859, tmp15867, 221);
wt900 = _mm512_shuffle_f32x4(tmp15861, tmp15869, 136);
wt908 = _mm512_shuffle_f32x4(tmp15861, tmp15869, 221);
wt901 = _mm512_shuffle_f32x4(tmp15863, tmp15871, 136);
wt909 = _mm512_shuffle_f32x4(tmp15863, tmp15871, 221);
wt902 = _mm512_shuffle_f32x4(tmp15865, tmp15873, 136);
wt910 = _mm512_shuffle_f32x4(tmp15865, tmp15873, 221);
wt903 = _mm512_shuffle_f32x4(tmp15860, tmp15868, 136);
wt911 = _mm512_shuffle_f32x4(tmp15860, tmp15868, 221);
wt904 = _mm512_shuffle_f32x4(tmp15862, tmp15870, 136);
wt912 = _mm512_shuffle_f32x4(tmp15862, tmp15870, 221);
wt905 = _mm512_shuffle_f32x4(tmp15864, tmp15872, 136);
wt913 = _mm512_shuffle_f32x4(tmp15864, tmp15872, 221);
wt906 = _mm512_shuffle_f32x4(tmp15866, tmp15874, 136);
wt914 = _mm512_shuffle_f32x4(tmp15866, tmp15874, 221);
wt899 = _mm512_mul_ps(wt899, postMul75);
wt900 = _mm512_mul_ps(wt900, postMul75);
wt901 = _mm512_mul_ps(wt901, postMul75);
wt902 = _mm512_mul_ps(wt902, postMul75);
wt903 = _mm512_mul_ps(wt903, postMul75);
wt904 = _mm512_mul_ps(wt904, postMul75);
wt905 = _mm512_mul_ps(wt905, postMul75);
wt906 = _mm512_mul_ps(wt906, postMul75);
wt907 = _mm512_mul_ps(wt907, postMul75);
wt908 = _mm512_mul_ps(wt908, postMul75);
wt909 = _mm512_mul_ps(wt909, postMul75);
wt910 = _mm512_mul_ps(wt910, postMul75);
wt911 = _mm512_mul_ps(wt911, postMul75);
wt912 = _mm512_mul_ps(wt912, postMul75);
wt913 = _mm512_mul_ps(wt913, postMul75);
wt914 = _mm512_mul_ps(wt914, postMul75);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c72)+(ptrdiff_t)0, 63>>cut32, wt899);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c72)+(ptrdiff_t)0, 63>>cut32, wt900);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c72)+(ptrdiff_t)0, 63>>cut32, wt901);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c72)+(ptrdiff_t)0, 63>>cut32, wt902);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c72)+(ptrdiff_t)0, 63>>cut32, wt903);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c72)+(ptrdiff_t)0, 63>>cut32, wt904);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c72)+(ptrdiff_t)0, 63>>cut32, wt905);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c72)+(ptrdiff_t)0, 63>>cut32, wt906);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c72)+(ptrdiff_t)0, 63>>cut32, wt907);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c72)+(ptrdiff_t)0, 63>>cut32, wt908);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c72)+(ptrdiff_t)0, 63>>cut32, wt909);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c72)+(ptrdiff_t)0, 63>>cut32, wt910);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c72)+(ptrdiff_t)0, 63>>cut32, wt911);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c72)+(ptrdiff_t)0, 63>>cut32, wt912);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c72)+(ptrdiff_t)0, 63>>cut32, wt913);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c72)+(ptrdiff_t)0, 63>>cut32, wt914);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt899);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt900);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt901);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt902);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt903);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt904);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt905);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt906);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt907);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt908);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt909);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt910);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt911);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt912);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt913);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c72)+(ptrdiff_t)24576, 4032>>cut32, wt914);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt899);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt900);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt901);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt902);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt903);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt904);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt905);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt906);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt907);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt908);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt909);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt910);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt911);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt912);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt913);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c72)+(ptrdiff_t)49152, 258048>>cut32, wt914);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(1+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt899);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(2+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt900);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(3+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt901);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(4+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt902);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(5+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt903);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(6+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt904);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(7+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt905);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(8+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt906);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(9+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt907);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(10+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt908);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(11+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt909);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(12+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt910);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(13+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt911);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(14+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt912);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(15+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt913);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l85+4*cut32+24*(16+16*c72)+(ptrdiff_t)73728, 65535-(262143>>cut32), wt914);
}
}
}
} else {
ptrdiff_t k195 = 2032;
ptrdiff_t l84 = (size_t)(0+k195)/6;
ptrdiff_t cut31 = (size_t)(0+k195)%6;
__m512 sum716 = _mm512_maskz_loadu_ps(65535, biasPtr23+8192*i86+4*k195);
__m512i pmMul51 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd51 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo43 = _mm512_loadu_ps(bnPtr24+(ptrdiff_t)8*(k195+2048*i86));
__m512 masHi43 = _mm512_maskz_loadu_ps(65535, bnPtr24+(ptrdiff_t)8*(k195+2048*i86)+(ptrdiff_t)64);
__m512 postMul73 = _mm512_permutex2var_ps(masLo43, pmMul51, masHi43);
__m512 postAdd49 = _mm512_permutex2var_ps(masLo43, pmAdd51, masHi43);
sum716 = _mm512_fmadd_ps(sum716, postMul73, postAdd49);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*0+(ptrdiff_t)0, 63>>cut31, sum716);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*0+(ptrdiff_t)24576, 4032>>cut31, sum716);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*0+(ptrdiff_t)49152, 258048>>cut31, sum716);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*0+(ptrdiff_t)73728, 65535-(262143>>cut31), sum716);
ptrdiff_t c70 = 0;
for (; c70 != 64; ++c70) {
__m512 wt867 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)0);
__m512 wt868 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)4096);
__m512 wt869 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)8192);
__m512 wt870 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)12288);
__m512 wt871 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)16384);
__m512 wt872 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)20480);
__m512 wt873 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)24576);
__m512 wt874 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)28672);
__m512 wt875 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)32768);
__m512 wt876 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)36864);
__m512 wt877 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)40960);
__m512 wt878 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)45056);
__m512 wt879 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)49152);
__m512 wt880 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)53248);
__m512 wt881 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)57344);
__m512 wt882 = _mm512_maskz_loadu_ps(65535, wtPtr23+8388608*i86+4096*k195+64*c70+(ptrdiff_t)61440);
__m512 tmp15875 = _mm512_unpacklo_ps(wt867, wt868);
__m512 tmp15876 = _mm512_unpackhi_ps(wt867, wt868);
__m512 tmp15877 = _mm512_unpacklo_ps(wt869, wt870);
__m512 tmp15878 = _mm512_unpackhi_ps(wt869, wt870);
__m512 tmp15879 = _mm512_unpacklo_ps(wt871, wt872);
__m512 tmp15880 = _mm512_unpackhi_ps(wt871, wt872);
__m512 tmp15881 = _mm512_unpacklo_ps(wt873, wt874);
__m512 tmp15882 = _mm512_unpackhi_ps(wt873, wt874);
__m512 tmp15883 = _mm512_unpacklo_ps(wt875, wt876);
__m512 tmp15884 = _mm512_unpackhi_ps(wt875, wt876);
__m512 tmp15885 = _mm512_unpacklo_ps(wt877, wt878);
__m512 tmp15886 = _mm512_unpackhi_ps(wt877, wt878);
__m512 tmp15887 = _mm512_unpacklo_ps(wt879, wt880);
__m512 tmp15888 = _mm512_unpackhi_ps(wt879, wt880);
__m512 tmp15889 = _mm512_unpacklo_ps(wt881, wt882);
__m512 tmp15890 = _mm512_unpackhi_ps(wt881, wt882);
__m512 tmp15891 = _mm512_shuffle_ps(tmp15875, tmp15877, 68);
__m512 tmp15892 = _mm512_shuffle_ps(tmp15875, tmp15877, 238);
__m512 tmp15893 = _mm512_shuffle_ps(tmp15876, tmp15878, 68);
__m512 tmp15894 = _mm512_shuffle_ps(tmp15876, tmp15878, 238);
__m512 tmp15895 = _mm512_shuffle_ps(tmp15879, tmp15881, 68);
__m512 tmp15896 = _mm512_shuffle_ps(tmp15879, tmp15881, 238);
__m512 tmp15897 = _mm512_shuffle_ps(tmp15880, tmp15882, 68);
__m512 tmp15898 = _mm512_shuffle_ps(tmp15880, tmp15882, 238);
__m512 tmp15899 = _mm512_shuffle_ps(tmp15883, tmp15885, 68);
__m512 tmp15900 = _mm512_shuffle_ps(tmp15883, tmp15885, 238);
__m512 tmp15901 = _mm512_shuffle_ps(tmp15884, tmp15886, 68);
__m512 tmp15902 = _mm512_shuffle_ps(tmp15884, tmp15886, 238);
__m512 tmp15903 = _mm512_shuffle_ps(tmp15887, tmp15889, 68);
__m512 tmp15904 = _mm512_shuffle_ps(tmp15887, tmp15889, 238);
__m512 tmp15905 = _mm512_shuffle_ps(tmp15888, tmp15890, 68);
__m512 tmp15906 = _mm512_shuffle_ps(tmp15888, tmp15890, 238);
__m512 tmp15907 = _mm512_shuffle_f32x4(tmp15891, tmp15895, 136);
__m512 tmp15908 = _mm512_shuffle_f32x4(tmp15891, tmp15895, 221);
__m512 tmp15909 = _mm512_shuffle_f32x4(tmp15892, tmp15896, 136);
__m512 tmp15910 = _mm512_shuffle_f32x4(tmp15892, tmp15896, 221);
__m512 tmp15911 = _mm512_shuffle_f32x4(tmp15893, tmp15897, 136);
__m512 tmp15912 = _mm512_shuffle_f32x4(tmp15893, tmp15897, 221);
__m512 tmp15913 = _mm512_shuffle_f32x4(tmp15894, tmp15898, 136);
__m512 tmp15914 = _mm512_shuffle_f32x4(tmp15894, tmp15898, 221);
__m512 tmp15915 = _mm512_shuffle_f32x4(tmp15899, tmp15903, 136);
__m512 tmp15916 = _mm512_shuffle_f32x4(tmp15899, tmp15903, 221);
__m512 tmp15917 = _mm512_shuffle_f32x4(tmp15900, tmp15904, 136);
__m512 tmp15918 = _mm512_shuffle_f32x4(tmp15900, tmp15904, 221);
__m512 tmp15919 = _mm512_shuffle_f32x4(tmp15901, tmp15905, 136);
__m512 tmp15920 = _mm512_shuffle_f32x4(tmp15901, tmp15905, 221);
__m512 tmp15921 = _mm512_shuffle_f32x4(tmp15902, tmp15906, 136);
__m512 tmp15922 = _mm512_shuffle_f32x4(tmp15902, tmp15906, 221);
wt867 = _mm512_shuffle_f32x4(tmp15907, tmp15915, 136);
wt875 = _mm512_shuffle_f32x4(tmp15907, tmp15915, 221);
wt868 = _mm512_shuffle_f32x4(tmp15909, tmp15917, 136);
wt876 = _mm512_shuffle_f32x4(tmp15909, tmp15917, 221);
wt869 = _mm512_shuffle_f32x4(tmp15911, tmp15919, 136);
wt877 = _mm512_shuffle_f32x4(tmp15911, tmp15919, 221);
wt870 = _mm512_shuffle_f32x4(tmp15913, tmp15921, 136);
wt878 = _mm512_shuffle_f32x4(tmp15913, tmp15921, 221);
wt871 = _mm512_shuffle_f32x4(tmp15908, tmp15916, 136);
wt879 = _mm512_shuffle_f32x4(tmp15908, tmp15916, 221);
wt872 = _mm512_shuffle_f32x4(tmp15910, tmp15918, 136);
wt880 = _mm512_shuffle_f32x4(tmp15910, tmp15918, 221);
wt873 = _mm512_shuffle_f32x4(tmp15912, tmp15920, 136);
wt881 = _mm512_shuffle_f32x4(tmp15912, tmp15920, 221);
wt874 = _mm512_shuffle_f32x4(tmp15914, tmp15922, 136);
wt882 = _mm512_shuffle_f32x4(tmp15914, tmp15922, 221);
wt867 = _mm512_mul_ps(wt867, postMul73);
wt868 = _mm512_mul_ps(wt868, postMul73);
wt869 = _mm512_mul_ps(wt869, postMul73);
wt870 = _mm512_mul_ps(wt870, postMul73);
wt871 = _mm512_mul_ps(wt871, postMul73);
wt872 = _mm512_mul_ps(wt872, postMul73);
wt873 = _mm512_mul_ps(wt873, postMul73);
wt874 = _mm512_mul_ps(wt874, postMul73);
wt875 = _mm512_mul_ps(wt875, postMul73);
wt876 = _mm512_mul_ps(wt876, postMul73);
wt877 = _mm512_mul_ps(wt877, postMul73);
wt878 = _mm512_mul_ps(wt878, postMul73);
wt879 = _mm512_mul_ps(wt879, postMul73);
wt880 = _mm512_mul_ps(wt880, postMul73);
wt881 = _mm512_mul_ps(wt881, postMul73);
wt882 = _mm512_mul_ps(wt882, postMul73);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(1+16*c70)+(ptrdiff_t)0, 63>>cut31, wt867);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(2+16*c70)+(ptrdiff_t)0, 63>>cut31, wt868);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(3+16*c70)+(ptrdiff_t)0, 63>>cut31, wt869);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(4+16*c70)+(ptrdiff_t)0, 63>>cut31, wt870);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(5+16*c70)+(ptrdiff_t)0, 63>>cut31, wt871);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(6+16*c70)+(ptrdiff_t)0, 63>>cut31, wt872);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(7+16*c70)+(ptrdiff_t)0, 63>>cut31, wt873);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(8+16*c70)+(ptrdiff_t)0, 63>>cut31, wt874);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(9+16*c70)+(ptrdiff_t)0, 63>>cut31, wt875);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(10+16*c70)+(ptrdiff_t)0, 63>>cut31, wt876);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(11+16*c70)+(ptrdiff_t)0, 63>>cut31, wt877);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(12+16*c70)+(ptrdiff_t)0, 63>>cut31, wt878);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(13+16*c70)+(ptrdiff_t)0, 63>>cut31, wt879);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(14+16*c70)+(ptrdiff_t)0, 63>>cut31, wt880);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(15+16*c70)+(ptrdiff_t)0, 63>>cut31, wt881);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(16+16*c70)+(ptrdiff_t)0, 63>>cut31, wt882);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(1+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt867);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(2+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt868);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(3+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt869);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(4+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt870);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(5+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt871);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(6+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt872);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(7+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt873);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(8+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt874);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(9+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt875);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(10+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt876);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(11+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt877);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(12+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt878);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(13+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt879);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(14+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt880);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(15+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt881);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(16+16*c70)+(ptrdiff_t)24576, 4032>>cut31, wt882);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(1+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt867);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(2+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt868);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(3+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt869);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(4+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt870);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(5+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt871);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(6+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt872);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(7+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt873);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(8+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt874);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(9+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt875);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(10+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt876);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(11+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt877);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(12+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt878);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(13+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt879);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(14+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt880);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(15+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt881);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+24*(16+16*c70)+(ptrdiff_t)49152, 258048>>cut31, wt882);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(1+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt867);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(2+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt868);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(3+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt869);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(4+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt870);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(5+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt871);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(6+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt872);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(7+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt873);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(8+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt874);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(9+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt875);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(10+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt876);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(11+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt877);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(12+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt878);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(13+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt879);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(14+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt880);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(15+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt881);
_mm512_mask_storeu_ps(arranged27+8396800*i86+24600*l84+4*cut31+8*(16+16*c70)+(ptrdiff_t)73728, 65535-(262143>>cut31), wt882);
}
}
}
}
}

static void ResNeXt50OneArrangeWts14(ResNeXt50ThreaderTeam1* team86, char** tensors145) {
ResNeXt50ThreaderTask1 task149;
task149.callee1 = ResNeXt50OneArrangeWts14Callee1;
task149.any1 = tensors145;
task149.nd1 = 3;
task149.hull1[0] = 128;
task149.hull1[1] = 1;
task149.hull1[2] = 1;
ResNeXt50ThreaderDo1(team86, &task149);
}

static void ResNeXt50OneArrangeDats14Callee1(ResNeXt50ThreaderTask1* task150, int64_t* pt80) {
char** tensors148 = task150->any1;
ptrdiff_t s87 = pt80[0];
char*restrict datPtr46 = tensors148[0]+(ptrdiff_t)0+(ptrdiff_t)267200*0+(ptrdiff_t)327680*0;
char*restrict arranged28 = tensors148[1]+(ptrdiff_t)213760*0+(ptrdiff_t)262144*0;
ptrdiff_t ii64 = 1;
for (ptrdiff_t i87 = 0; i87 < ii64; ++i87) {
ptrdiff_t j78 = 0;
ptrdiff_t k197 = 128*s87;
ptrdiff_t kk69 = k197+128;
for (; k197 < kk69; ++k197) {
__m512 dat2659 = _mm512_maskz_loadu_ps(65535, datPtr46+327680*i87+256*j78+320*k197+(ptrdiff_t)0);
__m512 dat2660 = _mm512_maskz_loadu_ps(65535, datPtr46+327680*i87+256*j78+320*k197+(ptrdiff_t)64);
__m512 dat2661 = _mm512_maskz_loadu_ps(65535, datPtr46+327680*i87+256*j78+320*k197+(ptrdiff_t)128);
__m512 dat2662 = _mm512_maskz_loadu_ps(1, datPtr46+327680*i87+256*j78+320*k197+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged28+262144*i87+262144*j78+256*k197+(ptrdiff_t)0, 65535, dat2659);
_mm512_mask_storeu_ps(arranged28+262144*i87+262144*j78+256*k197+(ptrdiff_t)64, 65535, dat2660);
_mm512_mask_storeu_ps(arranged28+262144*i87+262144*j78+256*k197+(ptrdiff_t)128, 65535, dat2661);
_mm512_mask_storeu_ps(arranged28+262144*i87+262144*j78+256*k197+(ptrdiff_t)192, 1, dat2662);
}
}
}

static void ResNeXt50OneArrangeDats14(ResNeXt50ThreaderTeam1* team87, char** tensors147) {
ResNeXt50ThreaderTask1 task151;
task151.callee1 = ResNeXt50OneArrangeDats14Callee1;
task151.any1 = tensors147;
task151.nd1 = 4;
task151.hull1[0] = 8;
task151.hull1[1] = 1;
task151.hull1[2] = 1;
task151.hull1[3] = 1;
ResNeXt50ThreaderDo1(team87, &task151);
}

static void ResNeXt50OneApply14Callee1(ResNeXt50ThreaderTask1* task152, int64_t* pt81) {
void** pair36 = task152->any1;
char** tensors150 = pair36[0];
ptrdiff_t e43 = 0;
ptrdiff_t g47 = 0;
ptrdiff_t d30 = 0;
ptrdiff_t w79 = pt81[0];
char*restrict arrangedWts14 = tensors150[0]+6848512*e43+(ptrdiff_t)8396800*1*g47;
char*restrict arrangedDats14 = tensors150[1]+213760*e43+(ptrdiff_t)262144*1*g47;
char*restrict datPtr47 = tensors150[2]+(ptrdiff_t)655360*1*g47;
char*restrict datPtr48 = tensors150[3]+(ptrdiff_t)655360*1*g47;
ptrdiff_t ii65 = 1;
for (ptrdiff_t i88 = 0; i88 < ii65; ++i88) {
ptrdiff_t j79 = 1*d30;
ptrdiff_t k198 = 1*w79;
ptrdiff_t kk70 = k198+0;
for (; k198 != 341; ++k198) {
ptrdiff_t s88 = -1;
__m512 sum719 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)24));
__m512 sum723 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)28));
__m512 sum727 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)32));
__m512 sum731 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)36));
__m512 sum735 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)40));
__m512 sum739 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)44));
__m512 sum720 = sum719;
__m512 sum721 = sum719;
__m512 sum722 = sum719;
__m512 sum724 = sum723;
__m512 sum725 = sum723;
__m512 sum726 = sum723;
__m512 sum728 = sum727;
__m512 sum729 = sum727;
__m512 sum730 = sum727;
__m512 sum732 = sum731;
__m512 sum733 = sum731;
__m512 sum734 = sum731;
__m512 sum736 = sum735;
__m512 sum737 = sum735;
__m512 sum738 = sum735;
__m512 sum740 = sum739;
__m512 sum741 = sum739;
__m512 sum742 = sum739;
for (s88 = 0; s88 < 1024; ++s88) {
__m512 dat2663 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s88+(ptrdiff_t)0);
__m512 dat2664 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s88+(ptrdiff_t)64);
__m512 dat2665 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s88+(ptrdiff_t)128);
__m512 dat2666 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s88+(ptrdiff_t)192);
__m512 wt915 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)24));
sum719 = _mm512_fmadd_ps(wt915, dat2663, sum719);
sum720 = _mm512_fmadd_ps(wt915, dat2664, sum720);
sum721 = _mm512_fmadd_ps(wt915, dat2665, sum721);
sum722 = _mm512_fmadd_ps(wt915, dat2666, sum722);
__m512 wt916 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)28));
sum723 = _mm512_fmadd_ps(wt916, dat2663, sum723);
sum724 = _mm512_fmadd_ps(wt916, dat2664, sum724);
sum725 = _mm512_fmadd_ps(wt916, dat2665, sum725);
sum726 = _mm512_fmadd_ps(wt916, dat2666, sum726);
__m512 wt917 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)32));
sum727 = _mm512_fmadd_ps(wt917, dat2663, sum727);
sum728 = _mm512_fmadd_ps(wt917, dat2664, sum728);
sum729 = _mm512_fmadd_ps(wt917, dat2665, sum729);
sum730 = _mm512_fmadd_ps(wt917, dat2666, sum730);
__m512 wt918 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)36));
sum731 = _mm512_fmadd_ps(wt918, dat2663, sum731);
sum732 = _mm512_fmadd_ps(wt918, dat2664, sum732);
sum733 = _mm512_fmadd_ps(wt918, dat2665, sum733);
sum734 = _mm512_fmadd_ps(wt918, dat2666, sum734);
__m512 wt919 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)40));
sum735 = _mm512_fmadd_ps(wt919, dat2663, sum735);
sum736 = _mm512_fmadd_ps(wt919, dat2664, sum736);
sum737 = _mm512_fmadd_ps(wt919, dat2665, sum737);
sum738 = _mm512_fmadd_ps(wt919, dat2666, sum738);
__m512 wt920 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+24*s88+(ptrdiff_t)44));
sum739 = _mm512_fmadd_ps(wt920, dat2663, sum739);
sum740 = _mm512_fmadd_ps(wt920, dat2664, sum740);
sum741 = _mm512_fmadd_ps(wt920, dat2665, sum741);
sum742 = _mm512_fmadd_ps(wt920, dat2666, sum742);
}
sum719 = _mm512_add_ps(sum719, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)0));
sum720 = _mm512_add_ps(sum720, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)64));
sum721 = _mm512_add_ps(sum721, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)128));
sum722 = _mm512_add_ps(sum722, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)192));
sum719 = _mm512_max_ps(_mm512_setzero_ps(), sum719);
sum720 = _mm512_max_ps(_mm512_setzero_ps(), sum720);
sum721 = _mm512_max_ps(_mm512_setzero_ps(), sum721);
sum722 = _mm512_max_ps(_mm512_setzero_ps(), sum722);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)0, 65535, sum719);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)64, 65535, sum720);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)128, 65535, sum721);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)192, 1, sum722);
sum723 = _mm512_add_ps(sum723, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)320));
sum724 = _mm512_add_ps(sum724, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)384));
sum725 = _mm512_add_ps(sum725, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)448));
sum726 = _mm512_add_ps(sum726, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)512));
sum723 = _mm512_max_ps(_mm512_setzero_ps(), sum723);
sum724 = _mm512_max_ps(_mm512_setzero_ps(), sum724);
sum725 = _mm512_max_ps(_mm512_setzero_ps(), sum725);
sum726 = _mm512_max_ps(_mm512_setzero_ps(), sum726);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)320, 65535, sum723);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)384, 65535, sum724);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)448, 65535, sum725);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)512, 1, sum726);
sum727 = _mm512_add_ps(sum727, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)640));
sum728 = _mm512_add_ps(sum728, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)704));
sum729 = _mm512_add_ps(sum729, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)768));
sum730 = _mm512_add_ps(sum730, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)832));
sum727 = _mm512_max_ps(_mm512_setzero_ps(), sum727);
sum728 = _mm512_max_ps(_mm512_setzero_ps(), sum728);
sum729 = _mm512_max_ps(_mm512_setzero_ps(), sum729);
sum730 = _mm512_max_ps(_mm512_setzero_ps(), sum730);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)640, 65535, sum727);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)704, 65535, sum728);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)768, 65535, sum729);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)832, 1, sum730);
sum731 = _mm512_add_ps(sum731, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)960));
sum732 = _mm512_add_ps(sum732, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1024));
sum733 = _mm512_add_ps(sum733, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1088));
sum734 = _mm512_add_ps(sum734, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1152));
sum731 = _mm512_max_ps(_mm512_setzero_ps(), sum731);
sum732 = _mm512_max_ps(_mm512_setzero_ps(), sum732);
sum733 = _mm512_max_ps(_mm512_setzero_ps(), sum733);
sum734 = _mm512_max_ps(_mm512_setzero_ps(), sum734);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)960, 65535, sum731);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1024, 65535, sum732);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1088, 65535, sum733);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1152, 1, sum734);
sum735 = _mm512_add_ps(sum735, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1280));
sum736 = _mm512_add_ps(sum736, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1344));
sum737 = _mm512_add_ps(sum737, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1408));
sum738 = _mm512_add_ps(sum738, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1472));
sum735 = _mm512_max_ps(_mm512_setzero_ps(), sum735);
sum736 = _mm512_max_ps(_mm512_setzero_ps(), sum736);
sum737 = _mm512_max_ps(_mm512_setzero_ps(), sum737);
sum738 = _mm512_max_ps(_mm512_setzero_ps(), sum738);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1280, 65535, sum735);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1344, 65535, sum736);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1408, 65535, sum737);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1472, 1, sum738);
sum739 = _mm512_add_ps(sum739, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1600));
sum740 = _mm512_add_ps(sum740, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1664));
sum741 = _mm512_add_ps(sum741, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1728));
sum742 = _mm512_add_ps(sum742, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)1792));
sum739 = _mm512_max_ps(_mm512_setzero_ps(), sum739);
sum740 = _mm512_max_ps(_mm512_setzero_ps(), sum740);
sum741 = _mm512_max_ps(_mm512_setzero_ps(), sum741);
sum742 = _mm512_max_ps(_mm512_setzero_ps(), sum742);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1600, 65535, sum739);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1664, 65535, sum740);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1728, 65535, sum741);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)1792, 1, sum742);
if (k198 >= kk70) return;
}
ptrdiff_t s89 = -1;
__m512 sum743 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+8*s89+(ptrdiff_t)8));
__m512 sum747 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+8*s89+(ptrdiff_t)12));
__m512 sum744 = sum743;
__m512 sum745 = sum743;
__m512 sum746 = sum743;
__m512 sum748 = sum747;
__m512 sum749 = sum747;
__m512 sum750 = sum747;
for (s89 = 0; s89 < 1024; ++s89) {
__m512 dat2667 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s89+(ptrdiff_t)0);
__m512 dat2668 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s89+(ptrdiff_t)64);
__m512 dat2669 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s89+(ptrdiff_t)128);
__m512 dat2670 = _mm512_loadu_ps(arrangedDats14+262144*i88+262144*j79+256*s89+(ptrdiff_t)192);
__m512 wt921 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+8*s89+(ptrdiff_t)8));
sum743 = _mm512_fmadd_ps(wt921, dat2667, sum743);
sum744 = _mm512_fmadd_ps(wt921, dat2668, sum744);
sum745 = _mm512_fmadd_ps(wt921, dat2669, sum745);
sum746 = _mm512_fmadd_ps(wt921, dat2670, sum746);
__m512 wt922 = _mm512_set1_ps(*(float*)(arrangedWts14+8396800*i88+24600*k198+8*s89+(ptrdiff_t)12));
sum747 = _mm512_fmadd_ps(wt922, dat2667, sum747);
sum748 = _mm512_fmadd_ps(wt922, dat2668, sum748);
sum749 = _mm512_fmadd_ps(wt922, dat2669, sum749);
sum750 = _mm512_fmadd_ps(wt922, dat2670, sum750);
}
sum743 = _mm512_add_ps(sum743, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)0));
sum744 = _mm512_add_ps(sum744, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)64));
sum745 = _mm512_add_ps(sum745, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)128));
sum746 = _mm512_add_ps(sum746, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)192));
sum743 = _mm512_max_ps(_mm512_setzero_ps(), sum743);
sum744 = _mm512_max_ps(_mm512_setzero_ps(), sum744);
sum745 = _mm512_max_ps(_mm512_setzero_ps(), sum745);
sum746 = _mm512_max_ps(_mm512_setzero_ps(), sum746);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)0, 65535, sum743);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)64, 65535, sum744);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)128, 65535, sum745);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)192, 1, sum746);
sum747 = _mm512_add_ps(sum747, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)320));
sum748 = _mm512_add_ps(sum748, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)384));
sum749 = _mm512_add_ps(sum749, _mm512_maskz_loadu_ps(65535, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)448));
sum750 = _mm512_add_ps(sum750, _mm512_maskz_loadu_ps(1, datPtr47+655360*i88+256*j79+1920*k198+(ptrdiff_t)512));
sum747 = _mm512_max_ps(_mm512_setzero_ps(), sum747);
sum748 = _mm512_max_ps(_mm512_setzero_ps(), sum748);
sum749 = _mm512_max_ps(_mm512_setzero_ps(), sum749);
sum750 = _mm512_max_ps(_mm512_setzero_ps(), sum750);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)320, 65535, sum747);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)384, 65535, sum748);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)448, 65535, sum749);
_mm512_mask_storeu_ps(datPtr48+655360*i88+256*j79+1920*k198+(ptrdiff_t)512, 1, sum750);
}
}

static void ResNeXt50OneApply14(ResNeXt50ThreaderTeam1* team88, char** tensors149) {
void* pair35[] = {tensors149, 0};
ResNeXt50ThreaderTask1 task153;
task153.callee1 = ResNeXt50OneApply14Callee1;
task153.any1 = pair35;
task153.nd1 = 3;
task153.hull1[0] = 342;
task153.hull1[1] = 1;
task153.hull1[2] = 1;
ResNeXt50ThreaderDo1(team88, &task153);
}

static void ResNeXt50OneArrangeWts15Callee1(ResNeXt50ThreaderTask1* task154, int64_t* pt82) {
char** tensors152 = task154->any1;
ptrdiff_t b91 = pt82[0];
ptrdiff_t e44 = pt82[2];
if (e44 < 2) {
char*restrict wtPtr24 = tensors152[0]+(ptrdiff_t)3340*e44+(ptrdiff_t)8388608*0;
char*restrict biasPtr24 = tensors152[1]+(ptrdiff_t)4096*0;
char*restrict bnPtr25 = tensors152[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged29 = tensors152[3]+(ptrdiff_t)3424256*e44+(ptrdiff_t)3424256*0;
ptrdiff_t ii66 = 1;
for (ptrdiff_t i89 = 0; i89 < ii66; ++i89) {
ptrdiff_t j80 = 1*b91;
ptrdiff_t jj69 = j80+1;
for (; j80 < jj69; ++j80) {
if (j80 < 63) {
ptrdiff_t k200 = 0+16*(j80-0);
ptrdiff_t l87 = (size_t)(0+k200)/6;
ptrdiff_t cut34 = (size_t)(0+k200)%6;
switch (cut34) {
case 0:;
case 2: {
__m512 sum752;
if (!e44) {
sum752 = _mm512_maskz_loadu_ps(65535, biasPtr24+4096*i89+4*k200);
} else {
sum752 = _mm512_setzero_ps();
}
__m512i pmMul52 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd52 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo44 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k200+1024*i89));
__m512 masHi44 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k200+1024*i89)+(ptrdiff_t)64);
__m512 postMul77 = _mm512_permutex2var_ps(masLo44, pmMul52, masHi44);
__m512 postAdd53 = _mm512_permutex2var_ps(masLo44, pmAdd52, masHi44);
if (!e44) sum752 = _mm512_fmadd_ps(sum752, postMul77, postAdd53);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)0, 63>>cut34, sum752);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)20040, 4032>>cut34, sum752);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)40080, 65535-(4095>>cut34), sum752);
ptrdiff_t c74 = 0;
for (; c74 != 52; ++c74) {
__m512 wt955 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)0);
__m512 wt956 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)8192);
__m512 wt957 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)16384);
__m512 wt958 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)24576);
__m512 wt959 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)32768);
__m512 wt960 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)40960);
__m512 wt961 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)49152);
__m512 wt962 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)57344);
__m512 wt963 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)65536);
__m512 wt964 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)73728);
__m512 wt965 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)81920);
__m512 wt966 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)90112);
__m512 wt967 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)98304);
__m512 wt968 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)106496);
__m512 wt969 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)114688);
__m512 wt970 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)122880);
__m512 tmp15923 = _mm512_unpacklo_ps(wt955, wt956);
__m512 tmp15924 = _mm512_unpackhi_ps(wt955, wt956);
__m512 tmp15925 = _mm512_unpacklo_ps(wt957, wt958);
__m512 tmp15926 = _mm512_unpackhi_ps(wt957, wt958);
__m512 tmp15927 = _mm512_unpacklo_ps(wt959, wt960);
__m512 tmp15928 = _mm512_unpackhi_ps(wt959, wt960);
__m512 tmp15929 = _mm512_unpacklo_ps(wt961, wt962);
__m512 tmp15930 = _mm512_unpackhi_ps(wt961, wt962);
__m512 tmp15931 = _mm512_unpacklo_ps(wt963, wt964);
__m512 tmp15932 = _mm512_unpackhi_ps(wt963, wt964);
__m512 tmp15933 = _mm512_unpacklo_ps(wt965, wt966);
__m512 tmp15934 = _mm512_unpackhi_ps(wt965, wt966);
__m512 tmp15935 = _mm512_unpacklo_ps(wt967, wt968);
__m512 tmp15936 = _mm512_unpackhi_ps(wt967, wt968);
__m512 tmp15937 = _mm512_unpacklo_ps(wt969, wt970);
__m512 tmp15938 = _mm512_unpackhi_ps(wt969, wt970);
__m512 tmp15939 = _mm512_shuffle_ps(tmp15923, tmp15925, 68);
__m512 tmp15940 = _mm512_shuffle_ps(tmp15923, tmp15925, 238);
__m512 tmp15941 = _mm512_shuffle_ps(tmp15924, tmp15926, 68);
__m512 tmp15942 = _mm512_shuffle_ps(tmp15924, tmp15926, 238);
__m512 tmp15943 = _mm512_shuffle_ps(tmp15927, tmp15929, 68);
__m512 tmp15944 = _mm512_shuffle_ps(tmp15927, tmp15929, 238);
__m512 tmp15945 = _mm512_shuffle_ps(tmp15928, tmp15930, 68);
__m512 tmp15946 = _mm512_shuffle_ps(tmp15928, tmp15930, 238);
__m512 tmp15947 = _mm512_shuffle_ps(tmp15931, tmp15933, 68);
__m512 tmp15948 = _mm512_shuffle_ps(tmp15931, tmp15933, 238);
__m512 tmp15949 = _mm512_shuffle_ps(tmp15932, tmp15934, 68);
__m512 tmp15950 = _mm512_shuffle_ps(tmp15932, tmp15934, 238);
__m512 tmp15951 = _mm512_shuffle_ps(tmp15935, tmp15937, 68);
__m512 tmp15952 = _mm512_shuffle_ps(tmp15935, tmp15937, 238);
__m512 tmp15953 = _mm512_shuffle_ps(tmp15936, tmp15938, 68);
__m512 tmp15954 = _mm512_shuffle_ps(tmp15936, tmp15938, 238);
__m512 tmp15955 = _mm512_shuffle_f32x4(tmp15939, tmp15943, 136);
__m512 tmp15956 = _mm512_shuffle_f32x4(tmp15939, tmp15943, 221);
__m512 tmp15957 = _mm512_shuffle_f32x4(tmp15940, tmp15944, 136);
__m512 tmp15958 = _mm512_shuffle_f32x4(tmp15940, tmp15944, 221);
__m512 tmp15959 = _mm512_shuffle_f32x4(tmp15941, tmp15945, 136);
__m512 tmp15960 = _mm512_shuffle_f32x4(tmp15941, tmp15945, 221);
__m512 tmp15961 = _mm512_shuffle_f32x4(tmp15942, tmp15946, 136);
__m512 tmp15962 = _mm512_shuffle_f32x4(tmp15942, tmp15946, 221);
__m512 tmp15963 = _mm512_shuffle_f32x4(tmp15947, tmp15951, 136);
__m512 tmp15964 = _mm512_shuffle_f32x4(tmp15947, tmp15951, 221);
__m512 tmp15965 = _mm512_shuffle_f32x4(tmp15948, tmp15952, 136);
__m512 tmp15966 = _mm512_shuffle_f32x4(tmp15948, tmp15952, 221);
__m512 tmp15967 = _mm512_shuffle_f32x4(tmp15949, tmp15953, 136);
__m512 tmp15968 = _mm512_shuffle_f32x4(tmp15949, tmp15953, 221);
__m512 tmp15969 = _mm512_shuffle_f32x4(tmp15950, tmp15954, 136);
__m512 tmp15970 = _mm512_shuffle_f32x4(tmp15950, tmp15954, 221);
wt955 = _mm512_shuffle_f32x4(tmp15955, tmp15963, 136);
wt963 = _mm512_shuffle_f32x4(tmp15955, tmp15963, 221);
wt956 = _mm512_shuffle_f32x4(tmp15957, tmp15965, 136);
wt964 = _mm512_shuffle_f32x4(tmp15957, tmp15965, 221);
wt957 = _mm512_shuffle_f32x4(tmp15959, tmp15967, 136);
wt965 = _mm512_shuffle_f32x4(tmp15959, tmp15967, 221);
wt958 = _mm512_shuffle_f32x4(tmp15961, tmp15969, 136);
wt966 = _mm512_shuffle_f32x4(tmp15961, tmp15969, 221);
wt959 = _mm512_shuffle_f32x4(tmp15956, tmp15964, 136);
wt967 = _mm512_shuffle_f32x4(tmp15956, tmp15964, 221);
wt960 = _mm512_shuffle_f32x4(tmp15958, tmp15966, 136);
wt968 = _mm512_shuffle_f32x4(tmp15958, tmp15966, 221);
wt961 = _mm512_shuffle_f32x4(tmp15960, tmp15968, 136);
wt969 = _mm512_shuffle_f32x4(tmp15960, tmp15968, 221);
wt962 = _mm512_shuffle_f32x4(tmp15962, tmp15970, 136);
wt970 = _mm512_shuffle_f32x4(tmp15962, tmp15970, 221);
wt955 = _mm512_mul_ps(wt955, postMul77);
wt956 = _mm512_mul_ps(wt956, postMul77);
wt957 = _mm512_mul_ps(wt957, postMul77);
wt958 = _mm512_mul_ps(wt958, postMul77);
wt959 = _mm512_mul_ps(wt959, postMul77);
wt960 = _mm512_mul_ps(wt960, postMul77);
wt961 = _mm512_mul_ps(wt961, postMul77);
wt962 = _mm512_mul_ps(wt962, postMul77);
wt963 = _mm512_mul_ps(wt963, postMul77);
wt964 = _mm512_mul_ps(wt964, postMul77);
wt965 = _mm512_mul_ps(wt965, postMul77);
wt966 = _mm512_mul_ps(wt966, postMul77);
wt967 = _mm512_mul_ps(wt967, postMul77);
wt968 = _mm512_mul_ps(wt968, postMul77);
wt969 = _mm512_mul_ps(wt969, postMul77);
wt970 = _mm512_mul_ps(wt970, postMul77);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)0, 63>>cut34, wt955);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)0, 63>>cut34, wt956);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)0, 63>>cut34, wt957);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c74)+(ptrdiff_t)0, 63>>cut34, wt958);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c74)+(ptrdiff_t)0, 63>>cut34, wt959);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c74)+(ptrdiff_t)0, 63>>cut34, wt960);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c74)+(ptrdiff_t)0, 63>>cut34, wt961);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c74)+(ptrdiff_t)0, 63>>cut34, wt962);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c74)+(ptrdiff_t)0, 63>>cut34, wt963);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c74)+(ptrdiff_t)0, 63>>cut34, wt964);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c74)+(ptrdiff_t)0, 63>>cut34, wt965);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c74)+(ptrdiff_t)0, 63>>cut34, wt966);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c74)+(ptrdiff_t)0, 63>>cut34, wt967);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c74)+(ptrdiff_t)0, 63>>cut34, wt968);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c74)+(ptrdiff_t)0, 63>>cut34, wt969);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c74)+(ptrdiff_t)0, 63>>cut34, wt970);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt955);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt956);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt957);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt958);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt959);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt960);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt961);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt962);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt963);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt964);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt965);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt966);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt967);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt968);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt969);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt970);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt955);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt956);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt957);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt958);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt959);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt960);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt961);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt962);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt963);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt964);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt965);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt966);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt967);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt968);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt969);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt970);
}
__m512 wt971 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)0);
__m512 wt972 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)8192);
__m512 wt973 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)16384);
__m512 wt974 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)24576);
__m512 wt975 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)32768);
__m512 wt976 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)40960);
__m512 wt977 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)49152);
__m512 wt978 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)57344);
__m512 wt979 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)65536);
__m512 wt980 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)73728);
__m512 wt981 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)81920);
__m512 wt982 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)90112);
__m512 wt983 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)98304);
__m512 wt984 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)106496);
__m512 wt985 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)114688);
__m512 wt986 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c74+(ptrdiff_t)122880);
__m512 tmp15971 = _mm512_unpacklo_ps(wt971, wt972);
__m512 tmp15972 = _mm512_unpackhi_ps(wt971, wt972);
__m512 tmp15973 = _mm512_unpacklo_ps(wt973, wt974);
__m512 tmp15974 = _mm512_unpackhi_ps(wt973, wt974);
__m512 tmp15975 = _mm512_unpacklo_ps(wt975, wt976);
__m512 tmp15976 = _mm512_unpackhi_ps(wt975, wt976);
__m512 tmp15977 = _mm512_unpacklo_ps(wt977, wt978);
__m512 tmp15978 = _mm512_unpackhi_ps(wt977, wt978);
__m512 tmp15979 = _mm512_unpacklo_ps(wt979, wt980);
__m512 tmp15980 = _mm512_unpackhi_ps(wt979, wt980);
__m512 tmp15981 = _mm512_unpacklo_ps(wt981, wt982);
__m512 tmp15982 = _mm512_unpackhi_ps(wt981, wt982);
__m512 tmp15983 = _mm512_unpacklo_ps(wt983, wt984);
__m512 tmp15984 = _mm512_unpackhi_ps(wt983, wt984);
__m512 tmp15985 = _mm512_unpacklo_ps(wt985, wt986);
__m512 tmp15986 = _mm512_unpackhi_ps(wt985, wt986);
__m512 tmp15987 = _mm512_shuffle_ps(tmp15971, tmp15973, 68);
__m512 tmp15988 = _mm512_shuffle_ps(tmp15971, tmp15973, 238);
__m512 tmp15989 = _mm512_shuffle_ps(tmp15972, tmp15974, 68);
__m512 tmp15990 = _mm512_shuffle_ps(tmp15975, tmp15977, 68);
__m512 tmp15991 = _mm512_shuffle_ps(tmp15975, tmp15977, 238);
__m512 tmp15992 = _mm512_shuffle_ps(tmp15976, tmp15978, 68);
__m512 tmp15993 = _mm512_shuffle_ps(tmp15979, tmp15981, 68);
__m512 tmp15994 = _mm512_shuffle_ps(tmp15979, tmp15981, 238);
__m512 tmp15995 = _mm512_shuffle_ps(tmp15980, tmp15982, 68);
__m512 tmp15996 = _mm512_shuffle_ps(tmp15983, tmp15985, 68);
__m512 tmp15997 = _mm512_shuffle_ps(tmp15983, tmp15985, 238);
__m512 tmp15998 = _mm512_shuffle_ps(tmp15984, tmp15986, 68);
__m512 tmp15999 = _mm512_shuffle_f32x4(tmp15987, tmp15990, 136);
__m512 tmp16000 = _mm512_shuffle_f32x4(tmp15988, tmp15991, 136);
__m512 tmp16001 = _mm512_shuffle_f32x4(tmp15989, tmp15992, 136);
__m512 tmp16002 = _mm512_shuffle_f32x4(tmp15993, tmp15996, 136);
__m512 tmp16003 = _mm512_shuffle_f32x4(tmp15994, tmp15997, 136);
__m512 tmp16004 = _mm512_shuffle_f32x4(tmp15995, tmp15998, 136);
wt971 = _mm512_shuffle_f32x4(tmp15999, tmp16002, 136);
wt972 = _mm512_shuffle_f32x4(tmp16000, tmp16003, 136);
wt973 = _mm512_shuffle_f32x4(tmp16001, tmp16004, 136);
wt971 = _mm512_mul_ps(wt971, postMul77);
wt972 = _mm512_mul_ps(wt972, postMul77);
wt973 = _mm512_mul_ps(wt973, postMul77);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)0, 63>>cut34, wt971);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)0, 63>>cut34, wt972);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)0, 63>>cut34, wt973);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt971);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt972);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)20040, 4032>>cut34, wt973);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt971);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt972);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c74)+(ptrdiff_t)40080, 65535-(4095>>cut34), wt973);
break;
}
default: {
cut34 = 4;
__m512 sum753;
if (!e44) {
sum753 = _mm512_maskz_loadu_ps(65535, biasPtr24+4096*i89+4*k200);
} else {
sum753 = _mm512_setzero_ps();
}
__m512i pmMul53 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd53 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo45 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k200+1024*i89));
__m512 masHi45 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k200+1024*i89)+(ptrdiff_t)64);
__m512 postMul78 = _mm512_permutex2var_ps(masLo45, pmMul53, masHi45);
__m512 postAdd54 = _mm512_permutex2var_ps(masLo45, pmAdd53, masHi45);
if (!e44) sum753 = _mm512_fmadd_ps(sum753, postMul78, postAdd54);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)0, 63>>cut34, sum753);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)20040, 4032>>cut34, sum753);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)40080, 258048>>cut34, sum753);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*0+(ptrdiff_t)60120, 65535-(262143>>cut34), sum753);
ptrdiff_t c75 = 0;
for (; c75 != 52; ++c75) {
__m512 wt987 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)0);
__m512 wt988 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)8192);
__m512 wt989 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)16384);
__m512 wt990 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)24576);
__m512 wt991 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)32768);
__m512 wt992 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)40960);
__m512 wt993 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)49152);
__m512 wt994 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)57344);
__m512 wt995 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)65536);
__m512 wt996 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)73728);
__m512 wt997 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)81920);
__m512 wt998 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)90112);
__m512 wt999 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)98304);
__m512 wt1000 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)106496);
__m512 wt1001 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)114688);
__m512 wt1002 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)122880);
__m512 tmp16005 = _mm512_unpacklo_ps(wt987, wt988);
__m512 tmp16006 = _mm512_unpackhi_ps(wt987, wt988);
__m512 tmp16007 = _mm512_unpacklo_ps(wt989, wt990);
__m512 tmp16008 = _mm512_unpackhi_ps(wt989, wt990);
__m512 tmp16009 = _mm512_unpacklo_ps(wt991, wt992);
__m512 tmp16010 = _mm512_unpackhi_ps(wt991, wt992);
__m512 tmp16011 = _mm512_unpacklo_ps(wt993, wt994);
__m512 tmp16012 = _mm512_unpackhi_ps(wt993, wt994);
__m512 tmp16013 = _mm512_unpacklo_ps(wt995, wt996);
__m512 tmp16014 = _mm512_unpackhi_ps(wt995, wt996);
__m512 tmp16015 = _mm512_unpacklo_ps(wt997, wt998);
__m512 tmp16016 = _mm512_unpackhi_ps(wt997, wt998);
__m512 tmp16017 = _mm512_unpacklo_ps(wt999, wt1000);
__m512 tmp16018 = _mm512_unpackhi_ps(wt999, wt1000);
__m512 tmp16019 = _mm512_unpacklo_ps(wt1001, wt1002);
__m512 tmp16020 = _mm512_unpackhi_ps(wt1001, wt1002);
__m512 tmp16021 = _mm512_shuffle_ps(tmp16005, tmp16007, 68);
__m512 tmp16022 = _mm512_shuffle_ps(tmp16005, tmp16007, 238);
__m512 tmp16023 = _mm512_shuffle_ps(tmp16006, tmp16008, 68);
__m512 tmp16024 = _mm512_shuffle_ps(tmp16006, tmp16008, 238);
__m512 tmp16025 = _mm512_shuffle_ps(tmp16009, tmp16011, 68);
__m512 tmp16026 = _mm512_shuffle_ps(tmp16009, tmp16011, 238);
__m512 tmp16027 = _mm512_shuffle_ps(tmp16010, tmp16012, 68);
__m512 tmp16028 = _mm512_shuffle_ps(tmp16010, tmp16012, 238);
__m512 tmp16029 = _mm512_shuffle_ps(tmp16013, tmp16015, 68);
__m512 tmp16030 = _mm512_shuffle_ps(tmp16013, tmp16015, 238);
__m512 tmp16031 = _mm512_shuffle_ps(tmp16014, tmp16016, 68);
__m512 tmp16032 = _mm512_shuffle_ps(tmp16014, tmp16016, 238);
__m512 tmp16033 = _mm512_shuffle_ps(tmp16017, tmp16019, 68);
__m512 tmp16034 = _mm512_shuffle_ps(tmp16017, tmp16019, 238);
__m512 tmp16035 = _mm512_shuffle_ps(tmp16018, tmp16020, 68);
__m512 tmp16036 = _mm512_shuffle_ps(tmp16018, tmp16020, 238);
__m512 tmp16037 = _mm512_shuffle_f32x4(tmp16021, tmp16025, 136);
__m512 tmp16038 = _mm512_shuffle_f32x4(tmp16021, tmp16025, 221);
__m512 tmp16039 = _mm512_shuffle_f32x4(tmp16022, tmp16026, 136);
__m512 tmp16040 = _mm512_shuffle_f32x4(tmp16022, tmp16026, 221);
__m512 tmp16041 = _mm512_shuffle_f32x4(tmp16023, tmp16027, 136);
__m512 tmp16042 = _mm512_shuffle_f32x4(tmp16023, tmp16027, 221);
__m512 tmp16043 = _mm512_shuffle_f32x4(tmp16024, tmp16028, 136);
__m512 tmp16044 = _mm512_shuffle_f32x4(tmp16024, tmp16028, 221);
__m512 tmp16045 = _mm512_shuffle_f32x4(tmp16029, tmp16033, 136);
__m512 tmp16046 = _mm512_shuffle_f32x4(tmp16029, tmp16033, 221);
__m512 tmp16047 = _mm512_shuffle_f32x4(tmp16030, tmp16034, 136);
__m512 tmp16048 = _mm512_shuffle_f32x4(tmp16030, tmp16034, 221);
__m512 tmp16049 = _mm512_shuffle_f32x4(tmp16031, tmp16035, 136);
__m512 tmp16050 = _mm512_shuffle_f32x4(tmp16031, tmp16035, 221);
__m512 tmp16051 = _mm512_shuffle_f32x4(tmp16032, tmp16036, 136);
__m512 tmp16052 = _mm512_shuffle_f32x4(tmp16032, tmp16036, 221);
wt987 = _mm512_shuffle_f32x4(tmp16037, tmp16045, 136);
wt995 = _mm512_shuffle_f32x4(tmp16037, tmp16045, 221);
wt988 = _mm512_shuffle_f32x4(tmp16039, tmp16047, 136);
wt996 = _mm512_shuffle_f32x4(tmp16039, tmp16047, 221);
wt989 = _mm512_shuffle_f32x4(tmp16041, tmp16049, 136);
wt997 = _mm512_shuffle_f32x4(tmp16041, tmp16049, 221);
wt990 = _mm512_shuffle_f32x4(tmp16043, tmp16051, 136);
wt998 = _mm512_shuffle_f32x4(tmp16043, tmp16051, 221);
wt991 = _mm512_shuffle_f32x4(tmp16038, tmp16046, 136);
wt999 = _mm512_shuffle_f32x4(tmp16038, tmp16046, 221);
wt992 = _mm512_shuffle_f32x4(tmp16040, tmp16048, 136);
wt1000 = _mm512_shuffle_f32x4(tmp16040, tmp16048, 221);
wt993 = _mm512_shuffle_f32x4(tmp16042, tmp16050, 136);
wt1001 = _mm512_shuffle_f32x4(tmp16042, tmp16050, 221);
wt994 = _mm512_shuffle_f32x4(tmp16044, tmp16052, 136);
wt1002 = _mm512_shuffle_f32x4(tmp16044, tmp16052, 221);
wt987 = _mm512_mul_ps(wt987, postMul78);
wt988 = _mm512_mul_ps(wt988, postMul78);
wt989 = _mm512_mul_ps(wt989, postMul78);
wt990 = _mm512_mul_ps(wt990, postMul78);
wt991 = _mm512_mul_ps(wt991, postMul78);
wt992 = _mm512_mul_ps(wt992, postMul78);
wt993 = _mm512_mul_ps(wt993, postMul78);
wt994 = _mm512_mul_ps(wt994, postMul78);
wt995 = _mm512_mul_ps(wt995, postMul78);
wt996 = _mm512_mul_ps(wt996, postMul78);
wt997 = _mm512_mul_ps(wt997, postMul78);
wt998 = _mm512_mul_ps(wt998, postMul78);
wt999 = _mm512_mul_ps(wt999, postMul78);
wt1000 = _mm512_mul_ps(wt1000, postMul78);
wt1001 = _mm512_mul_ps(wt1001, postMul78);
wt1002 = _mm512_mul_ps(wt1002, postMul78);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)0, 63>>cut34, wt987);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)0, 63>>cut34, wt988);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)0, 63>>cut34, wt989);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c75)+(ptrdiff_t)0, 63>>cut34, wt990);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c75)+(ptrdiff_t)0, 63>>cut34, wt991);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c75)+(ptrdiff_t)0, 63>>cut34, wt992);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c75)+(ptrdiff_t)0, 63>>cut34, wt993);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c75)+(ptrdiff_t)0, 63>>cut34, wt994);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c75)+(ptrdiff_t)0, 63>>cut34, wt995);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c75)+(ptrdiff_t)0, 63>>cut34, wt996);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c75)+(ptrdiff_t)0, 63>>cut34, wt997);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c75)+(ptrdiff_t)0, 63>>cut34, wt998);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c75)+(ptrdiff_t)0, 63>>cut34, wt999);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1000);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1001);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1002);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt987);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt988);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt989);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt990);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt991);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt992);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt993);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt994);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt995);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt996);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt997);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt998);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt999);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1000);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1001);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1002);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt987);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt988);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt989);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt990);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt991);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt992);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt993);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt994);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt995);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt996);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt997);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt998);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt999);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1000);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1001);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1002);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt987);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt988);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt989);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(4+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt990);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(5+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt991);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(6+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt992);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(7+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt993);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(8+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt994);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(9+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt995);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(10+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt996);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(11+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt997);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(12+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt998);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(13+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt999);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(14+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1000);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(15+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1001);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(16+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1002);
}
__m512 wt1003 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)0);
__m512 wt1004 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)8192);
__m512 wt1005 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)16384);
__m512 wt1006 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)24576);
__m512 wt1007 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)32768);
__m512 wt1008 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)40960);
__m512 wt1009 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)49152);
__m512 wt1010 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)57344);
__m512 wt1011 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)65536);
__m512 wt1012 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)73728);
__m512 wt1013 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)81920);
__m512 wt1014 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)90112);
__m512 wt1015 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)98304);
__m512 wt1016 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)106496);
__m512 wt1017 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)114688);
__m512 wt1018 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k200+64*c75+(ptrdiff_t)122880);
__m512 tmp16053 = _mm512_unpacklo_ps(wt1003, wt1004);
__m512 tmp16054 = _mm512_unpackhi_ps(wt1003, wt1004);
__m512 tmp16055 = _mm512_unpacklo_ps(wt1005, wt1006);
__m512 tmp16056 = _mm512_unpackhi_ps(wt1005, wt1006);
__m512 tmp16057 = _mm512_unpacklo_ps(wt1007, wt1008);
__m512 tmp16058 = _mm512_unpackhi_ps(wt1007, wt1008);
__m512 tmp16059 = _mm512_unpacklo_ps(wt1009, wt1010);
__m512 tmp16060 = _mm512_unpackhi_ps(wt1009, wt1010);
__m512 tmp16061 = _mm512_unpacklo_ps(wt1011, wt1012);
__m512 tmp16062 = _mm512_unpackhi_ps(wt1011, wt1012);
__m512 tmp16063 = _mm512_unpacklo_ps(wt1013, wt1014);
__m512 tmp16064 = _mm512_unpackhi_ps(wt1013, wt1014);
__m512 tmp16065 = _mm512_unpacklo_ps(wt1015, wt1016);
__m512 tmp16066 = _mm512_unpackhi_ps(wt1015, wt1016);
__m512 tmp16067 = _mm512_unpacklo_ps(wt1017, wt1018);
__m512 tmp16068 = _mm512_unpackhi_ps(wt1017, wt1018);
__m512 tmp16069 = _mm512_shuffle_ps(tmp16053, tmp16055, 68);
__m512 tmp16070 = _mm512_shuffle_ps(tmp16053, tmp16055, 238);
__m512 tmp16071 = _mm512_shuffle_ps(tmp16054, tmp16056, 68);
__m512 tmp16072 = _mm512_shuffle_ps(tmp16057, tmp16059, 68);
__m512 tmp16073 = _mm512_shuffle_ps(tmp16057, tmp16059, 238);
__m512 tmp16074 = _mm512_shuffle_ps(tmp16058, tmp16060, 68);
__m512 tmp16075 = _mm512_shuffle_ps(tmp16061, tmp16063, 68);
__m512 tmp16076 = _mm512_shuffle_ps(tmp16061, tmp16063, 238);
__m512 tmp16077 = _mm512_shuffle_ps(tmp16062, tmp16064, 68);
__m512 tmp16078 = _mm512_shuffle_ps(tmp16065, tmp16067, 68);
__m512 tmp16079 = _mm512_shuffle_ps(tmp16065, tmp16067, 238);
__m512 tmp16080 = _mm512_shuffle_ps(tmp16066, tmp16068, 68);
__m512 tmp16081 = _mm512_shuffle_f32x4(tmp16069, tmp16072, 136);
__m512 tmp16082 = _mm512_shuffle_f32x4(tmp16070, tmp16073, 136);
__m512 tmp16083 = _mm512_shuffle_f32x4(tmp16071, tmp16074, 136);
__m512 tmp16084 = _mm512_shuffle_f32x4(tmp16075, tmp16078, 136);
__m512 tmp16085 = _mm512_shuffle_f32x4(tmp16076, tmp16079, 136);
__m512 tmp16086 = _mm512_shuffle_f32x4(tmp16077, tmp16080, 136);
wt1003 = _mm512_shuffle_f32x4(tmp16081, tmp16084, 136);
wt1004 = _mm512_shuffle_f32x4(tmp16082, tmp16085, 136);
wt1005 = _mm512_shuffle_f32x4(tmp16083, tmp16086, 136);
wt1003 = _mm512_mul_ps(wt1003, postMul78);
wt1004 = _mm512_mul_ps(wt1004, postMul78);
wt1005 = _mm512_mul_ps(wt1005, postMul78);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1003);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1004);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)0, 63>>cut34, wt1005);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1003);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1004);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)20040, 4032>>cut34, wt1005);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1003);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1004);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)40080, 258048>>cut34, wt1005);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(1+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1003);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(2+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1004);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l87+4*cut34+24*(3+16*c75)+(ptrdiff_t)60120, 65535-(262143>>cut34), wt1005);
}
}
} else {
ptrdiff_t k199 = 1008;
ptrdiff_t l86 = (size_t)(0+k199)/6;
ptrdiff_t cut33 = (size_t)(0+k199)%6;
__m512 sum751;
if (!e44) {
sum751 = _mm512_maskz_loadu_ps(65535, biasPtr24+4096*i89+4*k199);
} else {
sum751 = _mm512_setzero_ps();
}
__m512i pmMul54 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd54 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo46 = _mm512_loadu_ps(bnPtr25+(ptrdiff_t)8*(k199+1024*i89));
__m512 masHi46 = _mm512_maskz_loadu_ps(65535, bnPtr25+(ptrdiff_t)8*(k199+1024*i89)+(ptrdiff_t)64);
__m512 postMul76 = _mm512_permutex2var_ps(masLo46, pmMul54, masHi46);
__m512 postAdd52 = _mm512_permutex2var_ps(masLo46, pmAdd54, masHi46);
if (!e44) sum751 = _mm512_fmadd_ps(sum751, postMul76, postAdd52);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*0+(ptrdiff_t)0, 63>>cut33, sum751);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*0+(ptrdiff_t)20040, 4032>>cut33, sum751);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*0+(ptrdiff_t)40080, 65535-(4095>>cut33), sum751);
ptrdiff_t c73 = 0;
for (; c73 != 52; ++c73) {
__m512 wt923 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)0);
__m512 wt924 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)8192);
__m512 wt925 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)16384);
__m512 wt926 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)24576);
__m512 wt927 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)32768);
__m512 wt928 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)40960);
__m512 wt929 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)49152);
__m512 wt930 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)57344);
__m512 wt931 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)65536);
__m512 wt932 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)73728);
__m512 wt933 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)81920);
__m512 wt934 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)90112);
__m512 wt935 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)98304);
__m512 wt936 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)106496);
__m512 wt937 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)114688);
__m512 wt938 = _mm512_maskz_loadu_ps(65535, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)122880);
__m512 tmp16087 = _mm512_unpacklo_ps(wt923, wt924);
__m512 tmp16088 = _mm512_unpackhi_ps(wt923, wt924);
__m512 tmp16089 = _mm512_unpacklo_ps(wt925, wt926);
__m512 tmp16090 = _mm512_unpackhi_ps(wt925, wt926);
__m512 tmp16091 = _mm512_unpacklo_ps(wt927, wt928);
__m512 tmp16092 = _mm512_unpackhi_ps(wt927, wt928);
__m512 tmp16093 = _mm512_unpacklo_ps(wt929, wt930);
__m512 tmp16094 = _mm512_unpackhi_ps(wt929, wt930);
__m512 tmp16095 = _mm512_unpacklo_ps(wt931, wt932);
__m512 tmp16096 = _mm512_unpackhi_ps(wt931, wt932);
__m512 tmp16097 = _mm512_unpacklo_ps(wt933, wt934);
__m512 tmp16098 = _mm512_unpackhi_ps(wt933, wt934);
__m512 tmp16099 = _mm512_unpacklo_ps(wt935, wt936);
__m512 tmp16100 = _mm512_unpackhi_ps(wt935, wt936);
__m512 tmp16101 = _mm512_unpacklo_ps(wt937, wt938);
__m512 tmp16102 = _mm512_unpackhi_ps(wt937, wt938);
__m512 tmp16103 = _mm512_shuffle_ps(tmp16087, tmp16089, 68);
__m512 tmp16104 = _mm512_shuffle_ps(tmp16087, tmp16089, 238);
__m512 tmp16105 = _mm512_shuffle_ps(tmp16088, tmp16090, 68);
__m512 tmp16106 = _mm512_shuffle_ps(tmp16088, tmp16090, 238);
__m512 tmp16107 = _mm512_shuffle_ps(tmp16091, tmp16093, 68);
__m512 tmp16108 = _mm512_shuffle_ps(tmp16091, tmp16093, 238);
__m512 tmp16109 = _mm512_shuffle_ps(tmp16092, tmp16094, 68);
__m512 tmp16110 = _mm512_shuffle_ps(tmp16092, tmp16094, 238);
__m512 tmp16111 = _mm512_shuffle_ps(tmp16095, tmp16097, 68);
__m512 tmp16112 = _mm512_shuffle_ps(tmp16095, tmp16097, 238);
__m512 tmp16113 = _mm512_shuffle_ps(tmp16096, tmp16098, 68);
__m512 tmp16114 = _mm512_shuffle_ps(tmp16096, tmp16098, 238);
__m512 tmp16115 = _mm512_shuffle_ps(tmp16099, tmp16101, 68);
__m512 tmp16116 = _mm512_shuffle_ps(tmp16099, tmp16101, 238);
__m512 tmp16117 = _mm512_shuffle_ps(tmp16100, tmp16102, 68);
__m512 tmp16118 = _mm512_shuffle_ps(tmp16100, tmp16102, 238);
__m512 tmp16119 = _mm512_shuffle_f32x4(tmp16103, tmp16107, 136);
__m512 tmp16120 = _mm512_shuffle_f32x4(tmp16103, tmp16107, 221);
__m512 tmp16121 = _mm512_shuffle_f32x4(tmp16104, tmp16108, 136);
__m512 tmp16122 = _mm512_shuffle_f32x4(tmp16104, tmp16108, 221);
__m512 tmp16123 = _mm512_shuffle_f32x4(tmp16105, tmp16109, 136);
__m512 tmp16124 = _mm512_shuffle_f32x4(tmp16105, tmp16109, 221);
__m512 tmp16125 = _mm512_shuffle_f32x4(tmp16106, tmp16110, 136);
__m512 tmp16126 = _mm512_shuffle_f32x4(tmp16106, tmp16110, 221);
__m512 tmp16127 = _mm512_shuffle_f32x4(tmp16111, tmp16115, 136);
__m512 tmp16128 = _mm512_shuffle_f32x4(tmp16111, tmp16115, 221);
__m512 tmp16129 = _mm512_shuffle_f32x4(tmp16112, tmp16116, 136);
__m512 tmp16130 = _mm512_shuffle_f32x4(tmp16112, tmp16116, 221);
__m512 tmp16131 = _mm512_shuffle_f32x4(tmp16113, tmp16117, 136);
__m512 tmp16132 = _mm512_shuffle_f32x4(tmp16113, tmp16117, 221);
__m512 tmp16133 = _mm512_shuffle_f32x4(tmp16114, tmp16118, 136);
__m512 tmp16134 = _mm512_shuffle_f32x4(tmp16114, tmp16118, 221);
wt923 = _mm512_shuffle_f32x4(tmp16119, tmp16127, 136);
wt931 = _mm512_shuffle_f32x4(tmp16119, tmp16127, 221);
wt924 = _mm512_shuffle_f32x4(tmp16121, tmp16129, 136);
wt932 = _mm512_shuffle_f32x4(tmp16121, tmp16129, 221);
wt925 = _mm512_shuffle_f32x4(tmp16123, tmp16131, 136);
wt933 = _mm512_shuffle_f32x4(tmp16123, tmp16131, 221);
wt926 = _mm512_shuffle_f32x4(tmp16125, tmp16133, 136);
wt934 = _mm512_shuffle_f32x4(tmp16125, tmp16133, 221);
wt927 = _mm512_shuffle_f32x4(tmp16120, tmp16128, 136);
wt935 = _mm512_shuffle_f32x4(tmp16120, tmp16128, 221);
wt928 = _mm512_shuffle_f32x4(tmp16122, tmp16130, 136);
wt936 = _mm512_shuffle_f32x4(tmp16122, tmp16130, 221);
wt929 = _mm512_shuffle_f32x4(tmp16124, tmp16132, 136);
wt937 = _mm512_shuffle_f32x4(tmp16124, tmp16132, 221);
wt930 = _mm512_shuffle_f32x4(tmp16126, tmp16134, 136);
wt938 = _mm512_shuffle_f32x4(tmp16126, tmp16134, 221);
wt923 = _mm512_mul_ps(wt923, postMul76);
wt924 = _mm512_mul_ps(wt924, postMul76);
wt925 = _mm512_mul_ps(wt925, postMul76);
wt926 = _mm512_mul_ps(wt926, postMul76);
wt927 = _mm512_mul_ps(wt927, postMul76);
wt928 = _mm512_mul_ps(wt928, postMul76);
wt929 = _mm512_mul_ps(wt929, postMul76);
wt930 = _mm512_mul_ps(wt930, postMul76);
wt931 = _mm512_mul_ps(wt931, postMul76);
wt932 = _mm512_mul_ps(wt932, postMul76);
wt933 = _mm512_mul_ps(wt933, postMul76);
wt934 = _mm512_mul_ps(wt934, postMul76);
wt935 = _mm512_mul_ps(wt935, postMul76);
wt936 = _mm512_mul_ps(wt936, postMul76);
wt937 = _mm512_mul_ps(wt937, postMul76);
wt938 = _mm512_mul_ps(wt938, postMul76);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(1+16*c73)+(ptrdiff_t)0, 63>>cut33, wt923);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(2+16*c73)+(ptrdiff_t)0, 63>>cut33, wt924);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(3+16*c73)+(ptrdiff_t)0, 63>>cut33, wt925);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(4+16*c73)+(ptrdiff_t)0, 63>>cut33, wt926);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(5+16*c73)+(ptrdiff_t)0, 63>>cut33, wt927);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(6+16*c73)+(ptrdiff_t)0, 63>>cut33, wt928);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(7+16*c73)+(ptrdiff_t)0, 63>>cut33, wt929);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(8+16*c73)+(ptrdiff_t)0, 63>>cut33, wt930);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(9+16*c73)+(ptrdiff_t)0, 63>>cut33, wt931);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(10+16*c73)+(ptrdiff_t)0, 63>>cut33, wt932);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(11+16*c73)+(ptrdiff_t)0, 63>>cut33, wt933);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(12+16*c73)+(ptrdiff_t)0, 63>>cut33, wt934);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(13+16*c73)+(ptrdiff_t)0, 63>>cut33, wt935);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(14+16*c73)+(ptrdiff_t)0, 63>>cut33, wt936);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(15+16*c73)+(ptrdiff_t)0, 63>>cut33, wt937);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(16+16*c73)+(ptrdiff_t)0, 63>>cut33, wt938);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(1+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt923);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(2+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt924);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(3+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt925);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(4+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt926);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(5+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt927);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(6+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt928);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(7+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt929);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(8+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt930);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(9+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt931);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(10+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt932);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(11+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt933);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(12+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt934);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(13+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt935);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(14+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt936);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(15+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt937);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(16+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt938);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(1+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt923);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(2+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt924);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(3+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt925);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(4+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt926);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(5+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt927);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(6+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt928);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(7+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt929);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(8+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt930);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(9+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt931);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(10+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt932);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(11+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt933);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(12+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt934);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(13+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt935);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(14+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt936);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(15+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt937);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(16+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt938);
}
__m512 wt939 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)0);
__m512 wt940 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)8192);
__m512 wt941 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)16384);
__m512 wt942 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)24576);
__m512 wt943 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)32768);
__m512 wt944 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)40960);
__m512 wt945 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)49152);
__m512 wt946 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)57344);
__m512 wt947 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)65536);
__m512 wt948 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)73728);
__m512 wt949 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)81920);
__m512 wt950 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)90112);
__m512 wt951 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)98304);
__m512 wt952 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)106496);
__m512 wt953 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)114688);
__m512 wt954 = _mm512_maskz_loadu_ps(7, wtPtr24+8388608*i89+8192*k199+64*c73+(ptrdiff_t)122880);
__m512 tmp16135 = _mm512_unpacklo_ps(wt939, wt940);
__m512 tmp16136 = _mm512_unpackhi_ps(wt939, wt940);
__m512 tmp16137 = _mm512_unpacklo_ps(wt941, wt942);
__m512 tmp16138 = _mm512_unpackhi_ps(wt941, wt942);
__m512 tmp16139 = _mm512_unpacklo_ps(wt943, wt944);
__m512 tmp16140 = _mm512_unpackhi_ps(wt943, wt944);
__m512 tmp16141 = _mm512_unpacklo_ps(wt945, wt946);
__m512 tmp16142 = _mm512_unpackhi_ps(wt945, wt946);
__m512 tmp16143 = _mm512_unpacklo_ps(wt947, wt948);
__m512 tmp16144 = _mm512_unpackhi_ps(wt947, wt948);
__m512 tmp16145 = _mm512_unpacklo_ps(wt949, wt950);
__m512 tmp16146 = _mm512_unpackhi_ps(wt949, wt950);
__m512 tmp16147 = _mm512_unpacklo_ps(wt951, wt952);
__m512 tmp16148 = _mm512_unpackhi_ps(wt951, wt952);
__m512 tmp16149 = _mm512_unpacklo_ps(wt953, wt954);
__m512 tmp16150 = _mm512_unpackhi_ps(wt953, wt954);
__m512 tmp16151 = _mm512_shuffle_ps(tmp16135, tmp16137, 68);
__m512 tmp16152 = _mm512_shuffle_ps(tmp16135, tmp16137, 238);
__m512 tmp16153 = _mm512_shuffle_ps(tmp16136, tmp16138, 68);
__m512 tmp16154 = _mm512_shuffle_ps(tmp16139, tmp16141, 68);
__m512 tmp16155 = _mm512_shuffle_ps(tmp16139, tmp16141, 238);
__m512 tmp16156 = _mm512_shuffle_ps(tmp16140, tmp16142, 68);
__m512 tmp16157 = _mm512_shuffle_ps(tmp16143, tmp16145, 68);
__m512 tmp16158 = _mm512_shuffle_ps(tmp16143, tmp16145, 238);
__m512 tmp16159 = _mm512_shuffle_ps(tmp16144, tmp16146, 68);
__m512 tmp16160 = _mm512_shuffle_ps(tmp16147, tmp16149, 68);
__m512 tmp16161 = _mm512_shuffle_ps(tmp16147, tmp16149, 238);
__m512 tmp16162 = _mm512_shuffle_ps(tmp16148, tmp16150, 68);
__m512 tmp16163 = _mm512_shuffle_f32x4(tmp16151, tmp16154, 136);
__m512 tmp16164 = _mm512_shuffle_f32x4(tmp16152, tmp16155, 136);
__m512 tmp16165 = _mm512_shuffle_f32x4(tmp16153, tmp16156, 136);
__m512 tmp16166 = _mm512_shuffle_f32x4(tmp16157, tmp16160, 136);
__m512 tmp16167 = _mm512_shuffle_f32x4(tmp16158, tmp16161, 136);
__m512 tmp16168 = _mm512_shuffle_f32x4(tmp16159, tmp16162, 136);
wt939 = _mm512_shuffle_f32x4(tmp16163, tmp16166, 136);
wt940 = _mm512_shuffle_f32x4(tmp16164, tmp16167, 136);
wt941 = _mm512_shuffle_f32x4(tmp16165, tmp16168, 136);
wt939 = _mm512_mul_ps(wt939, postMul76);
wt940 = _mm512_mul_ps(wt940, postMul76);
wt941 = _mm512_mul_ps(wt941, postMul76);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(1+16*c73)+(ptrdiff_t)0, 63>>cut33, wt939);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(2+16*c73)+(ptrdiff_t)0, 63>>cut33, wt940);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(3+16*c73)+(ptrdiff_t)0, 63>>cut33, wt941);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(1+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt939);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(2+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt940);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+24*(3+16*c73)+(ptrdiff_t)20040, 4032>>cut33, wt941);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(1+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt939);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(2+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt940);
_mm512_mask_storeu_ps(arranged29+3424256*i89+20064*l86+4*cut33+16*(3+16*c73)+(ptrdiff_t)40080, 65535-(4095>>cut33), wt941);
}
}
}
return;
}
char*restrict wtPtr25 = tensors152[0]+(ptrdiff_t)3340*2+(ptrdiff_t)8388608*0;
char*restrict bnPtr26 = tensors152[2]+(ptrdiff_t)8*1024*0;
char*restrict arranged30 = tensors152[3]+(ptrdiff_t)3424256*2+(ptrdiff_t)1552384*0;
ptrdiff_t ii67 = 1;
for (ptrdiff_t i90 = 0; i90 < ii67; ++i90) {
ptrdiff_t j81 = 1*b91;
ptrdiff_t jj70 = j81+1;
for (; j81 < jj70; ++j81) {
if (j81 < 63) {
ptrdiff_t k202 = 0+16*(j81-0);
ptrdiff_t l89 = (size_t)(0+k202)/6;
ptrdiff_t cut36 = (size_t)(0+k202)%6;
switch (cut36) {
case 0:;
case 2: {
__m512 sum755 = _mm512_setzero_ps();
__m512i pmMul55 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd55 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo47 = _mm512_loadu_ps(bnPtr26+(ptrdiff_t)8*(k202+1024*i90));
__m512 masHi47 = _mm512_maskz_loadu_ps(65535, bnPtr26+(ptrdiff_t)8*(k202+1024*i90)+(ptrdiff_t)64);
__m512 postMul80 = _mm512_permutex2var_ps(masLo47, pmMul55, masHi47);
__m512 postAdd56 = _mm512_permutex2var_ps(masLo47, pmAdd55, masHi47);
(void)postAdd56;
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)0, 63>>cut36, sum755);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)9072, 4032>>cut36, sum755);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)18144, 65535-(4095>>cut36), sum755);
ptrdiff_t c77 = 0;
for (; c77 != 23; ++c77) {
__m512 wt1051 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)0);
__m512 wt1052 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)8192);
__m512 wt1053 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)16384);
__m512 wt1054 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)24576);
__m512 wt1055 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)32768);
__m512 wt1056 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)40960);
__m512 wt1057 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)49152);
__m512 wt1058 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)57344);
__m512 wt1059 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)65536);
__m512 wt1060 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)73728);
__m512 wt1061 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)81920);
__m512 wt1062 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)90112);
__m512 wt1063 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)98304);
__m512 wt1064 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)106496);
__m512 wt1065 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)114688);
__m512 wt1066 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)122880);
__m512 tmp16169 = _mm512_unpacklo_ps(wt1051, wt1052);
__m512 tmp16170 = _mm512_unpackhi_ps(wt1051, wt1052);
__m512 tmp16171 = _mm512_unpacklo_ps(wt1053, wt1054);
__m512 tmp16172 = _mm512_unpackhi_ps(wt1053, wt1054);
__m512 tmp16173 = _mm512_unpacklo_ps(wt1055, wt1056);
__m512 tmp16174 = _mm512_unpackhi_ps(wt1055, wt1056);
__m512 tmp16175 = _mm512_unpacklo_ps(wt1057, wt1058);
__m512 tmp16176 = _mm512_unpackhi_ps(wt1057, wt1058);
__m512 tmp16177 = _mm512_unpacklo_ps(wt1059, wt1060);
__m512 tmp16178 = _mm512_unpackhi_ps(wt1059, wt1060);
__m512 tmp16179 = _mm512_unpacklo_ps(wt1061, wt1062);
__m512 tmp16180 = _mm512_unpackhi_ps(wt1061, wt1062);
__m512 tmp16181 = _mm512_unpacklo_ps(wt1063, wt1064);
__m512 tmp16182 = _mm512_unpackhi_ps(wt1063, wt1064);
__m512 tmp16183 = _mm512_unpacklo_ps(wt1065, wt1066);
__m512 tmp16184 = _mm512_unpackhi_ps(wt1065, wt1066);
__m512 tmp16185 = _mm512_shuffle_ps(tmp16169, tmp16171, 68);
__m512 tmp16186 = _mm512_shuffle_ps(tmp16169, tmp16171, 238);
__m512 tmp16187 = _mm512_shuffle_ps(tmp16170, tmp16172, 68);
__m512 tmp16188 = _mm512_shuffle_ps(tmp16170, tmp16172, 238);
__m512 tmp16189 = _mm512_shuffle_ps(tmp16173, tmp16175, 68);
__m512 tmp16190 = _mm512_shuffle_ps(tmp16173, tmp16175, 238);
__m512 tmp16191 = _mm512_shuffle_ps(tmp16174, tmp16176, 68);
__m512 tmp16192 = _mm512_shuffle_ps(tmp16174, tmp16176, 238);
__m512 tmp16193 = _mm512_shuffle_ps(tmp16177, tmp16179, 68);
__m512 tmp16194 = _mm512_shuffle_ps(tmp16177, tmp16179, 238);
__m512 tmp16195 = _mm512_shuffle_ps(tmp16178, tmp16180, 68);
__m512 tmp16196 = _mm512_shuffle_ps(tmp16178, tmp16180, 238);
__m512 tmp16197 = _mm512_shuffle_ps(tmp16181, tmp16183, 68);
__m512 tmp16198 = _mm512_shuffle_ps(tmp16181, tmp16183, 238);
__m512 tmp16199 = _mm512_shuffle_ps(tmp16182, tmp16184, 68);
__m512 tmp16200 = _mm512_shuffle_ps(tmp16182, tmp16184, 238);
__m512 tmp16201 = _mm512_shuffle_f32x4(tmp16185, tmp16189, 136);
__m512 tmp16202 = _mm512_shuffle_f32x4(tmp16185, tmp16189, 221);
__m512 tmp16203 = _mm512_shuffle_f32x4(tmp16186, tmp16190, 136);
__m512 tmp16204 = _mm512_shuffle_f32x4(tmp16186, tmp16190, 221);
__m512 tmp16205 = _mm512_shuffle_f32x4(tmp16187, tmp16191, 136);
__m512 tmp16206 = _mm512_shuffle_f32x4(tmp16187, tmp16191, 221);
__m512 tmp16207 = _mm512_shuffle_f32x4(tmp16188, tmp16192, 136);
__m512 tmp16208 = _mm512_shuffle_f32x4(tmp16188, tmp16192, 221);
__m512 tmp16209 = _mm512_shuffle_f32x4(tmp16193, tmp16197, 136);
__m512 tmp16210 = _mm512_shuffle_f32x4(tmp16193, tmp16197, 221);
__m512 tmp16211 = _mm512_shuffle_f32x4(tmp16194, tmp16198, 136);
__m512 tmp16212 = _mm512_shuffle_f32x4(tmp16194, tmp16198, 221);
__m512 tmp16213 = _mm512_shuffle_f32x4(tmp16195, tmp16199, 136);
__m512 tmp16214 = _mm512_shuffle_f32x4(tmp16195, tmp16199, 221);
__m512 tmp16215 = _mm512_shuffle_f32x4(tmp16196, tmp16200, 136);
__m512 tmp16216 = _mm512_shuffle_f32x4(tmp16196, tmp16200, 221);
wt1051 = _mm512_shuffle_f32x4(tmp16201, tmp16209, 136);
wt1059 = _mm512_shuffle_f32x4(tmp16201, tmp16209, 221);
wt1052 = _mm512_shuffle_f32x4(tmp16203, tmp16211, 136);
wt1060 = _mm512_shuffle_f32x4(tmp16203, tmp16211, 221);
wt1053 = _mm512_shuffle_f32x4(tmp16205, tmp16213, 136);
wt1061 = _mm512_shuffle_f32x4(tmp16205, tmp16213, 221);
wt1054 = _mm512_shuffle_f32x4(tmp16207, tmp16215, 136);
wt1062 = _mm512_shuffle_f32x4(tmp16207, tmp16215, 221);
wt1055 = _mm512_shuffle_f32x4(tmp16202, tmp16210, 136);
wt1063 = _mm512_shuffle_f32x4(tmp16202, tmp16210, 221);
wt1056 = _mm512_shuffle_f32x4(tmp16204, tmp16212, 136);
wt1064 = _mm512_shuffle_f32x4(tmp16204, tmp16212, 221);
wt1057 = _mm512_shuffle_f32x4(tmp16206, tmp16214, 136);
wt1065 = _mm512_shuffle_f32x4(tmp16206, tmp16214, 221);
wt1058 = _mm512_shuffle_f32x4(tmp16208, tmp16216, 136);
wt1066 = _mm512_shuffle_f32x4(tmp16208, tmp16216, 221);
wt1051 = _mm512_mul_ps(wt1051, postMul80);
wt1052 = _mm512_mul_ps(wt1052, postMul80);
wt1053 = _mm512_mul_ps(wt1053, postMul80);
wt1054 = _mm512_mul_ps(wt1054, postMul80);
wt1055 = _mm512_mul_ps(wt1055, postMul80);
wt1056 = _mm512_mul_ps(wt1056, postMul80);
wt1057 = _mm512_mul_ps(wt1057, postMul80);
wt1058 = _mm512_mul_ps(wt1058, postMul80);
wt1059 = _mm512_mul_ps(wt1059, postMul80);
wt1060 = _mm512_mul_ps(wt1060, postMul80);
wt1061 = _mm512_mul_ps(wt1061, postMul80);
wt1062 = _mm512_mul_ps(wt1062, postMul80);
wt1063 = _mm512_mul_ps(wt1063, postMul80);
wt1064 = _mm512_mul_ps(wt1064, postMul80);
wt1065 = _mm512_mul_ps(wt1065, postMul80);
wt1066 = _mm512_mul_ps(wt1066, postMul80);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1051);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1052);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1053);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1054);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1055);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1056);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1057);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1058);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1059);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1060);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1061);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1062);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1063);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1064);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1065);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1066);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1051);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1052);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1053);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1054);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1055);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1056);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1057);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1058);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1059);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1060);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1061);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1062);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1063);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1064);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1065);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1066);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1051);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1052);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1053);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1054);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1055);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1056);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1057);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1058);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1059);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1060);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1061);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1062);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1063);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1064);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1065);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1066);
}
__m512 wt1067 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)0);
__m512 wt1068 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)8192);
__m512 wt1069 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)16384);
__m512 wt1070 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)24576);
__m512 wt1071 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)32768);
__m512 wt1072 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)40960);
__m512 wt1073 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)49152);
__m512 wt1074 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)57344);
__m512 wt1075 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)65536);
__m512 wt1076 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)73728);
__m512 wt1077 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)81920);
__m512 wt1078 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)90112);
__m512 wt1079 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)98304);
__m512 wt1080 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)106496);
__m512 wt1081 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)114688);
__m512 wt1082 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c77+(ptrdiff_t)122880);
__m512 tmp16217 = _mm512_unpacklo_ps(wt1067, wt1068);
__m512 tmp16218 = _mm512_unpackhi_ps(wt1067, wt1068);
__m512 tmp16219 = _mm512_unpacklo_ps(wt1069, wt1070);
__m512 tmp16220 = _mm512_unpackhi_ps(wt1069, wt1070);
__m512 tmp16221 = _mm512_unpacklo_ps(wt1071, wt1072);
__m512 tmp16222 = _mm512_unpackhi_ps(wt1071, wt1072);
__m512 tmp16223 = _mm512_unpacklo_ps(wt1073, wt1074);
__m512 tmp16224 = _mm512_unpackhi_ps(wt1073, wt1074);
__m512 tmp16225 = _mm512_unpacklo_ps(wt1075, wt1076);
__m512 tmp16226 = _mm512_unpackhi_ps(wt1075, wt1076);
__m512 tmp16227 = _mm512_unpacklo_ps(wt1077, wt1078);
__m512 tmp16228 = _mm512_unpackhi_ps(wt1077, wt1078);
__m512 tmp16229 = _mm512_unpacklo_ps(wt1079, wt1080);
__m512 tmp16230 = _mm512_unpackhi_ps(wt1079, wt1080);
__m512 tmp16231 = _mm512_unpacklo_ps(wt1081, wt1082);
__m512 tmp16232 = _mm512_unpackhi_ps(wt1081, wt1082);
__m512 tmp16233 = _mm512_shuffle_ps(tmp16217, tmp16219, 68);
__m512 tmp16234 = _mm512_shuffle_ps(tmp16217, tmp16219, 238);
__m512 tmp16235 = _mm512_shuffle_ps(tmp16218, tmp16220, 68);
__m512 tmp16236 = _mm512_shuffle_ps(tmp16218, tmp16220, 238);
__m512 tmp16237 = _mm512_shuffle_ps(tmp16221, tmp16223, 68);
__m512 tmp16238 = _mm512_shuffle_ps(tmp16221, tmp16223, 238);
__m512 tmp16239 = _mm512_shuffle_ps(tmp16222, tmp16224, 68);
__m512 tmp16240 = _mm512_shuffle_ps(tmp16222, tmp16224, 238);
__m512 tmp16241 = _mm512_shuffle_ps(tmp16225, tmp16227, 68);
__m512 tmp16242 = _mm512_shuffle_ps(tmp16225, tmp16227, 238);
__m512 tmp16243 = _mm512_shuffle_ps(tmp16226, tmp16228, 68);
__m512 tmp16244 = _mm512_shuffle_ps(tmp16226, tmp16228, 238);
__m512 tmp16245 = _mm512_shuffle_ps(tmp16229, tmp16231, 68);
__m512 tmp16246 = _mm512_shuffle_ps(tmp16229, tmp16231, 238);
__m512 tmp16247 = _mm512_shuffle_ps(tmp16230, tmp16232, 68);
__m512 tmp16248 = _mm512_shuffle_ps(tmp16230, tmp16232, 238);
__m512 tmp16249 = _mm512_shuffle_f32x4(tmp16233, tmp16237, 136);
__m512 tmp16250 = _mm512_shuffle_f32x4(tmp16233, tmp16237, 221);
__m512 tmp16251 = _mm512_shuffle_f32x4(tmp16234, tmp16238, 136);
__m512 tmp16252 = _mm512_shuffle_f32x4(tmp16234, tmp16238, 221);
__m512 tmp16253 = _mm512_shuffle_f32x4(tmp16235, tmp16239, 136);
__m512 tmp16254 = _mm512_shuffle_f32x4(tmp16235, tmp16239, 221);
__m512 tmp16255 = _mm512_shuffle_f32x4(tmp16236, tmp16240, 136);
__m512 tmp16256 = _mm512_shuffle_f32x4(tmp16236, tmp16240, 221);
__m512 tmp16257 = _mm512_shuffle_f32x4(tmp16241, tmp16245, 136);
__m512 tmp16258 = _mm512_shuffle_f32x4(tmp16241, tmp16245, 221);
__m512 tmp16259 = _mm512_shuffle_f32x4(tmp16242, tmp16246, 136);
__m512 tmp16260 = _mm512_shuffle_f32x4(tmp16242, tmp16246, 221);
__m512 tmp16261 = _mm512_shuffle_f32x4(tmp16243, tmp16247, 136);
__m512 tmp16262 = _mm512_shuffle_f32x4(tmp16243, tmp16247, 221);
__m512 tmp16263 = _mm512_shuffle_f32x4(tmp16244, tmp16248, 136);
__m512 tmp16264 = _mm512_shuffle_f32x4(tmp16244, tmp16248, 221);
wt1067 = _mm512_shuffle_f32x4(tmp16249, tmp16257, 136);
wt1075 = _mm512_shuffle_f32x4(tmp16249, tmp16257, 221);
wt1068 = _mm512_shuffle_f32x4(tmp16251, tmp16259, 136);
wt1076 = _mm512_shuffle_f32x4(tmp16251, tmp16259, 221);
wt1069 = _mm512_shuffle_f32x4(tmp16253, tmp16261, 136);
wt1070 = _mm512_shuffle_f32x4(tmp16255, tmp16263, 136);
wt1071 = _mm512_shuffle_f32x4(tmp16250, tmp16258, 136);
wt1072 = _mm512_shuffle_f32x4(tmp16252, tmp16260, 136);
wt1073 = _mm512_shuffle_f32x4(tmp16254, tmp16262, 136);
wt1074 = _mm512_shuffle_f32x4(tmp16256, tmp16264, 136);
wt1067 = _mm512_mul_ps(wt1067, postMul80);
wt1068 = _mm512_mul_ps(wt1068, postMul80);
wt1069 = _mm512_mul_ps(wt1069, postMul80);
wt1070 = _mm512_mul_ps(wt1070, postMul80);
wt1071 = _mm512_mul_ps(wt1071, postMul80);
wt1072 = _mm512_mul_ps(wt1072, postMul80);
wt1073 = _mm512_mul_ps(wt1073, postMul80);
wt1074 = _mm512_mul_ps(wt1074, postMul80);
wt1075 = _mm512_mul_ps(wt1075, postMul80);
wt1076 = _mm512_mul_ps(wt1076, postMul80);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1067);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1068);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1069);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1070);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1071);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1072);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1073);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1074);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1075);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)0, 63>>cut36, wt1076);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1067);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1068);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1069);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1070);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1071);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1072);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1073);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1074);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1075);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)9072, 4032>>cut36, wt1076);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1067);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1068);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1069);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1070);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1071);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1072);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1073);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1074);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1075);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c77)+(ptrdiff_t)18144, 65535-(4095>>cut36), wt1076);
break;
}
default: {
cut36 = 4;
__m512 sum756 = _mm512_setzero_ps();
__m512i pmMul56 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd56 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo48 = _mm512_loadu_ps(bnPtr26+(ptrdiff_t)8*(k202+1024*i90));
__m512 masHi48 = _mm512_maskz_loadu_ps(65535, bnPtr26+(ptrdiff_t)8*(k202+1024*i90)+(ptrdiff_t)64);
__m512 postMul81 = _mm512_permutex2var_ps(masLo48, pmMul56, masHi48);
__m512 postAdd57 = _mm512_permutex2var_ps(masLo48, pmAdd56, masHi48);
(void)postAdd57;
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)0, 63>>cut36, sum756);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)9072, 4032>>cut36, sum756);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)18144, 258048>>cut36, sum756);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*0+(ptrdiff_t)27216, 65535-(262143>>cut36), sum756);
ptrdiff_t c78 = 0;
for (; c78 != 23; ++c78) {
__m512 wt1083 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)0);
__m512 wt1084 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)8192);
__m512 wt1085 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)16384);
__m512 wt1086 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)24576);
__m512 wt1087 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)32768);
__m512 wt1088 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)40960);
__m512 wt1089 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)49152);
__m512 wt1090 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)57344);
__m512 wt1091 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)65536);
__m512 wt1092 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)73728);
__m512 wt1093 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)81920);
__m512 wt1094 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)90112);
__m512 wt1095 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)98304);
__m512 wt1096 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)106496);
__m512 wt1097 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)114688);
__m512 wt1098 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)122880);
__m512 tmp16265 = _mm512_unpacklo_ps(wt1083, wt1084);
__m512 tmp16266 = _mm512_unpackhi_ps(wt1083, wt1084);
__m512 tmp16267 = _mm512_unpacklo_ps(wt1085, wt1086);
__m512 tmp16268 = _mm512_unpackhi_ps(wt1085, wt1086);
__m512 tmp16269 = _mm512_unpacklo_ps(wt1087, wt1088);
__m512 tmp16270 = _mm512_unpackhi_ps(wt1087, wt1088);
__m512 tmp16271 = _mm512_unpacklo_ps(wt1089, wt1090);
__m512 tmp16272 = _mm512_unpackhi_ps(wt1089, wt1090);
__m512 tmp16273 = _mm512_unpacklo_ps(wt1091, wt1092);
__m512 tmp16274 = _mm512_unpackhi_ps(wt1091, wt1092);
__m512 tmp16275 = _mm512_unpacklo_ps(wt1093, wt1094);
__m512 tmp16276 = _mm512_unpackhi_ps(wt1093, wt1094);
__m512 tmp16277 = _mm512_unpacklo_ps(wt1095, wt1096);
__m512 tmp16278 = _mm512_unpackhi_ps(wt1095, wt1096);
__m512 tmp16279 = _mm512_unpacklo_ps(wt1097, wt1098);
__m512 tmp16280 = _mm512_unpackhi_ps(wt1097, wt1098);
__m512 tmp16281 = _mm512_shuffle_ps(tmp16265, tmp16267, 68);
__m512 tmp16282 = _mm512_shuffle_ps(tmp16265, tmp16267, 238);
__m512 tmp16283 = _mm512_shuffle_ps(tmp16266, tmp16268, 68);
__m512 tmp16284 = _mm512_shuffle_ps(tmp16266, tmp16268, 238);
__m512 tmp16285 = _mm512_shuffle_ps(tmp16269, tmp16271, 68);
__m512 tmp16286 = _mm512_shuffle_ps(tmp16269, tmp16271, 238);
__m512 tmp16287 = _mm512_shuffle_ps(tmp16270, tmp16272, 68);
__m512 tmp16288 = _mm512_shuffle_ps(tmp16270, tmp16272, 238);
__m512 tmp16289 = _mm512_shuffle_ps(tmp16273, tmp16275, 68);
__m512 tmp16290 = _mm512_shuffle_ps(tmp16273, tmp16275, 238);
__m512 tmp16291 = _mm512_shuffle_ps(tmp16274, tmp16276, 68);
__m512 tmp16292 = _mm512_shuffle_ps(tmp16274, tmp16276, 238);
__m512 tmp16293 = _mm512_shuffle_ps(tmp16277, tmp16279, 68);
__m512 tmp16294 = _mm512_shuffle_ps(tmp16277, tmp16279, 238);
__m512 tmp16295 = _mm512_shuffle_ps(tmp16278, tmp16280, 68);
__m512 tmp16296 = _mm512_shuffle_ps(tmp16278, tmp16280, 238);
__m512 tmp16297 = _mm512_shuffle_f32x4(tmp16281, tmp16285, 136);
__m512 tmp16298 = _mm512_shuffle_f32x4(tmp16281, tmp16285, 221);
__m512 tmp16299 = _mm512_shuffle_f32x4(tmp16282, tmp16286, 136);
__m512 tmp16300 = _mm512_shuffle_f32x4(tmp16282, tmp16286, 221);
__m512 tmp16301 = _mm512_shuffle_f32x4(tmp16283, tmp16287, 136);
__m512 tmp16302 = _mm512_shuffle_f32x4(tmp16283, tmp16287, 221);
__m512 tmp16303 = _mm512_shuffle_f32x4(tmp16284, tmp16288, 136);
__m512 tmp16304 = _mm512_shuffle_f32x4(tmp16284, tmp16288, 221);
__m512 tmp16305 = _mm512_shuffle_f32x4(tmp16289, tmp16293, 136);
__m512 tmp16306 = _mm512_shuffle_f32x4(tmp16289, tmp16293, 221);
__m512 tmp16307 = _mm512_shuffle_f32x4(tmp16290, tmp16294, 136);
__m512 tmp16308 = _mm512_shuffle_f32x4(tmp16290, tmp16294, 221);
__m512 tmp16309 = _mm512_shuffle_f32x4(tmp16291, tmp16295, 136);
__m512 tmp16310 = _mm512_shuffle_f32x4(tmp16291, tmp16295, 221);
__m512 tmp16311 = _mm512_shuffle_f32x4(tmp16292, tmp16296, 136);
__m512 tmp16312 = _mm512_shuffle_f32x4(tmp16292, tmp16296, 221);
wt1083 = _mm512_shuffle_f32x4(tmp16297, tmp16305, 136);
wt1091 = _mm512_shuffle_f32x4(tmp16297, tmp16305, 221);
wt1084 = _mm512_shuffle_f32x4(tmp16299, tmp16307, 136);
wt1092 = _mm512_shuffle_f32x4(tmp16299, tmp16307, 221);
wt1085 = _mm512_shuffle_f32x4(tmp16301, tmp16309, 136);
wt1093 = _mm512_shuffle_f32x4(tmp16301, tmp16309, 221);
wt1086 = _mm512_shuffle_f32x4(tmp16303, tmp16311, 136);
wt1094 = _mm512_shuffle_f32x4(tmp16303, tmp16311, 221);
wt1087 = _mm512_shuffle_f32x4(tmp16298, tmp16306, 136);
wt1095 = _mm512_shuffle_f32x4(tmp16298, tmp16306, 221);
wt1088 = _mm512_shuffle_f32x4(tmp16300, tmp16308, 136);
wt1096 = _mm512_shuffle_f32x4(tmp16300, tmp16308, 221);
wt1089 = _mm512_shuffle_f32x4(tmp16302, tmp16310, 136);
wt1097 = _mm512_shuffle_f32x4(tmp16302, tmp16310, 221);
wt1090 = _mm512_shuffle_f32x4(tmp16304, tmp16312, 136);
wt1098 = _mm512_shuffle_f32x4(tmp16304, tmp16312, 221);
wt1083 = _mm512_mul_ps(wt1083, postMul81);
wt1084 = _mm512_mul_ps(wt1084, postMul81);
wt1085 = _mm512_mul_ps(wt1085, postMul81);
wt1086 = _mm512_mul_ps(wt1086, postMul81);
wt1087 = _mm512_mul_ps(wt1087, postMul81);
wt1088 = _mm512_mul_ps(wt1088, postMul81);
wt1089 = _mm512_mul_ps(wt1089, postMul81);
wt1090 = _mm512_mul_ps(wt1090, postMul81);
wt1091 = _mm512_mul_ps(wt1091, postMul81);
wt1092 = _mm512_mul_ps(wt1092, postMul81);
wt1093 = _mm512_mul_ps(wt1093, postMul81);
wt1094 = _mm512_mul_ps(wt1094, postMul81);
wt1095 = _mm512_mul_ps(wt1095, postMul81);
wt1096 = _mm512_mul_ps(wt1096, postMul81);
wt1097 = _mm512_mul_ps(wt1097, postMul81);
wt1098 = _mm512_mul_ps(wt1098, postMul81);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1083);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1084);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1085);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1086);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1087);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1088);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1089);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1090);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1091);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1092);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1093);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1094);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1095);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1096);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1097);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1098);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1083);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1084);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1085);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1086);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1087);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1088);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1089);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1090);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1091);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1092);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1093);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1094);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1095);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1096);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1097);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1098);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1083);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1084);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1085);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1086);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1087);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1088);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1089);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1090);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1091);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1092);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1093);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1094);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1095);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1096);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1097);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1098);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1083);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1084);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1085);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1086);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1087);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1088);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1089);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1090);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1091);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1092);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(11+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1093);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(12+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1094);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(13+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1095);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(14+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1096);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(15+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1097);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(16+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1098);
}
__m512 wt1099 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)0);
__m512 wt1100 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)8192);
__m512 wt1101 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)16384);
__m512 wt1102 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)24576);
__m512 wt1103 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)32768);
__m512 wt1104 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)40960);
__m512 wt1105 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)49152);
__m512 wt1106 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)57344);
__m512 wt1107 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)65536);
__m512 wt1108 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)73728);
__m512 wt1109 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)81920);
__m512 wt1110 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)90112);
__m512 wt1111 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)98304);
__m512 wt1112 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)106496);
__m512 wt1113 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)114688);
__m512 wt1114 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k202+64*c78+(ptrdiff_t)122880);
__m512 tmp16313 = _mm512_unpacklo_ps(wt1099, wt1100);
__m512 tmp16314 = _mm512_unpackhi_ps(wt1099, wt1100);
__m512 tmp16315 = _mm512_unpacklo_ps(wt1101, wt1102);
__m512 tmp16316 = _mm512_unpackhi_ps(wt1101, wt1102);
__m512 tmp16317 = _mm512_unpacklo_ps(wt1103, wt1104);
__m512 tmp16318 = _mm512_unpackhi_ps(wt1103, wt1104);
__m512 tmp16319 = _mm512_unpacklo_ps(wt1105, wt1106);
__m512 tmp16320 = _mm512_unpackhi_ps(wt1105, wt1106);
__m512 tmp16321 = _mm512_unpacklo_ps(wt1107, wt1108);
__m512 tmp16322 = _mm512_unpackhi_ps(wt1107, wt1108);
__m512 tmp16323 = _mm512_unpacklo_ps(wt1109, wt1110);
__m512 tmp16324 = _mm512_unpackhi_ps(wt1109, wt1110);
__m512 tmp16325 = _mm512_unpacklo_ps(wt1111, wt1112);
__m512 tmp16326 = _mm512_unpackhi_ps(wt1111, wt1112);
__m512 tmp16327 = _mm512_unpacklo_ps(wt1113, wt1114);
__m512 tmp16328 = _mm512_unpackhi_ps(wt1113, wt1114);
__m512 tmp16329 = _mm512_shuffle_ps(tmp16313, tmp16315, 68);
__m512 tmp16330 = _mm512_shuffle_ps(tmp16313, tmp16315, 238);
__m512 tmp16331 = _mm512_shuffle_ps(tmp16314, tmp16316, 68);
__m512 tmp16332 = _mm512_shuffle_ps(tmp16314, tmp16316, 238);
__m512 tmp16333 = _mm512_shuffle_ps(tmp16317, tmp16319, 68);
__m512 tmp16334 = _mm512_shuffle_ps(tmp16317, tmp16319, 238);
__m512 tmp16335 = _mm512_shuffle_ps(tmp16318, tmp16320, 68);
__m512 tmp16336 = _mm512_shuffle_ps(tmp16318, tmp16320, 238);
__m512 tmp16337 = _mm512_shuffle_ps(tmp16321, tmp16323, 68);
__m512 tmp16338 = _mm512_shuffle_ps(tmp16321, tmp16323, 238);
__m512 tmp16339 = _mm512_shuffle_ps(tmp16322, tmp16324, 68);
__m512 tmp16340 = _mm512_shuffle_ps(tmp16322, tmp16324, 238);
__m512 tmp16341 = _mm512_shuffle_ps(tmp16325, tmp16327, 68);
__m512 tmp16342 = _mm512_shuffle_ps(tmp16325, tmp16327, 238);
__m512 tmp16343 = _mm512_shuffle_ps(tmp16326, tmp16328, 68);
__m512 tmp16344 = _mm512_shuffle_ps(tmp16326, tmp16328, 238);
__m512 tmp16345 = _mm512_shuffle_f32x4(tmp16329, tmp16333, 136);
__m512 tmp16346 = _mm512_shuffle_f32x4(tmp16329, tmp16333, 221);
__m512 tmp16347 = _mm512_shuffle_f32x4(tmp16330, tmp16334, 136);
__m512 tmp16348 = _mm512_shuffle_f32x4(tmp16330, tmp16334, 221);
__m512 tmp16349 = _mm512_shuffle_f32x4(tmp16331, tmp16335, 136);
__m512 tmp16350 = _mm512_shuffle_f32x4(tmp16331, tmp16335, 221);
__m512 tmp16351 = _mm512_shuffle_f32x4(tmp16332, tmp16336, 136);
__m512 tmp16352 = _mm512_shuffle_f32x4(tmp16332, tmp16336, 221);
__m512 tmp16353 = _mm512_shuffle_f32x4(tmp16337, tmp16341, 136);
__m512 tmp16354 = _mm512_shuffle_f32x4(tmp16337, tmp16341, 221);
__m512 tmp16355 = _mm512_shuffle_f32x4(tmp16338, tmp16342, 136);
__m512 tmp16356 = _mm512_shuffle_f32x4(tmp16338, tmp16342, 221);
__m512 tmp16357 = _mm512_shuffle_f32x4(tmp16339, tmp16343, 136);
__m512 tmp16358 = _mm512_shuffle_f32x4(tmp16339, tmp16343, 221);
__m512 tmp16359 = _mm512_shuffle_f32x4(tmp16340, tmp16344, 136);
__m512 tmp16360 = _mm512_shuffle_f32x4(tmp16340, tmp16344, 221);
wt1099 = _mm512_shuffle_f32x4(tmp16345, tmp16353, 136);
wt1107 = _mm512_shuffle_f32x4(tmp16345, tmp16353, 221);
wt1100 = _mm512_shuffle_f32x4(tmp16347, tmp16355, 136);
wt1108 = _mm512_shuffle_f32x4(tmp16347, tmp16355, 221);
wt1101 = _mm512_shuffle_f32x4(tmp16349, tmp16357, 136);
wt1102 = _mm512_shuffle_f32x4(tmp16351, tmp16359, 136);
wt1103 = _mm512_shuffle_f32x4(tmp16346, tmp16354, 136);
wt1104 = _mm512_shuffle_f32x4(tmp16348, tmp16356, 136);
wt1105 = _mm512_shuffle_f32x4(tmp16350, tmp16358, 136);
wt1106 = _mm512_shuffle_f32x4(tmp16352, tmp16360, 136);
wt1099 = _mm512_mul_ps(wt1099, postMul81);
wt1100 = _mm512_mul_ps(wt1100, postMul81);
wt1101 = _mm512_mul_ps(wt1101, postMul81);
wt1102 = _mm512_mul_ps(wt1102, postMul81);
wt1103 = _mm512_mul_ps(wt1103, postMul81);
wt1104 = _mm512_mul_ps(wt1104, postMul81);
wt1105 = _mm512_mul_ps(wt1105, postMul81);
wt1106 = _mm512_mul_ps(wt1106, postMul81);
wt1107 = _mm512_mul_ps(wt1107, postMul81);
wt1108 = _mm512_mul_ps(wt1108, postMul81);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1099);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1100);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1101);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1102);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1103);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1104);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1105);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1106);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1107);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)0, 63>>cut36, wt1108);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1099);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1100);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1101);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1102);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1103);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1104);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1105);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1106);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1107);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)9072, 4032>>cut36, wt1108);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1099);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1100);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1101);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1102);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1103);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1104);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1105);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1106);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1107);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)18144, 258048>>cut36, wt1108);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(1+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1099);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(2+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1100);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(3+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1101);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(4+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1102);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(5+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1103);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(6+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1104);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(7+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1105);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(8+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1106);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(9+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1107);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l89+4*cut36+24*(10+16*c78)+(ptrdiff_t)27216, 65535-(262143>>cut36), wt1108);
}
}
} else {
ptrdiff_t k201 = 1008;
ptrdiff_t l88 = (size_t)(0+k201)/6;
ptrdiff_t cut35 = (size_t)(0+k201)%6;
__m512 sum754 = _mm512_setzero_ps();
__m512i pmMul57 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd57 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 masLo49 = _mm512_loadu_ps(bnPtr26+(ptrdiff_t)8*(k201+1024*i90));
__m512 masHi49 = _mm512_maskz_loadu_ps(65535, bnPtr26+(ptrdiff_t)8*(k201+1024*i90)+(ptrdiff_t)64);
__m512 postMul79 = _mm512_permutex2var_ps(masLo49, pmMul57, masHi49);
__m512 postAdd55 = _mm512_permutex2var_ps(masLo49, pmAdd57, masHi49);
(void)postAdd55;
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*0+(ptrdiff_t)0, 63>>cut35, sum754);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*0+(ptrdiff_t)9072, 4032>>cut35, sum754);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*0+(ptrdiff_t)18144, 65535-(4095>>cut35), sum754);
ptrdiff_t c76 = 0;
for (; c76 != 23; ++c76) {
__m512 wt1019 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)0);
__m512 wt1020 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)8192);
__m512 wt1021 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)16384);
__m512 wt1022 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)24576);
__m512 wt1023 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)32768);
__m512 wt1024 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)40960);
__m512 wt1025 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)49152);
__m512 wt1026 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)57344);
__m512 wt1027 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)65536);
__m512 wt1028 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)73728);
__m512 wt1029 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)81920);
__m512 wt1030 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)90112);
__m512 wt1031 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)98304);
__m512 wt1032 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)106496);
__m512 wt1033 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)114688);
__m512 wt1034 = _mm512_maskz_loadu_ps(65535, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)122880);
__m512 tmp16361 = _mm512_unpacklo_ps(wt1019, wt1020);
__m512 tmp16362 = _mm512_unpackhi_ps(wt1019, wt1020);
__m512 tmp16363 = _mm512_unpacklo_ps(wt1021, wt1022);
__m512 tmp16364 = _mm512_unpackhi_ps(wt1021, wt1022);
__m512 tmp16365 = _mm512_unpacklo_ps(wt1023, wt1024);
__m512 tmp16366 = _mm512_unpackhi_ps(wt1023, wt1024);
__m512 tmp16367 = _mm512_unpacklo_ps(wt1025, wt1026);
__m512 tmp16368 = _mm512_unpackhi_ps(wt1025, wt1026);
__m512 tmp16369 = _mm512_unpacklo_ps(wt1027, wt1028);
__m512 tmp16370 = _mm512_unpackhi_ps(wt1027, wt1028);
__m512 tmp16371 = _mm512_unpacklo_ps(wt1029, wt1030);
__m512 tmp16372 = _mm512_unpackhi_ps(wt1029, wt1030);
__m512 tmp16373 = _mm512_unpacklo_ps(wt1031, wt1032);
__m512 tmp16374 = _mm512_unpackhi_ps(wt1031, wt1032);
__m512 tmp16375 = _mm512_unpacklo_ps(wt1033, wt1034);
__m512 tmp16376 = _mm512_unpackhi_ps(wt1033, wt1034);
__m512 tmp16377 = _mm512_shuffle_ps(tmp16361, tmp16363, 68);
__m512 tmp16378 = _mm512_shuffle_ps(tmp16361, tmp16363, 238);
__m512 tmp16379 = _mm512_shuffle_ps(tmp16362, tmp16364, 68);
__m512 tmp16380 = _mm512_shuffle_ps(tmp16362, tmp16364, 238);
__m512 tmp16381 = _mm512_shuffle_ps(tmp16365, tmp16367, 68);
__m512 tmp16382 = _mm512_shuffle_ps(tmp16365, tmp16367, 238);
__m512 tmp16383 = _mm512_shuffle_ps(tmp16366, tmp16368, 68);
__m512 tmp16384 = _mm512_shuffle_ps(tmp16366, tmp16368, 238);
__m512 tmp16385 = _mm512_shuffle_ps(tmp16369, tmp16371, 68);
__m512 tmp16386 = _mm512_shuffle_ps(tmp16369, tmp16371, 238);
__m512 tmp16387 = _mm512_shuffle_ps(tmp16370, tmp16372, 68);
__m512 tmp16388 = _mm512_shuffle_ps(tmp16370, tmp16372, 238);
__m512 tmp16389 = _mm512_shuffle_ps(tmp16373, tmp16375, 68);
__m512 tmp16390 = _mm512_shuffle_ps(tmp16373, tmp16375, 238);
__m512 tmp16391 = _mm512_shuffle_ps(tmp16374, tmp16376, 68);
__m512 tmp16392 = _mm512_shuffle_ps(tmp16374, tmp16376, 238);
__m512 tmp16393 = _mm512_shuffle_f32x4(tmp16377, tmp16381, 136);
__m512 tmp16394 = _mm512_shuffle_f32x4(tmp16377, tmp16381, 221);
__m512 tmp16395 = _mm512_shuffle_f32x4(tmp16378, tmp16382, 136);
__m512 tmp16396 = _mm512_shuffle_f32x4(tmp16378, tmp16382, 221);
__m512 tmp16397 = _mm512_shuffle_f32x4(tmp16379, tmp16383, 136);
__m512 tmp16398 = _mm512_shuffle_f32x4(tmp16379, tmp16383, 221);
__m512 tmp16399 = _mm512_shuffle_f32x4(tmp16380, tmp16384, 136);
__m512 tmp16400 = _mm512_shuffle_f32x4(tmp16380, tmp16384, 221);
__m512 tmp16401 = _mm512_shuffle_f32x4(tmp16385, tmp16389, 136);
__m512 tmp16402 = _mm512_shuffle_f32x4(tmp16385, tmp16389, 221);
__m512 tmp16403 = _mm512_shuffle_f32x4(tmp16386, tmp16390, 136);
__m512 tmp16404 = _mm512_shuffle_f32x4(tmp16386, tmp16390, 221);
__m512 tmp16405 = _mm512_shuffle_f32x4(tmp16387, tmp16391, 136);
__m512 tmp16406 = _mm512_shuffle_f32x4(tmp16387, tmp16391, 221);
__m512 tmp16407 = _mm512_shuffle_f32x4(tmp16388, tmp16392, 136);
__m512 tmp16408 = _mm512_shuffle_f32x4(tmp16388, tmp16392, 221);
wt1019 = _mm512_shuffle_f32x4(tmp16393, tmp16401, 136);
wt1027 = _mm512_shuffle_f32x4(tmp16393, tmp16401, 221);
wt1020 = _mm512_shuffle_f32x4(tmp16395, tmp16403, 136);
wt1028 = _mm512_shuffle_f32x4(tmp16395, tmp16403, 221);
wt1021 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 136);
wt1029 = _mm512_shuffle_f32x4(tmp16397, tmp16405, 221);
wt1022 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 136);
wt1030 = _mm512_shuffle_f32x4(tmp16399, tmp16407, 221);
wt1023 = _mm512_shuffle_f32x4(tmp16394, tmp16402, 136);
wt1031 = _mm512_shuffle_f32x4(tmp16394, tmp16402, 221);
wt1024 = _mm512_shuffle_f32x4(tmp16396, tmp16404, 136);
wt1032 = _mm512_shuffle_f32x4(tmp16396, tmp16404, 221);
wt1025 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 136);
wt1033 = _mm512_shuffle_f32x4(tmp16398, tmp16406, 221);
wt1026 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 136);
wt1034 = _mm512_shuffle_f32x4(tmp16400, tmp16408, 221);
wt1019 = _mm512_mul_ps(wt1019, postMul79);
wt1020 = _mm512_mul_ps(wt1020, postMul79);
wt1021 = _mm512_mul_ps(wt1021, postMul79);
wt1022 = _mm512_mul_ps(wt1022, postMul79);
wt1023 = _mm512_mul_ps(wt1023, postMul79);
wt1024 = _mm512_mul_ps(wt1024, postMul79);
wt1025 = _mm512_mul_ps(wt1025, postMul79);
wt1026 = _mm512_mul_ps(wt1026, postMul79);
wt1027 = _mm512_mul_ps(wt1027, postMul79);
wt1028 = _mm512_mul_ps(wt1028, postMul79);
wt1029 = _mm512_mul_ps(wt1029, postMul79);
wt1030 = _mm512_mul_ps(wt1030, postMul79);
wt1031 = _mm512_mul_ps(wt1031, postMul79);
wt1032 = _mm512_mul_ps(wt1032, postMul79);
wt1033 = _mm512_mul_ps(wt1033, postMul79);
wt1034 = _mm512_mul_ps(wt1034, postMul79);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(1+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1019);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(2+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1020);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(3+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1021);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(4+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1022);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(5+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1023);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(6+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1024);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(7+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1025);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(8+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1026);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(9+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1027);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(10+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1028);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(11+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1029);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(12+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1030);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(13+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1031);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(14+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1032);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(15+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1033);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(16+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1034);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(1+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1019);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(2+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1020);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(3+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1021);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(4+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1022);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(5+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1023);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(6+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1024);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(7+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1025);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(8+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1026);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(9+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1027);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(10+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1028);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(11+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1029);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(12+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1030);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(13+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1031);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(14+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1032);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(15+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1033);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(16+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1034);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(1+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1019);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(2+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1020);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(3+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1021);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(4+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1022);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(5+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1023);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(6+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1024);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(7+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1025);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(8+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1026);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(9+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1027);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(10+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1028);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(11+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1029);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(12+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1030);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(13+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1031);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(14+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1032);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(15+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1033);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(16+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1034);
}
__m512 wt1035 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)0);
__m512 wt1036 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)8192);
__m512 wt1037 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)16384);
__m512 wt1038 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)24576);
__m512 wt1039 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)32768);
__m512 wt1040 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)40960);
__m512 wt1041 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)49152);
__m512 wt1042 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)57344);
__m512 wt1043 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)65536);
__m512 wt1044 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)73728);
__m512 wt1045 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)81920);
__m512 wt1046 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)90112);
__m512 wt1047 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)98304);
__m512 wt1048 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)106496);
__m512 wt1049 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)114688);
__m512 wt1050 = _mm512_maskz_loadu_ps(1023, wtPtr25+8388608*i90+8192*k201+64*c76+(ptrdiff_t)122880);
__m512 tmp16409 = _mm512_unpacklo_ps(wt1035, wt1036);
__m512 tmp16410 = _mm512_unpackhi_ps(wt1035, wt1036);
__m512 tmp16411 = _mm512_unpacklo_ps(wt1037, wt1038);
__m512 tmp16412 = _mm512_unpackhi_ps(wt1037, wt1038);
__m512 tmp16413 = _mm512_unpacklo_ps(wt1039, wt1040);
__m512 tmp16414 = _mm512_unpackhi_ps(wt1039, wt1040);
__m512 tmp16415 = _mm512_unpacklo_ps(wt1041, wt1042);
__m512 tmp16416 = _mm512_unpackhi_ps(wt1041, wt1042);
__m512 tmp16417 = _mm512_unpacklo_ps(wt1043, wt1044);
__m512 tmp16418 = _mm512_unpackhi_ps(wt1043, wt1044);
__m512 tmp16419 = _mm512_unpacklo_ps(wt1045, wt1046);
__m512 tmp16420 = _mm512_unpackhi_ps(wt1045, wt1046);
__m512 tmp16421 = _mm512_unpacklo_ps(wt1047, wt1048);
__m512 tmp16422 = _mm512_unpackhi_ps(wt1047, wt1048);
__m512 tmp16423 = _mm512_unpacklo_ps(wt1049, wt1050);
__m512 tmp16424 = _mm512_unpackhi_ps(wt1049, wt1050);
__m512 tmp16425 = _mm512_shuffle_ps(tmp16409, tmp16411, 68);
__m512 tmp16426 = _mm512_shuffle_ps(tmp16409, tmp16411, 238);
__m512 tmp16427 = _mm512_shuffle_ps(tmp16410, tmp16412, 68);
__m512 tmp16428 = _mm512_shuffle_ps(tmp16410, tmp16412, 238);
__m512 tmp16429 = _mm512_shuffle_ps(tmp16413, tmp16415, 68);
__m512 tmp16430 = _mm512_shuffle_ps(tmp16413, tmp16415, 238);
__m512 tmp16431 = _mm512_shuffle_ps(tmp16414, tmp16416, 68);
__m512 tmp16432 = _mm512_shuffle_ps(tmp16414, tmp16416, 238);
__m512 tmp16433 = _mm512_shuffle_ps(tmp16417, tmp16419, 68);
__m512 tmp16434 = _mm512_shuffle_ps(tmp16417, tmp16419, 238);
__m512 tmp16435 = _mm512_shuffle_ps(tmp16418, tmp16420, 68);
__m512 tmp16436 = _mm512_shuffle_ps(tmp16418, tmp16420, 238);
__m512 tmp16437 = _mm512_shuffle_ps(tmp16421, tmp16423, 68);
__m512 tmp16438 = _mm512_shuffle_ps(tmp16421, tmp16423, 238);
__m512 tmp16439 = _mm512_shuffle_ps(tmp16422, tmp16424, 68);
__m512 tmp16440 = _mm512_shuffle_ps(tmp16422, tmp16424, 238);
__m512 tmp16441 = _mm512_shuffle_f32x4(tmp16425, tmp16429, 136);
__m512 tmp16442 = _mm512_shuffle_f32x4(tmp16425, tmp16429, 221);
__m512 tmp16443 = _mm512_shuffle_f32x4(tmp16426, tmp16430, 136);
__m512 tmp16444 = _mm512_shuffle_f32x4(tmp16426, tmp16430, 221);
__m512 tmp16445 = _mm512_shuffle_f32x4(tmp16427, tmp16431, 136);
__m512 tmp16446 = _mm512_shuffle_f32x4(tmp16427, tmp16431, 221);
__m512 tmp16447 = _mm512_shuffle_f32x4(tmp16428, tmp16432, 136);
__m512 tmp16448 = _mm512_shuffle_f32x4(tmp16428, tmp16432, 221);
__m512 tmp16449 = _mm512_shuffle_f32x4(tmp16433, tmp16437, 136);
__m512 tmp16450 = _mm512_shuffle_f32x4(tmp16433, tmp16437, 221);
__m512 tmp16451 = _mm512_shuffle_f32x4(tmp16434, tmp16438, 136);
__m512 tmp16452 = _mm512_shuffle_f32x4(tmp16434, tmp16438, 221);
__m512 tmp16453 = _mm512_shuffle_f32x4(tmp16435, tmp16439, 136);
__m512 tmp16454 = _mm512_shuffle_f32x4(tmp16435, tmp16439, 221);
__m512 tmp16455 = _mm512_shuffle_f32x4(tmp16436, tmp16440, 136);
__m512 tmp16456 = _mm512_shuffle_f32x4(tmp16436, tmp16440, 221);
wt1035 = _mm512_shuffle_f32x4(tmp16441, tmp16449, 136);
wt1043 = _mm512_shuffle_f32x4(tmp16441, tmp16449, 221);
wt1036 = _mm512_shuffle_f32x4(tmp16443, tmp16451, 136);
wt1044 = _mm512_shuffle_f32x4(tmp16443, tmp16451, 221);
wt1037 = _mm512_shuffle_f32x4(tmp16445, tmp16453, 136);
wt1038 = _mm512_shuffle_f32x4(tmp16447, tmp16455, 136);
wt1039 = _mm512_shuffle_f32x4(tmp16442, tmp16450, 136);
wt1040 = _mm512_shuffle_f32x4(tmp16444, tmp16452, 136);
wt1041 = _mm512_shuffle_f32x4(tmp16446, tmp16454, 136);
wt1042 = _mm512_shuffle_f32x4(tmp16448, tmp16456, 136);
wt1035 = _mm512_mul_ps(wt1035, postMul79);
wt1036 = _mm512_mul_ps(wt1036, postMul79);
wt1037 = _mm512_mul_ps(wt1037, postMul79);
wt1038 = _mm512_mul_ps(wt1038, postMul79);
wt1039 = _mm512_mul_ps(wt1039, postMul79);
wt1040 = _mm512_mul_ps(wt1040, postMul79);
wt1041 = _mm512_mul_ps(wt1041, postMul79);
wt1042 = _mm512_mul_ps(wt1042, postMul79);
wt1043 = _mm512_mul_ps(wt1043, postMul79);
wt1044 = _mm512_mul_ps(wt1044, postMul79);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(1+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1035);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(2+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1036);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(3+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1037);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(4+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1038);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(5+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1039);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(6+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1040);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(7+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1041);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(8+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1042);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(9+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1043);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(10+16*c76)+(ptrdiff_t)0, 63>>cut35, wt1044);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(1+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1035);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(2+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1036);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(3+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1037);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(4+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1038);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(5+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1039);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(6+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1040);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(7+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1041);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(8+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1042);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(9+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1043);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+24*(10+16*c76)+(ptrdiff_t)9072, 4032>>cut35, wt1044);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(1+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1035);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(2+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1036);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(3+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1037);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(4+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1038);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(5+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1039);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(6+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1040);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(7+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1041);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(8+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1042);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(9+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1043);
_mm512_mask_storeu_ps(arranged30+1552384*i90+9096*l88+4*cut35+16*(10+16*c76)+(ptrdiff_t)18144, 65535-(4095>>cut35), wt1044);
}
}
}
}

static void ResNeXt50OneArrangeWts15(ResNeXt50ThreaderTeam1* team89, char** tensors151) {
ResNeXt50ThreaderTask1 task155;
task155.callee1 = ResNeXt50OneArrangeWts15Callee1;
task155.any1 = tensors151;
task155.nd1 = 3;
task155.hull1[0] = 64;
task155.hull1[1] = 1;
task155.hull1[2] = 3;
ResNeXt50ThreaderDo1(team89, &task155);
}

static void ResNeXt50OneArrangeDats15Callee1(ResNeXt50ThreaderTask1* task156, int64_t* pt83) {
char** tensors154 = task156->any1;
ptrdiff_t s90 = pt83[0];
ptrdiff_t e45 = pt83[3];
if (e45 < 2) {
char*restrict datPtr49 = tensors154[0]+(ptrdiff_t)0+(ptrdiff_t)267200*e45+(ptrdiff_t)655360*0;
char*restrict arranged31 = tensors154[1]+(ptrdiff_t)213760*e45+(ptrdiff_t)213760*0;
ptrdiff_t ii68 = 1;
for (ptrdiff_t i91 = 0; i91 < ii68; ++i91) {
ptrdiff_t j82 = 0;
ptrdiff_t k203 = 417*s90;
ptrdiff_t kk71 = k203+(s90 < 1 ? 417 : 418);
for (; k203 < kk71; ++k203) {
__m512 dat2671 = _mm512_maskz_loadu_ps(65535, datPtr49+655360*i91+256*j82+320*k203+(ptrdiff_t)0);
__m512 dat2672 = _mm512_maskz_loadu_ps(65535, datPtr49+655360*i91+256*j82+320*k203+(ptrdiff_t)64);
__m512 dat2673 = _mm512_maskz_loadu_ps(65535, datPtr49+655360*i91+256*j82+320*k203+(ptrdiff_t)128);
__m512 dat2674 = _mm512_maskz_loadu_ps(1, datPtr49+655360*i91+256*j82+320*k203+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged31+213760*i91+213760*j82+256*k203+(ptrdiff_t)0, 65535, dat2671);
_mm512_mask_storeu_ps(arranged31+213760*i91+213760*j82+256*k203+(ptrdiff_t)64, 65535, dat2672);
_mm512_mask_storeu_ps(arranged31+213760*i91+213760*j82+256*k203+(ptrdiff_t)128, 65535, dat2673);
_mm512_mask_storeu_ps(arranged31+213760*i91+213760*j82+256*k203+(ptrdiff_t)192, 1, dat2674);
}
}
return;
}
char*restrict datPtr50 = tensors154[0]+(ptrdiff_t)0+(ptrdiff_t)267200*2+(ptrdiff_t)655360*0;
char*restrict arranged32 = tensors154[1]+(ptrdiff_t)213760*2+(ptrdiff_t)96768*0;
ptrdiff_t ii69 = 1;
for (ptrdiff_t i92 = 0; i92 < ii69; ++i92) {
ptrdiff_t j83 = 0;
ptrdiff_t k204 = 189*s90;
ptrdiff_t kk72 = k204+189;
for (; k204 < kk72; ++k204) {
__m512 dat2675 = _mm512_maskz_loadu_ps(65535, datPtr50+655360*i92+256*j83+320*k204+(ptrdiff_t)0);
__m512 dat2676 = _mm512_maskz_loadu_ps(65535, datPtr50+655360*i92+256*j83+320*k204+(ptrdiff_t)64);
__m512 dat2677 = _mm512_maskz_loadu_ps(65535, datPtr50+655360*i92+256*j83+320*k204+(ptrdiff_t)128);
__m512 dat2678 = _mm512_maskz_loadu_ps(1, datPtr50+655360*i92+256*j83+320*k204+(ptrdiff_t)192);
_mm512_mask_storeu_ps(arranged32+96768*i92+96768*j83+256*k204+(ptrdiff_t)0, 65535, dat2675);
_mm512_mask_storeu_ps(arranged32+96768*i92+96768*j83+256*k204+(ptrdiff_t)64, 65535, dat2676);
_mm512_mask_storeu_ps(arranged32+96768*i92+96768*j83+256*k204+(ptrdiff_t)128, 65535, dat2677);
_mm512_mask_storeu_ps(arranged32+96768*i92+96768*j83+256*k204+(ptrdiff_t)192, 1, dat2678);
}
}
}

static void ResNeXt50OneArrangeDats15(ResNeXt50ThreaderTeam1* team90, char** tensors153) {
ResNeXt50ThreaderTask1 task157;
task157.callee1 = ResNeXt50OneArrangeDats15Callee1;
task157.any1 = tensors153;
task157.nd1 = 4;
task157.hull1[0] = 2;
task157.hull1[1] = 1;
task157.hull1[2] = 1;
task157.hull1[3] = 3;
ResNeXt50ThreaderDo1(team90, &task157);
}

static void ResNeXt50OneApply15Callee1(ResNeXt50ThreaderTask1* task158, int64_t* pt84) {
void** pair38 = task158->any1;
char** tensors156 = pair38[0];
ptrdiff_t e46 = 0;
ptrdiff_t g48 = 0;
ptrdiff_t d31 = 0;
ptrdiff_t w80 = pt84[0];
char*restrict arrangedWts15 = tensors156[0]+3424256*e46+(ptrdiff_t)3424256*1*g48;
char*restrict arrangedDats15 = tensors156[1]+213760*e46+(ptrdiff_t)213760*1*g48;
char*restrict datPtr51 = tensors156[2]+(ptrdiff_t)327680*1*g48;
ptrdiff_t ii70 = 1;
for (ptrdiff_t i93 = 0; i93 < ii70; ++i93) {
ptrdiff_t j84 = 1*d31;
ptrdiff_t k205 = 1*w80;
ptrdiff_t kk73 = k205+0;
for (; k205 != 170; ++k205) {
ptrdiff_t s91 = -1;
__m512 sum757 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)24));
__m512 sum761 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)28));
__m512 sum765 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)32));
__m512 sum769 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)36));
__m512 sum773 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)40));
__m512 sum777 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)44));
__m512 sum758 = sum757;
__m512 sum759 = sum757;
__m512 sum760 = sum757;
__m512 sum762 = sum761;
__m512 sum763 = sum761;
__m512 sum764 = sum761;
__m512 sum766 = sum765;
__m512 sum767 = sum765;
__m512 sum768 = sum765;
__m512 sum770 = sum769;
__m512 sum771 = sum769;
__m512 sum772 = sum769;
__m512 sum774 = sum773;
__m512 sum775 = sum773;
__m512 sum776 = sum773;
__m512 sum778 = sum777;
__m512 sum779 = sum777;
__m512 sum780 = sum777;
for (s91 = 0; s91 < 835; ++s91) {
__m512 dat2679 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s91+(ptrdiff_t)0);
__m512 dat2680 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s91+(ptrdiff_t)64);
__m512 dat2681 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s91+(ptrdiff_t)128);
__m512 dat2682 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s91+(ptrdiff_t)192);
__m512 wt1115 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)24));
sum757 = _mm512_fmadd_ps(wt1115, dat2679, sum757);
sum758 = _mm512_fmadd_ps(wt1115, dat2680, sum758);
sum759 = _mm512_fmadd_ps(wt1115, dat2681, sum759);
sum760 = _mm512_fmadd_ps(wt1115, dat2682, sum760);
__m512 wt1116 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)28));
sum761 = _mm512_fmadd_ps(wt1116, dat2679, sum761);
sum762 = _mm512_fmadd_ps(wt1116, dat2680, sum762);
sum763 = _mm512_fmadd_ps(wt1116, dat2681, sum763);
sum764 = _mm512_fmadd_ps(wt1116, dat2682, sum764);
__m512 wt1117 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)32));
sum765 = _mm512_fmadd_ps(wt1117, dat2679, sum765);
sum766 = _mm512_fmadd_ps(wt1117, dat2680, sum766);
sum767 = _mm512_fmadd_ps(wt1117, dat2681, sum767);
sum768 = _mm512_fmadd_ps(wt1117, dat2682, sum768);
__m512 wt1118 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)36));
sum769 = _mm512_fmadd_ps(wt1118, dat2679, sum769);
sum770 = _mm512_fmadd_ps(wt1118, dat2680, sum770);
sum771 = _mm512_fmadd_ps(wt1118, dat2681, sum771);
sum772 = _mm512_fmadd_ps(wt1118, dat2682, sum772);
__m512 wt1119 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)40));
sum773 = _mm512_fmadd_ps(wt1119, dat2679, sum773);
sum774 = _mm512_fmadd_ps(wt1119, dat2680, sum774);
sum775 = _mm512_fmadd_ps(wt1119, dat2681, sum775);
sum776 = _mm512_fmadd_ps(wt1119, dat2682, sum776);
__m512 wt1120 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+24*s91+(ptrdiff_t)44));
sum777 = _mm512_fmadd_ps(wt1120, dat2679, sum777);
sum778 = _mm512_fmadd_ps(wt1120, dat2680, sum778);
sum779 = _mm512_fmadd_ps(wt1120, dat2681, sum779);
sum780 = _mm512_fmadd_ps(wt1120, dat2682, sum780);
}
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)0, 65535, sum757);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)64, 65535, sum758);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)128, 65535, sum759);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)192, 1, sum760);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)320, 65535, sum761);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)384, 65535, sum762);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)448, 65535, sum763);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)512, 1, sum764);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)640, 65535, sum765);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)704, 65535, sum766);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)768, 65535, sum767);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)832, 1, sum768);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)960, 65535, sum769);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1024, 65535, sum770);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1088, 65535, sum771);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1152, 1, sum772);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1280, 65535, sum773);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1344, 65535, sum774);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1408, 65535, sum775);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1472, 1, sum776);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1600, 65535, sum777);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1664, 65535, sum778);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1728, 65535, sum779);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1792, 1, sum780);
if (k205 >= kk73) return;
}
ptrdiff_t s92 = -1;
__m512 sum781 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)16));
__m512 sum785 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)20));
__m512 sum789 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)24));
__m512 sum793 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)28));
__m512 sum782 = sum781;
__m512 sum783 = sum781;
__m512 sum784 = sum781;
__m512 sum786 = sum785;
__m512 sum787 = sum785;
__m512 sum788 = sum785;
__m512 sum790 = sum789;
__m512 sum791 = sum789;
__m512 sum792 = sum789;
__m512 sum794 = sum793;
__m512 sum795 = sum793;
__m512 sum796 = sum793;
for (s92 = 0; s92 < 835; ++s92) {
__m512 dat2683 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s92+(ptrdiff_t)0);
__m512 dat2684 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s92+(ptrdiff_t)64);
__m512 dat2685 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s92+(ptrdiff_t)128);
__m512 dat2686 = _mm512_loadu_ps(arrangedDats15+213760*i93+213760*j84+256*s92+(ptrdiff_t)192);
__m512 wt1121 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)16));
sum781 = _mm512_fmadd_ps(wt1121, dat2683, sum781);
sum782 = _mm512_fmadd_ps(wt1121, dat2684, sum782);
sum783 = _mm512_fmadd_ps(wt1121, dat2685, sum783);
sum784 = _mm512_fmadd_ps(wt1121, dat2686, sum784);
__m512 wt1122 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)20));
sum785 = _mm512_fmadd_ps(wt1122, dat2683, sum785);
sum786 = _mm512_fmadd_ps(wt1122, dat2684, sum786);
sum787 = _mm512_fmadd_ps(wt1122, dat2685, sum787);
sum788 = _mm512_fmadd_ps(wt1122, dat2686, sum788);
__m512 wt1123 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)24));
sum789 = _mm512_fmadd_ps(wt1123, dat2683, sum789);
sum790 = _mm512_fmadd_ps(wt1123, dat2684, sum790);
sum791 = _mm512_fmadd_ps(wt1123, dat2685, sum791);
sum792 = _mm512_fmadd_ps(wt1123, dat2686, sum792);
__m512 wt1124 = _mm512_set1_ps(*(float*)(arrangedWts15+3424256*i93+20064*k205+16*s92+(ptrdiff_t)28));
sum793 = _mm512_fmadd_ps(wt1124, dat2683, sum793);
sum794 = _mm512_fmadd_ps(wt1124, dat2684, sum794);
sum795 = _mm512_fmadd_ps(wt1124, dat2685, sum795);
sum796 = _mm512_fmadd_ps(wt1124, dat2686, sum796);
}
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)0, 65535, sum781);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)64, 65535, sum782);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)128, 65535, sum783);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)192, 1, sum784);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)320, 65535, sum785);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)384, 65535, sum786);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)448, 65535, sum787);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)512, 1, sum788);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)640, 65535, sum789);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)704, 65535, sum790);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)768, 65535, sum791);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)832, 1, sum792);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)960, 65535, sum793);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1024, 65535, sum794);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1088, 65535, sum795);
_mm512_mask_storeu_ps(datPtr51+327680*i93+256*j84+1920*k205+(ptrdiff_t)1152, 1, sum796);
}
}

static void ResNeXt50OneApply15Callee2(ResNeXt50ThreaderTask1* task159, int64_t* pt85) {
void** pair39 = task159->any1;
char** tensors157 = pair39[0];
ptrdiff_t e47 = (ptrdiff_t)pair39[1];
ptrdiff_t g49 = 0;
ptrdiff_t d32 = 0;
ptrdiff_t w81 = pt85[0];
char*restrict arrangedWts16 = tensors157[0]+3424256*e47+(ptrdiff_t)3424256*1*g49;
char*restrict arrangedDats16 = tensors157[1]+213760*e47+(ptrdiff_t)213760*1*g49;
char*restrict datPtr52 = tensors157[2]+(ptrdiff_t)327680*1*g49;
ptrdiff_t ii71 = 1;
for (ptrdiff_t i94 = 0; i94 < ii71; ++i94) {
ptrdiff_t j85 = 1*d32;
ptrdiff_t k206 = 1*w81;
ptrdiff_t kk74 = k206+0;
for (; k206 != 170; ++k206) {
ptrdiff_t s93 = -1;
__m512 sum797 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)24));
__m512 sum801 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)28));
__m512 sum805 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)32));
__m512 sum809 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)36));
__m512 sum813 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)40));
__m512 sum817 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)44));
__m512 sum798 = sum797;
__m512 sum799 = sum797;
__m512 sum800 = sum797;
__m512 sum802 = sum801;
__m512 sum803 = sum801;
__m512 sum804 = sum801;
__m512 sum806 = sum805;
__m512 sum807 = sum805;
__m512 sum808 = sum805;
__m512 sum810 = sum809;
__m512 sum811 = sum809;
__m512 sum812 = sum809;
__m512 sum814 = sum813;
__m512 sum815 = sum813;
__m512 sum816 = sum813;
__m512 sum818 = sum817;
__m512 sum819 = sum817;
__m512 sum820 = sum817;
for (s93 = 0; s93 < 835; ++s93) {
__m512 dat2687 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s93+(ptrdiff_t)0);
__m512 dat2688 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s93+(ptrdiff_t)64);
__m512 dat2689 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s93+(ptrdiff_t)128);
__m512 dat2690 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s93+(ptrdiff_t)192);
__m512 wt1125 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)24));
sum797 = _mm512_fmadd_ps(wt1125, dat2687, sum797);
sum798 = _mm512_fmadd_ps(wt1125, dat2688, sum798);
sum799 = _mm512_fmadd_ps(wt1125, dat2689, sum799);
sum800 = _mm512_fmadd_ps(wt1125, dat2690, sum800);
__m512 wt1126 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)28));
sum801 = _mm512_fmadd_ps(wt1126, dat2687, sum801);
sum802 = _mm512_fmadd_ps(wt1126, dat2688, sum802);
sum803 = _mm512_fmadd_ps(wt1126, dat2689, sum803);
sum804 = _mm512_fmadd_ps(wt1126, dat2690, sum804);
__m512 wt1127 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)32));
sum805 = _mm512_fmadd_ps(wt1127, dat2687, sum805);
sum806 = _mm512_fmadd_ps(wt1127, dat2688, sum806);
sum807 = _mm512_fmadd_ps(wt1127, dat2689, sum807);
sum808 = _mm512_fmadd_ps(wt1127, dat2690, sum808);
__m512 wt1128 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)36));
sum809 = _mm512_fmadd_ps(wt1128, dat2687, sum809);
sum810 = _mm512_fmadd_ps(wt1128, dat2688, sum810);
sum811 = _mm512_fmadd_ps(wt1128, dat2689, sum811);
sum812 = _mm512_fmadd_ps(wt1128, dat2690, sum812);
__m512 wt1129 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)40));
sum813 = _mm512_fmadd_ps(wt1129, dat2687, sum813);
sum814 = _mm512_fmadd_ps(wt1129, dat2688, sum814);
sum815 = _mm512_fmadd_ps(wt1129, dat2689, sum815);
sum816 = _mm512_fmadd_ps(wt1129, dat2690, sum816);
__m512 wt1130 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+24*s93+(ptrdiff_t)44));
sum817 = _mm512_fmadd_ps(wt1130, dat2687, sum817);
sum818 = _mm512_fmadd_ps(wt1130, dat2688, sum818);
sum819 = _mm512_fmadd_ps(wt1130, dat2689, sum819);
sum820 = _mm512_fmadd_ps(wt1130, dat2690, sum820);
}
sum797 = _mm512_add_ps(sum797, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)0));
sum798 = _mm512_add_ps(sum798, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)64));
sum799 = _mm512_add_ps(sum799, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)128));
sum800 = _mm512_add_ps(sum800, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)0, 65535, sum797);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)64, 65535, sum798);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)128, 65535, sum799);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)192, 1, sum800);
sum801 = _mm512_add_ps(sum801, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)320));
sum802 = _mm512_add_ps(sum802, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)384));
sum803 = _mm512_add_ps(sum803, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)448));
sum804 = _mm512_add_ps(sum804, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)512));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)320, 65535, sum801);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)384, 65535, sum802);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)448, 65535, sum803);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)512, 1, sum804);
sum805 = _mm512_add_ps(sum805, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)640));
sum806 = _mm512_add_ps(sum806, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)704));
sum807 = _mm512_add_ps(sum807, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)768));
sum808 = _mm512_add_ps(sum808, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)832));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)640, 65535, sum805);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)704, 65535, sum806);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)768, 65535, sum807);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)832, 1, sum808);
sum809 = _mm512_add_ps(sum809, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)960));
sum810 = _mm512_add_ps(sum810, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1024));
sum811 = _mm512_add_ps(sum811, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1088));
sum812 = _mm512_add_ps(sum812, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1152));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)960, 65535, sum809);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1024, 65535, sum810);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1088, 65535, sum811);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1152, 1, sum812);
sum813 = _mm512_add_ps(sum813, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1280));
sum814 = _mm512_add_ps(sum814, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1344));
sum815 = _mm512_add_ps(sum815, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1408));
sum816 = _mm512_add_ps(sum816, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1472));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1280, 65535, sum813);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1344, 65535, sum814);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1408, 65535, sum815);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1472, 1, sum816);
sum817 = _mm512_add_ps(sum817, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1600));
sum818 = _mm512_add_ps(sum818, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1664));
sum819 = _mm512_add_ps(sum819, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1728));
sum820 = _mm512_add_ps(sum820, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1792));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1600, 65535, sum817);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1664, 65535, sum818);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1728, 65535, sum819);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1792, 1, sum820);
if (k206 >= kk74) return;
}
ptrdiff_t s94 = -1;
__m512 sum821 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)16));
__m512 sum825 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)20));
__m512 sum829 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)24));
__m512 sum833 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)28));
__m512 sum822 = sum821;
__m512 sum823 = sum821;
__m512 sum824 = sum821;
__m512 sum826 = sum825;
__m512 sum827 = sum825;
__m512 sum828 = sum825;
__m512 sum830 = sum829;
__m512 sum831 = sum829;
__m512 sum832 = sum829;
__m512 sum834 = sum833;
__m512 sum835 = sum833;
__m512 sum836 = sum833;
for (s94 = 0; s94 < 835; ++s94) {
__m512 dat2691 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s94+(ptrdiff_t)0);
__m512 dat2692 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s94+(ptrdiff_t)64);
__m512 dat2693 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s94+(ptrdiff_t)128);
__m512 dat2694 = _mm512_loadu_ps(arrangedDats16+213760*i94+213760*j85+256*s94+(ptrdiff_t)192);
__m512 wt1131 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)16));
sum821 = _mm512_fmadd_ps(wt1131, dat2691, sum821);
sum822 = _mm512_fmadd_ps(wt1131, dat2692, sum822);
sum823 = _mm512_fmadd_ps(wt1131, dat2693, sum823);
sum824 = _mm512_fmadd_ps(wt1131, dat2694, sum824);
__m512 wt1132 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)20));
sum825 = _mm512_fmadd_ps(wt1132, dat2691, sum825);
sum826 = _mm512_fmadd_ps(wt1132, dat2692, sum826);
sum827 = _mm512_fmadd_ps(wt1132, dat2693, sum827);
sum828 = _mm512_fmadd_ps(wt1132, dat2694, sum828);
__m512 wt1133 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)24));
sum829 = _mm512_fmadd_ps(wt1133, dat2691, sum829);
sum830 = _mm512_fmadd_ps(wt1133, dat2692, sum830);
sum831 = _mm512_fmadd_ps(wt1133, dat2693, sum831);
sum832 = _mm512_fmadd_ps(wt1133, dat2694, sum832);
__m512 wt1134 = _mm512_set1_ps(*(float*)(arrangedWts16+3424256*i94+20064*k206+16*s94+(ptrdiff_t)28));
sum833 = _mm512_fmadd_ps(wt1134, dat2691, sum833);
sum834 = _mm512_fmadd_ps(wt1134, dat2692, sum834);
sum835 = _mm512_fmadd_ps(wt1134, dat2693, sum835);
sum836 = _mm512_fmadd_ps(wt1134, dat2694, sum836);
}
sum821 = _mm512_add_ps(sum821, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)0));
sum822 = _mm512_add_ps(sum822, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)64));
sum823 = _mm512_add_ps(sum823, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)128));
sum824 = _mm512_add_ps(sum824, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)192));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)0, 65535, sum821);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)64, 65535, sum822);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)128, 65535, sum823);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)192, 1, sum824);
sum825 = _mm512_add_ps(sum825, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)320));
sum826 = _mm512_add_ps(sum826, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)384));
sum827 = _mm512_add_ps(sum827, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)448));
sum828 = _mm512_add_ps(sum828, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)512));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)320, 65535, sum825);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)384, 65535, sum826);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)448, 65535, sum827);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)512, 1, sum828);
sum829 = _mm512_add_ps(sum829, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)640));
sum830 = _mm512_add_ps(sum830, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)704));
sum831 = _mm512_add_ps(sum831, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)768));
sum832 = _mm512_add_ps(sum832, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)832));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)640, 65535, sum829);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)704, 65535, sum830);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)768, 65535, sum831);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)832, 1, sum832);
sum833 = _mm512_add_ps(sum833, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)960));
sum834 = _mm512_add_ps(sum834, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1024));
sum835 = _mm512_add_ps(sum835, _mm512_maskz_loadu_ps(65535, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1088));
sum836 = _mm512_add_ps(sum836, _mm512_maskz_loadu_ps(1, datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1152));
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)960, 65535, sum833);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1024, 65535, sum834);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1088, 65535, sum835);
_mm512_mask_storeu_ps(datPtr52+327680*i94+256*j85+1920*k206+(ptrdiff_t)1152, 1, sum836);
}
}

static void ResNeXt50OneApply15Callee3(ResNeXt50ThreaderTask1* task160, int64_t* pt86) {
void** pair40 = task160->any1;
char** tensors158 = pair40[0];
ptrdiff_t e49 = 2;
ptrdiff_t g50 = 0;
ptrdiff_t d33 = 0;
ptrdiff_t w82 = pt86[0];
char*restrict arrangedWts17 = tensors158[0]+3424256*e49+(ptrdiff_t)1552384*1*g50;
char*restrict arrangedDats17 = tensors158[1]+213760*e49+(ptrdiff_t)96768*1*g50;
char*restrict datPtr53 = tensors158[2]+(ptrdiff_t)327680*1*g50;
ptrdiff_t ii72 = 1;
for (ptrdiff_t i95 = 0; i95 < ii72; ++i95) {
ptrdiff_t j86 = 1*d33;
ptrdiff_t k207 = 2*w82;
ptrdiff_t kk75 = k207+(w82 < 84 ? 1 : 2);
for (; k207 != 170; ++k207) {
ptrdiff_t s95 = -1;
__m512 sum837 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)24));
__m512 sum841 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)28));
__m512 sum845 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)32));
__m512 sum849 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)36));
__m512 sum853 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)40));
__m512 sum857 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)44));
__m512 sum838 = sum837;
__m512 sum839 = sum837;
__m512 sum840 = sum837;
__m512 sum842 = sum841;
__m512 sum843 = sum841;
__m512 sum844 = sum841;
__m512 sum846 = sum845;
__m512 sum847 = sum845;
__m512 sum848 = sum845;
__m512 sum850 = sum849;
__m512 sum851 = sum849;
__m512 sum852 = sum849;
__m512 sum854 = sum853;
__m512 sum855 = sum853;
__m512 sum856 = sum853;
__m512 sum858 = sum857;
__m512 sum859 = sum857;
__m512 sum860 = sum857;
for (s95 = 0; s95 < 378; ++s95) {
__m512 dat2695 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s95+(ptrdiff_t)0);
__m512 dat2696 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s95+(ptrdiff_t)64);
__m512 dat2697 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s95+(ptrdiff_t)128);
__m512 dat2698 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s95+(ptrdiff_t)192);
__m512 wt1135 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)24));
sum837 = _mm512_fmadd_ps(wt1135, dat2695, sum837);
sum838 = _mm512_fmadd_ps(wt1135, dat2696, sum838);
sum839 = _mm512_fmadd_ps(wt1135, dat2697, sum839);
sum840 = _mm512_fmadd_ps(wt1135, dat2698, sum840);
__m512 wt1136 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)28));
sum841 = _mm512_fmadd_ps(wt1136, dat2695, sum841);
sum842 = _mm512_fmadd_ps(wt1136, dat2696, sum842);
sum843 = _mm512_fmadd_ps(wt1136, dat2697, sum843);
sum844 = _mm512_fmadd_ps(wt1136, dat2698, sum844);
__m512 wt1137 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)32));
sum845 = _mm512_fmadd_ps(wt1137, dat2695, sum845);
sum846 = _mm512_fmadd_ps(wt1137, dat2696, sum846);
sum847 = _mm512_fmadd_ps(wt1137, dat2697, sum847);
sum848 = _mm512_fmadd_ps(wt1137, dat2698, sum848);
__m512 wt1138 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)36));
sum849 = _mm512_fmadd_ps(wt1138, dat2695, sum849);
sum850 = _mm512_fmadd_ps(wt1138, dat2696, sum850);
sum851 = _mm512_fmadd_ps(wt1138, dat2697, sum851);
sum852 = _mm512_fmadd_ps(wt1138, dat2698, sum852);
__m512 wt1139 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)40));
sum853 = _mm512_fmadd_ps(wt1139, dat2695, sum853);
sum854 = _mm512_fmadd_ps(wt1139, dat2696, sum854);
sum855 = _mm512_fmadd_ps(wt1139, dat2697, sum855);
sum856 = _mm512_fmadd_ps(wt1139, dat2698, sum856);
__m512 wt1140 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+24*s95+(ptrdiff_t)44));
sum857 = _mm512_fmadd_ps(wt1140, dat2695, sum857);
sum858 = _mm512_fmadd_ps(wt1140, dat2696, sum858);
sum859 = _mm512_fmadd_ps(wt1140, dat2697, sum859);
sum860 = _mm512_fmadd_ps(wt1140, dat2698, sum860);
}
sum837 = _mm512_add_ps(sum837, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)0));
sum838 = _mm512_add_ps(sum838, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)64));
sum839 = _mm512_add_ps(sum839, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)128));
sum840 = _mm512_add_ps(sum840, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)192));
sum837 = _mm512_max_ps(_mm512_setzero_ps(), sum837);
sum838 = _mm512_max_ps(_mm512_setzero_ps(), sum838);
sum839 = _mm512_max_ps(_mm512_setzero_ps(), sum839);
sum840 = _mm512_max_ps(_mm512_setzero_ps(), sum840);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)0, 65535, sum837);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)64, 65535, sum838);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)128, 65535, sum839);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)192, 1, sum840);
sum841 = _mm512_add_ps(sum841, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)320));
sum842 = _mm512_add_ps(sum842, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)384));
sum843 = _mm512_add_ps(sum843, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)448));
sum844 = _mm512_add_ps(sum844, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)512));
sum841 = _mm512_max_ps(_mm512_setzero_ps(), sum841);
sum842 = _mm512_max_ps(_mm512_setzero_ps(), sum842);
sum843 = _mm512_max_ps(_mm512_setzero_ps(), sum843);
sum844 = _mm512_max_ps(_mm512_setzero_ps(), sum844);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)320, 65535, sum841);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)384, 65535, sum842);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)448, 65535, sum843);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)512, 1, sum844);
sum845 = _mm512_add_ps(sum845, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)640));
sum846 = _mm512_add_ps(sum846, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)704));
sum847 = _mm512_add_ps(sum847, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)768));
sum848 = _mm512_add_ps(sum848, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)832));
sum845 = _mm512_max_ps(_mm512_setzero_ps(), sum845);
sum846 = _mm512_max_ps(_mm512_setzero_ps(), sum846);
sum847 = _mm512_max_ps(_mm512_setzero_ps(), sum847);
sum848 = _mm512_max_ps(_mm512_setzero_ps(), sum848);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)640, 65535, sum845);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)704, 65535, sum846);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)768, 65535, sum847);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)832, 1, sum848);
sum849 = _mm512_add_ps(sum849, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)960));
sum850 = _mm512_add_ps(sum850, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1024));
sum851 = _mm512_add_ps(sum851, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1088));
sum852 = _mm512_add_ps(sum852, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1152));
sum849 = _mm512_max_ps(_mm512_setzero_ps(), sum849);
sum850 = _mm512_max_ps(_mm512_setzero_ps(), sum850);
sum851 = _mm512_max_ps(_mm512_setzero_ps(), sum851);
sum852 = _mm512_max_ps(_mm512_setzero_ps(), sum852);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)960, 65535, sum849);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1024, 65535, sum850);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1088, 65535, sum851);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1152, 1, sum852);
sum853 = _mm512_add_ps(sum853, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1280));
sum854 = _mm512_add_ps(sum854, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1344));
sum855 = _mm512_add_ps(sum855, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1408));
sum856 = _mm512_add_ps(sum856, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1472));
sum853 = _mm512_max_ps(_mm512_setzero_ps(), sum853);
sum854 = _mm512_max_ps(_mm512_setzero_ps(), sum854);
sum855 = _mm512_max_ps(_mm512_setzero_ps(), sum855);
sum856 = _mm512_max_ps(_mm512_setzero_ps(), sum856);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1280, 65535, sum853);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1344, 65535, sum854);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1408, 65535, sum855);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1472, 1, sum856);
sum857 = _mm512_add_ps(sum857, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1600));
sum858 = _mm512_add_ps(sum858, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1664));
sum859 = _mm512_add_ps(sum859, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1728));
sum860 = _mm512_add_ps(sum860, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1792));
sum857 = _mm512_max_ps(_mm512_setzero_ps(), sum857);
sum858 = _mm512_max_ps(_mm512_setzero_ps(), sum858);
sum859 = _mm512_max_ps(_mm512_setzero_ps(), sum859);
sum860 = _mm512_max_ps(_mm512_setzero_ps(), sum860);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1600, 65535, sum857);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1664, 65535, sum858);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1728, 65535, sum859);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1792, 1, sum860);
if (k207 >= kk75) return;
}
ptrdiff_t s96 = -1;
__m512 sum861 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)16));
__m512 sum865 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)20));
__m512 sum869 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)24));
__m512 sum873 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)28));
__m512 sum862 = sum861;
__m512 sum863 = sum861;
__m512 sum864 = sum861;
__m512 sum866 = sum865;
__m512 sum867 = sum865;
__m512 sum868 = sum865;
__m512 sum870 = sum869;
__m512 sum871 = sum869;
__m512 sum872 = sum869;
__m512 sum874 = sum873;
__m512 sum875 = sum873;
__m512 sum876 = sum873;
for (s96 = 0; s96 < 378; ++s96) {
__m512 dat2699 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s96+(ptrdiff_t)0);
__m512 dat2700 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s96+(ptrdiff_t)64);
__m512 dat2701 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s96+(ptrdiff_t)128);
__m512 dat2702 = _mm512_loadu_ps(arrangedDats17+96768*i95+96768*j86+256*s96+(ptrdiff_t)192);
__m512 wt1141 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)16));
sum861 = _mm512_fmadd_ps(wt1141, dat2699, sum861);
sum862 = _mm512_fmadd_ps(wt1141, dat2700, sum862);
sum863 = _mm512_fmadd_ps(wt1141, dat2701, sum863);
sum864 = _mm512_fmadd_ps(wt1141, dat2702, sum864);
__m512 wt1142 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)20));
sum865 = _mm512_fmadd_ps(wt1142, dat2699, sum865);
sum866 = _mm512_fmadd_ps(wt1142, dat2700, sum866);
sum867 = _mm512_fmadd_ps(wt1142, dat2701, sum867);
sum868 = _mm512_fmadd_ps(wt1142, dat2702, sum868);
__m512 wt1143 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)24));
sum869 = _mm512_fmadd_ps(wt1143, dat2699, sum869);
sum870 = _mm512_fmadd_ps(wt1143, dat2700, sum870);
sum871 = _mm512_fmadd_ps(wt1143, dat2701, sum871);
sum872 = _mm512_fmadd_ps(wt1143, dat2702, sum872);
__m512 wt1144 = _mm512_set1_ps(*(float*)(arrangedWts17+1552384*i95+9096*k207+16*s96+(ptrdiff_t)28));
sum873 = _mm512_fmadd_ps(wt1144, dat2699, sum873);
sum874 = _mm512_fmadd_ps(wt1144, dat2700, sum874);
sum875 = _mm512_fmadd_ps(wt1144, dat2701, sum875);
sum876 = _mm512_fmadd_ps(wt1144, dat2702, sum876);
}
sum861 = _mm512_add_ps(sum861, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)0));
sum862 = _mm512_add_ps(sum862, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)64));
sum863 = _mm512_add_ps(sum863, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)128));
sum864 = _mm512_add_ps(sum864, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)192));
sum861 = _mm512_max_ps(_mm512_setzero_ps(), sum861);
sum862 = _mm512_max_ps(_mm512_setzero_ps(), sum862);
sum863 = _mm512_max_ps(_mm512_setzero_ps(), sum863);
sum864 = _mm512_max_ps(_mm512_setzero_ps(), sum864);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)0, 65535, sum861);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)64, 65535, sum862);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)128, 65535, sum863);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)192, 1, sum864);
sum865 = _mm512_add_ps(sum865, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)320));
sum866 = _mm512_add_ps(sum866, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)384));
sum867 = _mm512_add_ps(sum867, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)448));
sum868 = _mm512_add_ps(sum868, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)512));
sum865 = _mm512_max_ps(_mm512_setzero_ps(), sum865);
sum866 = _mm512_max_ps(_mm512_setzero_ps(), sum866);
sum867 = _mm512_max_ps(_mm512_setzero_ps(), sum867);
sum868 = _mm512_max_ps(_mm512_setzero_ps(), sum868);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)320, 65535, sum865);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)384, 65535, sum866);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)448, 65535, sum867);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)512, 1, sum868);
sum869 = _mm512_add_ps(sum869, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)640));
sum870 = _mm512_add_ps(sum870, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)704));
sum871 = _mm512_add_ps(sum871, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)768));
sum872 = _mm512_add_ps(sum872, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)832));
sum869 = _mm512_max_ps(_mm512_setzero_ps(), sum869);
sum870 = _mm512_max_ps(_mm512_setzero_ps(), sum870);
sum871 = _mm512_max_ps(_mm512_setzero_ps(), sum871);
sum872 = _mm512_max_ps(_mm512_setzero_ps(), sum872);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)640, 65535, sum869);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)704, 65535, sum870);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)768, 65535, sum871);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)832, 1, sum872);
sum873 = _mm512_add_ps(sum873, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)960));
sum874 = _mm512_add_ps(sum874, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1024));
sum875 = _mm512_add_ps(sum875, _mm512_maskz_loadu_ps(65535, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1088));
sum876 = _mm512_add_ps(sum876, _mm512_maskz_loadu_ps(1, datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1152));
sum873 = _mm512_max_ps(_mm512_setzero_ps(), sum873);
sum874 = _mm512_max_ps(_mm512_setzero_ps(), sum874);
sum875 = _mm512_max_ps(_mm512_setzero_ps(), sum875);
sum876 = _mm512_max_ps(_mm512_setzero_ps(), sum876);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)960, 65535, sum873);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1024, 65535, sum874);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1088, 65535, sum875);
_mm512_mask_storeu_ps(datPtr53+327680*i95+256*j86+1920*k207+(ptrdiff_t)1152, 1, sum876);
}
}

static void ResNeXt50OneApply15(ResNeXt50ThreaderTeam1* team91, char** tensors155) {
void* pair37[] = {tensors155, 0};
ResNeXt50ThreaderTask1 task161;
task161.callee1 = ResNeXt50OneApply15Callee1;
task161.any1 = pair37;
task161.nd1 = 3;
task161.hull1[0] = 171;
task161.hull1[1] = 1;
task161.hull1[2] = 1;
ResNeXt50ThreaderDo1(team91, &task161);
for (ptrdiff_t e48 = 1; e48 < 2; ++e48) {
pair37[1] = (void*)e48;
ResNeXt50ThreaderTask1 task162;
task162.callee1 = ResNeXt50OneApply15Callee2;
task162.any1 = pair37;
task162.nd1 = 3;
task162.hull1[0] = 171;
task162.hull1[1] = 1;
task162.hull1[2] = 1;
ResNeXt50ThreaderDo1(team91, &task162);
}
pair37[1] = (void*)2;
ResNeXt50ThreaderTask1 task163;
task163.callee1 = ResNeXt50OneApply15Callee3;
task163.any1 = pair37;
task163.nd1 = 3;
task163.hull1[0] = 85;
task163.hull1[1] = 1;
task163.hull1[2] = 1;
ResNeXt50ThreaderDo1(team91, &task163);
}

static void ResNeXt50ThreeArrangeFilts1Callee1(ResNeXt50ThreaderTask1* task20, int64_t* pt15) {
char** tensors18 = task20->any1;
ptrdiff_t b45 = 0;
ptrdiff_t g7 = 0;
ptrdiff_t e7 = 0;
(void)pt15;
char*restrict bfPtr4 = tensors18[3]+512*e7;
char*restrict wfPtr4 = tensors18[3]+512+6488064*e7;
char*restrict wtPtr4 = tensors18[0]+14256*e7;
char*restrict biasPtr4 = tensors18[1];
char*restrict bnPtr5 = tensors18[2];
ptrdiff_t i16 = 32*g7;
ptrdiff_t ii4 = i16+31;
for (; i16 <= ii4; ++i16) {
ptrdiff_t j10 = 1*b45;
if (j10 == 0) {
ptrdiff_t k50 = 0+1*j10;
ptrdiff_t cut3 = 0;
__m512 postMul8 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(0+4*i16+4*j10))[0]);
__m512 postMul9 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(1+4*i16+4*j10))[0]);
__m512 postMul10 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(2+4*i16+4*j10))[0]);
__m512 postMul11 = _mm512_set1_ps(((float*)bnPtr5+(ptrdiff_t)2*(3+4*i16+4*j10))[0]);
ptrdiff_t s11 = 0;
for (; s11 != 4; ++s11) {
__m512 wt85 = _mm512_maskz_loadu_ps(511, wtPtr4+0+576*i16+576*j10+36*s11);
__m512 wt86 = _mm512_maskz_loadu_ps(511, wtPtr4+144+576*i16+576*j10+36*s11);
__m512 wt87 = _mm512_maskz_loadu_ps(511, wtPtr4+288+576*i16+576*j10+36*s11);
__m512 wt88 = _mm512_maskz_loadu_ps(511, wtPtr4+432+576*i16+576*j10+36*s11);
wt85 = _mm512_mul_ps(wt85, postMul8);
wt86 = _mm512_mul_ps(wt86, postMul9);
wt87 = _mm512_mul_ps(wt87, postMul10);
wt88 = _mm512_mul_ps(wt88, postMul11);
__m512i pm63 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm64 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp193 = _mm512_permutex2var_ps(wt85, pm63, wt87);
__m512 tmp194 = _mm512_permutex2var_ps(wt86, pm63, wt88);
__m512 tmp195 = _mm512_permutex2var_ps(wt85, pm64, wt87);
__m512 tmp196 = _mm512_permutex2var_ps(wt86, pm64, wt88);
__m512 in11 = _mm512_permutex2var_ps(tmp193, pm63, tmp194);
__m512 in12 = _mm512_permutex2var_ps(tmp193, pm64, tmp194);
__m512 in13 = _mm512_permutex2var_ps(tmp195, pm63, tmp196);
__m512 tmp197 = _mm512_fmadd_ps(in11, _mm512_set1_ps(4e+00f), in13);
__m512 tmp198 = _mm512_add_ps(in11, in13);
__m512 tmp199 = _mm512_fmadd_ps(in13, _mm512_set1_ps(4e+00f), in11);
__m512 tmp200 = _mm512_add_ps(in12, tmp198);
__m512 tmp201 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp199);
tmp199 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp199);
__m512 tmp202 = _mm512_fnmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp197);
tmp197 = _mm512_fmadd_ps(in12, _mm512_set1_ps(2e+00f), tmp197);
tmp198 = _mm512_sub_ps(tmp198, in12);
__m512 tmp219 = _mm512_unpacklo_ps(in11, tmp200);
__m512 tmp220 = _mm512_unpackhi_ps(in11, tmp200);
__m512 tmp221 = _mm512_unpacklo_ps(tmp198, tmp201);
__m512 tmp222 = _mm512_unpackhi_ps(tmp198, tmp201);
__m512 tmp223 = _mm512_unpacklo_ps(tmp199, tmp197);
__m512 tmp224 = _mm512_unpackhi_ps(tmp199, tmp197);
__m512 tmp225 = _mm512_unpacklo_ps(tmp202, in13);
__m512 tmp226 = _mm512_unpackhi_ps(tmp202, in13);
__m512 tmp227 = _mm512_shuffle_ps(tmp219, tmp221, 68);
__m512 tmp228 = _mm512_shuffle_ps(tmp219, tmp221, 238);
__m512 tmp229 = _mm512_shuffle_ps(tmp220, tmp222, 68);
__m512 tmp230 = _mm512_shuffle_ps(tmp220, tmp222, 238);
__m512 tmp231 = _mm512_shuffle_ps(tmp223, tmp225, 68);
__m512 tmp232 = _mm512_shuffle_ps(tmp223, tmp225, 238);
__m512 tmp233 = _mm512_shuffle_ps(tmp224, tmp226, 68);
__m512 tmp234 = _mm512_shuffle_ps(tmp224, tmp226, 238);
__m512 tmp235 = _mm512_shuffle_f32x4(tmp227, tmp231, 136);
__m512 tmp236 = _mm512_shuffle_f32x4(tmp227, tmp231, 221);
__m512 tmp237 = _mm512_shuffle_f32x4(tmp228, tmp232, 136);
__m512 tmp238 = _mm512_shuffle_f32x4(tmp228, tmp232, 221);
__m512 tmp239 = _mm512_shuffle_f32x4(tmp229, tmp233, 136);
__m512 tmp240 = _mm512_shuffle_f32x4(tmp229, tmp233, 221);
__m512 tmp241 = _mm512_shuffle_f32x4(tmp230, tmp234, 136);
__m512 tmp242 = _mm512_shuffle_f32x4(tmp230, tmp234, 221);
in11 = _mm512_shuffle_f32x4(tmp235, tmp235, 136);
__m512 tmp203 = _mm512_shuffle_f32x4(tmp235, tmp235, 221);
tmp200 = _mm512_shuffle_f32x4(tmp237, tmp237, 136);
__m512 tmp204 = _mm512_shuffle_f32x4(tmp237, tmp237, 221);
tmp198 = _mm512_shuffle_f32x4(tmp239, tmp239, 136);
__m512 tmp205 = _mm512_shuffle_f32x4(tmp239, tmp239, 221);
tmp201 = _mm512_shuffle_f32x4(tmp241, tmp241, 136);
__m512 tmp206 = _mm512_shuffle_f32x4(tmp241, tmp241, 221);
tmp199 = _mm512_shuffle_f32x4(tmp236, tmp236, 136);
tmp197 = _mm512_shuffle_f32x4(tmp238, tmp238, 136);
tmp202 = _mm512_shuffle_f32x4(tmp240, tmp240, 136);
in13 = _mm512_shuffle_f32x4(tmp242, tmp242, 136);
in11 = _mm512_shuffle_f32x4(in11, tmp201, 68);
tmp200 = _mm512_shuffle_f32x4(tmp200, tmp199, 68);
tmp198 = _mm512_shuffle_f32x4(tmp198, tmp197, 68);
tmp202 = _mm512_shuffle_f32x4(tmp202, tmp204, 68);
in13 = _mm512_shuffle_f32x4(in13, tmp205, 68);
tmp203 = _mm512_shuffle_f32x4(tmp203, tmp206, 68);
__m512 tmp207 = _mm512_fmadd_ps(in11, _mm512_set1_ps(4e+00f), tmp198);
__m512 tmp213 = _mm512_fmadd_ps(tmp202, _mm512_set1_ps(4e+00f), tmp203);
__m512 tmp208 = _mm512_add_ps(in11, tmp198);
__m512 tmp214 = _mm512_add_ps(tmp202, tmp203);
__m512 tmp209 = _mm512_fmadd_ps(tmp198, _mm512_set1_ps(4e+00f), in11);
__m512 tmp215 = _mm512_fmadd_ps(tmp203, _mm512_set1_ps(4e+00f), tmp202);
__m512 tmp210 = _mm512_add_ps(tmp200, tmp208);
__m512 tmp216 = _mm512_add_ps(in13, tmp214);
__m512 tmp211 = _mm512_fmadd_ps(tmp200, _mm512_set1_ps(2e+00f), tmp209);
__m512 tmp217 = _mm512_fmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp215);
tmp209 = _mm512_fnmadd_ps(tmp200, _mm512_set1_ps(2e+00f), tmp209);
tmp215 = _mm512_fnmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp215);
__m512 tmp212 = _mm512_fnmadd_ps(tmp200, _mm512_set1_ps(2e+00f), tmp207);
__m512 tmp218 = _mm512_fnmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp213);
tmp207 = _mm512_fmadd_ps(tmp200, _mm512_set1_ps(2e+00f), tmp207);
tmp213 = _mm512_fmadd_ps(in13, _mm512_set1_ps(2e+00f), tmp213);
tmp208 = _mm512_sub_ps(tmp208, tmp200);
tmp214 = _mm512_sub_ps(tmp214, in13);
in11 = _mm512_mul_ps(in11, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp210 = _mm512_mul_ps(tmp210, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp208 = _mm512_mul_ps(tmp208, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp211 = _mm512_mul_ps(tmp211, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp209 = _mm512_mul_ps(tmp209, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp207 = _mm512_mul_ps(tmp207, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp212 = _mm512_mul_ps(tmp212, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp198 = _mm512_mul_ps(tmp198, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp202 = _mm512_mul_ps(tmp202, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp216 = _mm512_mul_ps(tmp216, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp214 = _mm512_mul_ps(tmp214, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp217 = _mm512_mul_ps(tmp217, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp215 = _mm512_mul_ps(tmp215, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp213 = _mm512_mul_ps(tmp213, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp218 = _mm512_mul_ps(tmp218, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp203 = _mm512_mul_ps(tmp203, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out7 = _mm512_shuffle_f32x4(in11, tmp210, 68);
__m512 out11 = _mm512_shuffle_f32x4(in11, tmp210, 238);
__m512 out8 = _mm512_shuffle_f32x4(tmp208, tmp211, 68);
__m512 out12 = _mm512_shuffle_f32x4(tmp208, tmp211, 238);
__m512 out9 = _mm512_shuffle_f32x4(tmp209, tmp207, 68);
__m512 out13 = _mm512_shuffle_f32x4(tmp209, tmp207, 238);
__m512 out10 = _mm512_shuffle_f32x4(tmp212, tmp198, 68);
__m512 out14 = _mm512_shuffle_f32x4(tmp212, tmp198, 238);
__m512 out15 = _mm512_shuffle_f32x4(tmp202, tmp216, 68);
__m512 out19 = _mm512_shuffle_f32x4(tmp202, tmp216, 238);
__m512 out16 = _mm512_shuffle_f32x4(tmp214, tmp217, 68);
__m512 out20 = _mm512_shuffle_f32x4(tmp214, tmp217, 238);
__m512 out17 = _mm512_shuffle_f32x4(tmp215, tmp213, 68);
__m512 out21 = _mm512_shuffle_f32x4(tmp215, tmp213, 238);
__m512 out18 = _mm512_shuffle_f32x4(tmp218, tmp203, 68);
__m512 out22 = _mm512_shuffle_f32x4(tmp218, tmp203, 238);
ptrdiff_t off1 = 32*cut3;
ptrdiff_t off2 = (size_t)(cut3+1)/4*512+(size_t)(cut3+1)%4*32;
ptrdiff_t off3 = (size_t)(cut3+2)/4*512+(size_t)(cut3+2)%4*32;
ptrdiff_t off4 = (size_t)(cut3+3)/4*512+(size_t)(cut3+3)%4*32;
__m512i wf33 = _mm512_castsi256_si512(_mm512_cvtps_ph(out7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf34 = _mm512_castsi256_si512(_mm512_cvtps_ph(out11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf35 = _mm512_castsi256_si512(_mm512_cvtps_ph(out15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf36 = _mm512_castsi256_si512(_mm512_cvtps_ph(out19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf37 = _mm512_castsi256_si512(_mm512_cvtps_ph(out8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf38 = _mm512_castsi256_si512(_mm512_cvtps_ph(out12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf39 = _mm512_castsi256_si512(_mm512_cvtps_ph(out16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf40 = _mm512_castsi256_si512(_mm512_cvtps_ph(out20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf41 = _mm512_castsi256_si512(_mm512_cvtps_ph(out9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf42 = _mm512_castsi256_si512(_mm512_cvtps_ph(out13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf43 = _mm512_castsi256_si512(_mm512_cvtps_ph(out17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf44 = _mm512_castsi256_si512(_mm512_cvtps_ph(out21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf45 = _mm512_castsi256_si512(_mm512_cvtps_ph(out10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf46 = _mm512_castsi256_si512(_mm512_cvtps_ph(out14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf47 = _mm512_castsi256_si512(_mm512_cvtps_ph(out18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf48 = _mm512_castsi256_si512(_mm512_cvtps_ph(out22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr4+0+2048*i16+512*k50+off1+128*s11, 255, wf33);
_mm512_mask_storeu_epi32(wfPtr4+0+2048*i16+512*k50+off2+128*s11, 255, wf34);
_mm512_mask_storeu_epi32(wfPtr4+0+2048*i16+512*k50+off3+128*s11, 255, wf35);
_mm512_mask_storeu_epi32(wfPtr4+0+2048*i16+512*k50+off4+128*s11, 255, wf36);
_mm512_mask_storeu_epi32(wfPtr4+512+2048*i16+512*k50+off1+128*s11, 255, wf37);
_mm512_mask_storeu_epi32(wfPtr4+512+2048*i16+512*k50+off2+128*s11, 255, wf38);
_mm512_mask_storeu_epi32(wfPtr4+512+2048*i16+512*k50+off3+128*s11, 255, wf39);
_mm512_mask_storeu_epi32(wfPtr4+512+2048*i16+512*k50+off4+128*s11, 255, wf40);
_mm512_mask_storeu_epi32(wfPtr4+1024+2048*i16+512*k50+off1+128*s11, 255, wf41);
_mm512_mask_storeu_epi32(wfPtr4+1024+2048*i16+512*k50+off2+128*s11, 255, wf42);
_mm512_mask_storeu_epi32(wfPtr4+1024+2048*i16+512*k50+off3+128*s11, 255, wf43);
_mm512_mask_storeu_epi32(wfPtr4+1024+2048*i16+512*k50+off4+128*s11, 255, wf44);
_mm512_mask_storeu_epi32(wfPtr4+1536+2048*i16+512*k50+off1+128*s11, 255, wf45);
_mm512_mask_storeu_epi32(wfPtr4+1536+2048*i16+512*k50+off2+128*s11, 255, wf46);
_mm512_mask_storeu_epi32(wfPtr4+1536+2048*i16+512*k50+off3+128*s11, 255, wf47);
_mm512_mask_storeu_epi32(wfPtr4+1536+2048*i16+512*k50+off4+128*s11, 255, wf48);
}
__m512 bias2 = _mm512_setzero_ps();
if (!e7) {
bias2 = _mm512_maskz_loadu_ps(15, biasPtr4-0+16*i16+16*j10);
__m512i pmMul6 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd6 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas6 = _mm512_maskz_loadu_ps(255, bnPtr5+(ptrdiff_t)8*(0+4*i16+4*j10));
__m512 postMul12 = _mm512_permutexvar_ps(pmMul6, mas6);
__m512 postAdd6 = _mm512_permutexvar_ps(pmAdd6, mas6);
bias2 = _mm512_fmadd_ps(bias2, postMul12, postAdd6);
}
_mm512_mask_storeu_ps(bfPtr4-0+16*i16+16*j10, 15, bias2);
j10 = 1;
}
}
}

static void ResNeXt50ThreeArrangeFilts1(ResNeXt50ThreaderTeam1* team22, char** tensors17) {
ResNeXt50ThreaderTask1 task21;
task21.callee1 = ResNeXt50ThreeArrangeFilts1Callee1;
task21.any1 = tensors17;
task21.nd1 = 3;
task21.hull1[0] = 1;
task21.hull1[1] = 1;
task21.hull1[2] = 1;
ResNeXt50ThreaderDo1(team22, &task21);
}

static void ResNeXt50ThreeArrangeDats1Callee1(ResNeXt50ThreaderTask1* task22, int64_t* pt16) {
char** tensors20 = task22->any1;
ptrdiff_t s12 = 0;
ptrdiff_t c10 = 0;
ptrdiff_t g8 = pt16[2];
ptrdiff_t e8 = 0;
char*restrict datPtr5 = tensors20[0]-228+4992768*e8;
char*restrict dfPtr4 = tensors20[1]+324403200*e8;
ptrdiff_t i17 = 2*g8;
ptrdiff_t ii5 = i17+1;
for (; i17 <= ii5; ++i17) {
ptrdiff_t j11 = 17*c10;
if (j11 < 2) {
ptrdiff_t rel7 = j11-0;
ptrdiff_t base7 = 0;
if (rel7 < 1) {
ptrdiff_t h20 = base7+0;
ptrdiff_t w23 = 0;
ptrdiff_t k51 = 0;
for (; k51 != 2; ++k51) {
__m512 dat917 = _mm512_maskz_loadu_ps(8191, datPtr5+228+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat917 = _mm512_max_ps(_mm512_setzero_ps(), dat917);
__m512 dat918 = _mm512_maskz_loadu_ps(16383, datPtr5+272+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat918 = _mm512_max_ps(_mm512_setzero_ps(), dat918);
__m512i pm65 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in14 = _mm512_permutexvar_ps(pm65, dat917);
__m512i pm66 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in21 = _mm512_permutexvar_ps(pm66, dat918);
__m512 dat919 = _mm512_maskz_loadu_ps(8191, datPtr5+452+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat919 = _mm512_max_ps(_mm512_setzero_ps(), dat919);
__m512 dat920 = _mm512_maskz_loadu_ps(16383, datPtr5+496+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat920 = _mm512_max_ps(_mm512_setzero_ps(), dat920);
__m512 in15 = _mm512_permutexvar_ps(pm65, dat919);
__m512 in22 = _mm512_permutexvar_ps(pm66, dat920);
__m512 dat921 = _mm512_maskz_loadu_ps(8191, datPtr5+676+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat921 = _mm512_max_ps(_mm512_setzero_ps(), dat921);
__m512 dat922 = _mm512_maskz_loadu_ps(16383, datPtr5+720+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat922 = _mm512_max_ps(_mm512_setzero_ps(), dat922);
__m512 in16 = _mm512_permutexvar_ps(pm65, dat921);
__m512 in23 = _mm512_permutexvar_ps(pm66, dat922);
__m512 dat923 = _mm512_maskz_loadu_ps(8191, datPtr5+900+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat923 = _mm512_max_ps(_mm512_setzero_ps(), dat923);
__m512 dat924 = _mm512_maskz_loadu_ps(16383, datPtr5+944+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat924 = _mm512_max_ps(_mm512_setzero_ps(), dat924);
__m512 in17 = _mm512_permutexvar_ps(pm65, dat923);
__m512 in24 = _mm512_permutexvar_ps(pm66, dat924);
__m512 dat925 = _mm512_maskz_loadu_ps(8191, datPtr5+1124+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat925 = _mm512_max_ps(_mm512_setzero_ps(), dat925);
__m512 dat926 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat926 = _mm512_max_ps(_mm512_setzero_ps(), dat926);
__m512 in18 = _mm512_permutexvar_ps(pm65, dat925);
__m512 in25 = _mm512_permutexvar_ps(pm66, dat926);
__m512 dat927 = _mm512_maskz_loadu_ps(8191, datPtr5+1348+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat927 = _mm512_max_ps(_mm512_setzero_ps(), dat927);
__m512 dat928 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat928 = _mm512_max_ps(_mm512_setzero_ps(), dat928);
__m512 in19 = _mm512_permutexvar_ps(pm65, dat927);
__m512 in26 = _mm512_permutexvar_ps(pm66, dat928);
__m512 dat929 = _mm512_maskz_loadu_ps(8191, datPtr5+1572+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat929 = _mm512_max_ps(_mm512_setzero_ps(), dat929);
__m512 dat930 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat930 = _mm512_max_ps(_mm512_setzero_ps(), dat930);
__m512 in20 = _mm512_permutexvar_ps(pm65, dat929);
__m512 in27 = _mm512_permutexvar_ps(pm66, dat930);
__m512 tmp243 = _mm512_add_ps(in14, in18);
__m512 tmp248 = _mm512_add_ps(in21, in25);
__m512 tmp244 = _mm512_sub_ps(in17, in15);
__m512 tmp249 = _mm512_sub_ps(in24, in22);
__m512 tmp245 = _mm512_add_ps(in15, in19);
__m512 tmp250 = _mm512_add_ps(in22, in26);
__m512 tmp246 = _mm512_sub_ps(_mm512_setzero_ps(), in19);
__m512 tmp251 = _mm512_sub_ps(_mm512_setzero_ps(), in26);
tmp243 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-4.25e+00f), tmp243);
tmp248 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-4.25e+00f), tmp248);
tmp245 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-4.25e+00f), tmp245);
tmp250 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-4.25e+00f), tmp250);
tmp246 = _mm512_fmadd_ps(tmp244, _mm512_set1_ps(5.25e+00f), tmp246);
tmp251 = _mm512_fmadd_ps(tmp249, _mm512_set1_ps(5.25e+00f), tmp251);
tmp244 = _mm512_fmadd_ps(in15, _mm512_set1_ps(2.5e-01f), in19);
tmp249 = _mm512_fmadd_ps(in22, _mm512_set1_ps(2.5e-01f), in26);
in15 = _mm512_fmadd_ps(in15, _mm512_set1_ps(4e+00f), in19);
in22 = _mm512_fmadd_ps(in22, _mm512_set1_ps(4e+00f), in26);
__m512 tmp247 = _mm512_sub_ps(tmp245, tmp243);
__m512 tmp252 = _mm512_sub_ps(tmp250, tmp248);
tmp245 = _mm512_add_ps(tmp243, tmp245);
tmp250 = _mm512_add_ps(tmp248, tmp250);
tmp243 = _mm512_fmadd_ps(in14, _mm512_set1_ps(2.5e-01f), in18);
tmp248 = _mm512_fmadd_ps(in21, _mm512_set1_ps(2.5e-01f), in25);
tmp244 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-1.25e+00f), tmp244);
tmp249 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-1.25e+00f), tmp249);
in17 = _mm512_fmadd_ps(in17, _mm512_set1_ps(-5e+00f), in15);
in24 = _mm512_fmadd_ps(in24, _mm512_set1_ps(-5e+00f), in22);
tmp243 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-1.25e+00f), tmp243);
tmp248 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-1.25e+00f), tmp248);
in19 = _mm512_fmadd_ps(tmp243, _mm512_set1_ps(2e+00f), tmp244);
in26 = _mm512_fmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp249);
tmp244 = _mm512_fnmadd_ps(tmp243, _mm512_set1_ps(2e+00f), tmp244);
tmp249 = _mm512_fnmadd_ps(tmp248, _mm512_set1_ps(2e+00f), tmp249);
tmp243 = _mm512_fmadd_ps(in18, _mm512_set1_ps(2.5e-01f), in14);
tmp248 = _mm512_fmadd_ps(in25, _mm512_set1_ps(2.5e-01f), in21);
in14 = _mm512_sub_ps(in20, in14);
in21 = _mm512_sub_ps(in27, in21);
tmp243 = _mm512_fmadd_ps(in16, _mm512_set1_ps(-1.25e+00f), tmp243);
tmp248 = _mm512_fmadd_ps(in23, _mm512_set1_ps(-1.25e+00f), tmp248);
in16 = _mm512_sub_ps(in16, in18);
in23 = _mm512_sub_ps(in23, in25);
in16 = _mm512_fmadd_ps(in16, _mm512_set1_ps(5.25e+00f), in14);
in23 = _mm512_fmadd_ps(in23, _mm512_set1_ps(5.25e+00f), in21);
in15 = _mm512_fmadd_ps(tmp243, _mm512_set1_ps(2e+00f), in17);
in22 = _mm512_fmadd_ps(tmp248, _mm512_set1_ps(2e+00f), in24);
in17 = _mm512_fnmadd_ps(tmp243, _mm512_set1_ps(2e+00f), in17);
in24 = _mm512_fnmadd_ps(tmp248, _mm512_set1_ps(2e+00f), in24);
__m512 tmp261 = _mm512_unpacklo_ps(tmp246, tmp245);
__m512 tmp262 = _mm512_unpackhi_ps(tmp246, tmp245);
__m512 tmp263 = _mm512_unpacklo_ps(tmp247, in19);
__m512 tmp264 = _mm512_unpackhi_ps(tmp247, in19);
__m512 tmp265 = _mm512_unpacklo_ps(tmp244, in15);
__m512 tmp266 = _mm512_unpackhi_ps(tmp244, in15);
__m512 tmp267 = _mm512_unpacklo_ps(in17, in16);
__m512 tmp268 = _mm512_unpackhi_ps(in17, in16);
__m512 tmp269 = _mm512_unpacklo_ps(tmp251, tmp250);
__m512 tmp270 = _mm512_unpackhi_ps(tmp251, tmp250);
__m512 tmp271 = _mm512_unpacklo_ps(tmp252, in26);
__m512 tmp272 = _mm512_unpackhi_ps(tmp252, in26);
__m512 tmp273 = _mm512_unpacklo_ps(tmp249, in22);
__m512 tmp274 = _mm512_unpackhi_ps(tmp249, in22);
__m512 tmp275 = _mm512_unpacklo_ps(in24, in23);
__m512 tmp276 = _mm512_unpackhi_ps(in24, in23);
__m512 tmp277 = _mm512_shuffle_ps(tmp261, tmp263, 68);
__m512 tmp278 = _mm512_shuffle_ps(tmp261, tmp263, 238);
__m512 tmp279 = _mm512_shuffle_ps(tmp262, tmp264, 68);
__m512 tmp280 = _mm512_shuffle_ps(tmp262, tmp264, 238);
__m512 tmp281 = _mm512_shuffle_ps(tmp265, tmp267, 68);
__m512 tmp282 = _mm512_shuffle_ps(tmp265, tmp267, 238);
__m512 tmp283 = _mm512_shuffle_ps(tmp266, tmp268, 68);
__m512 tmp284 = _mm512_shuffle_ps(tmp266, tmp268, 238);
__m512 tmp285 = _mm512_shuffle_ps(tmp269, tmp271, 68);
__m512 tmp286 = _mm512_shuffle_ps(tmp269, tmp271, 238);
__m512 tmp287 = _mm512_shuffle_ps(tmp270, tmp272, 68);
__m512 tmp288 = _mm512_shuffle_ps(tmp270, tmp272, 238);
__m512 tmp289 = _mm512_shuffle_ps(tmp273, tmp275, 68);
__m512 tmp290 = _mm512_shuffle_ps(tmp273, tmp275, 238);
__m512 tmp291 = _mm512_shuffle_ps(tmp274, tmp276, 68);
__m512 tmp292 = _mm512_shuffle_ps(tmp274, tmp276, 238);
__m512 tmp293 = _mm512_shuffle_f32x4(tmp277, tmp281, 136);
__m512 tmp294 = _mm512_shuffle_f32x4(tmp277, tmp281, 221);
__m512 tmp295 = _mm512_shuffle_f32x4(tmp278, tmp282, 136);
__m512 tmp296 = _mm512_shuffle_f32x4(tmp278, tmp282, 221);
__m512 tmp297 = _mm512_shuffle_f32x4(tmp279, tmp283, 136);
__m512 tmp298 = _mm512_shuffle_f32x4(tmp279, tmp283, 221);
__m512 tmp299 = _mm512_shuffle_f32x4(tmp280, tmp284, 136);
__m512 tmp300 = _mm512_shuffle_f32x4(tmp280, tmp284, 221);
__m512 tmp301 = _mm512_shuffle_f32x4(tmp285, tmp289, 136);
__m512 tmp302 = _mm512_shuffle_f32x4(tmp285, tmp289, 221);
__m512 tmp303 = _mm512_shuffle_f32x4(tmp286, tmp290, 136);
__m512 tmp304 = _mm512_shuffle_f32x4(tmp286, tmp290, 221);
__m512 tmp305 = _mm512_shuffle_f32x4(tmp287, tmp291, 136);
__m512 tmp306 = _mm512_shuffle_f32x4(tmp287, tmp291, 221);
__m512 tmp307 = _mm512_shuffle_f32x4(tmp288, tmp292, 136);
__m512 tmp308 = _mm512_shuffle_f32x4(tmp288, tmp292, 221);
tmp246 = _mm512_shuffle_f32x4(tmp293, tmp301, 136);
tmp251 = _mm512_shuffle_f32x4(tmp293, tmp301, 221);
tmp245 = _mm512_shuffle_f32x4(tmp295, tmp303, 136);
tmp250 = _mm512_shuffle_f32x4(tmp295, tmp303, 221);
tmp247 = _mm512_shuffle_f32x4(tmp297, tmp305, 136);
tmp252 = _mm512_shuffle_f32x4(tmp297, tmp305, 221);
in19 = _mm512_shuffle_f32x4(tmp299, tmp307, 136);
in26 = _mm512_shuffle_f32x4(tmp299, tmp307, 221);
tmp244 = _mm512_shuffle_f32x4(tmp294, tmp302, 136);
tmp249 = _mm512_shuffle_f32x4(tmp294, tmp302, 221);
in15 = _mm512_shuffle_f32x4(tmp296, tmp304, 136);
in22 = _mm512_shuffle_f32x4(tmp296, tmp304, 221);
in17 = _mm512_shuffle_f32x4(tmp298, tmp306, 136);
in24 = _mm512_shuffle_f32x4(tmp298, tmp306, 221);
in16 = _mm512_shuffle_f32x4(tmp300, tmp308, 136);
in23 = _mm512_shuffle_f32x4(tmp300, tmp308, 221);
__m512 tmp253 = _mm512_add_ps(tmp245, in15);
__m512 tmp257 = _mm512_add_ps(tmp250, in22);
__m512 tmp254 = _mm512_sub_ps(tmp244, tmp247);
__m512 tmp258 = _mm512_sub_ps(tmp249, tmp252);
__m512 tmp255 = _mm512_add_ps(tmp247, in17);
__m512 tmp259 = _mm512_add_ps(tmp252, in24);
tmp246 = _mm512_sub_ps(tmp246, in17);
tmp251 = _mm512_sub_ps(tmp251, in24);
tmp253 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-4.25e+00f), tmp253);
tmp257 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-4.25e+00f), tmp257);
tmp255 = _mm512_fmadd_ps(tmp244, _mm512_set1_ps(-4.25e+00f), tmp255);
tmp259 = _mm512_fmadd_ps(tmp249, _mm512_set1_ps(-4.25e+00f), tmp259);
tmp246 = _mm512_fmadd_ps(tmp254, _mm512_set1_ps(5.25e+00f), tmp246);
tmp251 = _mm512_fmadd_ps(tmp258, _mm512_set1_ps(5.25e+00f), tmp251);
tmp254 = _mm512_fmadd_ps(tmp247, _mm512_set1_ps(2.5e-01f), in17);
tmp258 = _mm512_fmadd_ps(tmp252, _mm512_set1_ps(2.5e-01f), in24);
tmp247 = _mm512_fmadd_ps(tmp247, _mm512_set1_ps(4e+00f), in17);
tmp252 = _mm512_fmadd_ps(tmp252, _mm512_set1_ps(4e+00f), in24);
__m512 tmp256 = _mm512_sub_ps(tmp255, tmp253);
__m512 tmp260 = _mm512_sub_ps(tmp259, tmp257);
tmp255 = _mm512_add_ps(tmp253, tmp255);
tmp259 = _mm512_add_ps(tmp257, tmp259);
tmp253 = _mm512_fmadd_ps(tmp245, _mm512_set1_ps(2.5e-01f), in15);
tmp257 = _mm512_fmadd_ps(tmp250, _mm512_set1_ps(2.5e-01f), in22);
tmp254 = _mm512_fmadd_ps(tmp244, _mm512_set1_ps(-1.25e+00f), tmp254);
tmp258 = _mm512_fmadd_ps(tmp249, _mm512_set1_ps(-1.25e+00f), tmp258);
tmp244 = _mm512_fmadd_ps(tmp244, _mm512_set1_ps(-5e+00f), tmp247);
tmp249 = _mm512_fmadd_ps(tmp249, _mm512_set1_ps(-5e+00f), tmp252);
tmp253 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp253);
tmp257 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp257);
in17 = _mm512_fmadd_ps(tmp253, _mm512_set1_ps(2e+00f), tmp254);
in24 = _mm512_fmadd_ps(tmp257, _mm512_set1_ps(2e+00f), tmp258);
tmp254 = _mm512_fnmadd_ps(tmp253, _mm512_set1_ps(2e+00f), tmp254);
tmp258 = _mm512_fnmadd_ps(tmp257, _mm512_set1_ps(2e+00f), tmp258);
tmp253 = _mm512_fmadd_ps(in15, _mm512_set1_ps(2.5e-01f), tmp245);
tmp257 = _mm512_fmadd_ps(in22, _mm512_set1_ps(2.5e-01f), tmp250);
tmp245 = _mm512_sub_ps(in16, tmp245);
tmp250 = _mm512_sub_ps(in23, tmp250);
tmp253 = _mm512_fmadd_ps(in19, _mm512_set1_ps(-1.25e+00f), tmp253);
tmp257 = _mm512_fmadd_ps(in26, _mm512_set1_ps(-1.25e+00f), tmp257);
in19 = _mm512_sub_ps(in19, in15);
in26 = _mm512_sub_ps(in26, in22);
in19 = _mm512_fmadd_ps(in19, _mm512_set1_ps(5.25e+00f), tmp245);
in26 = _mm512_fmadd_ps(in26, _mm512_set1_ps(5.25e+00f), tmp250);
tmp247 = _mm512_fmadd_ps(tmp253, _mm512_set1_ps(2e+00f), tmp244);
tmp252 = _mm512_fmadd_ps(tmp257, _mm512_set1_ps(2e+00f), tmp249);
tmp244 = _mm512_fnmadd_ps(tmp253, _mm512_set1_ps(2e+00f), tmp244);
tmp249 = _mm512_fnmadd_ps(tmp257, _mm512_set1_ps(2e+00f), tmp249);
__m512 out23 = _mm512_shuffle_f32x4(tmp246, tmp255, 68);
__m512 out31 = _mm512_shuffle_f32x4(tmp246, tmp255, 238);
__m512 out24 = _mm512_shuffle_f32x4(tmp256, in17, 68);
__m512 out32 = _mm512_shuffle_f32x4(tmp256, in17, 238);
__m512 out25 = _mm512_shuffle_f32x4(tmp254, tmp247, 68);
__m512 out33 = _mm512_shuffle_f32x4(tmp254, tmp247, 238);
__m512 out26 = _mm512_shuffle_f32x4(tmp244, in19, 68);
__m512 out34 = _mm512_shuffle_f32x4(tmp244, in19, 238);
__m512 out27 = _mm512_shuffle_f32x4(tmp251, tmp259, 68);
__m512 out35 = _mm512_shuffle_f32x4(tmp251, tmp259, 238);
__m512 out28 = _mm512_shuffle_f32x4(tmp260, in24, 68);
__m512 out36 = _mm512_shuffle_f32x4(tmp260, in24, 238);
__m512 out29 = _mm512_shuffle_f32x4(tmp258, tmp252, 68);
__m512 out37 = _mm512_shuffle_f32x4(tmp258, tmp252, 238);
__m512 out30 = _mm512_shuffle_f32x4(tmp249, in26, 68);
__m512 out38 = _mm512_shuffle_f32x4(tmp249, in26, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k51, out23);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k51, out31);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k51, out27);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k51, out35);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k51, out24);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k51, out32);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k51, out28);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k51, out36);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k51, out25);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k51, out33);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k51, out29);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k51, out37);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k51, out26);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k51, out34);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k51, out30);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k51, out38);
__m512 dat931 = _mm512_maskz_loadu_ps(16383, datPtr5+320+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat931 = _mm512_max_ps(_mm512_setzero_ps(), dat931);
__m512 dat932 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat932 = _mm512_max_ps(_mm512_setzero_ps(), dat932);
__m512i pm67 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in28 = _mm512_permutexvar_ps(pm67, dat931);
__m512i pm68 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in35 = _mm512_permutexvar_ps(pm68, dat932);
__m512 dat933 = _mm512_maskz_loadu_ps(16383, datPtr5+544+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat933 = _mm512_max_ps(_mm512_setzero_ps(), dat933);
__m512 dat934 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat934 = _mm512_max_ps(_mm512_setzero_ps(), dat934);
__m512 in29 = _mm512_permutexvar_ps(pm67, dat933);
__m512 in36 = _mm512_permutexvar_ps(pm68, dat934);
__m512 dat935 = _mm512_maskz_loadu_ps(16383, datPtr5+768+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat935 = _mm512_max_ps(_mm512_setzero_ps(), dat935);
__m512 dat936 = _mm512_maskz_loadu_ps(8191, datPtr5+13284+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat936 = _mm512_max_ps(_mm512_setzero_ps(), dat936);
__m512 in30 = _mm512_permutexvar_ps(pm67, dat935);
__m512 in37 = _mm512_permutexvar_ps(pm68, dat936);
__m512 dat937 = _mm512_maskz_loadu_ps(16383, datPtr5+992+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat937 = _mm512_max_ps(_mm512_setzero_ps(), dat937);
__m512 dat938 = _mm512_maskz_loadu_ps(8191, datPtr5+13508+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat938 = _mm512_max_ps(_mm512_setzero_ps(), dat938);
__m512 in31 = _mm512_permutexvar_ps(pm67, dat937);
__m512 in38 = _mm512_permutexvar_ps(pm68, dat938);
__m512 dat939 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat939 = _mm512_max_ps(_mm512_setzero_ps(), dat939);
__m512 dat940 = _mm512_maskz_loadu_ps(8191, datPtr5+13732+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat940 = _mm512_max_ps(_mm512_setzero_ps(), dat940);
__m512 in32 = _mm512_permutexvar_ps(pm67, dat939);
__m512 in39 = _mm512_permutexvar_ps(pm68, dat940);
__m512 dat941 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat941 = _mm512_max_ps(_mm512_setzero_ps(), dat941);
__m512 dat942 = _mm512_maskz_loadu_ps(8191, datPtr5+13956+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat942 = _mm512_max_ps(_mm512_setzero_ps(), dat942);
__m512 in33 = _mm512_permutexvar_ps(pm67, dat941);
__m512 in40 = _mm512_permutexvar_ps(pm68, dat942);
__m512 dat943 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat943 = _mm512_max_ps(_mm512_setzero_ps(), dat943);
__m512 dat944 = _mm512_maskz_loadu_ps(8191, datPtr5+14180+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat944 = _mm512_max_ps(_mm512_setzero_ps(), dat944);
__m512 in34 = _mm512_permutexvar_ps(pm67, dat943);
__m512 in41 = _mm512_permutexvar_ps(pm68, dat944);
__m512 tmp309 = _mm512_add_ps(in28, in32);
__m512 tmp314 = _mm512_add_ps(in35, in39);
__m512 tmp310 = _mm512_sub_ps(in31, in29);
__m512 tmp315 = _mm512_sub_ps(in38, in36);
__m512 tmp311 = _mm512_add_ps(in29, in33);
__m512 tmp316 = _mm512_add_ps(in36, in40);
__m512 tmp312 = _mm512_sub_ps(_mm512_setzero_ps(), in33);
__m512 tmp317 = _mm512_sub_ps(_mm512_setzero_ps(), in40);
tmp309 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-4.25e+00f), tmp309);
tmp314 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-4.25e+00f), tmp314);
tmp311 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-4.25e+00f), tmp311);
tmp316 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-4.25e+00f), tmp316);
tmp312 = _mm512_fmadd_ps(tmp310, _mm512_set1_ps(5.25e+00f), tmp312);
tmp317 = _mm512_fmadd_ps(tmp315, _mm512_set1_ps(5.25e+00f), tmp317);
tmp310 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), in33);
tmp315 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), in40);
in29 = _mm512_fmadd_ps(in29, _mm512_set1_ps(4e+00f), in33);
in36 = _mm512_fmadd_ps(in36, _mm512_set1_ps(4e+00f), in40);
__m512 tmp313 = _mm512_sub_ps(tmp311, tmp309);
__m512 tmp318 = _mm512_sub_ps(tmp316, tmp314);
tmp311 = _mm512_add_ps(tmp309, tmp311);
tmp316 = _mm512_add_ps(tmp314, tmp316);
tmp309 = _mm512_fmadd_ps(in28, _mm512_set1_ps(2.5e-01f), in32);
tmp314 = _mm512_fmadd_ps(in35, _mm512_set1_ps(2.5e-01f), in39);
tmp310 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-1.25e+00f), tmp310);
tmp315 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-1.25e+00f), tmp315);
in31 = _mm512_fmadd_ps(in31, _mm512_set1_ps(-5e+00f), in29);
in38 = _mm512_fmadd_ps(in38, _mm512_set1_ps(-5e+00f), in36);
tmp309 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp309);
tmp314 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp314);
in33 = _mm512_fmadd_ps(tmp309, _mm512_set1_ps(2e+00f), tmp310);
in40 = _mm512_fmadd_ps(tmp314, _mm512_set1_ps(2e+00f), tmp315);
tmp310 = _mm512_fnmadd_ps(tmp309, _mm512_set1_ps(2e+00f), tmp310);
tmp315 = _mm512_fnmadd_ps(tmp314, _mm512_set1_ps(2e+00f), tmp315);
tmp309 = _mm512_fmadd_ps(in32, _mm512_set1_ps(2.5e-01f), in28);
tmp314 = _mm512_fmadd_ps(in39, _mm512_set1_ps(2.5e-01f), in35);
in28 = _mm512_sub_ps(in34, in28);
in35 = _mm512_sub_ps(in41, in35);
tmp309 = _mm512_fmadd_ps(in30, _mm512_set1_ps(-1.25e+00f), tmp309);
tmp314 = _mm512_fmadd_ps(in37, _mm512_set1_ps(-1.25e+00f), tmp314);
in30 = _mm512_sub_ps(in30, in32);
in37 = _mm512_sub_ps(in37, in39);
in30 = _mm512_fmadd_ps(in30, _mm512_set1_ps(5.25e+00f), in28);
in37 = _mm512_fmadd_ps(in37, _mm512_set1_ps(5.25e+00f), in35);
in29 = _mm512_fmadd_ps(tmp309, _mm512_set1_ps(2e+00f), in31);
in36 = _mm512_fmadd_ps(tmp314, _mm512_set1_ps(2e+00f), in38);
in31 = _mm512_fnmadd_ps(tmp309, _mm512_set1_ps(2e+00f), in31);
in38 = _mm512_fnmadd_ps(tmp314, _mm512_set1_ps(2e+00f), in38);
__m512 tmp327 = _mm512_unpacklo_ps(tmp312, tmp311);
__m512 tmp328 = _mm512_unpackhi_ps(tmp312, tmp311);
__m512 tmp329 = _mm512_unpacklo_ps(tmp313, in33);
__m512 tmp330 = _mm512_unpackhi_ps(tmp313, in33);
__m512 tmp331 = _mm512_unpacklo_ps(tmp310, in29);
__m512 tmp332 = _mm512_unpackhi_ps(tmp310, in29);
__m512 tmp333 = _mm512_unpacklo_ps(in31, in30);
__m512 tmp334 = _mm512_unpackhi_ps(in31, in30);
__m512 tmp335 = _mm512_unpacklo_ps(tmp317, tmp316);
__m512 tmp336 = _mm512_unpackhi_ps(tmp317, tmp316);
__m512 tmp337 = _mm512_unpacklo_ps(tmp318, in40);
__m512 tmp338 = _mm512_unpackhi_ps(tmp318, in40);
__m512 tmp339 = _mm512_unpacklo_ps(tmp315, in36);
__m512 tmp340 = _mm512_unpackhi_ps(tmp315, in36);
__m512 tmp341 = _mm512_unpacklo_ps(in38, in37);
__m512 tmp342 = _mm512_unpackhi_ps(in38, in37);
__m512 tmp343 = _mm512_shuffle_ps(tmp327, tmp329, 68);
__m512 tmp344 = _mm512_shuffle_ps(tmp327, tmp329, 238);
__m512 tmp345 = _mm512_shuffle_ps(tmp328, tmp330, 68);
__m512 tmp346 = _mm512_shuffle_ps(tmp328, tmp330, 238);
__m512 tmp347 = _mm512_shuffle_ps(tmp331, tmp333, 68);
__m512 tmp348 = _mm512_shuffle_ps(tmp331, tmp333, 238);
__m512 tmp349 = _mm512_shuffle_ps(tmp332, tmp334, 68);
__m512 tmp350 = _mm512_shuffle_ps(tmp332, tmp334, 238);
__m512 tmp351 = _mm512_shuffle_ps(tmp335, tmp337, 68);
__m512 tmp352 = _mm512_shuffle_ps(tmp335, tmp337, 238);
__m512 tmp353 = _mm512_shuffle_ps(tmp336, tmp338, 68);
__m512 tmp354 = _mm512_shuffle_ps(tmp336, tmp338, 238);
__m512 tmp355 = _mm512_shuffle_ps(tmp339, tmp341, 68);
__m512 tmp356 = _mm512_shuffle_ps(tmp339, tmp341, 238);
__m512 tmp357 = _mm512_shuffle_ps(tmp340, tmp342, 68);
__m512 tmp358 = _mm512_shuffle_ps(tmp340, tmp342, 238);
__m512 tmp359 = _mm512_shuffle_f32x4(tmp343, tmp347, 136);
__m512 tmp360 = _mm512_shuffle_f32x4(tmp343, tmp347, 221);
__m512 tmp361 = _mm512_shuffle_f32x4(tmp344, tmp348, 136);
__m512 tmp362 = _mm512_shuffle_f32x4(tmp344, tmp348, 221);
__m512 tmp363 = _mm512_shuffle_f32x4(tmp345, tmp349, 136);
__m512 tmp364 = _mm512_shuffle_f32x4(tmp345, tmp349, 221);
__m512 tmp365 = _mm512_shuffle_f32x4(tmp346, tmp350, 136);
__m512 tmp366 = _mm512_shuffle_f32x4(tmp346, tmp350, 221);
__m512 tmp367 = _mm512_shuffle_f32x4(tmp351, tmp355, 136);
__m512 tmp368 = _mm512_shuffle_f32x4(tmp351, tmp355, 221);
__m512 tmp369 = _mm512_shuffle_f32x4(tmp352, tmp356, 136);
__m512 tmp370 = _mm512_shuffle_f32x4(tmp352, tmp356, 221);
__m512 tmp371 = _mm512_shuffle_f32x4(tmp353, tmp357, 136);
__m512 tmp372 = _mm512_shuffle_f32x4(tmp353, tmp357, 221);
__m512 tmp373 = _mm512_shuffle_f32x4(tmp354, tmp358, 136);
__m512 tmp374 = _mm512_shuffle_f32x4(tmp354, tmp358, 221);
tmp312 = _mm512_shuffle_f32x4(tmp359, tmp367, 136);
tmp317 = _mm512_shuffle_f32x4(tmp359, tmp367, 221);
tmp311 = _mm512_shuffle_f32x4(tmp361, tmp369, 136);
tmp316 = _mm512_shuffle_f32x4(tmp361, tmp369, 221);
tmp313 = _mm512_shuffle_f32x4(tmp363, tmp371, 136);
tmp318 = _mm512_shuffle_f32x4(tmp363, tmp371, 221);
in33 = _mm512_shuffle_f32x4(tmp365, tmp373, 136);
in40 = _mm512_shuffle_f32x4(tmp365, tmp373, 221);
tmp310 = _mm512_shuffle_f32x4(tmp360, tmp368, 136);
tmp315 = _mm512_shuffle_f32x4(tmp360, tmp368, 221);
in29 = _mm512_shuffle_f32x4(tmp362, tmp370, 136);
in36 = _mm512_shuffle_f32x4(tmp362, tmp370, 221);
in31 = _mm512_shuffle_f32x4(tmp364, tmp372, 136);
in38 = _mm512_shuffle_f32x4(tmp364, tmp372, 221);
in30 = _mm512_shuffle_f32x4(tmp366, tmp374, 136);
in37 = _mm512_shuffle_f32x4(tmp366, tmp374, 221);
__m512 tmp319 = _mm512_add_ps(tmp311, in29);
__m512 tmp323 = _mm512_add_ps(tmp316, in36);
__m512 tmp320 = _mm512_sub_ps(tmp310, tmp313);
__m512 tmp324 = _mm512_sub_ps(tmp315, tmp318);
__m512 tmp321 = _mm512_add_ps(tmp313, in31);
__m512 tmp325 = _mm512_add_ps(tmp318, in38);
tmp312 = _mm512_sub_ps(tmp312, in31);
tmp317 = _mm512_sub_ps(tmp317, in38);
tmp319 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-4.25e+00f), tmp319);
tmp323 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-4.25e+00f), tmp323);
tmp321 = _mm512_fmadd_ps(tmp310, _mm512_set1_ps(-4.25e+00f), tmp321);
tmp325 = _mm512_fmadd_ps(tmp315, _mm512_set1_ps(-4.25e+00f), tmp325);
tmp312 = _mm512_fmadd_ps(tmp320, _mm512_set1_ps(5.25e+00f), tmp312);
tmp317 = _mm512_fmadd_ps(tmp324, _mm512_set1_ps(5.25e+00f), tmp317);
tmp320 = _mm512_fmadd_ps(tmp313, _mm512_set1_ps(2.5e-01f), in31);
tmp324 = _mm512_fmadd_ps(tmp318, _mm512_set1_ps(2.5e-01f), in38);
tmp313 = _mm512_fmadd_ps(tmp313, _mm512_set1_ps(4e+00f), in31);
tmp318 = _mm512_fmadd_ps(tmp318, _mm512_set1_ps(4e+00f), in38);
__m512 tmp322 = _mm512_sub_ps(tmp321, tmp319);
__m512 tmp326 = _mm512_sub_ps(tmp325, tmp323);
tmp321 = _mm512_add_ps(tmp319, tmp321);
tmp325 = _mm512_add_ps(tmp323, tmp325);
tmp319 = _mm512_fmadd_ps(tmp311, _mm512_set1_ps(2.5e-01f), in29);
tmp323 = _mm512_fmadd_ps(tmp316, _mm512_set1_ps(2.5e-01f), in36);
tmp320 = _mm512_fmadd_ps(tmp310, _mm512_set1_ps(-1.25e+00f), tmp320);
tmp324 = _mm512_fmadd_ps(tmp315, _mm512_set1_ps(-1.25e+00f), tmp324);
tmp310 = _mm512_fmadd_ps(tmp310, _mm512_set1_ps(-5e+00f), tmp313);
tmp315 = _mm512_fmadd_ps(tmp315, _mm512_set1_ps(-5e+00f), tmp318);
tmp319 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp319);
tmp323 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp323);
in31 = _mm512_fmadd_ps(tmp319, _mm512_set1_ps(2e+00f), tmp320);
in38 = _mm512_fmadd_ps(tmp323, _mm512_set1_ps(2e+00f), tmp324);
tmp320 = _mm512_fnmadd_ps(tmp319, _mm512_set1_ps(2e+00f), tmp320);
tmp324 = _mm512_fnmadd_ps(tmp323, _mm512_set1_ps(2e+00f), tmp324);
tmp319 = _mm512_fmadd_ps(in29, _mm512_set1_ps(2.5e-01f), tmp311);
tmp323 = _mm512_fmadd_ps(in36, _mm512_set1_ps(2.5e-01f), tmp316);
tmp311 = _mm512_sub_ps(in30, tmp311);
tmp316 = _mm512_sub_ps(in37, tmp316);
tmp319 = _mm512_fmadd_ps(in33, _mm512_set1_ps(-1.25e+00f), tmp319);
tmp323 = _mm512_fmadd_ps(in40, _mm512_set1_ps(-1.25e+00f), tmp323);
in33 = _mm512_sub_ps(in33, in29);
in40 = _mm512_sub_ps(in40, in36);
in33 = _mm512_fmadd_ps(in33, _mm512_set1_ps(5.25e+00f), tmp311);
in40 = _mm512_fmadd_ps(in40, _mm512_set1_ps(5.25e+00f), tmp316);
tmp313 = _mm512_fmadd_ps(tmp319, _mm512_set1_ps(2e+00f), tmp310);
tmp318 = _mm512_fmadd_ps(tmp323, _mm512_set1_ps(2e+00f), tmp315);
tmp310 = _mm512_fnmadd_ps(tmp319, _mm512_set1_ps(2e+00f), tmp310);
tmp315 = _mm512_fnmadd_ps(tmp323, _mm512_set1_ps(2e+00f), tmp315);
__m512 out39 = _mm512_shuffle_f32x4(tmp312, tmp321, 68);
__m512 out47 = _mm512_shuffle_f32x4(tmp312, tmp321, 238);
__m512 out40 = _mm512_shuffle_f32x4(tmp322, in31, 68);
__m512 out48 = _mm512_shuffle_f32x4(tmp322, in31, 238);
__m512 out41 = _mm512_shuffle_f32x4(tmp320, tmp313, 68);
__m512 out49 = _mm512_shuffle_f32x4(tmp320, tmp313, 238);
__m512 out42 = _mm512_shuffle_f32x4(tmp310, in33, 68);
__m512 out50 = _mm512_shuffle_f32x4(tmp310, in33, 238);
__m512 out43 = _mm512_shuffle_f32x4(tmp317, tmp325, 68);
__m512 out51 = _mm512_shuffle_f32x4(tmp317, tmp325, 238);
__m512 out44 = _mm512_shuffle_f32x4(tmp326, in38, 68);
__m512 out52 = _mm512_shuffle_f32x4(tmp326, in38, 238);
__m512 out45 = _mm512_shuffle_f32x4(tmp324, tmp318, 68);
__m512 out53 = _mm512_shuffle_f32x4(tmp324, tmp318, 238);
__m512 out46 = _mm512_shuffle_f32x4(tmp315, in40, 68);
__m512 out54 = _mm512_shuffle_f32x4(tmp315, in40, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k51, out39);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k51, out47);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k51, out43);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k51, out51);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k51, out40);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k51, out48);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k51, out44);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k51, out52);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k51, out41);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k51, out49);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k51, out45);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k51, out53);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k51, out42);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k51, out50);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k51, out46);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k51, out54);
__m512 dat945 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat945 = _mm512_max_ps(_mm512_setzero_ps(), dat945);
__m512 dat946 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat946 = _mm512_max_ps(_mm512_setzero_ps(), dat946);
__m512i pm69 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in42 = _mm512_permutexvar_ps(pm69, dat945);
__m512 in49 = _mm512_permutexvar_ps(pm69, dat946);
__m512 dat947 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat947 = _mm512_max_ps(_mm512_setzero_ps(), dat947);
__m512 dat948 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat948 = _mm512_max_ps(_mm512_setzero_ps(), dat948);
__m512 in43 = _mm512_permutexvar_ps(pm69, dat947);
__m512 in50 = _mm512_permutexvar_ps(pm69, dat948);
__m512 dat949 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat949 = _mm512_max_ps(_mm512_setzero_ps(), dat949);
__m512 dat950 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat950 = _mm512_max_ps(_mm512_setzero_ps(), dat950);
__m512 in44 = _mm512_permutexvar_ps(pm69, dat949);
__m512 in51 = _mm512_permutexvar_ps(pm69, dat950);
__m512 dat951 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat951 = _mm512_max_ps(_mm512_setzero_ps(), dat951);
__m512 dat952 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat952 = _mm512_max_ps(_mm512_setzero_ps(), dat952);
__m512 in45 = _mm512_permutexvar_ps(pm69, dat951);
__m512 in52 = _mm512_permutexvar_ps(pm69, dat952);
__m512 dat953 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat953 = _mm512_max_ps(_mm512_setzero_ps(), dat953);
__m512 dat954 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat954 = _mm512_max_ps(_mm512_setzero_ps(), dat954);
__m512 in46 = _mm512_permutexvar_ps(pm69, dat953);
__m512 in53 = _mm512_permutexvar_ps(pm69, dat954);
__m512 dat955 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat955 = _mm512_max_ps(_mm512_setzero_ps(), dat955);
__m512 dat956 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat956 = _mm512_max_ps(_mm512_setzero_ps(), dat956);
__m512 in47 = _mm512_permutexvar_ps(pm69, dat955);
__m512 in54 = _mm512_permutexvar_ps(pm69, dat956);
__m512 dat957 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat957 = _mm512_max_ps(_mm512_setzero_ps(), dat957);
__m512 dat958 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+50432*i17+224*h20+4*w23+50432*s12+25216*k51);
dat958 = _mm512_max_ps(_mm512_setzero_ps(), dat958);
__m512 in48 = _mm512_permutexvar_ps(pm69, dat957);
__m512 in55 = _mm512_permutexvar_ps(pm69, dat958);
__m512 tmp375 = _mm512_add_ps(in42, in46);
__m512 tmp380 = _mm512_add_ps(in49, in53);
__m512 tmp376 = _mm512_sub_ps(in45, in43);
__m512 tmp381 = _mm512_sub_ps(in52, in50);
__m512 tmp377 = _mm512_add_ps(in43, in47);
__m512 tmp382 = _mm512_add_ps(in50, in54);
__m512 tmp378 = _mm512_sub_ps(_mm512_setzero_ps(), in47);
__m512 tmp383 = _mm512_sub_ps(_mm512_setzero_ps(), in54);
tmp375 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-4.25e+00f), tmp375);
tmp380 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-4.25e+00f), tmp380);
tmp377 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-4.25e+00f), tmp377);
tmp382 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-4.25e+00f), tmp382);
tmp378 = _mm512_fmadd_ps(tmp376, _mm512_set1_ps(5.25e+00f), tmp378);
tmp383 = _mm512_fmadd_ps(tmp381, _mm512_set1_ps(5.25e+00f), tmp383);
tmp376 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), in47);
tmp381 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), in54);
in43 = _mm512_fmadd_ps(in43, _mm512_set1_ps(4e+00f), in47);
in50 = _mm512_fmadd_ps(in50, _mm512_set1_ps(4e+00f), in54);
__m512 tmp379 = _mm512_sub_ps(tmp377, tmp375);
__m512 tmp384 = _mm512_sub_ps(tmp382, tmp380);
tmp377 = _mm512_add_ps(tmp375, tmp377);
tmp382 = _mm512_add_ps(tmp380, tmp382);
tmp375 = _mm512_fmadd_ps(in42, _mm512_set1_ps(2.5e-01f), in46);
tmp380 = _mm512_fmadd_ps(in49, _mm512_set1_ps(2.5e-01f), in53);
tmp376 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-1.25e+00f), tmp376);
tmp381 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-1.25e+00f), tmp381);
in45 = _mm512_fmadd_ps(in45, _mm512_set1_ps(-5e+00f), in43);
in52 = _mm512_fmadd_ps(in52, _mm512_set1_ps(-5e+00f), in50);
tmp375 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp375);
tmp380 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp380);
in47 = _mm512_fmadd_ps(tmp375, _mm512_set1_ps(2e+00f), tmp376);
in54 = _mm512_fmadd_ps(tmp380, _mm512_set1_ps(2e+00f), tmp381);
tmp376 = _mm512_fnmadd_ps(tmp375, _mm512_set1_ps(2e+00f), tmp376);
tmp381 = _mm512_fnmadd_ps(tmp380, _mm512_set1_ps(2e+00f), tmp381);
tmp375 = _mm512_fmadd_ps(in46, _mm512_set1_ps(2.5e-01f), in42);
tmp380 = _mm512_fmadd_ps(in53, _mm512_set1_ps(2.5e-01f), in49);
in42 = _mm512_sub_ps(in48, in42);
in49 = _mm512_sub_ps(in55, in49);
tmp375 = _mm512_fmadd_ps(in44, _mm512_set1_ps(-1.25e+00f), tmp375);
tmp380 = _mm512_fmadd_ps(in51, _mm512_set1_ps(-1.25e+00f), tmp380);
in44 = _mm512_sub_ps(in44, in46);
in51 = _mm512_sub_ps(in51, in53);
in44 = _mm512_fmadd_ps(in44, _mm512_set1_ps(5.25e+00f), in42);
in51 = _mm512_fmadd_ps(in51, _mm512_set1_ps(5.25e+00f), in49);
in43 = _mm512_fmadd_ps(tmp375, _mm512_set1_ps(2e+00f), in45);
in50 = _mm512_fmadd_ps(tmp380, _mm512_set1_ps(2e+00f), in52);
in45 = _mm512_fnmadd_ps(tmp375, _mm512_set1_ps(2e+00f), in45);
in52 = _mm512_fnmadd_ps(tmp380, _mm512_set1_ps(2e+00f), in52);
__m512 tmp393 = _mm512_unpacklo_ps(tmp378, tmp377);
__m512 tmp394 = _mm512_unpackhi_ps(tmp378, tmp377);
__m512 tmp395 = _mm512_unpacklo_ps(tmp379, in47);
__m512 tmp396 = _mm512_unpackhi_ps(tmp379, in47);
__m512 tmp397 = _mm512_unpacklo_ps(tmp376, in43);
__m512 tmp398 = _mm512_unpackhi_ps(tmp376, in43);
__m512 tmp399 = _mm512_unpacklo_ps(in45, in44);
__m512 tmp400 = _mm512_unpackhi_ps(in45, in44);
__m512 tmp401 = _mm512_unpacklo_ps(tmp383, tmp382);
__m512 tmp402 = _mm512_unpackhi_ps(tmp383, tmp382);
__m512 tmp403 = _mm512_unpacklo_ps(tmp384, in54);
__m512 tmp404 = _mm512_unpackhi_ps(tmp384, in54);
__m512 tmp405 = _mm512_unpacklo_ps(tmp381, in50);
__m512 tmp406 = _mm512_unpackhi_ps(tmp381, in50);
__m512 tmp407 = _mm512_unpacklo_ps(in52, in51);
__m512 tmp408 = _mm512_unpackhi_ps(in52, in51);
__m512 tmp409 = _mm512_shuffle_ps(tmp393, tmp395, 68);
__m512 tmp410 = _mm512_shuffle_ps(tmp393, tmp395, 238);
__m512 tmp411 = _mm512_shuffle_ps(tmp394, tmp396, 68);
__m512 tmp412 = _mm512_shuffle_ps(tmp394, tmp396, 238);
__m512 tmp413 = _mm512_shuffle_ps(tmp397, tmp399, 68);
__m512 tmp414 = _mm512_shuffle_ps(tmp397, tmp399, 238);
__m512 tmp415 = _mm512_shuffle_ps(tmp398, tmp400, 68);
__m512 tmp416 = _mm512_shuffle_ps(tmp398, tmp400, 238);
__m512 tmp417 = _mm512_shuffle_ps(tmp401, tmp403, 68);
__m512 tmp418 = _mm512_shuffle_ps(tmp401, tmp403, 238);
__m512 tmp419 = _mm512_shuffle_ps(tmp402, tmp404, 68);
__m512 tmp420 = _mm512_shuffle_ps(tmp402, tmp404, 238);
__m512 tmp421 = _mm512_shuffle_ps(tmp405, tmp407, 68);
__m512 tmp422 = _mm512_shuffle_ps(tmp405, tmp407, 238);
__m512 tmp423 = _mm512_shuffle_ps(tmp406, tmp408, 68);
__m512 tmp424 = _mm512_shuffle_ps(tmp406, tmp408, 238);
__m512 tmp425 = _mm512_shuffle_f32x4(tmp409, tmp413, 136);
__m512 tmp426 = _mm512_shuffle_f32x4(tmp409, tmp413, 221);
__m512 tmp427 = _mm512_shuffle_f32x4(tmp410, tmp414, 136);
__m512 tmp428 = _mm512_shuffle_f32x4(tmp410, tmp414, 221);
__m512 tmp429 = _mm512_shuffle_f32x4(tmp411, tmp415, 136);
__m512 tmp430 = _mm512_shuffle_f32x4(tmp411, tmp415, 221);
__m512 tmp431 = _mm512_shuffle_f32x4(tmp412, tmp416, 136);
__m512 tmp432 = _mm512_shuffle_f32x4(tmp412, tmp416, 221);
__m512 tmp433 = _mm512_shuffle_f32x4(tmp417, tmp421, 136);
__m512 tmp434 = _mm512_shuffle_f32x4(tmp417, tmp421, 221);
__m512 tmp435 = _mm512_shuffle_f32x4(tmp418, tmp422, 136);
__m512 tmp436 = _mm512_shuffle_f32x4(tmp418, tmp422, 221);
__m512 tmp437 = _mm512_shuffle_f32x4(tmp419, tmp423, 136);
__m512 tmp438 = _mm512_shuffle_f32x4(tmp419, tmp423, 221);
__m512 tmp439 = _mm512_shuffle_f32x4(tmp420, tmp424, 136);
__m512 tmp440 = _mm512_shuffle_f32x4(tmp420, tmp424, 221);
tmp378 = _mm512_shuffle_f32x4(tmp425, tmp433, 136);
tmp383 = _mm512_shuffle_f32x4(tmp425, tmp433, 221);
tmp377 = _mm512_shuffle_f32x4(tmp427, tmp435, 136);
tmp382 = _mm512_shuffle_f32x4(tmp427, tmp435, 221);
tmp379 = _mm512_shuffle_f32x4(tmp429, tmp437, 136);
tmp384 = _mm512_shuffle_f32x4(tmp429, tmp437, 221);
in47 = _mm512_shuffle_f32x4(tmp431, tmp439, 136);
in54 = _mm512_shuffle_f32x4(tmp431, tmp439, 221);
tmp376 = _mm512_shuffle_f32x4(tmp426, tmp434, 136);
tmp381 = _mm512_shuffle_f32x4(tmp426, tmp434, 221);
in43 = _mm512_shuffle_f32x4(tmp428, tmp436, 136);
in50 = _mm512_shuffle_f32x4(tmp428, tmp436, 221);
in45 = _mm512_shuffle_f32x4(tmp430, tmp438, 136);
in52 = _mm512_shuffle_f32x4(tmp430, tmp438, 221);
in44 = _mm512_shuffle_f32x4(tmp432, tmp440, 136);
in51 = _mm512_shuffle_f32x4(tmp432, tmp440, 221);
__m512 tmp385 = _mm512_add_ps(tmp377, in43);
__m512 tmp389 = _mm512_add_ps(tmp382, in50);
__m512 tmp386 = _mm512_sub_ps(tmp376, tmp379);
__m512 tmp390 = _mm512_sub_ps(tmp381, tmp384);
__m512 tmp387 = _mm512_add_ps(tmp379, in45);
__m512 tmp391 = _mm512_add_ps(tmp384, in52);
tmp378 = _mm512_sub_ps(tmp378, in45);
tmp383 = _mm512_sub_ps(tmp383, in52);
tmp385 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-4.25e+00f), tmp385);
tmp389 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-4.25e+00f), tmp389);
tmp387 = _mm512_fmadd_ps(tmp376, _mm512_set1_ps(-4.25e+00f), tmp387);
tmp391 = _mm512_fmadd_ps(tmp381, _mm512_set1_ps(-4.25e+00f), tmp391);
tmp378 = _mm512_fmadd_ps(tmp386, _mm512_set1_ps(5.25e+00f), tmp378);
tmp383 = _mm512_fmadd_ps(tmp390, _mm512_set1_ps(5.25e+00f), tmp383);
tmp386 = _mm512_fmadd_ps(tmp379, _mm512_set1_ps(2.5e-01f), in45);
tmp390 = _mm512_fmadd_ps(tmp384, _mm512_set1_ps(2.5e-01f), in52);
tmp379 = _mm512_fmadd_ps(tmp379, _mm512_set1_ps(4e+00f), in45);
tmp384 = _mm512_fmadd_ps(tmp384, _mm512_set1_ps(4e+00f), in52);
__m512 tmp388 = _mm512_sub_ps(tmp387, tmp385);
__m512 tmp392 = _mm512_sub_ps(tmp391, tmp389);
tmp387 = _mm512_add_ps(tmp385, tmp387);
tmp391 = _mm512_add_ps(tmp389, tmp391);
tmp385 = _mm512_fmadd_ps(tmp377, _mm512_set1_ps(2.5e-01f), in43);
tmp389 = _mm512_fmadd_ps(tmp382, _mm512_set1_ps(2.5e-01f), in50);
tmp386 = _mm512_fmadd_ps(tmp376, _mm512_set1_ps(-1.25e+00f), tmp386);
tmp390 = _mm512_fmadd_ps(tmp381, _mm512_set1_ps(-1.25e+00f), tmp390);
tmp376 = _mm512_fmadd_ps(tmp376, _mm512_set1_ps(-5e+00f), tmp379);
tmp381 = _mm512_fmadd_ps(tmp381, _mm512_set1_ps(-5e+00f), tmp384);
tmp385 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp385);
tmp389 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp389);
in45 = _mm512_fmadd_ps(tmp385, _mm512_set1_ps(2e+00f), tmp386);
in52 = _mm512_fmadd_ps(tmp389, _mm512_set1_ps(2e+00f), tmp390);
tmp386 = _mm512_fnmadd_ps(tmp385, _mm512_set1_ps(2e+00f), tmp386);
tmp390 = _mm512_fnmadd_ps(tmp389, _mm512_set1_ps(2e+00f), tmp390);
tmp385 = _mm512_fmadd_ps(in43, _mm512_set1_ps(2.5e-01f), tmp377);
tmp389 = _mm512_fmadd_ps(in50, _mm512_set1_ps(2.5e-01f), tmp382);
tmp377 = _mm512_sub_ps(in44, tmp377);
tmp382 = _mm512_sub_ps(in51, tmp382);
tmp385 = _mm512_fmadd_ps(in47, _mm512_set1_ps(-1.25e+00f), tmp385);
tmp389 = _mm512_fmadd_ps(in54, _mm512_set1_ps(-1.25e+00f), tmp389);
in47 = _mm512_sub_ps(in47, in43);
in54 = _mm512_sub_ps(in54, in50);
in47 = _mm512_fmadd_ps(in47, _mm512_set1_ps(5.25e+00f), tmp377);
in54 = _mm512_fmadd_ps(in54, _mm512_set1_ps(5.25e+00f), tmp382);
tmp379 = _mm512_fmadd_ps(tmp385, _mm512_set1_ps(2e+00f), tmp376);
tmp384 = _mm512_fmadd_ps(tmp389, _mm512_set1_ps(2e+00f), tmp381);
tmp376 = _mm512_fnmadd_ps(tmp385, _mm512_set1_ps(2e+00f), tmp376);
tmp381 = _mm512_fnmadd_ps(tmp389, _mm512_set1_ps(2e+00f), tmp381);
__m512 out55 = _mm512_shuffle_f32x4(tmp378, tmp387, 68);
__m512 out63 = _mm512_shuffle_f32x4(tmp378, tmp387, 238);
__m512 out56 = _mm512_shuffle_f32x4(tmp388, in45, 68);
__m512 out64 = _mm512_shuffle_f32x4(tmp388, in45, 238);
__m512 out57 = _mm512_shuffle_f32x4(tmp386, tmp379, 68);
__m512 out65 = _mm512_shuffle_f32x4(tmp386, tmp379, 238);
__m512 out58 = _mm512_shuffle_f32x4(tmp376, in47, 68);
__m512 out66 = _mm512_shuffle_f32x4(tmp376, in47, 238);
__m512 out59 = _mm512_shuffle_f32x4(tmp383, tmp391, 68);
__m512 out67 = _mm512_shuffle_f32x4(tmp383, tmp391, 238);
__m512 out60 = _mm512_shuffle_f32x4(tmp392, in52, 68);
__m512 out68 = _mm512_shuffle_f32x4(tmp392, in52, 238);
__m512 out61 = _mm512_shuffle_f32x4(tmp390, tmp384, 68);
__m512 out69 = _mm512_shuffle_f32x4(tmp390, tmp384, 238);
__m512 out62 = _mm512_shuffle_f32x4(tmp381, in54, 68);
__m512 out70 = _mm512_shuffle_f32x4(tmp381, in54, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k51, out55);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k51, out63);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k51, out59);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k51, out67);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k51, out56);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k51, out64);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k51, out60);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k51, out68);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k51, out57);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k51, out65);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k51, out61);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k51, out69);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k51, out58);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k51, out66);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k51, out62);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k51, out70);
}
++j11;
rel7 = 1;
}
ptrdiff_t h21 = base7+0;
ptrdiff_t w24 = 36;
ptrdiff_t k52 = 0;
for (; k52 != 2; ++k52) {
__m512 dat959 = _mm512_maskz_loadu_ps(16383, datPtr5+224+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat959 = _mm512_max_ps(_mm512_setzero_ps(), dat959);
__m512 dat960 = _mm512_maskz_loadu_ps(511, datPtr5+272+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat960 = _mm512_max_ps(_mm512_setzero_ps(), dat960);
__m512i pm70 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in56 = _mm512_permutexvar_ps(pm70, dat959);
__m512i pm71 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in63 = _mm512_permutexvar_ps(pm71, dat960);
__m512 dat961 = _mm512_maskz_loadu_ps(16383, datPtr5+448+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat961 = _mm512_max_ps(_mm512_setzero_ps(), dat961);
__m512 dat962 = _mm512_maskz_loadu_ps(511, datPtr5+496+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat962 = _mm512_max_ps(_mm512_setzero_ps(), dat962);
__m512 in57 = _mm512_permutexvar_ps(pm70, dat961);
__m512 in64 = _mm512_permutexvar_ps(pm71, dat962);
__m512 dat963 = _mm512_maskz_loadu_ps(16383, datPtr5+672+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat963 = _mm512_max_ps(_mm512_setzero_ps(), dat963);
__m512 dat964 = _mm512_maskz_loadu_ps(511, datPtr5+720+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat964 = _mm512_max_ps(_mm512_setzero_ps(), dat964);
__m512 in58 = _mm512_permutexvar_ps(pm70, dat963);
__m512 in65 = _mm512_permutexvar_ps(pm71, dat964);
__m512 dat965 = _mm512_maskz_loadu_ps(16383, datPtr5+896+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat965 = _mm512_max_ps(_mm512_setzero_ps(), dat965);
__m512 dat966 = _mm512_maskz_loadu_ps(511, datPtr5+944+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat966 = _mm512_max_ps(_mm512_setzero_ps(), dat966);
__m512 in59 = _mm512_permutexvar_ps(pm70, dat965);
__m512 in66 = _mm512_permutexvar_ps(pm71, dat966);
__m512 dat967 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat967 = _mm512_max_ps(_mm512_setzero_ps(), dat967);
__m512 dat968 = _mm512_maskz_loadu_ps(511, datPtr5+1168+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat968 = _mm512_max_ps(_mm512_setzero_ps(), dat968);
__m512 in60 = _mm512_permutexvar_ps(pm70, dat967);
__m512 in67 = _mm512_permutexvar_ps(pm71, dat968);
__m512 dat969 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat969 = _mm512_max_ps(_mm512_setzero_ps(), dat969);
__m512 dat970 = _mm512_maskz_loadu_ps(511, datPtr5+1392+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat970 = _mm512_max_ps(_mm512_setzero_ps(), dat970);
__m512 in61 = _mm512_permutexvar_ps(pm70, dat969);
__m512 in68 = _mm512_permutexvar_ps(pm71, dat970);
__m512 dat971 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat971 = _mm512_max_ps(_mm512_setzero_ps(), dat971);
__m512 dat972 = _mm512_maskz_loadu_ps(511, datPtr5+1616+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat972 = _mm512_max_ps(_mm512_setzero_ps(), dat972);
__m512 in62 = _mm512_permutexvar_ps(pm70, dat971);
__m512 in69 = _mm512_permutexvar_ps(pm71, dat972);
__m512 tmp441 = _mm512_add_ps(in56, in60);
__m512 tmp446 = _mm512_add_ps(in63, in67);
__m512 tmp442 = _mm512_sub_ps(in59, in57);
__m512 tmp447 = _mm512_sub_ps(in66, in64);
__m512 tmp443 = _mm512_add_ps(in57, in61);
__m512 tmp448 = _mm512_add_ps(in64, in68);
__m512 tmp444 = _mm512_sub_ps(_mm512_setzero_ps(), in61);
__m512 tmp449 = _mm512_sub_ps(_mm512_setzero_ps(), in68);
tmp441 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-4.25e+00f), tmp441);
tmp446 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-4.25e+00f), tmp446);
tmp443 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-4.25e+00f), tmp443);
tmp448 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-4.25e+00f), tmp448);
tmp444 = _mm512_fmadd_ps(tmp442, _mm512_set1_ps(5.25e+00f), tmp444);
tmp449 = _mm512_fmadd_ps(tmp447, _mm512_set1_ps(5.25e+00f), tmp449);
tmp442 = _mm512_fmadd_ps(in57, _mm512_set1_ps(2.5e-01f), in61);
tmp447 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), in68);
in57 = _mm512_fmadd_ps(in57, _mm512_set1_ps(4e+00f), in61);
in64 = _mm512_fmadd_ps(in64, _mm512_set1_ps(4e+00f), in68);
__m512 tmp445 = _mm512_sub_ps(tmp443, tmp441);
__m512 tmp450 = _mm512_sub_ps(tmp448, tmp446);
tmp443 = _mm512_add_ps(tmp441, tmp443);
tmp448 = _mm512_add_ps(tmp446, tmp448);
tmp441 = _mm512_fmadd_ps(in56, _mm512_set1_ps(2.5e-01f), in60);
tmp446 = _mm512_fmadd_ps(in63, _mm512_set1_ps(2.5e-01f), in67);
tmp442 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-1.25e+00f), tmp442);
tmp447 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-1.25e+00f), tmp447);
in59 = _mm512_fmadd_ps(in59, _mm512_set1_ps(-5e+00f), in57);
in66 = _mm512_fmadd_ps(in66, _mm512_set1_ps(-5e+00f), in64);
tmp441 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-1.25e+00f), tmp441);
tmp446 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp446);
in61 = _mm512_fmadd_ps(tmp441, _mm512_set1_ps(2e+00f), tmp442);
in68 = _mm512_fmadd_ps(tmp446, _mm512_set1_ps(2e+00f), tmp447);
tmp442 = _mm512_fnmadd_ps(tmp441, _mm512_set1_ps(2e+00f), tmp442);
tmp447 = _mm512_fnmadd_ps(tmp446, _mm512_set1_ps(2e+00f), tmp447);
tmp441 = _mm512_fmadd_ps(in60, _mm512_set1_ps(2.5e-01f), in56);
tmp446 = _mm512_fmadd_ps(in67, _mm512_set1_ps(2.5e-01f), in63);
in56 = _mm512_sub_ps(in62, in56);
in63 = _mm512_sub_ps(in69, in63);
tmp441 = _mm512_fmadd_ps(in58, _mm512_set1_ps(-1.25e+00f), tmp441);
tmp446 = _mm512_fmadd_ps(in65, _mm512_set1_ps(-1.25e+00f), tmp446);
in58 = _mm512_sub_ps(in58, in60);
in65 = _mm512_sub_ps(in65, in67);
in58 = _mm512_fmadd_ps(in58, _mm512_set1_ps(5.25e+00f), in56);
in65 = _mm512_fmadd_ps(in65, _mm512_set1_ps(5.25e+00f), in63);
in57 = _mm512_fmadd_ps(tmp441, _mm512_set1_ps(2e+00f), in59);
in64 = _mm512_fmadd_ps(tmp446, _mm512_set1_ps(2e+00f), in66);
in59 = _mm512_fnmadd_ps(tmp441, _mm512_set1_ps(2e+00f), in59);
in66 = _mm512_fnmadd_ps(tmp446, _mm512_set1_ps(2e+00f), in66);
__m512 tmp459 = _mm512_unpacklo_ps(tmp444, tmp443);
__m512 tmp460 = _mm512_unpackhi_ps(tmp444, tmp443);
__m512 tmp461 = _mm512_unpacklo_ps(tmp445, in61);
__m512 tmp462 = _mm512_unpackhi_ps(tmp445, in61);
__m512 tmp463 = _mm512_unpacklo_ps(tmp442, in57);
__m512 tmp464 = _mm512_unpackhi_ps(tmp442, in57);
__m512 tmp465 = _mm512_unpacklo_ps(in59, in58);
__m512 tmp466 = _mm512_unpackhi_ps(in59, in58);
__m512 tmp467 = _mm512_unpacklo_ps(tmp449, tmp448);
__m512 tmp468 = _mm512_unpackhi_ps(tmp449, tmp448);
__m512 tmp469 = _mm512_unpacklo_ps(tmp450, in68);
__m512 tmp470 = _mm512_unpackhi_ps(tmp450, in68);
__m512 tmp471 = _mm512_unpacklo_ps(tmp447, in64);
__m512 tmp472 = _mm512_unpackhi_ps(tmp447, in64);
__m512 tmp473 = _mm512_unpacklo_ps(in66, in65);
__m512 tmp474 = _mm512_unpackhi_ps(in66, in65);
__m512 tmp475 = _mm512_shuffle_ps(tmp459, tmp461, 68);
__m512 tmp476 = _mm512_shuffle_ps(tmp459, tmp461, 238);
__m512 tmp477 = _mm512_shuffle_ps(tmp460, tmp462, 68);
__m512 tmp478 = _mm512_shuffle_ps(tmp460, tmp462, 238);
__m512 tmp479 = _mm512_shuffle_ps(tmp463, tmp465, 68);
__m512 tmp480 = _mm512_shuffle_ps(tmp463, tmp465, 238);
__m512 tmp481 = _mm512_shuffle_ps(tmp464, tmp466, 68);
__m512 tmp482 = _mm512_shuffle_ps(tmp464, tmp466, 238);
__m512 tmp483 = _mm512_shuffle_ps(tmp467, tmp469, 68);
__m512 tmp484 = _mm512_shuffle_ps(tmp467, tmp469, 238);
__m512 tmp485 = _mm512_shuffle_ps(tmp468, tmp470, 68);
__m512 tmp486 = _mm512_shuffle_ps(tmp468, tmp470, 238);
__m512 tmp487 = _mm512_shuffle_ps(tmp471, tmp473, 68);
__m512 tmp488 = _mm512_shuffle_ps(tmp471, tmp473, 238);
__m512 tmp489 = _mm512_shuffle_ps(tmp472, tmp474, 68);
__m512 tmp490 = _mm512_shuffle_ps(tmp472, tmp474, 238);
__m512 tmp491 = _mm512_shuffle_f32x4(tmp475, tmp479, 136);
__m512 tmp492 = _mm512_shuffle_f32x4(tmp475, tmp479, 221);
__m512 tmp493 = _mm512_shuffle_f32x4(tmp476, tmp480, 136);
__m512 tmp494 = _mm512_shuffle_f32x4(tmp476, tmp480, 221);
__m512 tmp495 = _mm512_shuffle_f32x4(tmp477, tmp481, 136);
__m512 tmp496 = _mm512_shuffle_f32x4(tmp477, tmp481, 221);
__m512 tmp497 = _mm512_shuffle_f32x4(tmp478, tmp482, 136);
__m512 tmp498 = _mm512_shuffle_f32x4(tmp478, tmp482, 221);
__m512 tmp499 = _mm512_shuffle_f32x4(tmp483, tmp487, 136);
__m512 tmp500 = _mm512_shuffle_f32x4(tmp483, tmp487, 221);
__m512 tmp501 = _mm512_shuffle_f32x4(tmp484, tmp488, 136);
__m512 tmp502 = _mm512_shuffle_f32x4(tmp484, tmp488, 221);
__m512 tmp503 = _mm512_shuffle_f32x4(tmp485, tmp489, 136);
__m512 tmp504 = _mm512_shuffle_f32x4(tmp485, tmp489, 221);
__m512 tmp505 = _mm512_shuffle_f32x4(tmp486, tmp490, 136);
__m512 tmp506 = _mm512_shuffle_f32x4(tmp486, tmp490, 221);
tmp444 = _mm512_shuffle_f32x4(tmp491, tmp499, 136);
tmp449 = _mm512_shuffle_f32x4(tmp491, tmp499, 221);
tmp443 = _mm512_shuffle_f32x4(tmp493, tmp501, 136);
tmp448 = _mm512_shuffle_f32x4(tmp493, tmp501, 221);
tmp445 = _mm512_shuffle_f32x4(tmp495, tmp503, 136);
tmp450 = _mm512_shuffle_f32x4(tmp495, tmp503, 221);
in61 = _mm512_shuffle_f32x4(tmp497, tmp505, 136);
in68 = _mm512_shuffle_f32x4(tmp497, tmp505, 221);
tmp442 = _mm512_shuffle_f32x4(tmp492, tmp500, 136);
tmp447 = _mm512_shuffle_f32x4(tmp492, tmp500, 221);
in57 = _mm512_shuffle_f32x4(tmp494, tmp502, 136);
in64 = _mm512_shuffle_f32x4(tmp494, tmp502, 221);
in59 = _mm512_shuffle_f32x4(tmp496, tmp504, 136);
in66 = _mm512_shuffle_f32x4(tmp496, tmp504, 221);
in58 = _mm512_shuffle_f32x4(tmp498, tmp506, 136);
in65 = _mm512_shuffle_f32x4(tmp498, tmp506, 221);
__m512 tmp451 = _mm512_add_ps(tmp443, in57);
__m512 tmp455 = _mm512_add_ps(tmp448, in64);
__m512 tmp452 = _mm512_sub_ps(tmp442, tmp445);
__m512 tmp456 = _mm512_sub_ps(tmp447, tmp450);
__m512 tmp453 = _mm512_add_ps(tmp445, in59);
__m512 tmp457 = _mm512_add_ps(tmp450, in66);
tmp444 = _mm512_sub_ps(tmp444, in59);
tmp449 = _mm512_sub_ps(tmp449, in66);
tmp451 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-4.25e+00f), tmp451);
tmp455 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-4.25e+00f), tmp455);
tmp453 = _mm512_fmadd_ps(tmp442, _mm512_set1_ps(-4.25e+00f), tmp453);
tmp457 = _mm512_fmadd_ps(tmp447, _mm512_set1_ps(-4.25e+00f), tmp457);
tmp444 = _mm512_fmadd_ps(tmp452, _mm512_set1_ps(5.25e+00f), tmp444);
tmp449 = _mm512_fmadd_ps(tmp456, _mm512_set1_ps(5.25e+00f), tmp449);
tmp452 = _mm512_fmadd_ps(tmp445, _mm512_set1_ps(2.5e-01f), in59);
tmp456 = _mm512_fmadd_ps(tmp450, _mm512_set1_ps(2.5e-01f), in66);
tmp445 = _mm512_fmadd_ps(tmp445, _mm512_set1_ps(4e+00f), in59);
tmp450 = _mm512_fmadd_ps(tmp450, _mm512_set1_ps(4e+00f), in66);
__m512 tmp454 = _mm512_sub_ps(tmp453, tmp451);
__m512 tmp458 = _mm512_sub_ps(tmp457, tmp455);
tmp453 = _mm512_add_ps(tmp451, tmp453);
tmp457 = _mm512_add_ps(tmp455, tmp457);
tmp451 = _mm512_fmadd_ps(tmp443, _mm512_set1_ps(2.5e-01f), in57);
tmp455 = _mm512_fmadd_ps(tmp448, _mm512_set1_ps(2.5e-01f), in64);
tmp452 = _mm512_fmadd_ps(tmp442, _mm512_set1_ps(-1.25e+00f), tmp452);
tmp456 = _mm512_fmadd_ps(tmp447, _mm512_set1_ps(-1.25e+00f), tmp456);
tmp442 = _mm512_fmadd_ps(tmp442, _mm512_set1_ps(-5e+00f), tmp445);
tmp447 = _mm512_fmadd_ps(tmp447, _mm512_set1_ps(-5e+00f), tmp450);
tmp451 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-1.25e+00f), tmp451);
tmp455 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp455);
in59 = _mm512_fmadd_ps(tmp451, _mm512_set1_ps(2e+00f), tmp452);
in66 = _mm512_fmadd_ps(tmp455, _mm512_set1_ps(2e+00f), tmp456);
tmp452 = _mm512_fnmadd_ps(tmp451, _mm512_set1_ps(2e+00f), tmp452);
tmp456 = _mm512_fnmadd_ps(tmp455, _mm512_set1_ps(2e+00f), tmp456);
tmp451 = _mm512_fmadd_ps(in57, _mm512_set1_ps(2.5e-01f), tmp443);
tmp455 = _mm512_fmadd_ps(in64, _mm512_set1_ps(2.5e-01f), tmp448);
tmp443 = _mm512_sub_ps(in58, tmp443);
tmp448 = _mm512_sub_ps(in65, tmp448);
tmp451 = _mm512_fmadd_ps(in61, _mm512_set1_ps(-1.25e+00f), tmp451);
tmp455 = _mm512_fmadd_ps(in68, _mm512_set1_ps(-1.25e+00f), tmp455);
in61 = _mm512_sub_ps(in61, in57);
in68 = _mm512_sub_ps(in68, in64);
in61 = _mm512_fmadd_ps(in61, _mm512_set1_ps(5.25e+00f), tmp443);
in68 = _mm512_fmadd_ps(in68, _mm512_set1_ps(5.25e+00f), tmp448);
tmp445 = _mm512_fmadd_ps(tmp451, _mm512_set1_ps(2e+00f), tmp442);
tmp450 = _mm512_fmadd_ps(tmp455, _mm512_set1_ps(2e+00f), tmp447);
tmp442 = _mm512_fnmadd_ps(tmp451, _mm512_set1_ps(2e+00f), tmp442);
tmp447 = _mm512_fnmadd_ps(tmp455, _mm512_set1_ps(2e+00f), tmp447);
__m512 out71 = _mm512_shuffle_f32x4(tmp444, tmp453, 68);
__m512 out79 = _mm512_shuffle_f32x4(tmp444, tmp453, 238);
__m512 out72 = _mm512_shuffle_f32x4(tmp454, in59, 68);
__m512 out80 = _mm512_shuffle_f32x4(tmp454, in59, 238);
__m512 out73 = _mm512_shuffle_f32x4(tmp452, tmp445, 68);
__m512 out81 = _mm512_shuffle_f32x4(tmp452, tmp445, 238);
__m512 out74 = _mm512_shuffle_f32x4(tmp442, in61, 68);
__m512 out82 = _mm512_shuffle_f32x4(tmp442, in61, 238);
__m512 out75 = _mm512_shuffle_f32x4(tmp449, tmp457, 68);
__m512 out83 = _mm512_shuffle_f32x4(tmp449, tmp457, 238);
__m512 out76 = _mm512_shuffle_f32x4(tmp458, in66, 68);
__m512 out84 = _mm512_shuffle_f32x4(tmp458, in66, 238);
__m512 out77 = _mm512_shuffle_f32x4(tmp456, tmp450, 68);
__m512 out85 = _mm512_shuffle_f32x4(tmp456, tmp450, 238);
__m512 out78 = _mm512_shuffle_f32x4(tmp447, in68, 68);
__m512 out86 = _mm512_shuffle_f32x4(tmp447, in68, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k52, out71);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k52, out79);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k52, out75);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k52, out83);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k52, out72);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k52, out80);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k52, out76);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k52, out84);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k52, out73);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k52, out81);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k52, out77);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k52, out85);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k52, out74);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k52, out82);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k52, out78);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k52, out86);
__m512 dat973 = _mm512_maskz_loadu_ps(8191, datPtr5+1204+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat973 = _mm512_max_ps(_mm512_setzero_ps(), dat973);
__m512i pm72 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in70 = _mm512_permutexvar_ps(pm72, dat973);
__m512 dat974 = _mm512_maskz_loadu_ps(8191, datPtr5+1428+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat974 = _mm512_max_ps(_mm512_setzero_ps(), dat974);
__m512 dat975 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat975 = _mm512_max_ps(_mm512_setzero_ps(), dat975);
__m512 in71 = _mm512_permutexvar_ps(pm72, dat974);
__m512i pm73 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in78 = _mm512_permutexvar_ps(pm73, dat975);
__m512 dat976 = _mm512_maskz_loadu_ps(8191, datPtr5+1652+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat976 = _mm512_max_ps(_mm512_setzero_ps(), dat976);
__m512 dat977 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat977 = _mm512_max_ps(_mm512_setzero_ps(), dat977);
__m512 in72 = _mm512_permutexvar_ps(pm72, dat976);
__m512 in79 = _mm512_permutexvar_ps(pm73, dat977);
__m512 dat978 = _mm512_maskz_loadu_ps(8191, datPtr5+1876+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat978 = _mm512_max_ps(_mm512_setzero_ps(), dat978);
__m512 dat979 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat979 = _mm512_max_ps(_mm512_setzero_ps(), dat979);
__m512 in73 = _mm512_permutexvar_ps(pm72, dat978);
__m512 in80 = _mm512_permutexvar_ps(pm73, dat979);
__m512 dat980 = _mm512_maskz_loadu_ps(8191, datPtr5+2100+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat980 = _mm512_max_ps(_mm512_setzero_ps(), dat980);
__m512 dat981 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat981 = _mm512_max_ps(_mm512_setzero_ps(), dat981);
__m512 in74 = _mm512_permutexvar_ps(pm72, dat980);
__m512 in81 = _mm512_permutexvar_ps(pm73, dat981);
__m512 dat982 = _mm512_maskz_loadu_ps(8191, datPtr5+2324+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat982 = _mm512_max_ps(_mm512_setzero_ps(), dat982);
__m512 dat983 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat983 = _mm512_max_ps(_mm512_setzero_ps(), dat983);
__m512 in75 = _mm512_permutexvar_ps(pm72, dat982);
__m512 in82 = _mm512_permutexvar_ps(pm73, dat983);
__m512 dat984 = _mm512_maskz_loadu_ps(8191, datPtr5+2548+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat984 = _mm512_max_ps(_mm512_setzero_ps(), dat984);
__m512 dat985 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat985 = _mm512_max_ps(_mm512_setzero_ps(), dat985);
__m512 in76 = _mm512_permutexvar_ps(pm72, dat984);
__m512 in83 = _mm512_permutexvar_ps(pm73, dat985);
__m512 dat986 = _mm512_maskz_loadu_ps(8191, datPtr5+2772+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat986 = _mm512_max_ps(_mm512_setzero_ps(), dat986);
__m512 dat987 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat987 = _mm512_max_ps(_mm512_setzero_ps(), dat987);
__m512 in77 = _mm512_permutexvar_ps(pm72, dat986);
__m512 in84 = _mm512_permutexvar_ps(pm73, dat987);
__m512 tmp507 = _mm512_add_ps(in71, in75);
__m512 tmp511 = _mm512_add_ps(in78, in82);
__m512 tmp508 = _mm512_sub_ps(in74, in72);
__m512 tmp512 = _mm512_sub_ps(in81, in79);
__m512 tmp509 = _mm512_add_ps(in72, in76);
__m512 tmp513 = _mm512_add_ps(in79, in83);
in70 = _mm512_sub_ps(in70, in76);
__m512 tmp514 = _mm512_sub_ps(_mm512_setzero_ps(), in83);
tmp507 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-4.25e+00f), tmp507);
tmp511 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-4.25e+00f), tmp511);
tmp509 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-4.25e+00f), tmp509);
tmp513 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-4.25e+00f), tmp513);
in70 = _mm512_fmadd_ps(tmp508, _mm512_set1_ps(5.25e+00f), in70);
tmp514 = _mm512_fmadd_ps(tmp512, _mm512_set1_ps(5.25e+00f), tmp514);
tmp508 = _mm512_fmadd_ps(in72, _mm512_set1_ps(2.5e-01f), in76);
tmp512 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), in83);
in72 = _mm512_fmadd_ps(in72, _mm512_set1_ps(4e+00f), in76);
in79 = _mm512_fmadd_ps(in79, _mm512_set1_ps(4e+00f), in83);
__m512 tmp510 = _mm512_sub_ps(tmp509, tmp507);
__m512 tmp515 = _mm512_sub_ps(tmp513, tmp511);
tmp509 = _mm512_add_ps(tmp507, tmp509);
tmp513 = _mm512_add_ps(tmp511, tmp513);
tmp507 = _mm512_fmadd_ps(in71, _mm512_set1_ps(2.5e-01f), in75);
tmp511 = _mm512_fmadd_ps(in78, _mm512_set1_ps(2.5e-01f), in82);
tmp508 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-1.25e+00f), tmp508);
tmp512 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-1.25e+00f), tmp512);
in74 = _mm512_fmadd_ps(in74, _mm512_set1_ps(-5e+00f), in72);
in81 = _mm512_fmadd_ps(in81, _mm512_set1_ps(-5e+00f), in79);
tmp507 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-1.25e+00f), tmp507);
tmp511 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp511);
in76 = _mm512_fmadd_ps(tmp507, _mm512_set1_ps(2e+00f), tmp508);
in83 = _mm512_fmadd_ps(tmp511, _mm512_set1_ps(2e+00f), tmp512);
tmp508 = _mm512_fnmadd_ps(tmp507, _mm512_set1_ps(2e+00f), tmp508);
tmp512 = _mm512_fnmadd_ps(tmp511, _mm512_set1_ps(2e+00f), tmp512);
tmp507 = _mm512_fmadd_ps(in75, _mm512_set1_ps(2.5e-01f), in71);
tmp511 = _mm512_fmadd_ps(in82, _mm512_set1_ps(2.5e-01f), in78);
in71 = _mm512_sub_ps(in77, in71);
in78 = _mm512_sub_ps(in84, in78);
tmp507 = _mm512_fmadd_ps(in73, _mm512_set1_ps(-1.25e+00f), tmp507);
tmp511 = _mm512_fmadd_ps(in80, _mm512_set1_ps(-1.25e+00f), tmp511);
in73 = _mm512_sub_ps(in73, in75);
in80 = _mm512_sub_ps(in80, in82);
in73 = _mm512_fmadd_ps(in73, _mm512_set1_ps(5.25e+00f), in71);
in80 = _mm512_fmadd_ps(in80, _mm512_set1_ps(5.25e+00f), in78);
in72 = _mm512_fmadd_ps(tmp507, _mm512_set1_ps(2e+00f), in74);
in79 = _mm512_fmadd_ps(tmp511, _mm512_set1_ps(2e+00f), in81);
in74 = _mm512_fnmadd_ps(tmp507, _mm512_set1_ps(2e+00f), in74);
in81 = _mm512_fnmadd_ps(tmp511, _mm512_set1_ps(2e+00f), in81);
__m512 tmp524 = _mm512_unpacklo_ps(in70, tmp509);
__m512 tmp525 = _mm512_unpackhi_ps(in70, tmp509);
__m512 tmp526 = _mm512_unpacklo_ps(tmp510, in76);
__m512 tmp527 = _mm512_unpackhi_ps(tmp510, in76);
__m512 tmp528 = _mm512_unpacklo_ps(tmp508, in72);
__m512 tmp529 = _mm512_unpackhi_ps(tmp508, in72);
__m512 tmp530 = _mm512_unpacklo_ps(in74, in73);
__m512 tmp531 = _mm512_unpackhi_ps(in74, in73);
__m512 tmp532 = _mm512_unpacklo_ps(tmp514, tmp513);
__m512 tmp533 = _mm512_unpackhi_ps(tmp514, tmp513);
__m512 tmp534 = _mm512_unpacklo_ps(tmp515, in83);
__m512 tmp535 = _mm512_unpackhi_ps(tmp515, in83);
__m512 tmp536 = _mm512_unpacklo_ps(tmp512, in79);
__m512 tmp537 = _mm512_unpackhi_ps(tmp512, in79);
__m512 tmp538 = _mm512_unpacklo_ps(in81, in80);
__m512 tmp539 = _mm512_unpackhi_ps(in81, in80);
__m512 tmp540 = _mm512_shuffle_ps(tmp524, tmp526, 68);
__m512 tmp541 = _mm512_shuffle_ps(tmp524, tmp526, 238);
__m512 tmp542 = _mm512_shuffle_ps(tmp525, tmp527, 68);
__m512 tmp543 = _mm512_shuffle_ps(tmp525, tmp527, 238);
__m512 tmp544 = _mm512_shuffle_ps(tmp528, tmp530, 68);
__m512 tmp545 = _mm512_shuffle_ps(tmp528, tmp530, 238);
__m512 tmp546 = _mm512_shuffle_ps(tmp529, tmp531, 68);
__m512 tmp547 = _mm512_shuffle_ps(tmp529, tmp531, 238);
__m512 tmp548 = _mm512_shuffle_ps(tmp532, tmp534, 68);
__m512 tmp549 = _mm512_shuffle_ps(tmp532, tmp534, 238);
__m512 tmp550 = _mm512_shuffle_ps(tmp533, tmp535, 68);
__m512 tmp551 = _mm512_shuffle_ps(tmp533, tmp535, 238);
__m512 tmp552 = _mm512_shuffle_ps(tmp536, tmp538, 68);
__m512 tmp553 = _mm512_shuffle_ps(tmp536, tmp538, 238);
__m512 tmp554 = _mm512_shuffle_ps(tmp537, tmp539, 68);
__m512 tmp555 = _mm512_shuffle_ps(tmp537, tmp539, 238);
__m512 tmp556 = _mm512_shuffle_f32x4(tmp540, tmp544, 136);
__m512 tmp557 = _mm512_shuffle_f32x4(tmp540, tmp544, 221);
__m512 tmp558 = _mm512_shuffle_f32x4(tmp541, tmp545, 136);
__m512 tmp559 = _mm512_shuffle_f32x4(tmp541, tmp545, 221);
__m512 tmp560 = _mm512_shuffle_f32x4(tmp542, tmp546, 136);
__m512 tmp561 = _mm512_shuffle_f32x4(tmp542, tmp546, 221);
__m512 tmp562 = _mm512_shuffle_f32x4(tmp543, tmp547, 136);
__m512 tmp563 = _mm512_shuffle_f32x4(tmp543, tmp547, 221);
__m512 tmp564 = _mm512_shuffle_f32x4(tmp548, tmp552, 136);
__m512 tmp565 = _mm512_shuffle_f32x4(tmp548, tmp552, 221);
__m512 tmp566 = _mm512_shuffle_f32x4(tmp549, tmp553, 136);
__m512 tmp567 = _mm512_shuffle_f32x4(tmp549, tmp553, 221);
__m512 tmp568 = _mm512_shuffle_f32x4(tmp550, tmp554, 136);
__m512 tmp569 = _mm512_shuffle_f32x4(tmp550, tmp554, 221);
__m512 tmp570 = _mm512_shuffle_f32x4(tmp551, tmp555, 136);
__m512 tmp571 = _mm512_shuffle_f32x4(tmp551, tmp555, 221);
in70 = _mm512_shuffle_f32x4(tmp556, tmp564, 136);
tmp514 = _mm512_shuffle_f32x4(tmp556, tmp564, 221);
tmp509 = _mm512_shuffle_f32x4(tmp558, tmp566, 136);
tmp513 = _mm512_shuffle_f32x4(tmp558, tmp566, 221);
tmp510 = _mm512_shuffle_f32x4(tmp560, tmp568, 136);
tmp515 = _mm512_shuffle_f32x4(tmp560, tmp568, 221);
in76 = _mm512_shuffle_f32x4(tmp562, tmp570, 136);
in83 = _mm512_shuffle_f32x4(tmp562, tmp570, 221);
tmp508 = _mm512_shuffle_f32x4(tmp557, tmp565, 136);
tmp512 = _mm512_shuffle_f32x4(tmp557, tmp565, 221);
in72 = _mm512_shuffle_f32x4(tmp559, tmp567, 136);
in79 = _mm512_shuffle_f32x4(tmp559, tmp567, 221);
in74 = _mm512_shuffle_f32x4(tmp561, tmp569, 136);
in81 = _mm512_shuffle_f32x4(tmp561, tmp569, 221);
in73 = _mm512_shuffle_f32x4(tmp563, tmp571, 136);
in80 = _mm512_shuffle_f32x4(tmp563, tmp571, 221);
__m512 tmp516 = _mm512_add_ps(tmp509, in72);
__m512 tmp520 = _mm512_add_ps(tmp513, in79);
__m512 tmp517 = _mm512_sub_ps(tmp508, tmp510);
__m512 tmp521 = _mm512_sub_ps(tmp512, tmp515);
__m512 tmp518 = _mm512_add_ps(tmp510, in74);
__m512 tmp522 = _mm512_add_ps(tmp515, in81);
in70 = _mm512_sub_ps(in70, in74);
tmp514 = _mm512_sub_ps(tmp514, in81);
tmp516 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-4.25e+00f), tmp516);
tmp520 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-4.25e+00f), tmp520);
tmp518 = _mm512_fmadd_ps(tmp508, _mm512_set1_ps(-4.25e+00f), tmp518);
tmp522 = _mm512_fmadd_ps(tmp512, _mm512_set1_ps(-4.25e+00f), tmp522);
in70 = _mm512_fmadd_ps(tmp517, _mm512_set1_ps(5.25e+00f), in70);
tmp514 = _mm512_fmadd_ps(tmp521, _mm512_set1_ps(5.25e+00f), tmp514);
tmp517 = _mm512_fmadd_ps(tmp510, _mm512_set1_ps(2.5e-01f), in74);
tmp521 = _mm512_fmadd_ps(tmp515, _mm512_set1_ps(2.5e-01f), in81);
tmp510 = _mm512_fmadd_ps(tmp510, _mm512_set1_ps(4e+00f), in74);
tmp515 = _mm512_fmadd_ps(tmp515, _mm512_set1_ps(4e+00f), in81);
__m512 tmp519 = _mm512_sub_ps(tmp518, tmp516);
__m512 tmp523 = _mm512_sub_ps(tmp522, tmp520);
tmp518 = _mm512_add_ps(tmp516, tmp518);
tmp522 = _mm512_add_ps(tmp520, tmp522);
tmp516 = _mm512_fmadd_ps(tmp509, _mm512_set1_ps(2.5e-01f), in72);
tmp520 = _mm512_fmadd_ps(tmp513, _mm512_set1_ps(2.5e-01f), in79);
tmp517 = _mm512_fmadd_ps(tmp508, _mm512_set1_ps(-1.25e+00f), tmp517);
tmp521 = _mm512_fmadd_ps(tmp512, _mm512_set1_ps(-1.25e+00f), tmp521);
tmp508 = _mm512_fmadd_ps(tmp508, _mm512_set1_ps(-5e+00f), tmp510);
tmp512 = _mm512_fmadd_ps(tmp512, _mm512_set1_ps(-5e+00f), tmp515);
tmp516 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-1.25e+00f), tmp516);
tmp520 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp520);
in74 = _mm512_fmadd_ps(tmp516, _mm512_set1_ps(2e+00f), tmp517);
in81 = _mm512_fmadd_ps(tmp520, _mm512_set1_ps(2e+00f), tmp521);
tmp517 = _mm512_fnmadd_ps(tmp516, _mm512_set1_ps(2e+00f), tmp517);
tmp521 = _mm512_fnmadd_ps(tmp520, _mm512_set1_ps(2e+00f), tmp521);
tmp516 = _mm512_fmadd_ps(in72, _mm512_set1_ps(2.5e-01f), tmp509);
tmp520 = _mm512_fmadd_ps(in79, _mm512_set1_ps(2.5e-01f), tmp513);
tmp509 = _mm512_sub_ps(in73, tmp509);
tmp513 = _mm512_sub_ps(in80, tmp513);
tmp516 = _mm512_fmadd_ps(in76, _mm512_set1_ps(-1.25e+00f), tmp516);
tmp520 = _mm512_fmadd_ps(in83, _mm512_set1_ps(-1.25e+00f), tmp520);
in76 = _mm512_sub_ps(in76, in72);
in83 = _mm512_sub_ps(in83, in79);
in76 = _mm512_fmadd_ps(in76, _mm512_set1_ps(5.25e+00f), tmp509);
in83 = _mm512_fmadd_ps(in83, _mm512_set1_ps(5.25e+00f), tmp513);
tmp510 = _mm512_fmadd_ps(tmp516, _mm512_set1_ps(2e+00f), tmp508);
tmp515 = _mm512_fmadd_ps(tmp520, _mm512_set1_ps(2e+00f), tmp512);
tmp508 = _mm512_fnmadd_ps(tmp516, _mm512_set1_ps(2e+00f), tmp508);
tmp512 = _mm512_fnmadd_ps(tmp520, _mm512_set1_ps(2e+00f), tmp512);
__m512 out87 = _mm512_shuffle_f32x4(in70, tmp518, 68);
__m512 out95 = _mm512_shuffle_f32x4(in70, tmp518, 238);
__m512 out88 = _mm512_shuffle_f32x4(tmp519, in74, 68);
__m512 out96 = _mm512_shuffle_f32x4(tmp519, in74, 238);
__m512 out89 = _mm512_shuffle_f32x4(tmp517, tmp510, 68);
__m512 out97 = _mm512_shuffle_f32x4(tmp517, tmp510, 238);
__m512 out90 = _mm512_shuffle_f32x4(tmp508, in76, 68);
__m512 out98 = _mm512_shuffle_f32x4(tmp508, in76, 238);
__m512 out91 = _mm512_shuffle_f32x4(tmp514, tmp522, 68);
__m512 out99 = _mm512_shuffle_f32x4(tmp514, tmp522, 238);
__m512 out92 = _mm512_shuffle_f32x4(tmp523, in81, 68);
__m512 out100 = _mm512_shuffle_f32x4(tmp523, in81, 238);
__m512 out93 = _mm512_shuffle_f32x4(tmp521, tmp515, 68);
__m512 out101 = _mm512_shuffle_f32x4(tmp521, tmp515, 238);
__m512 out94 = _mm512_shuffle_f32x4(tmp512, in83, 68);
__m512 out102 = _mm512_shuffle_f32x4(tmp512, in83, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k52, out87);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k52, out95);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k52, out91);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k52, out99);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k52, out88);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k52, out96);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k52, out92);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k52, out100);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k52, out89);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k52, out97);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k52, out93);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k52, out101);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k52, out90);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k52, out98);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k52, out94);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k52, out102);
__m512 dat988 = _mm512_maskz_loadu_ps(8191, datPtr5+13812+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat988 = _mm512_max_ps(_mm512_setzero_ps(), dat988);
__m512i pm74 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in92 = _mm512_permutexvar_ps(pm74, dat988);
__m512 dat989 = _mm512_maskz_loadu_ps(511, datPtr5+12880+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat989 = _mm512_max_ps(_mm512_setzero_ps(), dat989);
__m512 dat990 = _mm512_maskz_loadu_ps(8191, datPtr5+14036+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat990 = _mm512_max_ps(_mm512_setzero_ps(), dat990);
__m512i pm75 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in85 = _mm512_permutexvar_ps(pm75, dat989);
__m512 in93 = _mm512_permutexvar_ps(pm74, dat990);
__m512 dat991 = _mm512_maskz_loadu_ps(511, datPtr5+13104+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat991 = _mm512_max_ps(_mm512_setzero_ps(), dat991);
__m512 dat992 = _mm512_maskz_loadu_ps(8191, datPtr5+14260+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat992 = _mm512_max_ps(_mm512_setzero_ps(), dat992);
__m512 in86 = _mm512_permutexvar_ps(pm75, dat991);
__m512 in94 = _mm512_permutexvar_ps(pm74, dat992);
__m512 dat993 = _mm512_maskz_loadu_ps(511, datPtr5+13328+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat993 = _mm512_max_ps(_mm512_setzero_ps(), dat993);
__m512 dat994 = _mm512_maskz_loadu_ps(8191, datPtr5+14484+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat994 = _mm512_max_ps(_mm512_setzero_ps(), dat994);
__m512 in87 = _mm512_permutexvar_ps(pm75, dat993);
__m512 in95 = _mm512_permutexvar_ps(pm74, dat994);
__m512 dat995 = _mm512_maskz_loadu_ps(511, datPtr5+13552+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat995 = _mm512_max_ps(_mm512_setzero_ps(), dat995);
__m512 dat996 = _mm512_maskz_loadu_ps(8191, datPtr5+14708+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat996 = _mm512_max_ps(_mm512_setzero_ps(), dat996);
__m512 in88 = _mm512_permutexvar_ps(pm75, dat995);
__m512 in96 = _mm512_permutexvar_ps(pm74, dat996);
__m512 dat997 = _mm512_maskz_loadu_ps(511, datPtr5+13776+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat997 = _mm512_max_ps(_mm512_setzero_ps(), dat997);
__m512 dat998 = _mm512_maskz_loadu_ps(8191, datPtr5+14932+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat998 = _mm512_max_ps(_mm512_setzero_ps(), dat998);
__m512 in89 = _mm512_permutexvar_ps(pm75, dat997);
__m512 in97 = _mm512_permutexvar_ps(pm74, dat998);
__m512 dat999 = _mm512_maskz_loadu_ps(511, datPtr5+14000+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat999 = _mm512_max_ps(_mm512_setzero_ps(), dat999);
__m512 dat1000 = _mm512_maskz_loadu_ps(8191, datPtr5+15156+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat1000 = _mm512_max_ps(_mm512_setzero_ps(), dat1000);
__m512 in90 = _mm512_permutexvar_ps(pm75, dat999);
__m512 in98 = _mm512_permutexvar_ps(pm74, dat1000);
__m512 dat1001 = _mm512_maskz_loadu_ps(511, datPtr5+14224+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat1001 = _mm512_max_ps(_mm512_setzero_ps(), dat1001);
__m512 dat1002 = _mm512_maskz_loadu_ps(8191, datPtr5+15380+50432*i17+224*h21+4*w24+50432*s12+25216*k52);
dat1002 = _mm512_max_ps(_mm512_setzero_ps(), dat1002);
__m512 in91 = _mm512_permutexvar_ps(pm75, dat1001);
__m512 in99 = _mm512_permutexvar_ps(pm74, dat1002);
__m512 tmp572 = _mm512_add_ps(in85, in89);
__m512 tmp577 = _mm512_add_ps(in93, in97);
__m512 tmp573 = _mm512_sub_ps(in88, in86);
__m512 tmp578 = _mm512_sub_ps(in96, in94);
__m512 tmp574 = _mm512_add_ps(in86, in90);
__m512 tmp579 = _mm512_add_ps(in94, in98);
__m512 tmp575 = _mm512_sub_ps(_mm512_setzero_ps(), in90);
in92 = _mm512_sub_ps(in92, in98);
tmp572 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-4.25e+00f), tmp572);
tmp577 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-4.25e+00f), tmp577);
tmp574 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-4.25e+00f), tmp574);
tmp579 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-4.25e+00f), tmp579);
tmp575 = _mm512_fmadd_ps(tmp573, _mm512_set1_ps(5.25e+00f), tmp575);
in92 = _mm512_fmadd_ps(tmp578, _mm512_set1_ps(5.25e+00f), in92);
tmp573 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), in90);
tmp578 = _mm512_fmadd_ps(in94, _mm512_set1_ps(2.5e-01f), in98);
in86 = _mm512_fmadd_ps(in86, _mm512_set1_ps(4e+00f), in90);
in94 = _mm512_fmadd_ps(in94, _mm512_set1_ps(4e+00f), in98);
__m512 tmp576 = _mm512_sub_ps(tmp574, tmp572);
__m512 tmp580 = _mm512_sub_ps(tmp579, tmp577);
tmp574 = _mm512_add_ps(tmp572, tmp574);
tmp579 = _mm512_add_ps(tmp577, tmp579);
tmp572 = _mm512_fmadd_ps(in85, _mm512_set1_ps(2.5e-01f), in89);
tmp577 = _mm512_fmadd_ps(in93, _mm512_set1_ps(2.5e-01f), in97);
tmp573 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-1.25e+00f), tmp573);
tmp578 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-1.25e+00f), tmp578);
in88 = _mm512_fmadd_ps(in88, _mm512_set1_ps(-5e+00f), in86);
in96 = _mm512_fmadd_ps(in96, _mm512_set1_ps(-5e+00f), in94);
tmp572 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp572);
tmp577 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-1.25e+00f), tmp577);
in90 = _mm512_fmadd_ps(tmp572, _mm512_set1_ps(2e+00f), tmp573);
in98 = _mm512_fmadd_ps(tmp577, _mm512_set1_ps(2e+00f), tmp578);
tmp573 = _mm512_fnmadd_ps(tmp572, _mm512_set1_ps(2e+00f), tmp573);
tmp578 = _mm512_fnmadd_ps(tmp577, _mm512_set1_ps(2e+00f), tmp578);
tmp572 = _mm512_fmadd_ps(in89, _mm512_set1_ps(2.5e-01f), in85);
tmp577 = _mm512_fmadd_ps(in97, _mm512_set1_ps(2.5e-01f), in93);
in85 = _mm512_sub_ps(in91, in85);
in93 = _mm512_sub_ps(in99, in93);
tmp572 = _mm512_fmadd_ps(in87, _mm512_set1_ps(-1.25e+00f), tmp572);
tmp577 = _mm512_fmadd_ps(in95, _mm512_set1_ps(-1.25e+00f), tmp577);
in87 = _mm512_sub_ps(in87, in89);
in95 = _mm512_sub_ps(in95, in97);
in87 = _mm512_fmadd_ps(in87, _mm512_set1_ps(5.25e+00f), in85);
in95 = _mm512_fmadd_ps(in95, _mm512_set1_ps(5.25e+00f), in93);
in86 = _mm512_fmadd_ps(tmp572, _mm512_set1_ps(2e+00f), in88);
in94 = _mm512_fmadd_ps(tmp577, _mm512_set1_ps(2e+00f), in96);
in88 = _mm512_fnmadd_ps(tmp572, _mm512_set1_ps(2e+00f), in88);
in96 = _mm512_fnmadd_ps(tmp577, _mm512_set1_ps(2e+00f), in96);
__m512 tmp589 = _mm512_unpacklo_ps(tmp575, tmp574);
__m512 tmp590 = _mm512_unpackhi_ps(tmp575, tmp574);
__m512 tmp591 = _mm512_unpacklo_ps(tmp576, in90);
__m512 tmp592 = _mm512_unpackhi_ps(tmp576, in90);
__m512 tmp593 = _mm512_unpacklo_ps(tmp573, in86);
__m512 tmp594 = _mm512_unpackhi_ps(tmp573, in86);
__m512 tmp595 = _mm512_unpacklo_ps(in88, in87);
__m512 tmp596 = _mm512_unpackhi_ps(in88, in87);
__m512 tmp597 = _mm512_unpacklo_ps(in92, tmp579);
__m512 tmp598 = _mm512_unpackhi_ps(in92, tmp579);
__m512 tmp599 = _mm512_unpacklo_ps(tmp580, in98);
__m512 tmp600 = _mm512_unpackhi_ps(tmp580, in98);
__m512 tmp601 = _mm512_unpacklo_ps(tmp578, in94);
__m512 tmp602 = _mm512_unpackhi_ps(tmp578, in94);
__m512 tmp603 = _mm512_unpacklo_ps(in96, in95);
__m512 tmp604 = _mm512_unpackhi_ps(in96, in95);
__m512 tmp605 = _mm512_shuffle_ps(tmp589, tmp591, 68);
__m512 tmp606 = _mm512_shuffle_ps(tmp589, tmp591, 238);
__m512 tmp607 = _mm512_shuffle_ps(tmp590, tmp592, 68);
__m512 tmp608 = _mm512_shuffle_ps(tmp590, tmp592, 238);
__m512 tmp609 = _mm512_shuffle_ps(tmp593, tmp595, 68);
__m512 tmp610 = _mm512_shuffle_ps(tmp593, tmp595, 238);
__m512 tmp611 = _mm512_shuffle_ps(tmp594, tmp596, 68);
__m512 tmp612 = _mm512_shuffle_ps(tmp594, tmp596, 238);
__m512 tmp613 = _mm512_shuffle_ps(tmp597, tmp599, 68);
__m512 tmp614 = _mm512_shuffle_ps(tmp597, tmp599, 238);
__m512 tmp615 = _mm512_shuffle_ps(tmp598, tmp600, 68);
__m512 tmp616 = _mm512_shuffle_ps(tmp598, tmp600, 238);
__m512 tmp617 = _mm512_shuffle_ps(tmp601, tmp603, 68);
__m512 tmp618 = _mm512_shuffle_ps(tmp601, tmp603, 238);
__m512 tmp619 = _mm512_shuffle_ps(tmp602, tmp604, 68);
__m512 tmp620 = _mm512_shuffle_ps(tmp602, tmp604, 238);
__m512 tmp621 = _mm512_shuffle_f32x4(tmp605, tmp609, 136);
__m512 tmp622 = _mm512_shuffle_f32x4(tmp605, tmp609, 221);
__m512 tmp623 = _mm512_shuffle_f32x4(tmp606, tmp610, 136);
__m512 tmp624 = _mm512_shuffle_f32x4(tmp606, tmp610, 221);
__m512 tmp625 = _mm512_shuffle_f32x4(tmp607, tmp611, 136);
__m512 tmp626 = _mm512_shuffle_f32x4(tmp607, tmp611, 221);
__m512 tmp627 = _mm512_shuffle_f32x4(tmp608, tmp612, 136);
__m512 tmp628 = _mm512_shuffle_f32x4(tmp608, tmp612, 221);
__m512 tmp629 = _mm512_shuffle_f32x4(tmp613, tmp617, 136);
__m512 tmp630 = _mm512_shuffle_f32x4(tmp613, tmp617, 221);
__m512 tmp631 = _mm512_shuffle_f32x4(tmp614, tmp618, 136);
__m512 tmp632 = _mm512_shuffle_f32x4(tmp614, tmp618, 221);
__m512 tmp633 = _mm512_shuffle_f32x4(tmp615, tmp619, 136);
__m512 tmp634 = _mm512_shuffle_f32x4(tmp615, tmp619, 221);
__m512 tmp635 = _mm512_shuffle_f32x4(tmp616, tmp620, 136);
__m512 tmp636 = _mm512_shuffle_f32x4(tmp616, tmp620, 221);
tmp575 = _mm512_shuffle_f32x4(tmp621, tmp629, 136);
in92 = _mm512_shuffle_f32x4(tmp621, tmp629, 221);
tmp574 = _mm512_shuffle_f32x4(tmp623, tmp631, 136);
tmp579 = _mm512_shuffle_f32x4(tmp623, tmp631, 221);
tmp576 = _mm512_shuffle_f32x4(tmp625, tmp633, 136);
tmp580 = _mm512_shuffle_f32x4(tmp625, tmp633, 221);
in90 = _mm512_shuffle_f32x4(tmp627, tmp635, 136);
in98 = _mm512_shuffle_f32x4(tmp627, tmp635, 221);
tmp573 = _mm512_shuffle_f32x4(tmp622, tmp630, 136);
tmp578 = _mm512_shuffle_f32x4(tmp622, tmp630, 221);
in86 = _mm512_shuffle_f32x4(tmp624, tmp632, 136);
in94 = _mm512_shuffle_f32x4(tmp624, tmp632, 221);
in88 = _mm512_shuffle_f32x4(tmp626, tmp634, 136);
in96 = _mm512_shuffle_f32x4(tmp626, tmp634, 221);
in87 = _mm512_shuffle_f32x4(tmp628, tmp636, 136);
in95 = _mm512_shuffle_f32x4(tmp628, tmp636, 221);
__m512 tmp581 = _mm512_add_ps(tmp574, in86);
__m512 tmp585 = _mm512_add_ps(tmp579, in94);
__m512 tmp582 = _mm512_sub_ps(tmp573, tmp576);
__m512 tmp586 = _mm512_sub_ps(tmp578, tmp580);
__m512 tmp583 = _mm512_add_ps(tmp576, in88);
__m512 tmp587 = _mm512_add_ps(tmp580, in96);
tmp575 = _mm512_sub_ps(tmp575, in88);
in92 = _mm512_sub_ps(in92, in96);
tmp581 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-4.25e+00f), tmp581);
tmp585 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-4.25e+00f), tmp585);
tmp583 = _mm512_fmadd_ps(tmp573, _mm512_set1_ps(-4.25e+00f), tmp583);
tmp587 = _mm512_fmadd_ps(tmp578, _mm512_set1_ps(-4.25e+00f), tmp587);
tmp575 = _mm512_fmadd_ps(tmp582, _mm512_set1_ps(5.25e+00f), tmp575);
in92 = _mm512_fmadd_ps(tmp586, _mm512_set1_ps(5.25e+00f), in92);
tmp582 = _mm512_fmadd_ps(tmp576, _mm512_set1_ps(2.5e-01f), in88);
tmp586 = _mm512_fmadd_ps(tmp580, _mm512_set1_ps(2.5e-01f), in96);
tmp576 = _mm512_fmadd_ps(tmp576, _mm512_set1_ps(4e+00f), in88);
tmp580 = _mm512_fmadd_ps(tmp580, _mm512_set1_ps(4e+00f), in96);
__m512 tmp584 = _mm512_sub_ps(tmp583, tmp581);
__m512 tmp588 = _mm512_sub_ps(tmp587, tmp585);
tmp583 = _mm512_add_ps(tmp581, tmp583);
tmp587 = _mm512_add_ps(tmp585, tmp587);
tmp581 = _mm512_fmadd_ps(tmp574, _mm512_set1_ps(2.5e-01f), in86);
tmp585 = _mm512_fmadd_ps(tmp579, _mm512_set1_ps(2.5e-01f), in94);
tmp582 = _mm512_fmadd_ps(tmp573, _mm512_set1_ps(-1.25e+00f), tmp582);
tmp586 = _mm512_fmadd_ps(tmp578, _mm512_set1_ps(-1.25e+00f), tmp586);
tmp573 = _mm512_fmadd_ps(tmp573, _mm512_set1_ps(-5e+00f), tmp576);
tmp578 = _mm512_fmadd_ps(tmp578, _mm512_set1_ps(-5e+00f), tmp580);
tmp581 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp581);
tmp585 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-1.25e+00f), tmp585);
in88 = _mm512_fmadd_ps(tmp581, _mm512_set1_ps(2e+00f), tmp582);
in96 = _mm512_fmadd_ps(tmp585, _mm512_set1_ps(2e+00f), tmp586);
tmp582 = _mm512_fnmadd_ps(tmp581, _mm512_set1_ps(2e+00f), tmp582);
tmp586 = _mm512_fnmadd_ps(tmp585, _mm512_set1_ps(2e+00f), tmp586);
tmp581 = _mm512_fmadd_ps(in86, _mm512_set1_ps(2.5e-01f), tmp574);
tmp585 = _mm512_fmadd_ps(in94, _mm512_set1_ps(2.5e-01f), tmp579);
tmp574 = _mm512_sub_ps(in87, tmp574);
tmp579 = _mm512_sub_ps(in95, tmp579);
tmp581 = _mm512_fmadd_ps(in90, _mm512_set1_ps(-1.25e+00f), tmp581);
tmp585 = _mm512_fmadd_ps(in98, _mm512_set1_ps(-1.25e+00f), tmp585);
in90 = _mm512_sub_ps(in90, in86);
in98 = _mm512_sub_ps(in98, in94);
in90 = _mm512_fmadd_ps(in90, _mm512_set1_ps(5.25e+00f), tmp574);
in98 = _mm512_fmadd_ps(in98, _mm512_set1_ps(5.25e+00f), tmp579);
tmp576 = _mm512_fmadd_ps(tmp581, _mm512_set1_ps(2e+00f), tmp573);
tmp580 = _mm512_fmadd_ps(tmp585, _mm512_set1_ps(2e+00f), tmp578);
tmp573 = _mm512_fnmadd_ps(tmp581, _mm512_set1_ps(2e+00f), tmp573);
tmp578 = _mm512_fnmadd_ps(tmp585, _mm512_set1_ps(2e+00f), tmp578);
__m512 out103 = _mm512_shuffle_f32x4(tmp575, tmp583, 68);
__m512 out111 = _mm512_shuffle_f32x4(tmp575, tmp583, 238);
__m512 out104 = _mm512_shuffle_f32x4(tmp584, in88, 68);
__m512 out112 = _mm512_shuffle_f32x4(tmp584, in88, 238);
__m512 out105 = _mm512_shuffle_f32x4(tmp582, tmp576, 68);
__m512 out113 = _mm512_shuffle_f32x4(tmp582, tmp576, 238);
__m512 out106 = _mm512_shuffle_f32x4(tmp573, in90, 68);
__m512 out114 = _mm512_shuffle_f32x4(tmp573, in90, 238);
__m512 out107 = _mm512_shuffle_f32x4(in92, tmp587, 68);
__m512 out115 = _mm512_shuffle_f32x4(in92, tmp587, 238);
__m512 out108 = _mm512_shuffle_f32x4(tmp588, in96, 68);
__m512 out116 = _mm512_shuffle_f32x4(tmp588, in96, 238);
__m512 out109 = _mm512_shuffle_f32x4(tmp586, tmp580, 68);
__m512 out117 = _mm512_shuffle_f32x4(tmp586, tmp580, 238);
__m512 out110 = _mm512_shuffle_f32x4(tmp578, in98, 68);
__m512 out118 = _mm512_shuffle_f32x4(tmp578, in98, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k52, out103);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k52, out111);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k52, out107);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k52, out115);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k52, out104);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k52, out112);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k52, out108);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k52, out116);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k52, out105);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k52, out113);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k52, out109);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k52, out117);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k52, out106);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k52, out114);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k52, out110);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k52, out118);
}
++j11;
j11 = 2;
}
if (j11 < 15) {
ptrdiff_t rel8 = (size_t)(j11-2)%5;
ptrdiff_t base8 = 6+(size_t)(j11-2)/5*18;
for (; ; rel8 = 0, base8 += 18) {
if (rel8 < 2) {
if (rel8 < 1) {
ptrdiff_t h22 = base8+0;
ptrdiff_t w25 = 12;
ptrdiff_t k53 = 0;
for (; k53 != 2; ++k53) {
__m512 dat1003 = _mm512_maskz_loadu_ps(16383, datPtr5+0+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1003 = _mm512_max_ps(_mm512_setzero_ps(), dat1003);
__m512 dat1004 = _mm512_maskz_loadu_ps(16383, datPtr5+48+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1004 = _mm512_max_ps(_mm512_setzero_ps(), dat1004);
__m512i pm76 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in100 = _mm512_permutexvar_ps(pm76, dat1003);
__m512 in108 = _mm512_permutexvar_ps(pm76, dat1004);
__m512 dat1005 = _mm512_maskz_loadu_ps(16383, datPtr5+224+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1005 = _mm512_max_ps(_mm512_setzero_ps(), dat1005);
__m512 dat1006 = _mm512_maskz_loadu_ps(16383, datPtr5+272+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1006 = _mm512_max_ps(_mm512_setzero_ps(), dat1006);
__m512 in101 = _mm512_permutexvar_ps(pm76, dat1005);
__m512 in109 = _mm512_permutexvar_ps(pm76, dat1006);
__m512 dat1007 = _mm512_maskz_loadu_ps(16383, datPtr5+448+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1007 = _mm512_max_ps(_mm512_setzero_ps(), dat1007);
__m512 dat1008 = _mm512_maskz_loadu_ps(16383, datPtr5+496+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1008 = _mm512_max_ps(_mm512_setzero_ps(), dat1008);
__m512 in102 = _mm512_permutexvar_ps(pm76, dat1007);
__m512 in110 = _mm512_permutexvar_ps(pm76, dat1008);
__m512 dat1009 = _mm512_maskz_loadu_ps(16383, datPtr5+672+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1009 = _mm512_max_ps(_mm512_setzero_ps(), dat1009);
__m512 dat1010 = _mm512_maskz_loadu_ps(16383, datPtr5+720+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1010 = _mm512_max_ps(_mm512_setzero_ps(), dat1010);
__m512 in103 = _mm512_permutexvar_ps(pm76, dat1009);
__m512 in111 = _mm512_permutexvar_ps(pm76, dat1010);
__m512 dat1011 = _mm512_maskz_loadu_ps(16383, datPtr5+896+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1011 = _mm512_max_ps(_mm512_setzero_ps(), dat1011);
__m512 dat1012 = _mm512_maskz_loadu_ps(16383, datPtr5+944+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1012 = _mm512_max_ps(_mm512_setzero_ps(), dat1012);
__m512 in104 = _mm512_permutexvar_ps(pm76, dat1011);
__m512 in112 = _mm512_permutexvar_ps(pm76, dat1012);
__m512 dat1013 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1013 = _mm512_max_ps(_mm512_setzero_ps(), dat1013);
__m512 dat1014 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1014 = _mm512_max_ps(_mm512_setzero_ps(), dat1014);
__m512 in105 = _mm512_permutexvar_ps(pm76, dat1013);
__m512 in113 = _mm512_permutexvar_ps(pm76, dat1014);
__m512 dat1015 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1015 = _mm512_max_ps(_mm512_setzero_ps(), dat1015);
__m512 dat1016 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1016 = _mm512_max_ps(_mm512_setzero_ps(), dat1016);
__m512 in106 = _mm512_permutexvar_ps(pm76, dat1015);
__m512 in114 = _mm512_permutexvar_ps(pm76, dat1016);
__m512 dat1017 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1017 = _mm512_max_ps(_mm512_setzero_ps(), dat1017);
__m512 dat1018 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1018 = _mm512_max_ps(_mm512_setzero_ps(), dat1018);
__m512 in107 = _mm512_permutexvar_ps(pm76, dat1017);
__m512 in115 = _mm512_permutexvar_ps(pm76, dat1018);
__m512 tmp637 = _mm512_add_ps(in101, in105);
__m512 tmp641 = _mm512_add_ps(in109, in113);
__m512 tmp638 = _mm512_sub_ps(in104, in102);
__m512 tmp642 = _mm512_sub_ps(in112, in110);
__m512 tmp639 = _mm512_add_ps(in102, in106);
__m512 tmp643 = _mm512_add_ps(in110, in114);
in100 = _mm512_sub_ps(in100, in106);
in108 = _mm512_sub_ps(in108, in114);
tmp637 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-4.25e+00f), tmp637);
tmp641 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-4.25e+00f), tmp641);
tmp639 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-4.25e+00f), tmp639);
tmp643 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-4.25e+00f), tmp643);
in100 = _mm512_fmadd_ps(tmp638, _mm512_set1_ps(5.25e+00f), in100);
in108 = _mm512_fmadd_ps(tmp642, _mm512_set1_ps(5.25e+00f), in108);
tmp638 = _mm512_fmadd_ps(in102, _mm512_set1_ps(2.5e-01f), in106);
tmp642 = _mm512_fmadd_ps(in110, _mm512_set1_ps(2.5e-01f), in114);
in102 = _mm512_fmadd_ps(in102, _mm512_set1_ps(4e+00f), in106);
in110 = _mm512_fmadd_ps(in110, _mm512_set1_ps(4e+00f), in114);
__m512 tmp640 = _mm512_sub_ps(tmp639, tmp637);
__m512 tmp644 = _mm512_sub_ps(tmp643, tmp641);
tmp639 = _mm512_add_ps(tmp637, tmp639);
tmp643 = _mm512_add_ps(tmp641, tmp643);
tmp637 = _mm512_fmadd_ps(in101, _mm512_set1_ps(2.5e-01f), in105);
tmp641 = _mm512_fmadd_ps(in109, _mm512_set1_ps(2.5e-01f), in113);
tmp638 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-1.25e+00f), tmp638);
tmp642 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-1.25e+00f), tmp642);
in104 = _mm512_fmadd_ps(in104, _mm512_set1_ps(-5e+00f), in102);
in112 = _mm512_fmadd_ps(in112, _mm512_set1_ps(-5e+00f), in110);
tmp637 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-1.25e+00f), tmp637);
tmp641 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-1.25e+00f), tmp641);
in106 = _mm512_fmadd_ps(tmp637, _mm512_set1_ps(2e+00f), tmp638);
in114 = _mm512_fmadd_ps(tmp641, _mm512_set1_ps(2e+00f), tmp642);
tmp638 = _mm512_fnmadd_ps(tmp637, _mm512_set1_ps(2e+00f), tmp638);
tmp642 = _mm512_fnmadd_ps(tmp641, _mm512_set1_ps(2e+00f), tmp642);
tmp637 = _mm512_fmadd_ps(in105, _mm512_set1_ps(2.5e-01f), in101);
tmp641 = _mm512_fmadd_ps(in113, _mm512_set1_ps(2.5e-01f), in109);
in101 = _mm512_sub_ps(in107, in101);
in109 = _mm512_sub_ps(in115, in109);
tmp637 = _mm512_fmadd_ps(in103, _mm512_set1_ps(-1.25e+00f), tmp637);
tmp641 = _mm512_fmadd_ps(in111, _mm512_set1_ps(-1.25e+00f), tmp641);
in103 = _mm512_sub_ps(in103, in105);
in111 = _mm512_sub_ps(in111, in113);
in103 = _mm512_fmadd_ps(in103, _mm512_set1_ps(5.25e+00f), in101);
in111 = _mm512_fmadd_ps(in111, _mm512_set1_ps(5.25e+00f), in109);
in102 = _mm512_fmadd_ps(tmp637, _mm512_set1_ps(2e+00f), in104);
in110 = _mm512_fmadd_ps(tmp641, _mm512_set1_ps(2e+00f), in112);
in104 = _mm512_fnmadd_ps(tmp637, _mm512_set1_ps(2e+00f), in104);
in112 = _mm512_fnmadd_ps(tmp641, _mm512_set1_ps(2e+00f), in112);
__m512 tmp653 = _mm512_unpacklo_ps(in100, tmp639);
__m512 tmp654 = _mm512_unpackhi_ps(in100, tmp639);
__m512 tmp655 = _mm512_unpacklo_ps(tmp640, in106);
__m512 tmp656 = _mm512_unpackhi_ps(tmp640, in106);
__m512 tmp657 = _mm512_unpacklo_ps(tmp638, in102);
__m512 tmp658 = _mm512_unpackhi_ps(tmp638, in102);
__m512 tmp659 = _mm512_unpacklo_ps(in104, in103);
__m512 tmp660 = _mm512_unpackhi_ps(in104, in103);
__m512 tmp661 = _mm512_unpacklo_ps(in108, tmp643);
__m512 tmp662 = _mm512_unpackhi_ps(in108, tmp643);
__m512 tmp663 = _mm512_unpacklo_ps(tmp644, in114);
__m512 tmp664 = _mm512_unpackhi_ps(tmp644, in114);
__m512 tmp665 = _mm512_unpacklo_ps(tmp642, in110);
__m512 tmp666 = _mm512_unpackhi_ps(tmp642, in110);
__m512 tmp667 = _mm512_unpacklo_ps(in112, in111);
__m512 tmp668 = _mm512_unpackhi_ps(in112, in111);
__m512 tmp669 = _mm512_shuffle_ps(tmp653, tmp655, 68);
__m512 tmp670 = _mm512_shuffle_ps(tmp653, tmp655, 238);
__m512 tmp671 = _mm512_shuffle_ps(tmp654, tmp656, 68);
__m512 tmp672 = _mm512_shuffle_ps(tmp654, tmp656, 238);
__m512 tmp673 = _mm512_shuffle_ps(tmp657, tmp659, 68);
__m512 tmp674 = _mm512_shuffle_ps(tmp657, tmp659, 238);
__m512 tmp675 = _mm512_shuffle_ps(tmp658, tmp660, 68);
__m512 tmp676 = _mm512_shuffle_ps(tmp658, tmp660, 238);
__m512 tmp677 = _mm512_shuffle_ps(tmp661, tmp663, 68);
__m512 tmp678 = _mm512_shuffle_ps(tmp661, tmp663, 238);
__m512 tmp679 = _mm512_shuffle_ps(tmp662, tmp664, 68);
__m512 tmp680 = _mm512_shuffle_ps(tmp662, tmp664, 238);
__m512 tmp681 = _mm512_shuffle_ps(tmp665, tmp667, 68);
__m512 tmp682 = _mm512_shuffle_ps(tmp665, tmp667, 238);
__m512 tmp683 = _mm512_shuffle_ps(tmp666, tmp668, 68);
__m512 tmp684 = _mm512_shuffle_ps(tmp666, tmp668, 238);
__m512 tmp685 = _mm512_shuffle_f32x4(tmp669, tmp673, 136);
__m512 tmp686 = _mm512_shuffle_f32x4(tmp669, tmp673, 221);
__m512 tmp687 = _mm512_shuffle_f32x4(tmp670, tmp674, 136);
__m512 tmp688 = _mm512_shuffle_f32x4(tmp670, tmp674, 221);
__m512 tmp689 = _mm512_shuffle_f32x4(tmp671, tmp675, 136);
__m512 tmp690 = _mm512_shuffle_f32x4(tmp671, tmp675, 221);
__m512 tmp691 = _mm512_shuffle_f32x4(tmp672, tmp676, 136);
__m512 tmp692 = _mm512_shuffle_f32x4(tmp672, tmp676, 221);
__m512 tmp693 = _mm512_shuffle_f32x4(tmp677, tmp681, 136);
__m512 tmp694 = _mm512_shuffle_f32x4(tmp677, tmp681, 221);
__m512 tmp695 = _mm512_shuffle_f32x4(tmp678, tmp682, 136);
__m512 tmp696 = _mm512_shuffle_f32x4(tmp678, tmp682, 221);
__m512 tmp697 = _mm512_shuffle_f32x4(tmp679, tmp683, 136);
__m512 tmp698 = _mm512_shuffle_f32x4(tmp679, tmp683, 221);
__m512 tmp699 = _mm512_shuffle_f32x4(tmp680, tmp684, 136);
__m512 tmp700 = _mm512_shuffle_f32x4(tmp680, tmp684, 221);
in100 = _mm512_shuffle_f32x4(tmp685, tmp693, 136);
in108 = _mm512_shuffle_f32x4(tmp685, tmp693, 221);
tmp639 = _mm512_shuffle_f32x4(tmp687, tmp695, 136);
tmp643 = _mm512_shuffle_f32x4(tmp687, tmp695, 221);
tmp640 = _mm512_shuffle_f32x4(tmp689, tmp697, 136);
tmp644 = _mm512_shuffle_f32x4(tmp689, tmp697, 221);
in106 = _mm512_shuffle_f32x4(tmp691, tmp699, 136);
in114 = _mm512_shuffle_f32x4(tmp691, tmp699, 221);
tmp638 = _mm512_shuffle_f32x4(tmp686, tmp694, 136);
tmp642 = _mm512_shuffle_f32x4(tmp686, tmp694, 221);
in102 = _mm512_shuffle_f32x4(tmp688, tmp696, 136);
in110 = _mm512_shuffle_f32x4(tmp688, tmp696, 221);
in104 = _mm512_shuffle_f32x4(tmp690, tmp698, 136);
in112 = _mm512_shuffle_f32x4(tmp690, tmp698, 221);
in103 = _mm512_shuffle_f32x4(tmp692, tmp700, 136);
in111 = _mm512_shuffle_f32x4(tmp692, tmp700, 221);
__m512 tmp645 = _mm512_add_ps(tmp639, in102);
__m512 tmp649 = _mm512_add_ps(tmp643, in110);
__m512 tmp646 = _mm512_sub_ps(tmp638, tmp640);
__m512 tmp650 = _mm512_sub_ps(tmp642, tmp644);
__m512 tmp647 = _mm512_add_ps(tmp640, in104);
__m512 tmp651 = _mm512_add_ps(tmp644, in112);
in100 = _mm512_sub_ps(in100, in104);
in108 = _mm512_sub_ps(in108, in112);
tmp645 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-4.25e+00f), tmp645);
tmp649 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-4.25e+00f), tmp649);
tmp647 = _mm512_fmadd_ps(tmp638, _mm512_set1_ps(-4.25e+00f), tmp647);
tmp651 = _mm512_fmadd_ps(tmp642, _mm512_set1_ps(-4.25e+00f), tmp651);
in100 = _mm512_fmadd_ps(tmp646, _mm512_set1_ps(5.25e+00f), in100);
in108 = _mm512_fmadd_ps(tmp650, _mm512_set1_ps(5.25e+00f), in108);
tmp646 = _mm512_fmadd_ps(tmp640, _mm512_set1_ps(2.5e-01f), in104);
tmp650 = _mm512_fmadd_ps(tmp644, _mm512_set1_ps(2.5e-01f), in112);
tmp640 = _mm512_fmadd_ps(tmp640, _mm512_set1_ps(4e+00f), in104);
tmp644 = _mm512_fmadd_ps(tmp644, _mm512_set1_ps(4e+00f), in112);
__m512 tmp648 = _mm512_sub_ps(tmp647, tmp645);
__m512 tmp652 = _mm512_sub_ps(tmp651, tmp649);
tmp647 = _mm512_add_ps(tmp645, tmp647);
tmp651 = _mm512_add_ps(tmp649, tmp651);
tmp645 = _mm512_fmadd_ps(tmp639, _mm512_set1_ps(2.5e-01f), in102);
tmp649 = _mm512_fmadd_ps(tmp643, _mm512_set1_ps(2.5e-01f), in110);
tmp646 = _mm512_fmadd_ps(tmp638, _mm512_set1_ps(-1.25e+00f), tmp646);
tmp650 = _mm512_fmadd_ps(tmp642, _mm512_set1_ps(-1.25e+00f), tmp650);
tmp638 = _mm512_fmadd_ps(tmp638, _mm512_set1_ps(-5e+00f), tmp640);
tmp642 = _mm512_fmadd_ps(tmp642, _mm512_set1_ps(-5e+00f), tmp644);
tmp645 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-1.25e+00f), tmp645);
tmp649 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-1.25e+00f), tmp649);
in104 = _mm512_fmadd_ps(tmp645, _mm512_set1_ps(2e+00f), tmp646);
in112 = _mm512_fmadd_ps(tmp649, _mm512_set1_ps(2e+00f), tmp650);
tmp646 = _mm512_fnmadd_ps(tmp645, _mm512_set1_ps(2e+00f), tmp646);
tmp650 = _mm512_fnmadd_ps(tmp649, _mm512_set1_ps(2e+00f), tmp650);
tmp645 = _mm512_fmadd_ps(in102, _mm512_set1_ps(2.5e-01f), tmp639);
tmp649 = _mm512_fmadd_ps(in110, _mm512_set1_ps(2.5e-01f), tmp643);
tmp639 = _mm512_sub_ps(in103, tmp639);
tmp643 = _mm512_sub_ps(in111, tmp643);
tmp645 = _mm512_fmadd_ps(in106, _mm512_set1_ps(-1.25e+00f), tmp645);
tmp649 = _mm512_fmadd_ps(in114, _mm512_set1_ps(-1.25e+00f), tmp649);
in106 = _mm512_sub_ps(in106, in102);
in114 = _mm512_sub_ps(in114, in110);
in106 = _mm512_fmadd_ps(in106, _mm512_set1_ps(5.25e+00f), tmp639);
in114 = _mm512_fmadd_ps(in114, _mm512_set1_ps(5.25e+00f), tmp643);
tmp640 = _mm512_fmadd_ps(tmp645, _mm512_set1_ps(2e+00f), tmp638);
tmp644 = _mm512_fmadd_ps(tmp649, _mm512_set1_ps(2e+00f), tmp642);
tmp638 = _mm512_fnmadd_ps(tmp645, _mm512_set1_ps(2e+00f), tmp638);
tmp642 = _mm512_fnmadd_ps(tmp649, _mm512_set1_ps(2e+00f), tmp642);
__m512 out119 = _mm512_shuffle_f32x4(in100, tmp647, 68);
__m512 out127 = _mm512_shuffle_f32x4(in100, tmp647, 238);
__m512 out120 = _mm512_shuffle_f32x4(tmp648, in104, 68);
__m512 out128 = _mm512_shuffle_f32x4(tmp648, in104, 238);
__m512 out121 = _mm512_shuffle_f32x4(tmp646, tmp640, 68);
__m512 out129 = _mm512_shuffle_f32x4(tmp646, tmp640, 238);
__m512 out122 = _mm512_shuffle_f32x4(tmp638, in106, 68);
__m512 out130 = _mm512_shuffle_f32x4(tmp638, in106, 238);
__m512 out123 = _mm512_shuffle_f32x4(in108, tmp651, 68);
__m512 out131 = _mm512_shuffle_f32x4(in108, tmp651, 238);
__m512 out124 = _mm512_shuffle_f32x4(tmp652, in112, 68);
__m512 out132 = _mm512_shuffle_f32x4(tmp652, in112, 238);
__m512 out125 = _mm512_shuffle_f32x4(tmp650, tmp644, 68);
__m512 out133 = _mm512_shuffle_f32x4(tmp650, tmp644, 238);
__m512 out126 = _mm512_shuffle_f32x4(tmp642, in114, 68);
__m512 out134 = _mm512_shuffle_f32x4(tmp642, in114, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k53, out119);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k53, out127);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k53, out123);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k53, out131);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k53, out120);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k53, out128);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k53, out124);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k53, out132);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k53, out121);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k53, out129);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k53, out125);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k53, out133);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k53, out122);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k53, out130);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k53, out126);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k53, out134);
__m512 dat1019 = _mm512_maskz_loadu_ps(16383, datPtr5+96+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1019 = _mm512_max_ps(_mm512_setzero_ps(), dat1019);
__m512 dat1020 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1020 = _mm512_max_ps(_mm512_setzero_ps(), dat1020);
__m512i pm77 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in116 = _mm512_permutexvar_ps(pm77, dat1019);
__m512 in124 = _mm512_permutexvar_ps(pm77, dat1020);
__m512 dat1021 = _mm512_maskz_loadu_ps(16383, datPtr5+320+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1021 = _mm512_max_ps(_mm512_setzero_ps(), dat1021);
__m512 dat1022 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1022 = _mm512_max_ps(_mm512_setzero_ps(), dat1022);
__m512 in117 = _mm512_permutexvar_ps(pm77, dat1021);
__m512 in125 = _mm512_permutexvar_ps(pm77, dat1022);
__m512 dat1023 = _mm512_maskz_loadu_ps(16383, datPtr5+544+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1023 = _mm512_max_ps(_mm512_setzero_ps(), dat1023);
__m512 dat1024 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1024 = _mm512_max_ps(_mm512_setzero_ps(), dat1024);
__m512 in118 = _mm512_permutexvar_ps(pm77, dat1023);
__m512 in126 = _mm512_permutexvar_ps(pm77, dat1024);
__m512 dat1025 = _mm512_maskz_loadu_ps(16383, datPtr5+768+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1025 = _mm512_max_ps(_mm512_setzero_ps(), dat1025);
__m512 dat1026 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1026 = _mm512_max_ps(_mm512_setzero_ps(), dat1026);
__m512 in119 = _mm512_permutexvar_ps(pm77, dat1025);
__m512 in127 = _mm512_permutexvar_ps(pm77, dat1026);
__m512 dat1027 = _mm512_maskz_loadu_ps(16383, datPtr5+992+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1027 = _mm512_max_ps(_mm512_setzero_ps(), dat1027);
__m512 dat1028 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1028 = _mm512_max_ps(_mm512_setzero_ps(), dat1028);
__m512 in120 = _mm512_permutexvar_ps(pm77, dat1027);
__m512 in128 = _mm512_permutexvar_ps(pm77, dat1028);
__m512 dat1029 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1029 = _mm512_max_ps(_mm512_setzero_ps(), dat1029);
__m512 dat1030 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1030 = _mm512_max_ps(_mm512_setzero_ps(), dat1030);
__m512 in121 = _mm512_permutexvar_ps(pm77, dat1029);
__m512 in129 = _mm512_permutexvar_ps(pm77, dat1030);
__m512 dat1031 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1031 = _mm512_max_ps(_mm512_setzero_ps(), dat1031);
__m512 dat1032 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1032 = _mm512_max_ps(_mm512_setzero_ps(), dat1032);
__m512 in122 = _mm512_permutexvar_ps(pm77, dat1031);
__m512 in130 = _mm512_permutexvar_ps(pm77, dat1032);
__m512 dat1033 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1033 = _mm512_max_ps(_mm512_setzero_ps(), dat1033);
__m512 dat1034 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1034 = _mm512_max_ps(_mm512_setzero_ps(), dat1034);
__m512 in123 = _mm512_permutexvar_ps(pm77, dat1033);
__m512 in131 = _mm512_permutexvar_ps(pm77, dat1034);
__m512 tmp701 = _mm512_add_ps(in117, in121);
__m512 tmp705 = _mm512_add_ps(in125, in129);
__m512 tmp702 = _mm512_sub_ps(in120, in118);
__m512 tmp706 = _mm512_sub_ps(in128, in126);
__m512 tmp703 = _mm512_add_ps(in118, in122);
__m512 tmp707 = _mm512_add_ps(in126, in130);
in116 = _mm512_sub_ps(in116, in122);
in124 = _mm512_sub_ps(in124, in130);
tmp701 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-4.25e+00f), tmp701);
tmp705 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-4.25e+00f), tmp705);
tmp703 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-4.25e+00f), tmp703);
tmp707 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-4.25e+00f), tmp707);
in116 = _mm512_fmadd_ps(tmp702, _mm512_set1_ps(5.25e+00f), in116);
in124 = _mm512_fmadd_ps(tmp706, _mm512_set1_ps(5.25e+00f), in124);
tmp702 = _mm512_fmadd_ps(in118, _mm512_set1_ps(2.5e-01f), in122);
tmp706 = _mm512_fmadd_ps(in126, _mm512_set1_ps(2.5e-01f), in130);
in118 = _mm512_fmadd_ps(in118, _mm512_set1_ps(4e+00f), in122);
in126 = _mm512_fmadd_ps(in126, _mm512_set1_ps(4e+00f), in130);
__m512 tmp704 = _mm512_sub_ps(tmp703, tmp701);
__m512 tmp708 = _mm512_sub_ps(tmp707, tmp705);
tmp703 = _mm512_add_ps(tmp701, tmp703);
tmp707 = _mm512_add_ps(tmp705, tmp707);
tmp701 = _mm512_fmadd_ps(in117, _mm512_set1_ps(2.5e-01f), in121);
tmp705 = _mm512_fmadd_ps(in125, _mm512_set1_ps(2.5e-01f), in129);
tmp702 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-1.25e+00f), tmp702);
tmp706 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-1.25e+00f), tmp706);
in120 = _mm512_fmadd_ps(in120, _mm512_set1_ps(-5e+00f), in118);
in128 = _mm512_fmadd_ps(in128, _mm512_set1_ps(-5e+00f), in126);
tmp701 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-1.25e+00f), tmp701);
tmp705 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-1.25e+00f), tmp705);
in122 = _mm512_fmadd_ps(tmp701, _mm512_set1_ps(2e+00f), tmp702);
in130 = _mm512_fmadd_ps(tmp705, _mm512_set1_ps(2e+00f), tmp706);
tmp702 = _mm512_fnmadd_ps(tmp701, _mm512_set1_ps(2e+00f), tmp702);
tmp706 = _mm512_fnmadd_ps(tmp705, _mm512_set1_ps(2e+00f), tmp706);
tmp701 = _mm512_fmadd_ps(in121, _mm512_set1_ps(2.5e-01f), in117);
tmp705 = _mm512_fmadd_ps(in129, _mm512_set1_ps(2.5e-01f), in125);
in117 = _mm512_sub_ps(in123, in117);
in125 = _mm512_sub_ps(in131, in125);
tmp701 = _mm512_fmadd_ps(in119, _mm512_set1_ps(-1.25e+00f), tmp701);
tmp705 = _mm512_fmadd_ps(in127, _mm512_set1_ps(-1.25e+00f), tmp705);
in119 = _mm512_sub_ps(in119, in121);
in127 = _mm512_sub_ps(in127, in129);
in119 = _mm512_fmadd_ps(in119, _mm512_set1_ps(5.25e+00f), in117);
in127 = _mm512_fmadd_ps(in127, _mm512_set1_ps(5.25e+00f), in125);
in118 = _mm512_fmadd_ps(tmp701, _mm512_set1_ps(2e+00f), in120);
in126 = _mm512_fmadd_ps(tmp705, _mm512_set1_ps(2e+00f), in128);
in120 = _mm512_fnmadd_ps(tmp701, _mm512_set1_ps(2e+00f), in120);
in128 = _mm512_fnmadd_ps(tmp705, _mm512_set1_ps(2e+00f), in128);
__m512 tmp717 = _mm512_unpacklo_ps(in116, tmp703);
__m512 tmp718 = _mm512_unpackhi_ps(in116, tmp703);
__m512 tmp719 = _mm512_unpacklo_ps(tmp704, in122);
__m512 tmp720 = _mm512_unpackhi_ps(tmp704, in122);
__m512 tmp721 = _mm512_unpacklo_ps(tmp702, in118);
__m512 tmp722 = _mm512_unpackhi_ps(tmp702, in118);
__m512 tmp723 = _mm512_unpacklo_ps(in120, in119);
__m512 tmp724 = _mm512_unpackhi_ps(in120, in119);
__m512 tmp725 = _mm512_unpacklo_ps(in124, tmp707);
__m512 tmp726 = _mm512_unpackhi_ps(in124, tmp707);
__m512 tmp727 = _mm512_unpacklo_ps(tmp708, in130);
__m512 tmp728 = _mm512_unpackhi_ps(tmp708, in130);
__m512 tmp729 = _mm512_unpacklo_ps(tmp706, in126);
__m512 tmp730 = _mm512_unpackhi_ps(tmp706, in126);
__m512 tmp731 = _mm512_unpacklo_ps(in128, in127);
__m512 tmp732 = _mm512_unpackhi_ps(in128, in127);
__m512 tmp733 = _mm512_shuffle_ps(tmp717, tmp719, 68);
__m512 tmp734 = _mm512_shuffle_ps(tmp717, tmp719, 238);
__m512 tmp735 = _mm512_shuffle_ps(tmp718, tmp720, 68);
__m512 tmp736 = _mm512_shuffle_ps(tmp718, tmp720, 238);
__m512 tmp737 = _mm512_shuffle_ps(tmp721, tmp723, 68);
__m512 tmp738 = _mm512_shuffle_ps(tmp721, tmp723, 238);
__m512 tmp739 = _mm512_shuffle_ps(tmp722, tmp724, 68);
__m512 tmp740 = _mm512_shuffle_ps(tmp722, tmp724, 238);
__m512 tmp741 = _mm512_shuffle_ps(tmp725, tmp727, 68);
__m512 tmp742 = _mm512_shuffle_ps(tmp725, tmp727, 238);
__m512 tmp743 = _mm512_shuffle_ps(tmp726, tmp728, 68);
__m512 tmp744 = _mm512_shuffle_ps(tmp726, tmp728, 238);
__m512 tmp745 = _mm512_shuffle_ps(tmp729, tmp731, 68);
__m512 tmp746 = _mm512_shuffle_ps(tmp729, tmp731, 238);
__m512 tmp747 = _mm512_shuffle_ps(tmp730, tmp732, 68);
__m512 tmp748 = _mm512_shuffle_ps(tmp730, tmp732, 238);
__m512 tmp749 = _mm512_shuffle_f32x4(tmp733, tmp737, 136);
__m512 tmp750 = _mm512_shuffle_f32x4(tmp733, tmp737, 221);
__m512 tmp751 = _mm512_shuffle_f32x4(tmp734, tmp738, 136);
__m512 tmp752 = _mm512_shuffle_f32x4(tmp734, tmp738, 221);
__m512 tmp753 = _mm512_shuffle_f32x4(tmp735, tmp739, 136);
__m512 tmp754 = _mm512_shuffle_f32x4(tmp735, tmp739, 221);
__m512 tmp755 = _mm512_shuffle_f32x4(tmp736, tmp740, 136);
__m512 tmp756 = _mm512_shuffle_f32x4(tmp736, tmp740, 221);
__m512 tmp757 = _mm512_shuffle_f32x4(tmp741, tmp745, 136);
__m512 tmp758 = _mm512_shuffle_f32x4(tmp741, tmp745, 221);
__m512 tmp759 = _mm512_shuffle_f32x4(tmp742, tmp746, 136);
__m512 tmp760 = _mm512_shuffle_f32x4(tmp742, tmp746, 221);
__m512 tmp761 = _mm512_shuffle_f32x4(tmp743, tmp747, 136);
__m512 tmp762 = _mm512_shuffle_f32x4(tmp743, tmp747, 221);
__m512 tmp763 = _mm512_shuffle_f32x4(tmp744, tmp748, 136);
__m512 tmp764 = _mm512_shuffle_f32x4(tmp744, tmp748, 221);
in116 = _mm512_shuffle_f32x4(tmp749, tmp757, 136);
in124 = _mm512_shuffle_f32x4(tmp749, tmp757, 221);
tmp703 = _mm512_shuffle_f32x4(tmp751, tmp759, 136);
tmp707 = _mm512_shuffle_f32x4(tmp751, tmp759, 221);
tmp704 = _mm512_shuffle_f32x4(tmp753, tmp761, 136);
tmp708 = _mm512_shuffle_f32x4(tmp753, tmp761, 221);
in122 = _mm512_shuffle_f32x4(tmp755, tmp763, 136);
in130 = _mm512_shuffle_f32x4(tmp755, tmp763, 221);
tmp702 = _mm512_shuffle_f32x4(tmp750, tmp758, 136);
tmp706 = _mm512_shuffle_f32x4(tmp750, tmp758, 221);
in118 = _mm512_shuffle_f32x4(tmp752, tmp760, 136);
in126 = _mm512_shuffle_f32x4(tmp752, tmp760, 221);
in120 = _mm512_shuffle_f32x4(tmp754, tmp762, 136);
in128 = _mm512_shuffle_f32x4(tmp754, tmp762, 221);
in119 = _mm512_shuffle_f32x4(tmp756, tmp764, 136);
in127 = _mm512_shuffle_f32x4(tmp756, tmp764, 221);
__m512 tmp709 = _mm512_add_ps(tmp703, in118);
__m512 tmp713 = _mm512_add_ps(tmp707, in126);
__m512 tmp710 = _mm512_sub_ps(tmp702, tmp704);
__m512 tmp714 = _mm512_sub_ps(tmp706, tmp708);
__m512 tmp711 = _mm512_add_ps(tmp704, in120);
__m512 tmp715 = _mm512_add_ps(tmp708, in128);
in116 = _mm512_sub_ps(in116, in120);
in124 = _mm512_sub_ps(in124, in128);
tmp709 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-4.25e+00f), tmp709);
tmp713 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-4.25e+00f), tmp713);
tmp711 = _mm512_fmadd_ps(tmp702, _mm512_set1_ps(-4.25e+00f), tmp711);
tmp715 = _mm512_fmadd_ps(tmp706, _mm512_set1_ps(-4.25e+00f), tmp715);
in116 = _mm512_fmadd_ps(tmp710, _mm512_set1_ps(5.25e+00f), in116);
in124 = _mm512_fmadd_ps(tmp714, _mm512_set1_ps(5.25e+00f), in124);
tmp710 = _mm512_fmadd_ps(tmp704, _mm512_set1_ps(2.5e-01f), in120);
tmp714 = _mm512_fmadd_ps(tmp708, _mm512_set1_ps(2.5e-01f), in128);
tmp704 = _mm512_fmadd_ps(tmp704, _mm512_set1_ps(4e+00f), in120);
tmp708 = _mm512_fmadd_ps(tmp708, _mm512_set1_ps(4e+00f), in128);
__m512 tmp712 = _mm512_sub_ps(tmp711, tmp709);
__m512 tmp716 = _mm512_sub_ps(tmp715, tmp713);
tmp711 = _mm512_add_ps(tmp709, tmp711);
tmp715 = _mm512_add_ps(tmp713, tmp715);
tmp709 = _mm512_fmadd_ps(tmp703, _mm512_set1_ps(2.5e-01f), in118);
tmp713 = _mm512_fmadd_ps(tmp707, _mm512_set1_ps(2.5e-01f), in126);
tmp710 = _mm512_fmadd_ps(tmp702, _mm512_set1_ps(-1.25e+00f), tmp710);
tmp714 = _mm512_fmadd_ps(tmp706, _mm512_set1_ps(-1.25e+00f), tmp714);
tmp702 = _mm512_fmadd_ps(tmp702, _mm512_set1_ps(-5e+00f), tmp704);
tmp706 = _mm512_fmadd_ps(tmp706, _mm512_set1_ps(-5e+00f), tmp708);
tmp709 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-1.25e+00f), tmp709);
tmp713 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-1.25e+00f), tmp713);
in120 = _mm512_fmadd_ps(tmp709, _mm512_set1_ps(2e+00f), tmp710);
in128 = _mm512_fmadd_ps(tmp713, _mm512_set1_ps(2e+00f), tmp714);
tmp710 = _mm512_fnmadd_ps(tmp709, _mm512_set1_ps(2e+00f), tmp710);
tmp714 = _mm512_fnmadd_ps(tmp713, _mm512_set1_ps(2e+00f), tmp714);
tmp709 = _mm512_fmadd_ps(in118, _mm512_set1_ps(2.5e-01f), tmp703);
tmp713 = _mm512_fmadd_ps(in126, _mm512_set1_ps(2.5e-01f), tmp707);
tmp703 = _mm512_sub_ps(in119, tmp703);
tmp707 = _mm512_sub_ps(in127, tmp707);
tmp709 = _mm512_fmadd_ps(in122, _mm512_set1_ps(-1.25e+00f), tmp709);
tmp713 = _mm512_fmadd_ps(in130, _mm512_set1_ps(-1.25e+00f), tmp713);
in122 = _mm512_sub_ps(in122, in118);
in130 = _mm512_sub_ps(in130, in126);
in122 = _mm512_fmadd_ps(in122, _mm512_set1_ps(5.25e+00f), tmp703);
in130 = _mm512_fmadd_ps(in130, _mm512_set1_ps(5.25e+00f), tmp707);
tmp704 = _mm512_fmadd_ps(tmp709, _mm512_set1_ps(2e+00f), tmp702);
tmp708 = _mm512_fmadd_ps(tmp713, _mm512_set1_ps(2e+00f), tmp706);
tmp702 = _mm512_fnmadd_ps(tmp709, _mm512_set1_ps(2e+00f), tmp702);
tmp706 = _mm512_fnmadd_ps(tmp713, _mm512_set1_ps(2e+00f), tmp706);
__m512 out135 = _mm512_shuffle_f32x4(in116, tmp711, 68);
__m512 out143 = _mm512_shuffle_f32x4(in116, tmp711, 238);
__m512 out136 = _mm512_shuffle_f32x4(tmp712, in120, 68);
__m512 out144 = _mm512_shuffle_f32x4(tmp712, in120, 238);
__m512 out137 = _mm512_shuffle_f32x4(tmp710, tmp704, 68);
__m512 out145 = _mm512_shuffle_f32x4(tmp710, tmp704, 238);
__m512 out138 = _mm512_shuffle_f32x4(tmp702, in122, 68);
__m512 out146 = _mm512_shuffle_f32x4(tmp702, in122, 238);
__m512 out139 = _mm512_shuffle_f32x4(in124, tmp715, 68);
__m512 out147 = _mm512_shuffle_f32x4(in124, tmp715, 238);
__m512 out140 = _mm512_shuffle_f32x4(tmp716, in128, 68);
__m512 out148 = _mm512_shuffle_f32x4(tmp716, in128, 238);
__m512 out141 = _mm512_shuffle_f32x4(tmp714, tmp708, 68);
__m512 out149 = _mm512_shuffle_f32x4(tmp714, tmp708, 238);
__m512 out142 = _mm512_shuffle_f32x4(tmp706, in130, 68);
__m512 out150 = _mm512_shuffle_f32x4(tmp706, in130, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k53, out135);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k53, out143);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k53, out139);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k53, out147);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k53, out136);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k53, out144);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k53, out140);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k53, out148);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k53, out137);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k53, out145);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k53, out141);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k53, out149);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k53, out138);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k53, out146);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k53, out142);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k53, out150);
__m512 dat1035 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1035 = _mm512_max_ps(_mm512_setzero_ps(), dat1035);
__m512 dat1036 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1036 = _mm512_max_ps(_mm512_setzero_ps(), dat1036);
__m512i pm78 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in132 = _mm512_permutexvar_ps(pm78, dat1035);
__m512 in140 = _mm512_permutexvar_ps(pm78, dat1036);
__m512 dat1037 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1037 = _mm512_max_ps(_mm512_setzero_ps(), dat1037);
__m512 dat1038 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1038 = _mm512_max_ps(_mm512_setzero_ps(), dat1038);
__m512 in133 = _mm512_permutexvar_ps(pm78, dat1037);
__m512 in141 = _mm512_permutexvar_ps(pm78, dat1038);
__m512 dat1039 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1039 = _mm512_max_ps(_mm512_setzero_ps(), dat1039);
__m512 dat1040 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1040 = _mm512_max_ps(_mm512_setzero_ps(), dat1040);
__m512 in134 = _mm512_permutexvar_ps(pm78, dat1039);
__m512 in142 = _mm512_permutexvar_ps(pm78, dat1040);
__m512 dat1041 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1041 = _mm512_max_ps(_mm512_setzero_ps(), dat1041);
__m512 dat1042 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1042 = _mm512_max_ps(_mm512_setzero_ps(), dat1042);
__m512 in135 = _mm512_permutexvar_ps(pm78, dat1041);
__m512 in143 = _mm512_permutexvar_ps(pm78, dat1042);
__m512 dat1043 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1043 = _mm512_max_ps(_mm512_setzero_ps(), dat1043);
__m512 dat1044 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1044 = _mm512_max_ps(_mm512_setzero_ps(), dat1044);
__m512 in136 = _mm512_permutexvar_ps(pm78, dat1043);
__m512 in144 = _mm512_permutexvar_ps(pm78, dat1044);
__m512 dat1045 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1045 = _mm512_max_ps(_mm512_setzero_ps(), dat1045);
__m512 dat1046 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1046 = _mm512_max_ps(_mm512_setzero_ps(), dat1046);
__m512 in137 = _mm512_permutexvar_ps(pm78, dat1045);
__m512 in145 = _mm512_permutexvar_ps(pm78, dat1046);
__m512 dat1047 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1047 = _mm512_max_ps(_mm512_setzero_ps(), dat1047);
__m512 dat1048 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1048 = _mm512_max_ps(_mm512_setzero_ps(), dat1048);
__m512 in138 = _mm512_permutexvar_ps(pm78, dat1047);
__m512 in146 = _mm512_permutexvar_ps(pm78, dat1048);
__m512 dat1049 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1049 = _mm512_max_ps(_mm512_setzero_ps(), dat1049);
__m512 dat1050 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+50432*i17+224*h22+4*w25+50432*s12+25216*k53);
dat1050 = _mm512_max_ps(_mm512_setzero_ps(), dat1050);
__m512 in139 = _mm512_permutexvar_ps(pm78, dat1049);
__m512 in147 = _mm512_permutexvar_ps(pm78, dat1050);
__m512 tmp765 = _mm512_add_ps(in133, in137);
__m512 tmp769 = _mm512_add_ps(in141, in145);
__m512 tmp766 = _mm512_sub_ps(in136, in134);
__m512 tmp770 = _mm512_sub_ps(in144, in142);
__m512 tmp767 = _mm512_add_ps(in134, in138);
__m512 tmp771 = _mm512_add_ps(in142, in146);
in132 = _mm512_sub_ps(in132, in138);
in140 = _mm512_sub_ps(in140, in146);
tmp765 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-4.25e+00f), tmp765);
tmp769 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-4.25e+00f), tmp769);
tmp767 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-4.25e+00f), tmp767);
tmp771 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-4.25e+00f), tmp771);
in132 = _mm512_fmadd_ps(tmp766, _mm512_set1_ps(5.25e+00f), in132);
in140 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(5.25e+00f), in140);
tmp766 = _mm512_fmadd_ps(in134, _mm512_set1_ps(2.5e-01f), in138);
tmp770 = _mm512_fmadd_ps(in142, _mm512_set1_ps(2.5e-01f), in146);
in134 = _mm512_fmadd_ps(in134, _mm512_set1_ps(4e+00f), in138);
in142 = _mm512_fmadd_ps(in142, _mm512_set1_ps(4e+00f), in146);
__m512 tmp768 = _mm512_sub_ps(tmp767, tmp765);
__m512 tmp772 = _mm512_sub_ps(tmp771, tmp769);
tmp767 = _mm512_add_ps(tmp765, tmp767);
tmp771 = _mm512_add_ps(tmp769, tmp771);
tmp765 = _mm512_fmadd_ps(in133, _mm512_set1_ps(2.5e-01f), in137);
tmp769 = _mm512_fmadd_ps(in141, _mm512_set1_ps(2.5e-01f), in145);
tmp766 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-1.25e+00f), tmp766);
tmp770 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-1.25e+00f), tmp770);
in136 = _mm512_fmadd_ps(in136, _mm512_set1_ps(-5e+00f), in134);
in144 = _mm512_fmadd_ps(in144, _mm512_set1_ps(-5e+00f), in142);
tmp765 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-1.25e+00f), tmp765);
tmp769 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-1.25e+00f), tmp769);
in138 = _mm512_fmadd_ps(tmp765, _mm512_set1_ps(2e+00f), tmp766);
in146 = _mm512_fmadd_ps(tmp769, _mm512_set1_ps(2e+00f), tmp770);
tmp766 = _mm512_fnmadd_ps(tmp765, _mm512_set1_ps(2e+00f), tmp766);
tmp770 = _mm512_fnmadd_ps(tmp769, _mm512_set1_ps(2e+00f), tmp770);
tmp765 = _mm512_fmadd_ps(in137, _mm512_set1_ps(2.5e-01f), in133);
tmp769 = _mm512_fmadd_ps(in145, _mm512_set1_ps(2.5e-01f), in141);
in133 = _mm512_sub_ps(in139, in133);
in141 = _mm512_sub_ps(in147, in141);
tmp765 = _mm512_fmadd_ps(in135, _mm512_set1_ps(-1.25e+00f), tmp765);
tmp769 = _mm512_fmadd_ps(in143, _mm512_set1_ps(-1.25e+00f), tmp769);
in135 = _mm512_sub_ps(in135, in137);
in143 = _mm512_sub_ps(in143, in145);
in135 = _mm512_fmadd_ps(in135, _mm512_set1_ps(5.25e+00f), in133);
in143 = _mm512_fmadd_ps(in143, _mm512_set1_ps(5.25e+00f), in141);
in134 = _mm512_fmadd_ps(tmp765, _mm512_set1_ps(2e+00f), in136);
in142 = _mm512_fmadd_ps(tmp769, _mm512_set1_ps(2e+00f), in144);
in136 = _mm512_fnmadd_ps(tmp765, _mm512_set1_ps(2e+00f), in136);
in144 = _mm512_fnmadd_ps(tmp769, _mm512_set1_ps(2e+00f), in144);
__m512 tmp781 = _mm512_unpacklo_ps(in132, tmp767);
__m512 tmp782 = _mm512_unpackhi_ps(in132, tmp767);
__m512 tmp783 = _mm512_unpacklo_ps(tmp768, in138);
__m512 tmp784 = _mm512_unpackhi_ps(tmp768, in138);
__m512 tmp785 = _mm512_unpacklo_ps(tmp766, in134);
__m512 tmp786 = _mm512_unpackhi_ps(tmp766, in134);
__m512 tmp787 = _mm512_unpacklo_ps(in136, in135);
__m512 tmp788 = _mm512_unpackhi_ps(in136, in135);
__m512 tmp789 = _mm512_unpacklo_ps(in140, tmp771);
__m512 tmp790 = _mm512_unpackhi_ps(in140, tmp771);
__m512 tmp791 = _mm512_unpacklo_ps(tmp772, in146);
__m512 tmp792 = _mm512_unpackhi_ps(tmp772, in146);
__m512 tmp793 = _mm512_unpacklo_ps(tmp770, in142);
__m512 tmp794 = _mm512_unpackhi_ps(tmp770, in142);
__m512 tmp795 = _mm512_unpacklo_ps(in144, in143);
__m512 tmp796 = _mm512_unpackhi_ps(in144, in143);
__m512 tmp797 = _mm512_shuffle_ps(tmp781, tmp783, 68);
__m512 tmp798 = _mm512_shuffle_ps(tmp781, tmp783, 238);
__m512 tmp799 = _mm512_shuffle_ps(tmp782, tmp784, 68);
__m512 tmp800 = _mm512_shuffle_ps(tmp782, tmp784, 238);
__m512 tmp801 = _mm512_shuffle_ps(tmp785, tmp787, 68);
__m512 tmp802 = _mm512_shuffle_ps(tmp785, tmp787, 238);
__m512 tmp803 = _mm512_shuffle_ps(tmp786, tmp788, 68);
__m512 tmp804 = _mm512_shuffle_ps(tmp786, tmp788, 238);
__m512 tmp805 = _mm512_shuffle_ps(tmp789, tmp791, 68);
__m512 tmp806 = _mm512_shuffle_ps(tmp789, tmp791, 238);
__m512 tmp807 = _mm512_shuffle_ps(tmp790, tmp792, 68);
__m512 tmp808 = _mm512_shuffle_ps(tmp790, tmp792, 238);
__m512 tmp809 = _mm512_shuffle_ps(tmp793, tmp795, 68);
__m512 tmp810 = _mm512_shuffle_ps(tmp793, tmp795, 238);
__m512 tmp811 = _mm512_shuffle_ps(tmp794, tmp796, 68);
__m512 tmp812 = _mm512_shuffle_ps(tmp794, tmp796, 238);
__m512 tmp813 = _mm512_shuffle_f32x4(tmp797, tmp801, 136);
__m512 tmp814 = _mm512_shuffle_f32x4(tmp797, tmp801, 221);
__m512 tmp815 = _mm512_shuffle_f32x4(tmp798, tmp802, 136);
__m512 tmp816 = _mm512_shuffle_f32x4(tmp798, tmp802, 221);
__m512 tmp817 = _mm512_shuffle_f32x4(tmp799, tmp803, 136);
__m512 tmp818 = _mm512_shuffle_f32x4(tmp799, tmp803, 221);
__m512 tmp819 = _mm512_shuffle_f32x4(tmp800, tmp804, 136);
__m512 tmp820 = _mm512_shuffle_f32x4(tmp800, tmp804, 221);
__m512 tmp821 = _mm512_shuffle_f32x4(tmp805, tmp809, 136);
__m512 tmp822 = _mm512_shuffle_f32x4(tmp805, tmp809, 221);
__m512 tmp823 = _mm512_shuffle_f32x4(tmp806, tmp810, 136);
__m512 tmp824 = _mm512_shuffle_f32x4(tmp806, tmp810, 221);
__m512 tmp825 = _mm512_shuffle_f32x4(tmp807, tmp811, 136);
__m512 tmp826 = _mm512_shuffle_f32x4(tmp807, tmp811, 221);
__m512 tmp827 = _mm512_shuffle_f32x4(tmp808, tmp812, 136);
__m512 tmp828 = _mm512_shuffle_f32x4(tmp808, tmp812, 221);
in132 = _mm512_shuffle_f32x4(tmp813, tmp821, 136);
in140 = _mm512_shuffle_f32x4(tmp813, tmp821, 221);
tmp767 = _mm512_shuffle_f32x4(tmp815, tmp823, 136);
tmp771 = _mm512_shuffle_f32x4(tmp815, tmp823, 221);
tmp768 = _mm512_shuffle_f32x4(tmp817, tmp825, 136);
tmp772 = _mm512_shuffle_f32x4(tmp817, tmp825, 221);
in138 = _mm512_shuffle_f32x4(tmp819, tmp827, 136);
in146 = _mm512_shuffle_f32x4(tmp819, tmp827, 221);
tmp766 = _mm512_shuffle_f32x4(tmp814, tmp822, 136);
tmp770 = _mm512_shuffle_f32x4(tmp814, tmp822, 221);
in134 = _mm512_shuffle_f32x4(tmp816, tmp824, 136);
in142 = _mm512_shuffle_f32x4(tmp816, tmp824, 221);
in136 = _mm512_shuffle_f32x4(tmp818, tmp826, 136);
in144 = _mm512_shuffle_f32x4(tmp818, tmp826, 221);
in135 = _mm512_shuffle_f32x4(tmp820, tmp828, 136);
in143 = _mm512_shuffle_f32x4(tmp820, tmp828, 221);
__m512 tmp773 = _mm512_add_ps(tmp767, in134);
__m512 tmp777 = _mm512_add_ps(tmp771, in142);
__m512 tmp774 = _mm512_sub_ps(tmp766, tmp768);
__m512 tmp778 = _mm512_sub_ps(tmp770, tmp772);
__m512 tmp775 = _mm512_add_ps(tmp768, in136);
__m512 tmp779 = _mm512_add_ps(tmp772, in144);
in132 = _mm512_sub_ps(in132, in136);
in140 = _mm512_sub_ps(in140, in144);
tmp773 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-4.25e+00f), tmp773);
tmp777 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-4.25e+00f), tmp777);
tmp775 = _mm512_fmadd_ps(tmp766, _mm512_set1_ps(-4.25e+00f), tmp775);
tmp779 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(-4.25e+00f), tmp779);
in132 = _mm512_fmadd_ps(tmp774, _mm512_set1_ps(5.25e+00f), in132);
in140 = _mm512_fmadd_ps(tmp778, _mm512_set1_ps(5.25e+00f), in140);
tmp774 = _mm512_fmadd_ps(tmp768, _mm512_set1_ps(2.5e-01f), in136);
tmp778 = _mm512_fmadd_ps(tmp772, _mm512_set1_ps(2.5e-01f), in144);
tmp768 = _mm512_fmadd_ps(tmp768, _mm512_set1_ps(4e+00f), in136);
tmp772 = _mm512_fmadd_ps(tmp772, _mm512_set1_ps(4e+00f), in144);
__m512 tmp776 = _mm512_sub_ps(tmp775, tmp773);
__m512 tmp780 = _mm512_sub_ps(tmp779, tmp777);
tmp775 = _mm512_add_ps(tmp773, tmp775);
tmp779 = _mm512_add_ps(tmp777, tmp779);
tmp773 = _mm512_fmadd_ps(tmp767, _mm512_set1_ps(2.5e-01f), in134);
tmp777 = _mm512_fmadd_ps(tmp771, _mm512_set1_ps(2.5e-01f), in142);
tmp774 = _mm512_fmadd_ps(tmp766, _mm512_set1_ps(-1.25e+00f), tmp774);
tmp778 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(-1.25e+00f), tmp778);
tmp766 = _mm512_fmadd_ps(tmp766, _mm512_set1_ps(-5e+00f), tmp768);
tmp770 = _mm512_fmadd_ps(tmp770, _mm512_set1_ps(-5e+00f), tmp772);
tmp773 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-1.25e+00f), tmp773);
tmp777 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-1.25e+00f), tmp777);
in136 = _mm512_fmadd_ps(tmp773, _mm512_set1_ps(2e+00f), tmp774);
in144 = _mm512_fmadd_ps(tmp777, _mm512_set1_ps(2e+00f), tmp778);
tmp774 = _mm512_fnmadd_ps(tmp773, _mm512_set1_ps(2e+00f), tmp774);
tmp778 = _mm512_fnmadd_ps(tmp777, _mm512_set1_ps(2e+00f), tmp778);
tmp773 = _mm512_fmadd_ps(in134, _mm512_set1_ps(2.5e-01f), tmp767);
tmp777 = _mm512_fmadd_ps(in142, _mm512_set1_ps(2.5e-01f), tmp771);
tmp767 = _mm512_sub_ps(in135, tmp767);
tmp771 = _mm512_sub_ps(in143, tmp771);
tmp773 = _mm512_fmadd_ps(in138, _mm512_set1_ps(-1.25e+00f), tmp773);
tmp777 = _mm512_fmadd_ps(in146, _mm512_set1_ps(-1.25e+00f), tmp777);
in138 = _mm512_sub_ps(in138, in134);
in146 = _mm512_sub_ps(in146, in142);
in138 = _mm512_fmadd_ps(in138, _mm512_set1_ps(5.25e+00f), tmp767);
in146 = _mm512_fmadd_ps(in146, _mm512_set1_ps(5.25e+00f), tmp771);
tmp768 = _mm512_fmadd_ps(tmp773, _mm512_set1_ps(2e+00f), tmp766);
tmp772 = _mm512_fmadd_ps(tmp777, _mm512_set1_ps(2e+00f), tmp770);
tmp766 = _mm512_fnmadd_ps(tmp773, _mm512_set1_ps(2e+00f), tmp766);
tmp770 = _mm512_fnmadd_ps(tmp777, _mm512_set1_ps(2e+00f), tmp770);
__m512 out151 = _mm512_shuffle_f32x4(in132, tmp775, 68);
__m512 out159 = _mm512_shuffle_f32x4(in132, tmp775, 238);
__m512 out152 = _mm512_shuffle_f32x4(tmp776, in136, 68);
__m512 out160 = _mm512_shuffle_f32x4(tmp776, in136, 238);
__m512 out153 = _mm512_shuffle_f32x4(tmp774, tmp768, 68);
__m512 out161 = _mm512_shuffle_f32x4(tmp774, tmp768, 238);
__m512 out154 = _mm512_shuffle_f32x4(tmp766, in138, 68);
__m512 out162 = _mm512_shuffle_f32x4(tmp766, in138, 238);
__m512 out155 = _mm512_shuffle_f32x4(in140, tmp779, 68);
__m512 out163 = _mm512_shuffle_f32x4(in140, tmp779, 238);
__m512 out156 = _mm512_shuffle_f32x4(tmp780, in144, 68);
__m512 out164 = _mm512_shuffle_f32x4(tmp780, in144, 238);
__m512 out157 = _mm512_shuffle_f32x4(tmp778, tmp772, 68);
__m512 out165 = _mm512_shuffle_f32x4(tmp778, tmp772, 238);
__m512 out158 = _mm512_shuffle_f32x4(tmp770, in146, 68);
__m512 out166 = _mm512_shuffle_f32x4(tmp770, in146, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k53, out151);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k53, out159);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k53, out155);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k53, out163);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k53, out152);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k53, out160);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k53, out156);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k53, out164);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k53, out153);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k53, out161);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k53, out157);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k53, out165);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k53, out154);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k53, out162);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k53, out158);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k53, out166);
}
++j11;
rel8 = 1;
}
ptrdiff_t h23 = base8+0;
ptrdiff_t w26 = 48;
ptrdiff_t k54 = 0;
for (; k54 != 2; ++k54) {
__m512 dat1051 = _mm512_maskz_loadu_ps(511, datPtr5+0+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1051 = _mm512_max_ps(_mm512_setzero_ps(), dat1051);
__m512 dat1052 = _mm512_maskz_loadu_ps(8191, datPtr5+1156+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1052 = _mm512_max_ps(_mm512_setzero_ps(), dat1052);
__m512i pm79 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in148 = _mm512_permutexvar_ps(pm79, dat1051);
__m512i pm80 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in156 = _mm512_permutexvar_ps(pm80, dat1052);
__m512 dat1053 = _mm512_maskz_loadu_ps(511, datPtr5+224+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1053 = _mm512_max_ps(_mm512_setzero_ps(), dat1053);
__m512 dat1054 = _mm512_maskz_loadu_ps(8191, datPtr5+1380+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1054 = _mm512_max_ps(_mm512_setzero_ps(), dat1054);
__m512 in149 = _mm512_permutexvar_ps(pm79, dat1053);
__m512 in157 = _mm512_permutexvar_ps(pm80, dat1054);
__m512 dat1055 = _mm512_maskz_loadu_ps(511, datPtr5+448+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1055 = _mm512_max_ps(_mm512_setzero_ps(), dat1055);
__m512 dat1056 = _mm512_maskz_loadu_ps(8191, datPtr5+1604+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1056 = _mm512_max_ps(_mm512_setzero_ps(), dat1056);
__m512 in150 = _mm512_permutexvar_ps(pm79, dat1055);
__m512 in158 = _mm512_permutexvar_ps(pm80, dat1056);
__m512 dat1057 = _mm512_maskz_loadu_ps(511, datPtr5+672+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1057 = _mm512_max_ps(_mm512_setzero_ps(), dat1057);
__m512 dat1058 = _mm512_maskz_loadu_ps(8191, datPtr5+1828+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1058 = _mm512_max_ps(_mm512_setzero_ps(), dat1058);
__m512 in151 = _mm512_permutexvar_ps(pm79, dat1057);
__m512 in159 = _mm512_permutexvar_ps(pm80, dat1058);
__m512 dat1059 = _mm512_maskz_loadu_ps(511, datPtr5+896+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1059 = _mm512_max_ps(_mm512_setzero_ps(), dat1059);
__m512 dat1060 = _mm512_maskz_loadu_ps(8191, datPtr5+2052+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1060 = _mm512_max_ps(_mm512_setzero_ps(), dat1060);
__m512 in152 = _mm512_permutexvar_ps(pm79, dat1059);
__m512 in160 = _mm512_permutexvar_ps(pm80, dat1060);
__m512 dat1061 = _mm512_maskz_loadu_ps(511, datPtr5+1120+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1061 = _mm512_max_ps(_mm512_setzero_ps(), dat1061);
__m512 dat1062 = _mm512_maskz_loadu_ps(8191, datPtr5+2276+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1062 = _mm512_max_ps(_mm512_setzero_ps(), dat1062);
__m512 in153 = _mm512_permutexvar_ps(pm79, dat1061);
__m512 in161 = _mm512_permutexvar_ps(pm80, dat1062);
__m512 dat1063 = _mm512_maskz_loadu_ps(511, datPtr5+1344+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1063 = _mm512_max_ps(_mm512_setzero_ps(), dat1063);
__m512 dat1064 = _mm512_maskz_loadu_ps(8191, datPtr5+2500+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1064 = _mm512_max_ps(_mm512_setzero_ps(), dat1064);
__m512 in154 = _mm512_permutexvar_ps(pm79, dat1063);
__m512 in162 = _mm512_permutexvar_ps(pm80, dat1064);
__m512 dat1065 = _mm512_maskz_loadu_ps(511, datPtr5+1568+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1065 = _mm512_max_ps(_mm512_setzero_ps(), dat1065);
__m512 dat1066 = _mm512_maskz_loadu_ps(8191, datPtr5+2724+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1066 = _mm512_max_ps(_mm512_setzero_ps(), dat1066);
__m512 in155 = _mm512_permutexvar_ps(pm79, dat1065);
__m512 in163 = _mm512_permutexvar_ps(pm80, dat1066);
__m512 tmp829 = _mm512_add_ps(in149, in153);
__m512 tmp833 = _mm512_add_ps(in157, in161);
__m512 tmp830 = _mm512_sub_ps(in152, in150);
__m512 tmp834 = _mm512_sub_ps(in160, in158);
__m512 tmp831 = _mm512_add_ps(in150, in154);
__m512 tmp835 = _mm512_add_ps(in158, in162);
in148 = _mm512_sub_ps(in148, in154);
in156 = _mm512_sub_ps(in156, in162);
tmp829 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-4.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-4.25e+00f), tmp833);
tmp831 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-4.25e+00f), tmp831);
tmp835 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-4.25e+00f), tmp835);
in148 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(5.25e+00f), in148);
in156 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(5.25e+00f), in156);
tmp830 = _mm512_fmadd_ps(in150, _mm512_set1_ps(2.5e-01f), in154);
tmp834 = _mm512_fmadd_ps(in158, _mm512_set1_ps(2.5e-01f), in162);
in150 = _mm512_fmadd_ps(in150, _mm512_set1_ps(4e+00f), in154);
in158 = _mm512_fmadd_ps(in158, _mm512_set1_ps(4e+00f), in162);
__m512 tmp832 = _mm512_sub_ps(tmp831, tmp829);
__m512 tmp836 = _mm512_sub_ps(tmp835, tmp833);
tmp831 = _mm512_add_ps(tmp829, tmp831);
tmp835 = _mm512_add_ps(tmp833, tmp835);
tmp829 = _mm512_fmadd_ps(in149, _mm512_set1_ps(2.5e-01f), in153);
tmp833 = _mm512_fmadd_ps(in157, _mm512_set1_ps(2.5e-01f), in161);
tmp830 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-1.25e+00f), tmp830);
tmp834 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-1.25e+00f), tmp834);
in152 = _mm512_fmadd_ps(in152, _mm512_set1_ps(-5e+00f), in150);
in160 = _mm512_fmadd_ps(in160, _mm512_set1_ps(-5e+00f), in158);
tmp829 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-1.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-1.25e+00f), tmp833);
in154 = _mm512_fmadd_ps(tmp829, _mm512_set1_ps(2e+00f), tmp830);
in162 = _mm512_fmadd_ps(tmp833, _mm512_set1_ps(2e+00f), tmp834);
tmp830 = _mm512_fnmadd_ps(tmp829, _mm512_set1_ps(2e+00f), tmp830);
tmp834 = _mm512_fnmadd_ps(tmp833, _mm512_set1_ps(2e+00f), tmp834);
tmp829 = _mm512_fmadd_ps(in153, _mm512_set1_ps(2.5e-01f), in149);
tmp833 = _mm512_fmadd_ps(in161, _mm512_set1_ps(2.5e-01f), in157);
in149 = _mm512_sub_ps(in155, in149);
in157 = _mm512_sub_ps(in163, in157);
tmp829 = _mm512_fmadd_ps(in151, _mm512_set1_ps(-1.25e+00f), tmp829);
tmp833 = _mm512_fmadd_ps(in159, _mm512_set1_ps(-1.25e+00f), tmp833);
in151 = _mm512_sub_ps(in151, in153);
in159 = _mm512_sub_ps(in159, in161);
in151 = _mm512_fmadd_ps(in151, _mm512_set1_ps(5.25e+00f), in149);
in159 = _mm512_fmadd_ps(in159, _mm512_set1_ps(5.25e+00f), in157);
in150 = _mm512_fmadd_ps(tmp829, _mm512_set1_ps(2e+00f), in152);
in158 = _mm512_fmadd_ps(tmp833, _mm512_set1_ps(2e+00f), in160);
in152 = _mm512_fnmadd_ps(tmp829, _mm512_set1_ps(2e+00f), in152);
in160 = _mm512_fnmadd_ps(tmp833, _mm512_set1_ps(2e+00f), in160);
__m512 tmp845 = _mm512_unpacklo_ps(in148, tmp831);
__m512 tmp846 = _mm512_unpackhi_ps(in148, tmp831);
__m512 tmp847 = _mm512_unpacklo_ps(tmp832, in154);
__m512 tmp848 = _mm512_unpackhi_ps(tmp832, in154);
__m512 tmp849 = _mm512_unpacklo_ps(tmp830, in150);
__m512 tmp850 = _mm512_unpackhi_ps(tmp830, in150);
__m512 tmp851 = _mm512_unpacklo_ps(in152, in151);
__m512 tmp852 = _mm512_unpackhi_ps(in152, in151);
__m512 tmp853 = _mm512_unpacklo_ps(in156, tmp835);
__m512 tmp854 = _mm512_unpackhi_ps(in156, tmp835);
__m512 tmp855 = _mm512_unpacklo_ps(tmp836, in162);
__m512 tmp856 = _mm512_unpackhi_ps(tmp836, in162);
__m512 tmp857 = _mm512_unpacklo_ps(tmp834, in158);
__m512 tmp858 = _mm512_unpackhi_ps(tmp834, in158);
__m512 tmp859 = _mm512_unpacklo_ps(in160, in159);
__m512 tmp860 = _mm512_unpackhi_ps(in160, in159);
__m512 tmp861 = _mm512_shuffle_ps(tmp845, tmp847, 68);
__m512 tmp862 = _mm512_shuffle_ps(tmp845, tmp847, 238);
__m512 tmp863 = _mm512_shuffle_ps(tmp846, tmp848, 68);
__m512 tmp864 = _mm512_shuffle_ps(tmp846, tmp848, 238);
__m512 tmp865 = _mm512_shuffle_ps(tmp849, tmp851, 68);
__m512 tmp866 = _mm512_shuffle_ps(tmp849, tmp851, 238);
__m512 tmp867 = _mm512_shuffle_ps(tmp850, tmp852, 68);
__m512 tmp868 = _mm512_shuffle_ps(tmp850, tmp852, 238);
__m512 tmp869 = _mm512_shuffle_ps(tmp853, tmp855, 68);
__m512 tmp870 = _mm512_shuffle_ps(tmp853, tmp855, 238);
__m512 tmp871 = _mm512_shuffle_ps(tmp854, tmp856, 68);
__m512 tmp872 = _mm512_shuffle_ps(tmp854, tmp856, 238);
__m512 tmp873 = _mm512_shuffle_ps(tmp857, tmp859, 68);
__m512 tmp874 = _mm512_shuffle_ps(tmp857, tmp859, 238);
__m512 tmp875 = _mm512_shuffle_ps(tmp858, tmp860, 68);
__m512 tmp876 = _mm512_shuffle_ps(tmp858, tmp860, 238);
__m512 tmp877 = _mm512_shuffle_f32x4(tmp861, tmp865, 136);
__m512 tmp878 = _mm512_shuffle_f32x4(tmp861, tmp865, 221);
__m512 tmp879 = _mm512_shuffle_f32x4(tmp862, tmp866, 136);
__m512 tmp880 = _mm512_shuffle_f32x4(tmp862, tmp866, 221);
__m512 tmp881 = _mm512_shuffle_f32x4(tmp863, tmp867, 136);
__m512 tmp882 = _mm512_shuffle_f32x4(tmp863, tmp867, 221);
__m512 tmp883 = _mm512_shuffle_f32x4(tmp864, tmp868, 136);
__m512 tmp884 = _mm512_shuffle_f32x4(tmp864, tmp868, 221);
__m512 tmp885 = _mm512_shuffle_f32x4(tmp869, tmp873, 136);
__m512 tmp886 = _mm512_shuffle_f32x4(tmp869, tmp873, 221);
__m512 tmp887 = _mm512_shuffle_f32x4(tmp870, tmp874, 136);
__m512 tmp888 = _mm512_shuffle_f32x4(tmp870, tmp874, 221);
__m512 tmp889 = _mm512_shuffle_f32x4(tmp871, tmp875, 136);
__m512 tmp890 = _mm512_shuffle_f32x4(tmp871, tmp875, 221);
__m512 tmp891 = _mm512_shuffle_f32x4(tmp872, tmp876, 136);
__m512 tmp892 = _mm512_shuffle_f32x4(tmp872, tmp876, 221);
in148 = _mm512_shuffle_f32x4(tmp877, tmp885, 136);
in156 = _mm512_shuffle_f32x4(tmp877, tmp885, 221);
tmp831 = _mm512_shuffle_f32x4(tmp879, tmp887, 136);
tmp835 = _mm512_shuffle_f32x4(tmp879, tmp887, 221);
tmp832 = _mm512_shuffle_f32x4(tmp881, tmp889, 136);
tmp836 = _mm512_shuffle_f32x4(tmp881, tmp889, 221);
in154 = _mm512_shuffle_f32x4(tmp883, tmp891, 136);
in162 = _mm512_shuffle_f32x4(tmp883, tmp891, 221);
tmp830 = _mm512_shuffle_f32x4(tmp878, tmp886, 136);
tmp834 = _mm512_shuffle_f32x4(tmp878, tmp886, 221);
in150 = _mm512_shuffle_f32x4(tmp880, tmp888, 136);
in158 = _mm512_shuffle_f32x4(tmp880, tmp888, 221);
in152 = _mm512_shuffle_f32x4(tmp882, tmp890, 136);
in160 = _mm512_shuffle_f32x4(tmp882, tmp890, 221);
in151 = _mm512_shuffle_f32x4(tmp884, tmp892, 136);
in159 = _mm512_shuffle_f32x4(tmp884, tmp892, 221);
__m512 tmp837 = _mm512_add_ps(tmp831, in150);
__m512 tmp841 = _mm512_add_ps(tmp835, in158);
__m512 tmp838 = _mm512_sub_ps(tmp830, tmp832);
__m512 tmp842 = _mm512_sub_ps(tmp834, tmp836);
__m512 tmp839 = _mm512_add_ps(tmp832, in152);
__m512 tmp843 = _mm512_add_ps(tmp836, in160);
in148 = _mm512_sub_ps(in148, in152);
in156 = _mm512_sub_ps(in156, in160);
tmp837 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-4.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-4.25e+00f), tmp841);
tmp839 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-4.25e+00f), tmp839);
tmp843 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-4.25e+00f), tmp843);
in148 = _mm512_fmadd_ps(tmp838, _mm512_set1_ps(5.25e+00f), in148);
in156 = _mm512_fmadd_ps(tmp842, _mm512_set1_ps(5.25e+00f), in156);
tmp838 = _mm512_fmadd_ps(tmp832, _mm512_set1_ps(2.5e-01f), in152);
tmp842 = _mm512_fmadd_ps(tmp836, _mm512_set1_ps(2.5e-01f), in160);
tmp832 = _mm512_fmadd_ps(tmp832, _mm512_set1_ps(4e+00f), in152);
tmp836 = _mm512_fmadd_ps(tmp836, _mm512_set1_ps(4e+00f), in160);
__m512 tmp840 = _mm512_sub_ps(tmp839, tmp837);
__m512 tmp844 = _mm512_sub_ps(tmp843, tmp841);
tmp839 = _mm512_add_ps(tmp837, tmp839);
tmp843 = _mm512_add_ps(tmp841, tmp843);
tmp837 = _mm512_fmadd_ps(tmp831, _mm512_set1_ps(2.5e-01f), in150);
tmp841 = _mm512_fmadd_ps(tmp835, _mm512_set1_ps(2.5e-01f), in158);
tmp838 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-1.25e+00f), tmp838);
tmp842 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-1.25e+00f), tmp842);
tmp830 = _mm512_fmadd_ps(tmp830, _mm512_set1_ps(-5e+00f), tmp832);
tmp834 = _mm512_fmadd_ps(tmp834, _mm512_set1_ps(-5e+00f), tmp836);
tmp837 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-1.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-1.25e+00f), tmp841);
in152 = _mm512_fmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp838);
in160 = _mm512_fmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp842);
tmp838 = _mm512_fnmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp838);
tmp842 = _mm512_fnmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp842);
tmp837 = _mm512_fmadd_ps(in150, _mm512_set1_ps(2.5e-01f), tmp831);
tmp841 = _mm512_fmadd_ps(in158, _mm512_set1_ps(2.5e-01f), tmp835);
tmp831 = _mm512_sub_ps(in151, tmp831);
tmp835 = _mm512_sub_ps(in159, tmp835);
tmp837 = _mm512_fmadd_ps(in154, _mm512_set1_ps(-1.25e+00f), tmp837);
tmp841 = _mm512_fmadd_ps(in162, _mm512_set1_ps(-1.25e+00f), tmp841);
in154 = _mm512_sub_ps(in154, in150);
in162 = _mm512_sub_ps(in162, in158);
in154 = _mm512_fmadd_ps(in154, _mm512_set1_ps(5.25e+00f), tmp831);
in162 = _mm512_fmadd_ps(in162, _mm512_set1_ps(5.25e+00f), tmp835);
tmp832 = _mm512_fmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp830);
tmp836 = _mm512_fmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp834);
tmp830 = _mm512_fnmadd_ps(tmp837, _mm512_set1_ps(2e+00f), tmp830);
tmp834 = _mm512_fnmadd_ps(tmp841, _mm512_set1_ps(2e+00f), tmp834);
__m512 out167 = _mm512_shuffle_f32x4(in148, tmp839, 68);
__m512 out175 = _mm512_shuffle_f32x4(in148, tmp839, 238);
__m512 out168 = _mm512_shuffle_f32x4(tmp840, in152, 68);
__m512 out176 = _mm512_shuffle_f32x4(tmp840, in152, 238);
__m512 out169 = _mm512_shuffle_f32x4(tmp838, tmp832, 68);
__m512 out177 = _mm512_shuffle_f32x4(tmp838, tmp832, 238);
__m512 out170 = _mm512_shuffle_f32x4(tmp830, in154, 68);
__m512 out178 = _mm512_shuffle_f32x4(tmp830, in154, 238);
__m512 out171 = _mm512_shuffle_f32x4(in156, tmp843, 68);
__m512 out179 = _mm512_shuffle_f32x4(in156, tmp843, 238);
__m512 out172 = _mm512_shuffle_f32x4(tmp844, in160, 68);
__m512 out180 = _mm512_shuffle_f32x4(tmp844, in160, 238);
__m512 out173 = _mm512_shuffle_f32x4(tmp842, tmp836, 68);
__m512 out181 = _mm512_shuffle_f32x4(tmp842, tmp836, 238);
__m512 out174 = _mm512_shuffle_f32x4(tmp834, in162, 68);
__m512 out182 = _mm512_shuffle_f32x4(tmp834, in162, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k54, out167);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k54, out175);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k54, out171);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k54, out179);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k54, out168);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k54, out176);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k54, out172);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k54, out180);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k54, out169);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k54, out177);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k54, out173);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k54, out181);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k54, out170);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k54, out178);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k54, out174);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k54, out182);
__m512 dat1067 = _mm512_maskz_loadu_ps(16383, datPtr5+1200+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1067 = _mm512_max_ps(_mm512_setzero_ps(), dat1067);
__m512 dat1068 = _mm512_maskz_loadu_ps(511, datPtr5+12608+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1068 = _mm512_max_ps(_mm512_setzero_ps(), dat1068);
__m512i pm81 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in164 = _mm512_permutexvar_ps(pm81, dat1067);
__m512i pm82 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in172 = _mm512_permutexvar_ps(pm82, dat1068);
__m512 dat1069 = _mm512_maskz_loadu_ps(16383, datPtr5+1424+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1069 = _mm512_max_ps(_mm512_setzero_ps(), dat1069);
__m512 dat1070 = _mm512_maskz_loadu_ps(511, datPtr5+12832+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1070 = _mm512_max_ps(_mm512_setzero_ps(), dat1070);
__m512 in165 = _mm512_permutexvar_ps(pm81, dat1069);
__m512 in173 = _mm512_permutexvar_ps(pm82, dat1070);
__m512 dat1071 = _mm512_maskz_loadu_ps(16383, datPtr5+1648+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1071 = _mm512_max_ps(_mm512_setzero_ps(), dat1071);
__m512 dat1072 = _mm512_maskz_loadu_ps(511, datPtr5+13056+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1072 = _mm512_max_ps(_mm512_setzero_ps(), dat1072);
__m512 in166 = _mm512_permutexvar_ps(pm81, dat1071);
__m512 in174 = _mm512_permutexvar_ps(pm82, dat1072);
__m512 dat1073 = _mm512_maskz_loadu_ps(16383, datPtr5+1872+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1073 = _mm512_max_ps(_mm512_setzero_ps(), dat1073);
__m512 dat1074 = _mm512_maskz_loadu_ps(511, datPtr5+13280+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1074 = _mm512_max_ps(_mm512_setzero_ps(), dat1074);
__m512 in167 = _mm512_permutexvar_ps(pm81, dat1073);
__m512 in175 = _mm512_permutexvar_ps(pm82, dat1074);
__m512 dat1075 = _mm512_maskz_loadu_ps(16383, datPtr5+2096+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1075 = _mm512_max_ps(_mm512_setzero_ps(), dat1075);
__m512 dat1076 = _mm512_maskz_loadu_ps(511, datPtr5+13504+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1076 = _mm512_max_ps(_mm512_setzero_ps(), dat1076);
__m512 in168 = _mm512_permutexvar_ps(pm81, dat1075);
__m512 in176 = _mm512_permutexvar_ps(pm82, dat1076);
__m512 dat1077 = _mm512_maskz_loadu_ps(16383, datPtr5+2320+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1077 = _mm512_max_ps(_mm512_setzero_ps(), dat1077);
__m512 dat1078 = _mm512_maskz_loadu_ps(511, datPtr5+13728+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1078 = _mm512_max_ps(_mm512_setzero_ps(), dat1078);
__m512 in169 = _mm512_permutexvar_ps(pm81, dat1077);
__m512 in177 = _mm512_permutexvar_ps(pm82, dat1078);
__m512 dat1079 = _mm512_maskz_loadu_ps(16383, datPtr5+2544+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1079 = _mm512_max_ps(_mm512_setzero_ps(), dat1079);
__m512 dat1080 = _mm512_maskz_loadu_ps(511, datPtr5+13952+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1080 = _mm512_max_ps(_mm512_setzero_ps(), dat1080);
__m512 in170 = _mm512_permutexvar_ps(pm81, dat1079);
__m512 in178 = _mm512_permutexvar_ps(pm82, dat1080);
__m512 dat1081 = _mm512_maskz_loadu_ps(16383, datPtr5+2768+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1081 = _mm512_max_ps(_mm512_setzero_ps(), dat1081);
__m512 dat1082 = _mm512_maskz_loadu_ps(511, datPtr5+14176+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1082 = _mm512_max_ps(_mm512_setzero_ps(), dat1082);
__m512 in171 = _mm512_permutexvar_ps(pm81, dat1081);
__m512 in179 = _mm512_permutexvar_ps(pm82, dat1082);
__m512 tmp893 = _mm512_add_ps(in165, in169);
__m512 tmp897 = _mm512_add_ps(in173, in177);
__m512 tmp894 = _mm512_sub_ps(in168, in166);
__m512 tmp898 = _mm512_sub_ps(in176, in174);
__m512 tmp895 = _mm512_add_ps(in166, in170);
__m512 tmp899 = _mm512_add_ps(in174, in178);
in164 = _mm512_sub_ps(in164, in170);
in172 = _mm512_sub_ps(in172, in178);
tmp893 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-4.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-4.25e+00f), tmp897);
tmp895 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-4.25e+00f), tmp895);
tmp899 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-4.25e+00f), tmp899);
in164 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(5.25e+00f), in164);
in172 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(5.25e+00f), in172);
tmp894 = _mm512_fmadd_ps(in166, _mm512_set1_ps(2.5e-01f), in170);
tmp898 = _mm512_fmadd_ps(in174, _mm512_set1_ps(2.5e-01f), in178);
in166 = _mm512_fmadd_ps(in166, _mm512_set1_ps(4e+00f), in170);
in174 = _mm512_fmadd_ps(in174, _mm512_set1_ps(4e+00f), in178);
__m512 tmp896 = _mm512_sub_ps(tmp895, tmp893);
__m512 tmp900 = _mm512_sub_ps(tmp899, tmp897);
tmp895 = _mm512_add_ps(tmp893, tmp895);
tmp899 = _mm512_add_ps(tmp897, tmp899);
tmp893 = _mm512_fmadd_ps(in165, _mm512_set1_ps(2.5e-01f), in169);
tmp897 = _mm512_fmadd_ps(in173, _mm512_set1_ps(2.5e-01f), in177);
tmp894 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-1.25e+00f), tmp894);
tmp898 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-1.25e+00f), tmp898);
in168 = _mm512_fmadd_ps(in168, _mm512_set1_ps(-5e+00f), in166);
in176 = _mm512_fmadd_ps(in176, _mm512_set1_ps(-5e+00f), in174);
tmp893 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-1.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-1.25e+00f), tmp897);
in170 = _mm512_fmadd_ps(tmp893, _mm512_set1_ps(2e+00f), tmp894);
in178 = _mm512_fmadd_ps(tmp897, _mm512_set1_ps(2e+00f), tmp898);
tmp894 = _mm512_fnmadd_ps(tmp893, _mm512_set1_ps(2e+00f), tmp894);
tmp898 = _mm512_fnmadd_ps(tmp897, _mm512_set1_ps(2e+00f), tmp898);
tmp893 = _mm512_fmadd_ps(in169, _mm512_set1_ps(2.5e-01f), in165);
tmp897 = _mm512_fmadd_ps(in177, _mm512_set1_ps(2.5e-01f), in173);
in165 = _mm512_sub_ps(in171, in165);
in173 = _mm512_sub_ps(in179, in173);
tmp893 = _mm512_fmadd_ps(in167, _mm512_set1_ps(-1.25e+00f), tmp893);
tmp897 = _mm512_fmadd_ps(in175, _mm512_set1_ps(-1.25e+00f), tmp897);
in167 = _mm512_sub_ps(in167, in169);
in175 = _mm512_sub_ps(in175, in177);
in167 = _mm512_fmadd_ps(in167, _mm512_set1_ps(5.25e+00f), in165);
in175 = _mm512_fmadd_ps(in175, _mm512_set1_ps(5.25e+00f), in173);
in166 = _mm512_fmadd_ps(tmp893, _mm512_set1_ps(2e+00f), in168);
in174 = _mm512_fmadd_ps(tmp897, _mm512_set1_ps(2e+00f), in176);
in168 = _mm512_fnmadd_ps(tmp893, _mm512_set1_ps(2e+00f), in168);
in176 = _mm512_fnmadd_ps(tmp897, _mm512_set1_ps(2e+00f), in176);
__m512 tmp909 = _mm512_unpacklo_ps(in164, tmp895);
__m512 tmp910 = _mm512_unpackhi_ps(in164, tmp895);
__m512 tmp911 = _mm512_unpacklo_ps(tmp896, in170);
__m512 tmp912 = _mm512_unpackhi_ps(tmp896, in170);
__m512 tmp913 = _mm512_unpacklo_ps(tmp894, in166);
__m512 tmp914 = _mm512_unpackhi_ps(tmp894, in166);
__m512 tmp915 = _mm512_unpacklo_ps(in168, in167);
__m512 tmp916 = _mm512_unpackhi_ps(in168, in167);
__m512 tmp917 = _mm512_unpacklo_ps(in172, tmp899);
__m512 tmp918 = _mm512_unpackhi_ps(in172, tmp899);
__m512 tmp919 = _mm512_unpacklo_ps(tmp900, in178);
__m512 tmp920 = _mm512_unpackhi_ps(tmp900, in178);
__m512 tmp921 = _mm512_unpacklo_ps(tmp898, in174);
__m512 tmp922 = _mm512_unpackhi_ps(tmp898, in174);
__m512 tmp923 = _mm512_unpacklo_ps(in176, in175);
__m512 tmp924 = _mm512_unpackhi_ps(in176, in175);
__m512 tmp925 = _mm512_shuffle_ps(tmp909, tmp911, 68);
__m512 tmp926 = _mm512_shuffle_ps(tmp909, tmp911, 238);
__m512 tmp927 = _mm512_shuffle_ps(tmp910, tmp912, 68);
__m512 tmp928 = _mm512_shuffle_ps(tmp910, tmp912, 238);
__m512 tmp929 = _mm512_shuffle_ps(tmp913, tmp915, 68);
__m512 tmp930 = _mm512_shuffle_ps(tmp913, tmp915, 238);
__m512 tmp931 = _mm512_shuffle_ps(tmp914, tmp916, 68);
__m512 tmp932 = _mm512_shuffle_ps(tmp914, tmp916, 238);
__m512 tmp933 = _mm512_shuffle_ps(tmp917, tmp919, 68);
__m512 tmp934 = _mm512_shuffle_ps(tmp917, tmp919, 238);
__m512 tmp935 = _mm512_shuffle_ps(tmp918, tmp920, 68);
__m512 tmp936 = _mm512_shuffle_ps(tmp918, tmp920, 238);
__m512 tmp937 = _mm512_shuffle_ps(tmp921, tmp923, 68);
__m512 tmp938 = _mm512_shuffle_ps(tmp921, tmp923, 238);
__m512 tmp939 = _mm512_shuffle_ps(tmp922, tmp924, 68);
__m512 tmp940 = _mm512_shuffle_ps(tmp922, tmp924, 238);
__m512 tmp941 = _mm512_shuffle_f32x4(tmp925, tmp929, 136);
__m512 tmp942 = _mm512_shuffle_f32x4(tmp925, tmp929, 221);
__m512 tmp943 = _mm512_shuffle_f32x4(tmp926, tmp930, 136);
__m512 tmp944 = _mm512_shuffle_f32x4(tmp926, tmp930, 221);
__m512 tmp945 = _mm512_shuffle_f32x4(tmp927, tmp931, 136);
__m512 tmp946 = _mm512_shuffle_f32x4(tmp927, tmp931, 221);
__m512 tmp947 = _mm512_shuffle_f32x4(tmp928, tmp932, 136);
__m512 tmp948 = _mm512_shuffle_f32x4(tmp928, tmp932, 221);
__m512 tmp949 = _mm512_shuffle_f32x4(tmp933, tmp937, 136);
__m512 tmp950 = _mm512_shuffle_f32x4(tmp933, tmp937, 221);
__m512 tmp951 = _mm512_shuffle_f32x4(tmp934, tmp938, 136);
__m512 tmp952 = _mm512_shuffle_f32x4(tmp934, tmp938, 221);
__m512 tmp953 = _mm512_shuffle_f32x4(tmp935, tmp939, 136);
__m512 tmp954 = _mm512_shuffle_f32x4(tmp935, tmp939, 221);
__m512 tmp955 = _mm512_shuffle_f32x4(tmp936, tmp940, 136);
__m512 tmp956 = _mm512_shuffle_f32x4(tmp936, tmp940, 221);
in164 = _mm512_shuffle_f32x4(tmp941, tmp949, 136);
in172 = _mm512_shuffle_f32x4(tmp941, tmp949, 221);
tmp895 = _mm512_shuffle_f32x4(tmp943, tmp951, 136);
tmp899 = _mm512_shuffle_f32x4(tmp943, tmp951, 221);
tmp896 = _mm512_shuffle_f32x4(tmp945, tmp953, 136);
tmp900 = _mm512_shuffle_f32x4(tmp945, tmp953, 221);
in170 = _mm512_shuffle_f32x4(tmp947, tmp955, 136);
in178 = _mm512_shuffle_f32x4(tmp947, tmp955, 221);
tmp894 = _mm512_shuffle_f32x4(tmp942, tmp950, 136);
tmp898 = _mm512_shuffle_f32x4(tmp942, tmp950, 221);
in166 = _mm512_shuffle_f32x4(tmp944, tmp952, 136);
in174 = _mm512_shuffle_f32x4(tmp944, tmp952, 221);
in168 = _mm512_shuffle_f32x4(tmp946, tmp954, 136);
in176 = _mm512_shuffle_f32x4(tmp946, tmp954, 221);
in167 = _mm512_shuffle_f32x4(tmp948, tmp956, 136);
in175 = _mm512_shuffle_f32x4(tmp948, tmp956, 221);
__m512 tmp901 = _mm512_add_ps(tmp895, in166);
__m512 tmp905 = _mm512_add_ps(tmp899, in174);
__m512 tmp902 = _mm512_sub_ps(tmp894, tmp896);
__m512 tmp906 = _mm512_sub_ps(tmp898, tmp900);
__m512 tmp903 = _mm512_add_ps(tmp896, in168);
__m512 tmp907 = _mm512_add_ps(tmp900, in176);
in164 = _mm512_sub_ps(in164, in168);
in172 = _mm512_sub_ps(in172, in176);
tmp901 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-4.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-4.25e+00f), tmp905);
tmp903 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-4.25e+00f), tmp903);
tmp907 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-4.25e+00f), tmp907);
in164 = _mm512_fmadd_ps(tmp902, _mm512_set1_ps(5.25e+00f), in164);
in172 = _mm512_fmadd_ps(tmp906, _mm512_set1_ps(5.25e+00f), in172);
tmp902 = _mm512_fmadd_ps(tmp896, _mm512_set1_ps(2.5e-01f), in168);
tmp906 = _mm512_fmadd_ps(tmp900, _mm512_set1_ps(2.5e-01f), in176);
tmp896 = _mm512_fmadd_ps(tmp896, _mm512_set1_ps(4e+00f), in168);
tmp900 = _mm512_fmadd_ps(tmp900, _mm512_set1_ps(4e+00f), in176);
__m512 tmp904 = _mm512_sub_ps(tmp903, tmp901);
__m512 tmp908 = _mm512_sub_ps(tmp907, tmp905);
tmp903 = _mm512_add_ps(tmp901, tmp903);
tmp907 = _mm512_add_ps(tmp905, tmp907);
tmp901 = _mm512_fmadd_ps(tmp895, _mm512_set1_ps(2.5e-01f), in166);
tmp905 = _mm512_fmadd_ps(tmp899, _mm512_set1_ps(2.5e-01f), in174);
tmp902 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-1.25e+00f), tmp902);
tmp906 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-1.25e+00f), tmp906);
tmp894 = _mm512_fmadd_ps(tmp894, _mm512_set1_ps(-5e+00f), tmp896);
tmp898 = _mm512_fmadd_ps(tmp898, _mm512_set1_ps(-5e+00f), tmp900);
tmp901 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-1.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-1.25e+00f), tmp905);
in168 = _mm512_fmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp902);
in176 = _mm512_fmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp906);
tmp902 = _mm512_fnmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp902);
tmp906 = _mm512_fnmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp906);
tmp901 = _mm512_fmadd_ps(in166, _mm512_set1_ps(2.5e-01f), tmp895);
tmp905 = _mm512_fmadd_ps(in174, _mm512_set1_ps(2.5e-01f), tmp899);
tmp895 = _mm512_sub_ps(in167, tmp895);
tmp899 = _mm512_sub_ps(in175, tmp899);
tmp901 = _mm512_fmadd_ps(in170, _mm512_set1_ps(-1.25e+00f), tmp901);
tmp905 = _mm512_fmadd_ps(in178, _mm512_set1_ps(-1.25e+00f), tmp905);
in170 = _mm512_sub_ps(in170, in166);
in178 = _mm512_sub_ps(in178, in174);
in170 = _mm512_fmadd_ps(in170, _mm512_set1_ps(5.25e+00f), tmp895);
in178 = _mm512_fmadd_ps(in178, _mm512_set1_ps(5.25e+00f), tmp899);
tmp896 = _mm512_fmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp894);
tmp900 = _mm512_fmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp898);
tmp894 = _mm512_fnmadd_ps(tmp901, _mm512_set1_ps(2e+00f), tmp894);
tmp898 = _mm512_fnmadd_ps(tmp905, _mm512_set1_ps(2e+00f), tmp898);
__m512 out183 = _mm512_shuffle_f32x4(in164, tmp903, 68);
__m512 out191 = _mm512_shuffle_f32x4(in164, tmp903, 238);
__m512 out184 = _mm512_shuffle_f32x4(tmp904, in168, 68);
__m512 out192 = _mm512_shuffle_f32x4(tmp904, in168, 238);
__m512 out185 = _mm512_shuffle_f32x4(tmp902, tmp896, 68);
__m512 out193 = _mm512_shuffle_f32x4(tmp902, tmp896, 238);
__m512 out186 = _mm512_shuffle_f32x4(tmp894, in170, 68);
__m512 out194 = _mm512_shuffle_f32x4(tmp894, in170, 238);
__m512 out187 = _mm512_shuffle_f32x4(in172, tmp907, 68);
__m512 out195 = _mm512_shuffle_f32x4(in172, tmp907, 238);
__m512 out188 = _mm512_shuffle_f32x4(tmp908, in176, 68);
__m512 out196 = _mm512_shuffle_f32x4(tmp908, in176, 238);
__m512 out189 = _mm512_shuffle_f32x4(tmp906, tmp900, 68);
__m512 out197 = _mm512_shuffle_f32x4(tmp906, tmp900, 238);
__m512 out190 = _mm512_shuffle_f32x4(tmp898, in178, 68);
__m512 out198 = _mm512_shuffle_f32x4(tmp898, in178, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k54, out183);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k54, out191);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k54, out187);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k54, out195);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k54, out184);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k54, out192);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k54, out188);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k54, out196);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k54, out185);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k54, out193);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k54, out189);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k54, out197);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k54, out186);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k54, out194);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k54, out190);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k54, out198);
__m512 dat1083 = _mm512_maskz_loadu_ps(8191, datPtr5+13764+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1083 = _mm512_max_ps(_mm512_setzero_ps(), dat1083);
__m512 dat1084 = _mm512_maskz_loadu_ps(16383, datPtr5+13808+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1084 = _mm512_max_ps(_mm512_setzero_ps(), dat1084);
__m512i pm83 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in180 = _mm512_permutexvar_ps(pm83, dat1083);
__m512i pm84 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in188 = _mm512_permutexvar_ps(pm84, dat1084);
__m512 dat1085 = _mm512_maskz_loadu_ps(8191, datPtr5+13988+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1085 = _mm512_max_ps(_mm512_setzero_ps(), dat1085);
__m512 dat1086 = _mm512_maskz_loadu_ps(16383, datPtr5+14032+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1086 = _mm512_max_ps(_mm512_setzero_ps(), dat1086);
__m512 in181 = _mm512_permutexvar_ps(pm83, dat1085);
__m512 in189 = _mm512_permutexvar_ps(pm84, dat1086);
__m512 dat1087 = _mm512_maskz_loadu_ps(8191, datPtr5+14212+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1087 = _mm512_max_ps(_mm512_setzero_ps(), dat1087);
__m512 dat1088 = _mm512_maskz_loadu_ps(16383, datPtr5+14256+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1088 = _mm512_max_ps(_mm512_setzero_ps(), dat1088);
__m512 in182 = _mm512_permutexvar_ps(pm83, dat1087);
__m512 in190 = _mm512_permutexvar_ps(pm84, dat1088);
__m512 dat1089 = _mm512_maskz_loadu_ps(8191, datPtr5+14436+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1089 = _mm512_max_ps(_mm512_setzero_ps(), dat1089);
__m512 dat1090 = _mm512_maskz_loadu_ps(16383, datPtr5+14480+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1090 = _mm512_max_ps(_mm512_setzero_ps(), dat1090);
__m512 in183 = _mm512_permutexvar_ps(pm83, dat1089);
__m512 in191 = _mm512_permutexvar_ps(pm84, dat1090);
__m512 dat1091 = _mm512_maskz_loadu_ps(8191, datPtr5+14660+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1091 = _mm512_max_ps(_mm512_setzero_ps(), dat1091);
__m512 dat1092 = _mm512_maskz_loadu_ps(16383, datPtr5+14704+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1092 = _mm512_max_ps(_mm512_setzero_ps(), dat1092);
__m512 in184 = _mm512_permutexvar_ps(pm83, dat1091);
__m512 in192 = _mm512_permutexvar_ps(pm84, dat1092);
__m512 dat1093 = _mm512_maskz_loadu_ps(8191, datPtr5+14884+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1093 = _mm512_max_ps(_mm512_setzero_ps(), dat1093);
__m512 dat1094 = _mm512_maskz_loadu_ps(16383, datPtr5+14928+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1094 = _mm512_max_ps(_mm512_setzero_ps(), dat1094);
__m512 in185 = _mm512_permutexvar_ps(pm83, dat1093);
__m512 in193 = _mm512_permutexvar_ps(pm84, dat1094);
__m512 dat1095 = _mm512_maskz_loadu_ps(8191, datPtr5+15108+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1095 = _mm512_max_ps(_mm512_setzero_ps(), dat1095);
__m512 dat1096 = _mm512_maskz_loadu_ps(16383, datPtr5+15152+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1096 = _mm512_max_ps(_mm512_setzero_ps(), dat1096);
__m512 in186 = _mm512_permutexvar_ps(pm83, dat1095);
__m512 in194 = _mm512_permutexvar_ps(pm84, dat1096);
__m512 dat1097 = _mm512_maskz_loadu_ps(8191, datPtr5+15332+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1097 = _mm512_max_ps(_mm512_setzero_ps(), dat1097);
__m512 dat1098 = _mm512_maskz_loadu_ps(16383, datPtr5+15376+50432*i17+224*h23+4*w26+50432*s12+25216*k54);
dat1098 = _mm512_max_ps(_mm512_setzero_ps(), dat1098);
__m512 in187 = _mm512_permutexvar_ps(pm83, dat1097);
__m512 in195 = _mm512_permutexvar_ps(pm84, dat1098);
__m512 tmp957 = _mm512_add_ps(in181, in185);
__m512 tmp961 = _mm512_add_ps(in189, in193);
__m512 tmp958 = _mm512_sub_ps(in184, in182);
__m512 tmp962 = _mm512_sub_ps(in192, in190);
__m512 tmp959 = _mm512_add_ps(in182, in186);
__m512 tmp963 = _mm512_add_ps(in190, in194);
in180 = _mm512_sub_ps(in180, in186);
in188 = _mm512_sub_ps(in188, in194);
tmp957 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-4.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-4.25e+00f), tmp961);
tmp959 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-4.25e+00f), tmp959);
tmp963 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-4.25e+00f), tmp963);
in180 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(5.25e+00f), in180);
in188 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(5.25e+00f), in188);
tmp958 = _mm512_fmadd_ps(in182, _mm512_set1_ps(2.5e-01f), in186);
tmp962 = _mm512_fmadd_ps(in190, _mm512_set1_ps(2.5e-01f), in194);
in182 = _mm512_fmadd_ps(in182, _mm512_set1_ps(4e+00f), in186);
in190 = _mm512_fmadd_ps(in190, _mm512_set1_ps(4e+00f), in194);
__m512 tmp960 = _mm512_sub_ps(tmp959, tmp957);
__m512 tmp964 = _mm512_sub_ps(tmp963, tmp961);
tmp959 = _mm512_add_ps(tmp957, tmp959);
tmp963 = _mm512_add_ps(tmp961, tmp963);
tmp957 = _mm512_fmadd_ps(in181, _mm512_set1_ps(2.5e-01f), in185);
tmp961 = _mm512_fmadd_ps(in189, _mm512_set1_ps(2.5e-01f), in193);
tmp958 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-1.25e+00f), tmp958);
tmp962 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-1.25e+00f), tmp962);
in184 = _mm512_fmadd_ps(in184, _mm512_set1_ps(-5e+00f), in182);
in192 = _mm512_fmadd_ps(in192, _mm512_set1_ps(-5e+00f), in190);
tmp957 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-1.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-1.25e+00f), tmp961);
in186 = _mm512_fmadd_ps(tmp957, _mm512_set1_ps(2e+00f), tmp958);
in194 = _mm512_fmadd_ps(tmp961, _mm512_set1_ps(2e+00f), tmp962);
tmp958 = _mm512_fnmadd_ps(tmp957, _mm512_set1_ps(2e+00f), tmp958);
tmp962 = _mm512_fnmadd_ps(tmp961, _mm512_set1_ps(2e+00f), tmp962);
tmp957 = _mm512_fmadd_ps(in185, _mm512_set1_ps(2.5e-01f), in181);
tmp961 = _mm512_fmadd_ps(in193, _mm512_set1_ps(2.5e-01f), in189);
in181 = _mm512_sub_ps(in187, in181);
in189 = _mm512_sub_ps(in195, in189);
tmp957 = _mm512_fmadd_ps(in183, _mm512_set1_ps(-1.25e+00f), tmp957);
tmp961 = _mm512_fmadd_ps(in191, _mm512_set1_ps(-1.25e+00f), tmp961);
in183 = _mm512_sub_ps(in183, in185);
in191 = _mm512_sub_ps(in191, in193);
in183 = _mm512_fmadd_ps(in183, _mm512_set1_ps(5.25e+00f), in181);
in191 = _mm512_fmadd_ps(in191, _mm512_set1_ps(5.25e+00f), in189);
in182 = _mm512_fmadd_ps(tmp957, _mm512_set1_ps(2e+00f), in184);
in190 = _mm512_fmadd_ps(tmp961, _mm512_set1_ps(2e+00f), in192);
in184 = _mm512_fnmadd_ps(tmp957, _mm512_set1_ps(2e+00f), in184);
in192 = _mm512_fnmadd_ps(tmp961, _mm512_set1_ps(2e+00f), in192);
__m512 tmp973 = _mm512_unpacklo_ps(in180, tmp959);
__m512 tmp974 = _mm512_unpackhi_ps(in180, tmp959);
__m512 tmp975 = _mm512_unpacklo_ps(tmp960, in186);
__m512 tmp976 = _mm512_unpackhi_ps(tmp960, in186);
__m512 tmp977 = _mm512_unpacklo_ps(tmp958, in182);
__m512 tmp978 = _mm512_unpackhi_ps(tmp958, in182);
__m512 tmp979 = _mm512_unpacklo_ps(in184, in183);
__m512 tmp980 = _mm512_unpackhi_ps(in184, in183);
__m512 tmp981 = _mm512_unpacklo_ps(in188, tmp963);
__m512 tmp982 = _mm512_unpackhi_ps(in188, tmp963);
__m512 tmp983 = _mm512_unpacklo_ps(tmp964, in194);
__m512 tmp984 = _mm512_unpackhi_ps(tmp964, in194);
__m512 tmp985 = _mm512_unpacklo_ps(tmp962, in190);
__m512 tmp986 = _mm512_unpackhi_ps(tmp962, in190);
__m512 tmp987 = _mm512_unpacklo_ps(in192, in191);
__m512 tmp988 = _mm512_unpackhi_ps(in192, in191);
__m512 tmp989 = _mm512_shuffle_ps(tmp973, tmp975, 68);
__m512 tmp990 = _mm512_shuffle_ps(tmp973, tmp975, 238);
__m512 tmp991 = _mm512_shuffle_ps(tmp974, tmp976, 68);
__m512 tmp992 = _mm512_shuffle_ps(tmp974, tmp976, 238);
__m512 tmp993 = _mm512_shuffle_ps(tmp977, tmp979, 68);
__m512 tmp994 = _mm512_shuffle_ps(tmp977, tmp979, 238);
__m512 tmp995 = _mm512_shuffle_ps(tmp978, tmp980, 68);
__m512 tmp996 = _mm512_shuffle_ps(tmp978, tmp980, 238);
__m512 tmp997 = _mm512_shuffle_ps(tmp981, tmp983, 68);
__m512 tmp998 = _mm512_shuffle_ps(tmp981, tmp983, 238);
__m512 tmp999 = _mm512_shuffle_ps(tmp982, tmp984, 68);
__m512 tmp1000 = _mm512_shuffle_ps(tmp982, tmp984, 238);
__m512 tmp1001 = _mm512_shuffle_ps(tmp985, tmp987, 68);
__m512 tmp1002 = _mm512_shuffle_ps(tmp985, tmp987, 238);
__m512 tmp1003 = _mm512_shuffle_ps(tmp986, tmp988, 68);
__m512 tmp1004 = _mm512_shuffle_ps(tmp986, tmp988, 238);
__m512 tmp1005 = _mm512_shuffle_f32x4(tmp989, tmp993, 136);
__m512 tmp1006 = _mm512_shuffle_f32x4(tmp989, tmp993, 221);
__m512 tmp1007 = _mm512_shuffle_f32x4(tmp990, tmp994, 136);
__m512 tmp1008 = _mm512_shuffle_f32x4(tmp990, tmp994, 221);
__m512 tmp1009 = _mm512_shuffle_f32x4(tmp991, tmp995, 136);
__m512 tmp1010 = _mm512_shuffle_f32x4(tmp991, tmp995, 221);
__m512 tmp1011 = _mm512_shuffle_f32x4(tmp992, tmp996, 136);
__m512 tmp1012 = _mm512_shuffle_f32x4(tmp992, tmp996, 221);
__m512 tmp1013 = _mm512_shuffle_f32x4(tmp997, tmp1001, 136);
__m512 tmp1014 = _mm512_shuffle_f32x4(tmp997, tmp1001, 221);
__m512 tmp1015 = _mm512_shuffle_f32x4(tmp998, tmp1002, 136);
__m512 tmp1016 = _mm512_shuffle_f32x4(tmp998, tmp1002, 221);
__m512 tmp1017 = _mm512_shuffle_f32x4(tmp999, tmp1003, 136);
__m512 tmp1018 = _mm512_shuffle_f32x4(tmp999, tmp1003, 221);
__m512 tmp1019 = _mm512_shuffle_f32x4(tmp1000, tmp1004, 136);
__m512 tmp1020 = _mm512_shuffle_f32x4(tmp1000, tmp1004, 221);
in180 = _mm512_shuffle_f32x4(tmp1005, tmp1013, 136);
in188 = _mm512_shuffle_f32x4(tmp1005, tmp1013, 221);
tmp959 = _mm512_shuffle_f32x4(tmp1007, tmp1015, 136);
tmp963 = _mm512_shuffle_f32x4(tmp1007, tmp1015, 221);
tmp960 = _mm512_shuffle_f32x4(tmp1009, tmp1017, 136);
tmp964 = _mm512_shuffle_f32x4(tmp1009, tmp1017, 221);
in186 = _mm512_shuffle_f32x4(tmp1011, tmp1019, 136);
in194 = _mm512_shuffle_f32x4(tmp1011, tmp1019, 221);
tmp958 = _mm512_shuffle_f32x4(tmp1006, tmp1014, 136);
tmp962 = _mm512_shuffle_f32x4(tmp1006, tmp1014, 221);
in182 = _mm512_shuffle_f32x4(tmp1008, tmp1016, 136);
in190 = _mm512_shuffle_f32x4(tmp1008, tmp1016, 221);
in184 = _mm512_shuffle_f32x4(tmp1010, tmp1018, 136);
in192 = _mm512_shuffle_f32x4(tmp1010, tmp1018, 221);
in183 = _mm512_shuffle_f32x4(tmp1012, tmp1020, 136);
in191 = _mm512_shuffle_f32x4(tmp1012, tmp1020, 221);
__m512 tmp965 = _mm512_add_ps(tmp959, in182);
__m512 tmp969 = _mm512_add_ps(tmp963, in190);
__m512 tmp966 = _mm512_sub_ps(tmp958, tmp960);
__m512 tmp970 = _mm512_sub_ps(tmp962, tmp964);
__m512 tmp967 = _mm512_add_ps(tmp960, in184);
__m512 tmp971 = _mm512_add_ps(tmp964, in192);
in180 = _mm512_sub_ps(in180, in184);
in188 = _mm512_sub_ps(in188, in192);
tmp965 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-4.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-4.25e+00f), tmp969);
tmp967 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-4.25e+00f), tmp967);
tmp971 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-4.25e+00f), tmp971);
in180 = _mm512_fmadd_ps(tmp966, _mm512_set1_ps(5.25e+00f), in180);
in188 = _mm512_fmadd_ps(tmp970, _mm512_set1_ps(5.25e+00f), in188);
tmp966 = _mm512_fmadd_ps(tmp960, _mm512_set1_ps(2.5e-01f), in184);
tmp970 = _mm512_fmadd_ps(tmp964, _mm512_set1_ps(2.5e-01f), in192);
tmp960 = _mm512_fmadd_ps(tmp960, _mm512_set1_ps(4e+00f), in184);
tmp964 = _mm512_fmadd_ps(tmp964, _mm512_set1_ps(4e+00f), in192);
__m512 tmp968 = _mm512_sub_ps(tmp967, tmp965);
__m512 tmp972 = _mm512_sub_ps(tmp971, tmp969);
tmp967 = _mm512_add_ps(tmp965, tmp967);
tmp971 = _mm512_add_ps(tmp969, tmp971);
tmp965 = _mm512_fmadd_ps(tmp959, _mm512_set1_ps(2.5e-01f), in182);
tmp969 = _mm512_fmadd_ps(tmp963, _mm512_set1_ps(2.5e-01f), in190);
tmp966 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-1.25e+00f), tmp966);
tmp970 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-1.25e+00f), tmp970);
tmp958 = _mm512_fmadd_ps(tmp958, _mm512_set1_ps(-5e+00f), tmp960);
tmp962 = _mm512_fmadd_ps(tmp962, _mm512_set1_ps(-5e+00f), tmp964);
tmp965 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-1.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-1.25e+00f), tmp969);
in184 = _mm512_fmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp966);
in192 = _mm512_fmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp970);
tmp966 = _mm512_fnmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp966);
tmp970 = _mm512_fnmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp970);
tmp965 = _mm512_fmadd_ps(in182, _mm512_set1_ps(2.5e-01f), tmp959);
tmp969 = _mm512_fmadd_ps(in190, _mm512_set1_ps(2.5e-01f), tmp963);
tmp959 = _mm512_sub_ps(in183, tmp959);
tmp963 = _mm512_sub_ps(in191, tmp963);
tmp965 = _mm512_fmadd_ps(in186, _mm512_set1_ps(-1.25e+00f), tmp965);
tmp969 = _mm512_fmadd_ps(in194, _mm512_set1_ps(-1.25e+00f), tmp969);
in186 = _mm512_sub_ps(in186, in182);
in194 = _mm512_sub_ps(in194, in190);
in186 = _mm512_fmadd_ps(in186, _mm512_set1_ps(5.25e+00f), tmp959);
in194 = _mm512_fmadd_ps(in194, _mm512_set1_ps(5.25e+00f), tmp963);
tmp960 = _mm512_fmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp958);
tmp964 = _mm512_fmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp962);
tmp958 = _mm512_fnmadd_ps(tmp965, _mm512_set1_ps(2e+00f), tmp958);
tmp962 = _mm512_fnmadd_ps(tmp969, _mm512_set1_ps(2e+00f), tmp962);
__m512 out199 = _mm512_shuffle_f32x4(in180, tmp967, 68);
__m512 out207 = _mm512_shuffle_f32x4(in180, tmp967, 238);
__m512 out200 = _mm512_shuffle_f32x4(tmp968, in184, 68);
__m512 out208 = _mm512_shuffle_f32x4(tmp968, in184, 238);
__m512 out201 = _mm512_shuffle_f32x4(tmp966, tmp960, 68);
__m512 out209 = _mm512_shuffle_f32x4(tmp966, tmp960, 238);
__m512 out202 = _mm512_shuffle_f32x4(tmp958, in186, 68);
__m512 out210 = _mm512_shuffle_f32x4(tmp958, in186, 238);
__m512 out203 = _mm512_shuffle_f32x4(in188, tmp971, 68);
__m512 out211 = _mm512_shuffle_f32x4(in188, tmp971, 238);
__m512 out204 = _mm512_shuffle_f32x4(tmp972, in192, 68);
__m512 out212 = _mm512_shuffle_f32x4(tmp972, in192, 238);
__m512 out205 = _mm512_shuffle_f32x4(tmp970, tmp964, 68);
__m512 out213 = _mm512_shuffle_f32x4(tmp970, tmp964, 238);
__m512 out206 = _mm512_shuffle_f32x4(tmp962, in194, 68);
__m512 out214 = _mm512_shuffle_f32x4(tmp962, in194, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k54, out199);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k54, out207);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k54, out203);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k54, out211);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k54, out200);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k54, out208);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k54, out204);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k54, out212);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k54, out201);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k54, out209);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k54, out205);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k54, out213);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k54, out202);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k54, out210);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k54, out206);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k54, out214);
}
++j11;
rel8 = 2;
}
if (rel8 < 3) {
ptrdiff_t h24 = base8+6;
ptrdiff_t w27 = 24;
ptrdiff_t k55 = 0;
for (; k55 != 2; ++k55) {
__m512 dat1099 = _mm512_maskz_loadu_ps(16383, datPtr5+0+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1099 = _mm512_max_ps(_mm512_setzero_ps(), dat1099);
__m512 dat1100 = _mm512_maskz_loadu_ps(16383, datPtr5+48+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1100 = _mm512_max_ps(_mm512_setzero_ps(), dat1100);
__m512i pm85 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in196 = _mm512_permutexvar_ps(pm85, dat1099);
__m512 in204 = _mm512_permutexvar_ps(pm85, dat1100);
__m512 dat1101 = _mm512_maskz_loadu_ps(16383, datPtr5+224+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1101 = _mm512_max_ps(_mm512_setzero_ps(), dat1101);
__m512 dat1102 = _mm512_maskz_loadu_ps(16383, datPtr5+272+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1102 = _mm512_max_ps(_mm512_setzero_ps(), dat1102);
__m512 in197 = _mm512_permutexvar_ps(pm85, dat1101);
__m512 in205 = _mm512_permutexvar_ps(pm85, dat1102);
__m512 dat1103 = _mm512_maskz_loadu_ps(16383, datPtr5+448+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1103 = _mm512_max_ps(_mm512_setzero_ps(), dat1103);
__m512 dat1104 = _mm512_maskz_loadu_ps(16383, datPtr5+496+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1104 = _mm512_max_ps(_mm512_setzero_ps(), dat1104);
__m512 in198 = _mm512_permutexvar_ps(pm85, dat1103);
__m512 in206 = _mm512_permutexvar_ps(pm85, dat1104);
__m512 dat1105 = _mm512_maskz_loadu_ps(16383, datPtr5+672+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1105 = _mm512_max_ps(_mm512_setzero_ps(), dat1105);
__m512 dat1106 = _mm512_maskz_loadu_ps(16383, datPtr5+720+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1106 = _mm512_max_ps(_mm512_setzero_ps(), dat1106);
__m512 in199 = _mm512_permutexvar_ps(pm85, dat1105);
__m512 in207 = _mm512_permutexvar_ps(pm85, dat1106);
__m512 dat1107 = _mm512_maskz_loadu_ps(16383, datPtr5+896+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1107 = _mm512_max_ps(_mm512_setzero_ps(), dat1107);
__m512 dat1108 = _mm512_maskz_loadu_ps(16383, datPtr5+944+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1108 = _mm512_max_ps(_mm512_setzero_ps(), dat1108);
__m512 in200 = _mm512_permutexvar_ps(pm85, dat1107);
__m512 in208 = _mm512_permutexvar_ps(pm85, dat1108);
__m512 dat1109 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1109 = _mm512_max_ps(_mm512_setzero_ps(), dat1109);
__m512 dat1110 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1110 = _mm512_max_ps(_mm512_setzero_ps(), dat1110);
__m512 in201 = _mm512_permutexvar_ps(pm85, dat1109);
__m512 in209 = _mm512_permutexvar_ps(pm85, dat1110);
__m512 dat1111 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1111 = _mm512_max_ps(_mm512_setzero_ps(), dat1111);
__m512 dat1112 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1112 = _mm512_max_ps(_mm512_setzero_ps(), dat1112);
__m512 in202 = _mm512_permutexvar_ps(pm85, dat1111);
__m512 in210 = _mm512_permutexvar_ps(pm85, dat1112);
__m512 dat1113 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1113 = _mm512_max_ps(_mm512_setzero_ps(), dat1113);
__m512 dat1114 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1114 = _mm512_max_ps(_mm512_setzero_ps(), dat1114);
__m512 in203 = _mm512_permutexvar_ps(pm85, dat1113);
__m512 in211 = _mm512_permutexvar_ps(pm85, dat1114);
__m512 tmp1021 = _mm512_add_ps(in197, in201);
__m512 tmp1025 = _mm512_add_ps(in205, in209);
__m512 tmp1022 = _mm512_sub_ps(in200, in198);
__m512 tmp1026 = _mm512_sub_ps(in208, in206);
__m512 tmp1023 = _mm512_add_ps(in198, in202);
__m512 tmp1027 = _mm512_add_ps(in206, in210);
in196 = _mm512_sub_ps(in196, in202);
in204 = _mm512_sub_ps(in204, in210);
tmp1021 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-4.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-4.25e+00f), tmp1025);
tmp1023 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-4.25e+00f), tmp1023);
tmp1027 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-4.25e+00f), tmp1027);
in196 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(5.25e+00f), in196);
in204 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(5.25e+00f), in204);
tmp1022 = _mm512_fmadd_ps(in198, _mm512_set1_ps(2.5e-01f), in202);
tmp1026 = _mm512_fmadd_ps(in206, _mm512_set1_ps(2.5e-01f), in210);
in198 = _mm512_fmadd_ps(in198, _mm512_set1_ps(4e+00f), in202);
in206 = _mm512_fmadd_ps(in206, _mm512_set1_ps(4e+00f), in210);
__m512 tmp1024 = _mm512_sub_ps(tmp1023, tmp1021);
__m512 tmp1028 = _mm512_sub_ps(tmp1027, tmp1025);
tmp1023 = _mm512_add_ps(tmp1021, tmp1023);
tmp1027 = _mm512_add_ps(tmp1025, tmp1027);
tmp1021 = _mm512_fmadd_ps(in197, _mm512_set1_ps(2.5e-01f), in201);
tmp1025 = _mm512_fmadd_ps(in205, _mm512_set1_ps(2.5e-01f), in209);
tmp1022 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-1.25e+00f), tmp1022);
tmp1026 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-1.25e+00f), tmp1026);
in200 = _mm512_fmadd_ps(in200, _mm512_set1_ps(-5e+00f), in198);
in208 = _mm512_fmadd_ps(in208, _mm512_set1_ps(-5e+00f), in206);
tmp1021 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-1.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-1.25e+00f), tmp1025);
in202 = _mm512_fmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), tmp1022);
in210 = _mm512_fmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), tmp1026);
tmp1022 = _mm512_fnmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), tmp1022);
tmp1026 = _mm512_fnmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), tmp1026);
tmp1021 = _mm512_fmadd_ps(in201, _mm512_set1_ps(2.5e-01f), in197);
tmp1025 = _mm512_fmadd_ps(in209, _mm512_set1_ps(2.5e-01f), in205);
in197 = _mm512_sub_ps(in203, in197);
in205 = _mm512_sub_ps(in211, in205);
tmp1021 = _mm512_fmadd_ps(in199, _mm512_set1_ps(-1.25e+00f), tmp1021);
tmp1025 = _mm512_fmadd_ps(in207, _mm512_set1_ps(-1.25e+00f), tmp1025);
in199 = _mm512_sub_ps(in199, in201);
in207 = _mm512_sub_ps(in207, in209);
in199 = _mm512_fmadd_ps(in199, _mm512_set1_ps(5.25e+00f), in197);
in207 = _mm512_fmadd_ps(in207, _mm512_set1_ps(5.25e+00f), in205);
in198 = _mm512_fmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), in200);
in206 = _mm512_fmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), in208);
in200 = _mm512_fnmadd_ps(tmp1021, _mm512_set1_ps(2e+00f), in200);
in208 = _mm512_fnmadd_ps(tmp1025, _mm512_set1_ps(2e+00f), in208);
__m512 tmp1037 = _mm512_unpacklo_ps(in196, tmp1023);
__m512 tmp1038 = _mm512_unpackhi_ps(in196, tmp1023);
__m512 tmp1039 = _mm512_unpacklo_ps(tmp1024, in202);
__m512 tmp1040 = _mm512_unpackhi_ps(tmp1024, in202);
__m512 tmp1041 = _mm512_unpacklo_ps(tmp1022, in198);
__m512 tmp1042 = _mm512_unpackhi_ps(tmp1022, in198);
__m512 tmp1043 = _mm512_unpacklo_ps(in200, in199);
__m512 tmp1044 = _mm512_unpackhi_ps(in200, in199);
__m512 tmp1045 = _mm512_unpacklo_ps(in204, tmp1027);
__m512 tmp1046 = _mm512_unpackhi_ps(in204, tmp1027);
__m512 tmp1047 = _mm512_unpacklo_ps(tmp1028, in210);
__m512 tmp1048 = _mm512_unpackhi_ps(tmp1028, in210);
__m512 tmp1049 = _mm512_unpacklo_ps(tmp1026, in206);
__m512 tmp1050 = _mm512_unpackhi_ps(tmp1026, in206);
__m512 tmp1051 = _mm512_unpacklo_ps(in208, in207);
__m512 tmp1052 = _mm512_unpackhi_ps(in208, in207);
__m512 tmp1053 = _mm512_shuffle_ps(tmp1037, tmp1039, 68);
__m512 tmp1054 = _mm512_shuffle_ps(tmp1037, tmp1039, 238);
__m512 tmp1055 = _mm512_shuffle_ps(tmp1038, tmp1040, 68);
__m512 tmp1056 = _mm512_shuffle_ps(tmp1038, tmp1040, 238);
__m512 tmp1057 = _mm512_shuffle_ps(tmp1041, tmp1043, 68);
__m512 tmp1058 = _mm512_shuffle_ps(tmp1041, tmp1043, 238);
__m512 tmp1059 = _mm512_shuffle_ps(tmp1042, tmp1044, 68);
__m512 tmp1060 = _mm512_shuffle_ps(tmp1042, tmp1044, 238);
__m512 tmp1061 = _mm512_shuffle_ps(tmp1045, tmp1047, 68);
__m512 tmp1062 = _mm512_shuffle_ps(tmp1045, tmp1047, 238);
__m512 tmp1063 = _mm512_shuffle_ps(tmp1046, tmp1048, 68);
__m512 tmp1064 = _mm512_shuffle_ps(tmp1046, tmp1048, 238);
__m512 tmp1065 = _mm512_shuffle_ps(tmp1049, tmp1051, 68);
__m512 tmp1066 = _mm512_shuffle_ps(tmp1049, tmp1051, 238);
__m512 tmp1067 = _mm512_shuffle_ps(tmp1050, tmp1052, 68);
__m512 tmp1068 = _mm512_shuffle_ps(tmp1050, tmp1052, 238);
__m512 tmp1069 = _mm512_shuffle_f32x4(tmp1053, tmp1057, 136);
__m512 tmp1070 = _mm512_shuffle_f32x4(tmp1053, tmp1057, 221);
__m512 tmp1071 = _mm512_shuffle_f32x4(tmp1054, tmp1058, 136);
__m512 tmp1072 = _mm512_shuffle_f32x4(tmp1054, tmp1058, 221);
__m512 tmp1073 = _mm512_shuffle_f32x4(tmp1055, tmp1059, 136);
__m512 tmp1074 = _mm512_shuffle_f32x4(tmp1055, tmp1059, 221);
__m512 tmp1075 = _mm512_shuffle_f32x4(tmp1056, tmp1060, 136);
__m512 tmp1076 = _mm512_shuffle_f32x4(tmp1056, tmp1060, 221);
__m512 tmp1077 = _mm512_shuffle_f32x4(tmp1061, tmp1065, 136);
__m512 tmp1078 = _mm512_shuffle_f32x4(tmp1061, tmp1065, 221);
__m512 tmp1079 = _mm512_shuffle_f32x4(tmp1062, tmp1066, 136);
__m512 tmp1080 = _mm512_shuffle_f32x4(tmp1062, tmp1066, 221);
__m512 tmp1081 = _mm512_shuffle_f32x4(tmp1063, tmp1067, 136);
__m512 tmp1082 = _mm512_shuffle_f32x4(tmp1063, tmp1067, 221);
__m512 tmp1083 = _mm512_shuffle_f32x4(tmp1064, tmp1068, 136);
__m512 tmp1084 = _mm512_shuffle_f32x4(tmp1064, tmp1068, 221);
in196 = _mm512_shuffle_f32x4(tmp1069, tmp1077, 136);
in204 = _mm512_shuffle_f32x4(tmp1069, tmp1077, 221);
tmp1023 = _mm512_shuffle_f32x4(tmp1071, tmp1079, 136);
tmp1027 = _mm512_shuffle_f32x4(tmp1071, tmp1079, 221);
tmp1024 = _mm512_shuffle_f32x4(tmp1073, tmp1081, 136);
tmp1028 = _mm512_shuffle_f32x4(tmp1073, tmp1081, 221);
in202 = _mm512_shuffle_f32x4(tmp1075, tmp1083, 136);
in210 = _mm512_shuffle_f32x4(tmp1075, tmp1083, 221);
tmp1022 = _mm512_shuffle_f32x4(tmp1070, tmp1078, 136);
tmp1026 = _mm512_shuffle_f32x4(tmp1070, tmp1078, 221);
in198 = _mm512_shuffle_f32x4(tmp1072, tmp1080, 136);
in206 = _mm512_shuffle_f32x4(tmp1072, tmp1080, 221);
in200 = _mm512_shuffle_f32x4(tmp1074, tmp1082, 136);
in208 = _mm512_shuffle_f32x4(tmp1074, tmp1082, 221);
in199 = _mm512_shuffle_f32x4(tmp1076, tmp1084, 136);
in207 = _mm512_shuffle_f32x4(tmp1076, tmp1084, 221);
__m512 tmp1029 = _mm512_add_ps(tmp1023, in198);
__m512 tmp1033 = _mm512_add_ps(tmp1027, in206);
__m512 tmp1030 = _mm512_sub_ps(tmp1022, tmp1024);
__m512 tmp1034 = _mm512_sub_ps(tmp1026, tmp1028);
__m512 tmp1031 = _mm512_add_ps(tmp1024, in200);
__m512 tmp1035 = _mm512_add_ps(tmp1028, in208);
in196 = _mm512_sub_ps(in196, in200);
in204 = _mm512_sub_ps(in204, in208);
tmp1029 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-4.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-4.25e+00f), tmp1033);
tmp1031 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-4.25e+00f), tmp1031);
tmp1035 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-4.25e+00f), tmp1035);
in196 = _mm512_fmadd_ps(tmp1030, _mm512_set1_ps(5.25e+00f), in196);
in204 = _mm512_fmadd_ps(tmp1034, _mm512_set1_ps(5.25e+00f), in204);
tmp1030 = _mm512_fmadd_ps(tmp1024, _mm512_set1_ps(2.5e-01f), in200);
tmp1034 = _mm512_fmadd_ps(tmp1028, _mm512_set1_ps(2.5e-01f), in208);
tmp1024 = _mm512_fmadd_ps(tmp1024, _mm512_set1_ps(4e+00f), in200);
tmp1028 = _mm512_fmadd_ps(tmp1028, _mm512_set1_ps(4e+00f), in208);
__m512 tmp1032 = _mm512_sub_ps(tmp1031, tmp1029);
__m512 tmp1036 = _mm512_sub_ps(tmp1035, tmp1033);
tmp1031 = _mm512_add_ps(tmp1029, tmp1031);
tmp1035 = _mm512_add_ps(tmp1033, tmp1035);
tmp1029 = _mm512_fmadd_ps(tmp1023, _mm512_set1_ps(2.5e-01f), in198);
tmp1033 = _mm512_fmadd_ps(tmp1027, _mm512_set1_ps(2.5e-01f), in206);
tmp1030 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-1.25e+00f), tmp1030);
tmp1034 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-1.25e+00f), tmp1034);
tmp1022 = _mm512_fmadd_ps(tmp1022, _mm512_set1_ps(-5e+00f), tmp1024);
tmp1026 = _mm512_fmadd_ps(tmp1026, _mm512_set1_ps(-5e+00f), tmp1028);
tmp1029 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-1.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-1.25e+00f), tmp1033);
in200 = _mm512_fmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1030);
in208 = _mm512_fmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1034);
tmp1030 = _mm512_fnmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1030);
tmp1034 = _mm512_fnmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1034);
tmp1029 = _mm512_fmadd_ps(in198, _mm512_set1_ps(2.5e-01f), tmp1023);
tmp1033 = _mm512_fmadd_ps(in206, _mm512_set1_ps(2.5e-01f), tmp1027);
tmp1023 = _mm512_sub_ps(in199, tmp1023);
tmp1027 = _mm512_sub_ps(in207, tmp1027);
tmp1029 = _mm512_fmadd_ps(in202, _mm512_set1_ps(-1.25e+00f), tmp1029);
tmp1033 = _mm512_fmadd_ps(in210, _mm512_set1_ps(-1.25e+00f), tmp1033);
in202 = _mm512_sub_ps(in202, in198);
in210 = _mm512_sub_ps(in210, in206);
in202 = _mm512_fmadd_ps(in202, _mm512_set1_ps(5.25e+00f), tmp1023);
in210 = _mm512_fmadd_ps(in210, _mm512_set1_ps(5.25e+00f), tmp1027);
tmp1024 = _mm512_fmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1022);
tmp1028 = _mm512_fmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1026);
tmp1022 = _mm512_fnmadd_ps(tmp1029, _mm512_set1_ps(2e+00f), tmp1022);
tmp1026 = _mm512_fnmadd_ps(tmp1033, _mm512_set1_ps(2e+00f), tmp1026);
__m512 out215 = _mm512_shuffle_f32x4(in196, tmp1031, 68);
__m512 out223 = _mm512_shuffle_f32x4(in196, tmp1031, 238);
__m512 out216 = _mm512_shuffle_f32x4(tmp1032, in200, 68);
__m512 out224 = _mm512_shuffle_f32x4(tmp1032, in200, 238);
__m512 out217 = _mm512_shuffle_f32x4(tmp1030, tmp1024, 68);
__m512 out225 = _mm512_shuffle_f32x4(tmp1030, tmp1024, 238);
__m512 out218 = _mm512_shuffle_f32x4(tmp1022, in202, 68);
__m512 out226 = _mm512_shuffle_f32x4(tmp1022, in202, 238);
__m512 out219 = _mm512_shuffle_f32x4(in204, tmp1035, 68);
__m512 out227 = _mm512_shuffle_f32x4(in204, tmp1035, 238);
__m512 out220 = _mm512_shuffle_f32x4(tmp1036, in208, 68);
__m512 out228 = _mm512_shuffle_f32x4(tmp1036, in208, 238);
__m512 out221 = _mm512_shuffle_f32x4(tmp1034, tmp1028, 68);
__m512 out229 = _mm512_shuffle_f32x4(tmp1034, tmp1028, 238);
__m512 out222 = _mm512_shuffle_f32x4(tmp1026, in210, 68);
__m512 out230 = _mm512_shuffle_f32x4(tmp1026, in210, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k55, out215);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k55, out223);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k55, out219);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k55, out227);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k55, out216);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k55, out224);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k55, out220);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k55, out228);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k55, out217);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k55, out225);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k55, out221);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k55, out229);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k55, out218);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k55, out226);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k55, out222);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k55, out230);
__m512 dat1115 = _mm512_maskz_loadu_ps(511, datPtr5+96+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1115 = _mm512_max_ps(_mm512_setzero_ps(), dat1115);
__m512 dat1116 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1116 = _mm512_max_ps(_mm512_setzero_ps(), dat1116);
__m512i pm86 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in212 = _mm512_permutexvar_ps(pm86, dat1115);
__m512i pm87 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in220 = _mm512_permutexvar_ps(pm87, dat1116);
__m512 dat1117 = _mm512_maskz_loadu_ps(511, datPtr5+320+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1117 = _mm512_max_ps(_mm512_setzero_ps(), dat1117);
__m512 dat1118 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1118 = _mm512_max_ps(_mm512_setzero_ps(), dat1118);
__m512 in213 = _mm512_permutexvar_ps(pm86, dat1117);
__m512 in221 = _mm512_permutexvar_ps(pm87, dat1118);
__m512 dat1119 = _mm512_maskz_loadu_ps(511, datPtr5+544+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1119 = _mm512_max_ps(_mm512_setzero_ps(), dat1119);
__m512 dat1120 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1120 = _mm512_max_ps(_mm512_setzero_ps(), dat1120);
__m512 in214 = _mm512_permutexvar_ps(pm86, dat1119);
__m512 in222 = _mm512_permutexvar_ps(pm87, dat1120);
__m512 dat1121 = _mm512_maskz_loadu_ps(511, datPtr5+768+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1121 = _mm512_max_ps(_mm512_setzero_ps(), dat1121);
__m512 dat1122 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1122 = _mm512_max_ps(_mm512_setzero_ps(), dat1122);
__m512 in215 = _mm512_permutexvar_ps(pm86, dat1121);
__m512 in223 = _mm512_permutexvar_ps(pm87, dat1122);
__m512 dat1123 = _mm512_maskz_loadu_ps(511, datPtr5+992+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1123 = _mm512_max_ps(_mm512_setzero_ps(), dat1123);
__m512 dat1124 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1124 = _mm512_max_ps(_mm512_setzero_ps(), dat1124);
__m512 in216 = _mm512_permutexvar_ps(pm86, dat1123);
__m512 in224 = _mm512_permutexvar_ps(pm87, dat1124);
__m512 dat1125 = _mm512_maskz_loadu_ps(511, datPtr5+1216+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1125 = _mm512_max_ps(_mm512_setzero_ps(), dat1125);
__m512 dat1126 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1126 = _mm512_max_ps(_mm512_setzero_ps(), dat1126);
__m512 in217 = _mm512_permutexvar_ps(pm86, dat1125);
__m512 in225 = _mm512_permutexvar_ps(pm87, dat1126);
__m512 dat1127 = _mm512_maskz_loadu_ps(511, datPtr5+1440+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1127 = _mm512_max_ps(_mm512_setzero_ps(), dat1127);
__m512 dat1128 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1128 = _mm512_max_ps(_mm512_setzero_ps(), dat1128);
__m512 in218 = _mm512_permutexvar_ps(pm86, dat1127);
__m512 in226 = _mm512_permutexvar_ps(pm87, dat1128);
__m512 dat1129 = _mm512_maskz_loadu_ps(511, datPtr5+1664+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1129 = _mm512_max_ps(_mm512_setzero_ps(), dat1129);
__m512 dat1130 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1130 = _mm512_max_ps(_mm512_setzero_ps(), dat1130);
__m512 in219 = _mm512_permutexvar_ps(pm86, dat1129);
__m512 in227 = _mm512_permutexvar_ps(pm87, dat1130);
__m512 tmp1085 = _mm512_add_ps(in213, in217);
__m512 tmp1089 = _mm512_add_ps(in221, in225);
__m512 tmp1086 = _mm512_sub_ps(in216, in214);
__m512 tmp1090 = _mm512_sub_ps(in224, in222);
__m512 tmp1087 = _mm512_add_ps(in214, in218);
__m512 tmp1091 = _mm512_add_ps(in222, in226);
in212 = _mm512_sub_ps(in212, in218);
in220 = _mm512_sub_ps(in220, in226);
tmp1085 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-4.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-4.25e+00f), tmp1089);
tmp1087 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-4.25e+00f), tmp1087);
tmp1091 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-4.25e+00f), tmp1091);
in212 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(5.25e+00f), in212);
in220 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(5.25e+00f), in220);
tmp1086 = _mm512_fmadd_ps(in214, _mm512_set1_ps(2.5e-01f), in218);
tmp1090 = _mm512_fmadd_ps(in222, _mm512_set1_ps(2.5e-01f), in226);
in214 = _mm512_fmadd_ps(in214, _mm512_set1_ps(4e+00f), in218);
in222 = _mm512_fmadd_ps(in222, _mm512_set1_ps(4e+00f), in226);
__m512 tmp1088 = _mm512_sub_ps(tmp1087, tmp1085);
__m512 tmp1092 = _mm512_sub_ps(tmp1091, tmp1089);
tmp1087 = _mm512_add_ps(tmp1085, tmp1087);
tmp1091 = _mm512_add_ps(tmp1089, tmp1091);
tmp1085 = _mm512_fmadd_ps(in213, _mm512_set1_ps(2.5e-01f), in217);
tmp1089 = _mm512_fmadd_ps(in221, _mm512_set1_ps(2.5e-01f), in225);
tmp1086 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-1.25e+00f), tmp1086);
tmp1090 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-1.25e+00f), tmp1090);
in216 = _mm512_fmadd_ps(in216, _mm512_set1_ps(-5e+00f), in214);
in224 = _mm512_fmadd_ps(in224, _mm512_set1_ps(-5e+00f), in222);
tmp1085 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-1.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-1.25e+00f), tmp1089);
in218 = _mm512_fmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), tmp1086);
in226 = _mm512_fmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), tmp1090);
tmp1086 = _mm512_fnmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), tmp1086);
tmp1090 = _mm512_fnmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), tmp1090);
tmp1085 = _mm512_fmadd_ps(in217, _mm512_set1_ps(2.5e-01f), in213);
tmp1089 = _mm512_fmadd_ps(in225, _mm512_set1_ps(2.5e-01f), in221);
in213 = _mm512_sub_ps(in219, in213);
in221 = _mm512_sub_ps(in227, in221);
tmp1085 = _mm512_fmadd_ps(in215, _mm512_set1_ps(-1.25e+00f), tmp1085);
tmp1089 = _mm512_fmadd_ps(in223, _mm512_set1_ps(-1.25e+00f), tmp1089);
in215 = _mm512_sub_ps(in215, in217);
in223 = _mm512_sub_ps(in223, in225);
in215 = _mm512_fmadd_ps(in215, _mm512_set1_ps(5.25e+00f), in213);
in223 = _mm512_fmadd_ps(in223, _mm512_set1_ps(5.25e+00f), in221);
in214 = _mm512_fmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), in216);
in222 = _mm512_fmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), in224);
in216 = _mm512_fnmadd_ps(tmp1085, _mm512_set1_ps(2e+00f), in216);
in224 = _mm512_fnmadd_ps(tmp1089, _mm512_set1_ps(2e+00f), in224);
__m512 tmp1101 = _mm512_unpacklo_ps(in212, tmp1087);
__m512 tmp1102 = _mm512_unpackhi_ps(in212, tmp1087);
__m512 tmp1103 = _mm512_unpacklo_ps(tmp1088, in218);
__m512 tmp1104 = _mm512_unpackhi_ps(tmp1088, in218);
__m512 tmp1105 = _mm512_unpacklo_ps(tmp1086, in214);
__m512 tmp1106 = _mm512_unpackhi_ps(tmp1086, in214);
__m512 tmp1107 = _mm512_unpacklo_ps(in216, in215);
__m512 tmp1108 = _mm512_unpackhi_ps(in216, in215);
__m512 tmp1109 = _mm512_unpacklo_ps(in220, tmp1091);
__m512 tmp1110 = _mm512_unpackhi_ps(in220, tmp1091);
__m512 tmp1111 = _mm512_unpacklo_ps(tmp1092, in226);
__m512 tmp1112 = _mm512_unpackhi_ps(tmp1092, in226);
__m512 tmp1113 = _mm512_unpacklo_ps(tmp1090, in222);
__m512 tmp1114 = _mm512_unpackhi_ps(tmp1090, in222);
__m512 tmp1115 = _mm512_unpacklo_ps(in224, in223);
__m512 tmp1116 = _mm512_unpackhi_ps(in224, in223);
__m512 tmp1117 = _mm512_shuffle_ps(tmp1101, tmp1103, 68);
__m512 tmp1118 = _mm512_shuffle_ps(tmp1101, tmp1103, 238);
__m512 tmp1119 = _mm512_shuffle_ps(tmp1102, tmp1104, 68);
__m512 tmp1120 = _mm512_shuffle_ps(tmp1102, tmp1104, 238);
__m512 tmp1121 = _mm512_shuffle_ps(tmp1105, tmp1107, 68);
__m512 tmp1122 = _mm512_shuffle_ps(tmp1105, tmp1107, 238);
__m512 tmp1123 = _mm512_shuffle_ps(tmp1106, tmp1108, 68);
__m512 tmp1124 = _mm512_shuffle_ps(tmp1106, tmp1108, 238);
__m512 tmp1125 = _mm512_shuffle_ps(tmp1109, tmp1111, 68);
__m512 tmp1126 = _mm512_shuffle_ps(tmp1109, tmp1111, 238);
__m512 tmp1127 = _mm512_shuffle_ps(tmp1110, tmp1112, 68);
__m512 tmp1128 = _mm512_shuffle_ps(tmp1110, tmp1112, 238);
__m512 tmp1129 = _mm512_shuffle_ps(tmp1113, tmp1115, 68);
__m512 tmp1130 = _mm512_shuffle_ps(tmp1113, tmp1115, 238);
__m512 tmp1131 = _mm512_shuffle_ps(tmp1114, tmp1116, 68);
__m512 tmp1132 = _mm512_shuffle_ps(tmp1114, tmp1116, 238);
__m512 tmp1133 = _mm512_shuffle_f32x4(tmp1117, tmp1121, 136);
__m512 tmp1134 = _mm512_shuffle_f32x4(tmp1117, tmp1121, 221);
__m512 tmp1135 = _mm512_shuffle_f32x4(tmp1118, tmp1122, 136);
__m512 tmp1136 = _mm512_shuffle_f32x4(tmp1118, tmp1122, 221);
__m512 tmp1137 = _mm512_shuffle_f32x4(tmp1119, tmp1123, 136);
__m512 tmp1138 = _mm512_shuffle_f32x4(tmp1119, tmp1123, 221);
__m512 tmp1139 = _mm512_shuffle_f32x4(tmp1120, tmp1124, 136);
__m512 tmp1140 = _mm512_shuffle_f32x4(tmp1120, tmp1124, 221);
__m512 tmp1141 = _mm512_shuffle_f32x4(tmp1125, tmp1129, 136);
__m512 tmp1142 = _mm512_shuffle_f32x4(tmp1125, tmp1129, 221);
__m512 tmp1143 = _mm512_shuffle_f32x4(tmp1126, tmp1130, 136);
__m512 tmp1144 = _mm512_shuffle_f32x4(tmp1126, tmp1130, 221);
__m512 tmp1145 = _mm512_shuffle_f32x4(tmp1127, tmp1131, 136);
__m512 tmp1146 = _mm512_shuffle_f32x4(tmp1127, tmp1131, 221);
__m512 tmp1147 = _mm512_shuffle_f32x4(tmp1128, tmp1132, 136);
__m512 tmp1148 = _mm512_shuffle_f32x4(tmp1128, tmp1132, 221);
in212 = _mm512_shuffle_f32x4(tmp1133, tmp1141, 136);
in220 = _mm512_shuffle_f32x4(tmp1133, tmp1141, 221);
tmp1087 = _mm512_shuffle_f32x4(tmp1135, tmp1143, 136);
tmp1091 = _mm512_shuffle_f32x4(tmp1135, tmp1143, 221);
tmp1088 = _mm512_shuffle_f32x4(tmp1137, tmp1145, 136);
tmp1092 = _mm512_shuffle_f32x4(tmp1137, tmp1145, 221);
in218 = _mm512_shuffle_f32x4(tmp1139, tmp1147, 136);
in226 = _mm512_shuffle_f32x4(tmp1139, tmp1147, 221);
tmp1086 = _mm512_shuffle_f32x4(tmp1134, tmp1142, 136);
tmp1090 = _mm512_shuffle_f32x4(tmp1134, tmp1142, 221);
in214 = _mm512_shuffle_f32x4(tmp1136, tmp1144, 136);
in222 = _mm512_shuffle_f32x4(tmp1136, tmp1144, 221);
in216 = _mm512_shuffle_f32x4(tmp1138, tmp1146, 136);
in224 = _mm512_shuffle_f32x4(tmp1138, tmp1146, 221);
in215 = _mm512_shuffle_f32x4(tmp1140, tmp1148, 136);
in223 = _mm512_shuffle_f32x4(tmp1140, tmp1148, 221);
__m512 tmp1093 = _mm512_add_ps(tmp1087, in214);
__m512 tmp1097 = _mm512_add_ps(tmp1091, in222);
__m512 tmp1094 = _mm512_sub_ps(tmp1086, tmp1088);
__m512 tmp1098 = _mm512_sub_ps(tmp1090, tmp1092);
__m512 tmp1095 = _mm512_add_ps(tmp1088, in216);
__m512 tmp1099 = _mm512_add_ps(tmp1092, in224);
in212 = _mm512_sub_ps(in212, in216);
in220 = _mm512_sub_ps(in220, in224);
tmp1093 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-4.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-4.25e+00f), tmp1097);
tmp1095 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-4.25e+00f), tmp1095);
tmp1099 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-4.25e+00f), tmp1099);
in212 = _mm512_fmadd_ps(tmp1094, _mm512_set1_ps(5.25e+00f), in212);
in220 = _mm512_fmadd_ps(tmp1098, _mm512_set1_ps(5.25e+00f), in220);
tmp1094 = _mm512_fmadd_ps(tmp1088, _mm512_set1_ps(2.5e-01f), in216);
tmp1098 = _mm512_fmadd_ps(tmp1092, _mm512_set1_ps(2.5e-01f), in224);
tmp1088 = _mm512_fmadd_ps(tmp1088, _mm512_set1_ps(4e+00f), in216);
tmp1092 = _mm512_fmadd_ps(tmp1092, _mm512_set1_ps(4e+00f), in224);
__m512 tmp1096 = _mm512_sub_ps(tmp1095, tmp1093);
__m512 tmp1100 = _mm512_sub_ps(tmp1099, tmp1097);
tmp1095 = _mm512_add_ps(tmp1093, tmp1095);
tmp1099 = _mm512_add_ps(tmp1097, tmp1099);
tmp1093 = _mm512_fmadd_ps(tmp1087, _mm512_set1_ps(2.5e-01f), in214);
tmp1097 = _mm512_fmadd_ps(tmp1091, _mm512_set1_ps(2.5e-01f), in222);
tmp1094 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-1.25e+00f), tmp1094);
tmp1098 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-1.25e+00f), tmp1098);
tmp1086 = _mm512_fmadd_ps(tmp1086, _mm512_set1_ps(-5e+00f), tmp1088);
tmp1090 = _mm512_fmadd_ps(tmp1090, _mm512_set1_ps(-5e+00f), tmp1092);
tmp1093 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-1.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-1.25e+00f), tmp1097);
in216 = _mm512_fmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1094);
in224 = _mm512_fmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1098);
tmp1094 = _mm512_fnmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1094);
tmp1098 = _mm512_fnmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1098);
tmp1093 = _mm512_fmadd_ps(in214, _mm512_set1_ps(2.5e-01f), tmp1087);
tmp1097 = _mm512_fmadd_ps(in222, _mm512_set1_ps(2.5e-01f), tmp1091);
tmp1087 = _mm512_sub_ps(in215, tmp1087);
tmp1091 = _mm512_sub_ps(in223, tmp1091);
tmp1093 = _mm512_fmadd_ps(in218, _mm512_set1_ps(-1.25e+00f), tmp1093);
tmp1097 = _mm512_fmadd_ps(in226, _mm512_set1_ps(-1.25e+00f), tmp1097);
in218 = _mm512_sub_ps(in218, in214);
in226 = _mm512_sub_ps(in226, in222);
in218 = _mm512_fmadd_ps(in218, _mm512_set1_ps(5.25e+00f), tmp1087);
in226 = _mm512_fmadd_ps(in226, _mm512_set1_ps(5.25e+00f), tmp1091);
tmp1088 = _mm512_fmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1086);
tmp1092 = _mm512_fmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1090);
tmp1086 = _mm512_fnmadd_ps(tmp1093, _mm512_set1_ps(2e+00f), tmp1086);
tmp1090 = _mm512_fnmadd_ps(tmp1097, _mm512_set1_ps(2e+00f), tmp1090);
__m512 out231 = _mm512_shuffle_f32x4(in212, tmp1095, 68);
__m512 out239 = _mm512_shuffle_f32x4(in212, tmp1095, 238);
__m512 out232 = _mm512_shuffle_f32x4(tmp1096, in216, 68);
__m512 out240 = _mm512_shuffle_f32x4(tmp1096, in216, 238);
__m512 out233 = _mm512_shuffle_f32x4(tmp1094, tmp1088, 68);
__m512 out241 = _mm512_shuffle_f32x4(tmp1094, tmp1088, 238);
__m512 out234 = _mm512_shuffle_f32x4(tmp1086, in218, 68);
__m512 out242 = _mm512_shuffle_f32x4(tmp1086, in218, 238);
__m512 out235 = _mm512_shuffle_f32x4(in220, tmp1099, 68);
__m512 out243 = _mm512_shuffle_f32x4(in220, tmp1099, 238);
__m512 out236 = _mm512_shuffle_f32x4(tmp1100, in224, 68);
__m512 out244 = _mm512_shuffle_f32x4(tmp1100, in224, 238);
__m512 out237 = _mm512_shuffle_f32x4(tmp1098, tmp1092, 68);
__m512 out245 = _mm512_shuffle_f32x4(tmp1098, tmp1092, 238);
__m512 out238 = _mm512_shuffle_f32x4(tmp1090, in226, 68);
__m512 out246 = _mm512_shuffle_f32x4(tmp1090, in226, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k55, out231);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k55, out239);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k55, out235);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k55, out243);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k55, out232);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k55, out240);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k55, out236);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k55, out244);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k55, out233);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k55, out241);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k55, out237);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k55, out245);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k55, out234);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k55, out242);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k55, out238);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k55, out246);
__m512 dat1131 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1131 = _mm512_max_ps(_mm512_setzero_ps(), dat1131);
__m512 dat1132 = _mm512_maskz_loadu_ps(511, datPtr5+12704+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1132 = _mm512_max_ps(_mm512_setzero_ps(), dat1132);
__m512i pm88 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in228 = _mm512_permutexvar_ps(pm88, dat1131);
__m512i pm89 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in236 = _mm512_permutexvar_ps(pm89, dat1132);
__m512 dat1133 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1133 = _mm512_max_ps(_mm512_setzero_ps(), dat1133);
__m512 dat1134 = _mm512_maskz_loadu_ps(511, datPtr5+12928+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1134 = _mm512_max_ps(_mm512_setzero_ps(), dat1134);
__m512 in229 = _mm512_permutexvar_ps(pm88, dat1133);
__m512 in237 = _mm512_permutexvar_ps(pm89, dat1134);
__m512 dat1135 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1135 = _mm512_max_ps(_mm512_setzero_ps(), dat1135);
__m512 dat1136 = _mm512_maskz_loadu_ps(511, datPtr5+13152+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1136 = _mm512_max_ps(_mm512_setzero_ps(), dat1136);
__m512 in230 = _mm512_permutexvar_ps(pm88, dat1135);
__m512 in238 = _mm512_permutexvar_ps(pm89, dat1136);
__m512 dat1137 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1137 = _mm512_max_ps(_mm512_setzero_ps(), dat1137);
__m512 dat1138 = _mm512_maskz_loadu_ps(511, datPtr5+13376+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1138 = _mm512_max_ps(_mm512_setzero_ps(), dat1138);
__m512 in231 = _mm512_permutexvar_ps(pm88, dat1137);
__m512 in239 = _mm512_permutexvar_ps(pm89, dat1138);
__m512 dat1139 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1139 = _mm512_max_ps(_mm512_setzero_ps(), dat1139);
__m512 dat1140 = _mm512_maskz_loadu_ps(511, datPtr5+13600+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1140 = _mm512_max_ps(_mm512_setzero_ps(), dat1140);
__m512 in232 = _mm512_permutexvar_ps(pm88, dat1139);
__m512 in240 = _mm512_permutexvar_ps(pm89, dat1140);
__m512 dat1141 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1141 = _mm512_max_ps(_mm512_setzero_ps(), dat1141);
__m512 dat1142 = _mm512_maskz_loadu_ps(511, datPtr5+13824+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1142 = _mm512_max_ps(_mm512_setzero_ps(), dat1142);
__m512 in233 = _mm512_permutexvar_ps(pm88, dat1141);
__m512 in241 = _mm512_permutexvar_ps(pm89, dat1142);
__m512 dat1143 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1143 = _mm512_max_ps(_mm512_setzero_ps(), dat1143);
__m512 dat1144 = _mm512_maskz_loadu_ps(511, datPtr5+14048+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1144 = _mm512_max_ps(_mm512_setzero_ps(), dat1144);
__m512 in234 = _mm512_permutexvar_ps(pm88, dat1143);
__m512 in242 = _mm512_permutexvar_ps(pm89, dat1144);
__m512 dat1145 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1145 = _mm512_max_ps(_mm512_setzero_ps(), dat1145);
__m512 dat1146 = _mm512_maskz_loadu_ps(511, datPtr5+14272+50432*i17+224*h24+4*w27+50432*s12+25216*k55);
dat1146 = _mm512_max_ps(_mm512_setzero_ps(), dat1146);
__m512 in235 = _mm512_permutexvar_ps(pm88, dat1145);
__m512 in243 = _mm512_permutexvar_ps(pm89, dat1146);
__m512 tmp1149 = _mm512_add_ps(in229, in233);
__m512 tmp1153 = _mm512_add_ps(in237, in241);
__m512 tmp1150 = _mm512_sub_ps(in232, in230);
__m512 tmp1154 = _mm512_sub_ps(in240, in238);
__m512 tmp1151 = _mm512_add_ps(in230, in234);
__m512 tmp1155 = _mm512_add_ps(in238, in242);
in228 = _mm512_sub_ps(in228, in234);
in236 = _mm512_sub_ps(in236, in242);
tmp1149 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-4.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-4.25e+00f), tmp1153);
tmp1151 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-4.25e+00f), tmp1151);
tmp1155 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-4.25e+00f), tmp1155);
in228 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(5.25e+00f), in228);
in236 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(5.25e+00f), in236);
tmp1150 = _mm512_fmadd_ps(in230, _mm512_set1_ps(2.5e-01f), in234);
tmp1154 = _mm512_fmadd_ps(in238, _mm512_set1_ps(2.5e-01f), in242);
in230 = _mm512_fmadd_ps(in230, _mm512_set1_ps(4e+00f), in234);
in238 = _mm512_fmadd_ps(in238, _mm512_set1_ps(4e+00f), in242);
__m512 tmp1152 = _mm512_sub_ps(tmp1151, tmp1149);
__m512 tmp1156 = _mm512_sub_ps(tmp1155, tmp1153);
tmp1151 = _mm512_add_ps(tmp1149, tmp1151);
tmp1155 = _mm512_add_ps(tmp1153, tmp1155);
tmp1149 = _mm512_fmadd_ps(in229, _mm512_set1_ps(2.5e-01f), in233);
tmp1153 = _mm512_fmadd_ps(in237, _mm512_set1_ps(2.5e-01f), in241);
tmp1150 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-1.25e+00f), tmp1150);
tmp1154 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-1.25e+00f), tmp1154);
in232 = _mm512_fmadd_ps(in232, _mm512_set1_ps(-5e+00f), in230);
in240 = _mm512_fmadd_ps(in240, _mm512_set1_ps(-5e+00f), in238);
tmp1149 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-1.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-1.25e+00f), tmp1153);
in234 = _mm512_fmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), tmp1150);
in242 = _mm512_fmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), tmp1154);
tmp1150 = _mm512_fnmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), tmp1150);
tmp1154 = _mm512_fnmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), tmp1154);
tmp1149 = _mm512_fmadd_ps(in233, _mm512_set1_ps(2.5e-01f), in229);
tmp1153 = _mm512_fmadd_ps(in241, _mm512_set1_ps(2.5e-01f), in237);
in229 = _mm512_sub_ps(in235, in229);
in237 = _mm512_sub_ps(in243, in237);
tmp1149 = _mm512_fmadd_ps(in231, _mm512_set1_ps(-1.25e+00f), tmp1149);
tmp1153 = _mm512_fmadd_ps(in239, _mm512_set1_ps(-1.25e+00f), tmp1153);
in231 = _mm512_sub_ps(in231, in233);
in239 = _mm512_sub_ps(in239, in241);
in231 = _mm512_fmadd_ps(in231, _mm512_set1_ps(5.25e+00f), in229);
in239 = _mm512_fmadd_ps(in239, _mm512_set1_ps(5.25e+00f), in237);
in230 = _mm512_fmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), in232);
in238 = _mm512_fmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), in240);
in232 = _mm512_fnmadd_ps(tmp1149, _mm512_set1_ps(2e+00f), in232);
in240 = _mm512_fnmadd_ps(tmp1153, _mm512_set1_ps(2e+00f), in240);
__m512 tmp1165 = _mm512_unpacklo_ps(in228, tmp1151);
__m512 tmp1166 = _mm512_unpackhi_ps(in228, tmp1151);
__m512 tmp1167 = _mm512_unpacklo_ps(tmp1152, in234);
__m512 tmp1168 = _mm512_unpackhi_ps(tmp1152, in234);
__m512 tmp1169 = _mm512_unpacklo_ps(tmp1150, in230);
__m512 tmp1170 = _mm512_unpackhi_ps(tmp1150, in230);
__m512 tmp1171 = _mm512_unpacklo_ps(in232, in231);
__m512 tmp1172 = _mm512_unpackhi_ps(in232, in231);
__m512 tmp1173 = _mm512_unpacklo_ps(in236, tmp1155);
__m512 tmp1174 = _mm512_unpackhi_ps(in236, tmp1155);
__m512 tmp1175 = _mm512_unpacklo_ps(tmp1156, in242);
__m512 tmp1176 = _mm512_unpackhi_ps(tmp1156, in242);
__m512 tmp1177 = _mm512_unpacklo_ps(tmp1154, in238);
__m512 tmp1178 = _mm512_unpackhi_ps(tmp1154, in238);
__m512 tmp1179 = _mm512_unpacklo_ps(in240, in239);
__m512 tmp1180 = _mm512_unpackhi_ps(in240, in239);
__m512 tmp1181 = _mm512_shuffle_ps(tmp1165, tmp1167, 68);
__m512 tmp1182 = _mm512_shuffle_ps(tmp1165, tmp1167, 238);
__m512 tmp1183 = _mm512_shuffle_ps(tmp1166, tmp1168, 68);
__m512 tmp1184 = _mm512_shuffle_ps(tmp1166, tmp1168, 238);
__m512 tmp1185 = _mm512_shuffle_ps(tmp1169, tmp1171, 68);
__m512 tmp1186 = _mm512_shuffle_ps(tmp1169, tmp1171, 238);
__m512 tmp1187 = _mm512_shuffle_ps(tmp1170, tmp1172, 68);
__m512 tmp1188 = _mm512_shuffle_ps(tmp1170, tmp1172, 238);
__m512 tmp1189 = _mm512_shuffle_ps(tmp1173, tmp1175, 68);
__m512 tmp1190 = _mm512_shuffle_ps(tmp1173, tmp1175, 238);
__m512 tmp1191 = _mm512_shuffle_ps(tmp1174, tmp1176, 68);
__m512 tmp1192 = _mm512_shuffle_ps(tmp1174, tmp1176, 238);
__m512 tmp1193 = _mm512_shuffle_ps(tmp1177, tmp1179, 68);
__m512 tmp1194 = _mm512_shuffle_ps(tmp1177, tmp1179, 238);
__m512 tmp1195 = _mm512_shuffle_ps(tmp1178, tmp1180, 68);
__m512 tmp1196 = _mm512_shuffle_ps(tmp1178, tmp1180, 238);
__m512 tmp1197 = _mm512_shuffle_f32x4(tmp1181, tmp1185, 136);
__m512 tmp1198 = _mm512_shuffle_f32x4(tmp1181, tmp1185, 221);
__m512 tmp1199 = _mm512_shuffle_f32x4(tmp1182, tmp1186, 136);
__m512 tmp1200 = _mm512_shuffle_f32x4(tmp1182, tmp1186, 221);
__m512 tmp1201 = _mm512_shuffle_f32x4(tmp1183, tmp1187, 136);
__m512 tmp1202 = _mm512_shuffle_f32x4(tmp1183, tmp1187, 221);
__m512 tmp1203 = _mm512_shuffle_f32x4(tmp1184, tmp1188, 136);
__m512 tmp1204 = _mm512_shuffle_f32x4(tmp1184, tmp1188, 221);
__m512 tmp1205 = _mm512_shuffle_f32x4(tmp1189, tmp1193, 136);
__m512 tmp1206 = _mm512_shuffle_f32x4(tmp1189, tmp1193, 221);
__m512 tmp1207 = _mm512_shuffle_f32x4(tmp1190, tmp1194, 136);
__m512 tmp1208 = _mm512_shuffle_f32x4(tmp1190, tmp1194, 221);
__m512 tmp1209 = _mm512_shuffle_f32x4(tmp1191, tmp1195, 136);
__m512 tmp1210 = _mm512_shuffle_f32x4(tmp1191, tmp1195, 221);
__m512 tmp1211 = _mm512_shuffle_f32x4(tmp1192, tmp1196, 136);
__m512 tmp1212 = _mm512_shuffle_f32x4(tmp1192, tmp1196, 221);
in228 = _mm512_shuffle_f32x4(tmp1197, tmp1205, 136);
in236 = _mm512_shuffle_f32x4(tmp1197, tmp1205, 221);
tmp1151 = _mm512_shuffle_f32x4(tmp1199, tmp1207, 136);
tmp1155 = _mm512_shuffle_f32x4(tmp1199, tmp1207, 221);
tmp1152 = _mm512_shuffle_f32x4(tmp1201, tmp1209, 136);
tmp1156 = _mm512_shuffle_f32x4(tmp1201, tmp1209, 221);
in234 = _mm512_shuffle_f32x4(tmp1203, tmp1211, 136);
in242 = _mm512_shuffle_f32x4(tmp1203, tmp1211, 221);
tmp1150 = _mm512_shuffle_f32x4(tmp1198, tmp1206, 136);
tmp1154 = _mm512_shuffle_f32x4(tmp1198, tmp1206, 221);
in230 = _mm512_shuffle_f32x4(tmp1200, tmp1208, 136);
in238 = _mm512_shuffle_f32x4(tmp1200, tmp1208, 221);
in232 = _mm512_shuffle_f32x4(tmp1202, tmp1210, 136);
in240 = _mm512_shuffle_f32x4(tmp1202, tmp1210, 221);
in231 = _mm512_shuffle_f32x4(tmp1204, tmp1212, 136);
in239 = _mm512_shuffle_f32x4(tmp1204, tmp1212, 221);
__m512 tmp1157 = _mm512_add_ps(tmp1151, in230);
__m512 tmp1161 = _mm512_add_ps(tmp1155, in238);
__m512 tmp1158 = _mm512_sub_ps(tmp1150, tmp1152);
__m512 tmp1162 = _mm512_sub_ps(tmp1154, tmp1156);
__m512 tmp1159 = _mm512_add_ps(tmp1152, in232);
__m512 tmp1163 = _mm512_add_ps(tmp1156, in240);
in228 = _mm512_sub_ps(in228, in232);
in236 = _mm512_sub_ps(in236, in240);
tmp1157 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-4.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-4.25e+00f), tmp1161);
tmp1159 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-4.25e+00f), tmp1159);
tmp1163 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-4.25e+00f), tmp1163);
in228 = _mm512_fmadd_ps(tmp1158, _mm512_set1_ps(5.25e+00f), in228);
in236 = _mm512_fmadd_ps(tmp1162, _mm512_set1_ps(5.25e+00f), in236);
tmp1158 = _mm512_fmadd_ps(tmp1152, _mm512_set1_ps(2.5e-01f), in232);
tmp1162 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(2.5e-01f), in240);
tmp1152 = _mm512_fmadd_ps(tmp1152, _mm512_set1_ps(4e+00f), in232);
tmp1156 = _mm512_fmadd_ps(tmp1156, _mm512_set1_ps(4e+00f), in240);
__m512 tmp1160 = _mm512_sub_ps(tmp1159, tmp1157);
__m512 tmp1164 = _mm512_sub_ps(tmp1163, tmp1161);
tmp1159 = _mm512_add_ps(tmp1157, tmp1159);
tmp1163 = _mm512_add_ps(tmp1161, tmp1163);
tmp1157 = _mm512_fmadd_ps(tmp1151, _mm512_set1_ps(2.5e-01f), in230);
tmp1161 = _mm512_fmadd_ps(tmp1155, _mm512_set1_ps(2.5e-01f), in238);
tmp1158 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-1.25e+00f), tmp1158);
tmp1162 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-1.25e+00f), tmp1162);
tmp1150 = _mm512_fmadd_ps(tmp1150, _mm512_set1_ps(-5e+00f), tmp1152);
tmp1154 = _mm512_fmadd_ps(tmp1154, _mm512_set1_ps(-5e+00f), tmp1156);
tmp1157 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-1.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-1.25e+00f), tmp1161);
in232 = _mm512_fmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1158);
in240 = _mm512_fmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1162);
tmp1158 = _mm512_fnmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1158);
tmp1162 = _mm512_fnmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1162);
tmp1157 = _mm512_fmadd_ps(in230, _mm512_set1_ps(2.5e-01f), tmp1151);
tmp1161 = _mm512_fmadd_ps(in238, _mm512_set1_ps(2.5e-01f), tmp1155);
tmp1151 = _mm512_sub_ps(in231, tmp1151);
tmp1155 = _mm512_sub_ps(in239, tmp1155);
tmp1157 = _mm512_fmadd_ps(in234, _mm512_set1_ps(-1.25e+00f), tmp1157);
tmp1161 = _mm512_fmadd_ps(in242, _mm512_set1_ps(-1.25e+00f), tmp1161);
in234 = _mm512_sub_ps(in234, in230);
in242 = _mm512_sub_ps(in242, in238);
in234 = _mm512_fmadd_ps(in234, _mm512_set1_ps(5.25e+00f), tmp1151);
in242 = _mm512_fmadd_ps(in242, _mm512_set1_ps(5.25e+00f), tmp1155);
tmp1152 = _mm512_fmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1150);
tmp1156 = _mm512_fmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1154);
tmp1150 = _mm512_fnmadd_ps(tmp1157, _mm512_set1_ps(2e+00f), tmp1150);
tmp1154 = _mm512_fnmadd_ps(tmp1161, _mm512_set1_ps(2e+00f), tmp1154);
__m512 out247 = _mm512_shuffle_f32x4(in228, tmp1159, 68);
__m512 out255 = _mm512_shuffle_f32x4(in228, tmp1159, 238);
__m512 out248 = _mm512_shuffle_f32x4(tmp1160, in232, 68);
__m512 out256 = _mm512_shuffle_f32x4(tmp1160, in232, 238);
__m512 out249 = _mm512_shuffle_f32x4(tmp1158, tmp1152, 68);
__m512 out257 = _mm512_shuffle_f32x4(tmp1158, tmp1152, 238);
__m512 out250 = _mm512_shuffle_f32x4(tmp1150, in234, 68);
__m512 out258 = _mm512_shuffle_f32x4(tmp1150, in234, 238);
__m512 out251 = _mm512_shuffle_f32x4(in236, tmp1163, 68);
__m512 out259 = _mm512_shuffle_f32x4(in236, tmp1163, 238);
__m512 out252 = _mm512_shuffle_f32x4(tmp1164, in240, 68);
__m512 out260 = _mm512_shuffle_f32x4(tmp1164, in240, 238);
__m512 out253 = _mm512_shuffle_f32x4(tmp1162, tmp1156, 68);
__m512 out261 = _mm512_shuffle_f32x4(tmp1162, tmp1156, 238);
__m512 out254 = _mm512_shuffle_f32x4(tmp1154, in242, 68);
__m512 out262 = _mm512_shuffle_f32x4(tmp1154, in242, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k55, out247);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k55, out255);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k55, out251);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k55, out259);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k55, out248);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k55, out256);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k55, out252);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k55, out260);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k55, out249);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k55, out257);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k55, out253);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k55, out261);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k55, out250);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k55, out258);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k55, out254);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k55, out262);
}
++j11;
if (j11 >= 15) break;
rel8 = 3;
}
if (rel8 < 4) {
ptrdiff_t h25 = base8+12;
ptrdiff_t w28 = 0;
ptrdiff_t k56 = 0;
for (; k56 != 2; ++k56) {
__m512 dat1147 = _mm512_maskz_loadu_ps(8191, datPtr5+4+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1147 = _mm512_max_ps(_mm512_setzero_ps(), dat1147);
__m512 dat1148 = _mm512_maskz_loadu_ps(16383, datPtr5+48+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1148 = _mm512_max_ps(_mm512_setzero_ps(), dat1148);
__m512i pm90 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in244 = _mm512_permutexvar_ps(pm90, dat1147);
__m512i pm91 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in252 = _mm512_permutexvar_ps(pm91, dat1148);
__m512 dat1149 = _mm512_maskz_loadu_ps(8191, datPtr5+228+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1149 = _mm512_max_ps(_mm512_setzero_ps(), dat1149);
__m512 dat1150 = _mm512_maskz_loadu_ps(16383, datPtr5+272+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1150 = _mm512_max_ps(_mm512_setzero_ps(), dat1150);
__m512 in245 = _mm512_permutexvar_ps(pm90, dat1149);
__m512 in253 = _mm512_permutexvar_ps(pm91, dat1150);
__m512 dat1151 = _mm512_maskz_loadu_ps(8191, datPtr5+452+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1151 = _mm512_max_ps(_mm512_setzero_ps(), dat1151);
__m512 dat1152 = _mm512_maskz_loadu_ps(16383, datPtr5+496+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1152 = _mm512_max_ps(_mm512_setzero_ps(), dat1152);
__m512 in246 = _mm512_permutexvar_ps(pm90, dat1151);
__m512 in254 = _mm512_permutexvar_ps(pm91, dat1152);
__m512 dat1153 = _mm512_maskz_loadu_ps(8191, datPtr5+676+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1153 = _mm512_max_ps(_mm512_setzero_ps(), dat1153);
__m512 dat1154 = _mm512_maskz_loadu_ps(16383, datPtr5+720+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1154 = _mm512_max_ps(_mm512_setzero_ps(), dat1154);
__m512 in247 = _mm512_permutexvar_ps(pm90, dat1153);
__m512 in255 = _mm512_permutexvar_ps(pm91, dat1154);
__m512 dat1155 = _mm512_maskz_loadu_ps(8191, datPtr5+900+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1155 = _mm512_max_ps(_mm512_setzero_ps(), dat1155);
__m512 dat1156 = _mm512_maskz_loadu_ps(16383, datPtr5+944+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1156 = _mm512_max_ps(_mm512_setzero_ps(), dat1156);
__m512 in248 = _mm512_permutexvar_ps(pm90, dat1155);
__m512 in256 = _mm512_permutexvar_ps(pm91, dat1156);
__m512 dat1157 = _mm512_maskz_loadu_ps(8191, datPtr5+1124+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1157 = _mm512_max_ps(_mm512_setzero_ps(), dat1157);
__m512 dat1158 = _mm512_maskz_loadu_ps(16383, datPtr5+1168+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1158 = _mm512_max_ps(_mm512_setzero_ps(), dat1158);
__m512 in249 = _mm512_permutexvar_ps(pm90, dat1157);
__m512 in257 = _mm512_permutexvar_ps(pm91, dat1158);
__m512 dat1159 = _mm512_maskz_loadu_ps(8191, datPtr5+1348+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1159 = _mm512_max_ps(_mm512_setzero_ps(), dat1159);
__m512 dat1160 = _mm512_maskz_loadu_ps(16383, datPtr5+1392+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1160 = _mm512_max_ps(_mm512_setzero_ps(), dat1160);
__m512 in250 = _mm512_permutexvar_ps(pm90, dat1159);
__m512 in258 = _mm512_permutexvar_ps(pm91, dat1160);
__m512 dat1161 = _mm512_maskz_loadu_ps(8191, datPtr5+1572+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1161 = _mm512_max_ps(_mm512_setzero_ps(), dat1161);
__m512 dat1162 = _mm512_maskz_loadu_ps(16383, datPtr5+1616+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1162 = _mm512_max_ps(_mm512_setzero_ps(), dat1162);
__m512 in251 = _mm512_permutexvar_ps(pm90, dat1161);
__m512 in259 = _mm512_permutexvar_ps(pm91, dat1162);
__m512 tmp1213 = _mm512_add_ps(in245, in249);
__m512 tmp1217 = _mm512_add_ps(in253, in257);
__m512 tmp1214 = _mm512_sub_ps(in248, in246);
__m512 tmp1218 = _mm512_sub_ps(in256, in254);
__m512 tmp1215 = _mm512_add_ps(in246, in250);
__m512 tmp1219 = _mm512_add_ps(in254, in258);
in244 = _mm512_sub_ps(in244, in250);
in252 = _mm512_sub_ps(in252, in258);
tmp1213 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-4.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-4.25e+00f), tmp1217);
tmp1215 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-4.25e+00f), tmp1215);
tmp1219 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-4.25e+00f), tmp1219);
in244 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(5.25e+00f), in244);
in252 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(5.25e+00f), in252);
tmp1214 = _mm512_fmadd_ps(in246, _mm512_set1_ps(2.5e-01f), in250);
tmp1218 = _mm512_fmadd_ps(in254, _mm512_set1_ps(2.5e-01f), in258);
in246 = _mm512_fmadd_ps(in246, _mm512_set1_ps(4e+00f), in250);
in254 = _mm512_fmadd_ps(in254, _mm512_set1_ps(4e+00f), in258);
__m512 tmp1216 = _mm512_sub_ps(tmp1215, tmp1213);
__m512 tmp1220 = _mm512_sub_ps(tmp1219, tmp1217);
tmp1215 = _mm512_add_ps(tmp1213, tmp1215);
tmp1219 = _mm512_add_ps(tmp1217, tmp1219);
tmp1213 = _mm512_fmadd_ps(in245, _mm512_set1_ps(2.5e-01f), in249);
tmp1217 = _mm512_fmadd_ps(in253, _mm512_set1_ps(2.5e-01f), in257);
tmp1214 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-1.25e+00f), tmp1214);
tmp1218 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-1.25e+00f), tmp1218);
in248 = _mm512_fmadd_ps(in248, _mm512_set1_ps(-5e+00f), in246);
in256 = _mm512_fmadd_ps(in256, _mm512_set1_ps(-5e+00f), in254);
tmp1213 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-1.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-1.25e+00f), tmp1217);
in250 = _mm512_fmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), tmp1214);
in258 = _mm512_fmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), tmp1218);
tmp1214 = _mm512_fnmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), tmp1214);
tmp1218 = _mm512_fnmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), tmp1218);
tmp1213 = _mm512_fmadd_ps(in249, _mm512_set1_ps(2.5e-01f), in245);
tmp1217 = _mm512_fmadd_ps(in257, _mm512_set1_ps(2.5e-01f), in253);
in245 = _mm512_sub_ps(in251, in245);
in253 = _mm512_sub_ps(in259, in253);
tmp1213 = _mm512_fmadd_ps(in247, _mm512_set1_ps(-1.25e+00f), tmp1213);
tmp1217 = _mm512_fmadd_ps(in255, _mm512_set1_ps(-1.25e+00f), tmp1217);
in247 = _mm512_sub_ps(in247, in249);
in255 = _mm512_sub_ps(in255, in257);
in247 = _mm512_fmadd_ps(in247, _mm512_set1_ps(5.25e+00f), in245);
in255 = _mm512_fmadd_ps(in255, _mm512_set1_ps(5.25e+00f), in253);
in246 = _mm512_fmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), in248);
in254 = _mm512_fmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), in256);
in248 = _mm512_fnmadd_ps(tmp1213, _mm512_set1_ps(2e+00f), in248);
in256 = _mm512_fnmadd_ps(tmp1217, _mm512_set1_ps(2e+00f), in256);
__m512 tmp1229 = _mm512_unpacklo_ps(in244, tmp1215);
__m512 tmp1230 = _mm512_unpackhi_ps(in244, tmp1215);
__m512 tmp1231 = _mm512_unpacklo_ps(tmp1216, in250);
__m512 tmp1232 = _mm512_unpackhi_ps(tmp1216, in250);
__m512 tmp1233 = _mm512_unpacklo_ps(tmp1214, in246);
__m512 tmp1234 = _mm512_unpackhi_ps(tmp1214, in246);
__m512 tmp1235 = _mm512_unpacklo_ps(in248, in247);
__m512 tmp1236 = _mm512_unpackhi_ps(in248, in247);
__m512 tmp1237 = _mm512_unpacklo_ps(in252, tmp1219);
__m512 tmp1238 = _mm512_unpackhi_ps(in252, tmp1219);
__m512 tmp1239 = _mm512_unpacklo_ps(tmp1220, in258);
__m512 tmp1240 = _mm512_unpackhi_ps(tmp1220, in258);
__m512 tmp1241 = _mm512_unpacklo_ps(tmp1218, in254);
__m512 tmp1242 = _mm512_unpackhi_ps(tmp1218, in254);
__m512 tmp1243 = _mm512_unpacklo_ps(in256, in255);
__m512 tmp1244 = _mm512_unpackhi_ps(in256, in255);
__m512 tmp1245 = _mm512_shuffle_ps(tmp1229, tmp1231, 68);
__m512 tmp1246 = _mm512_shuffle_ps(tmp1229, tmp1231, 238);
__m512 tmp1247 = _mm512_shuffle_ps(tmp1230, tmp1232, 68);
__m512 tmp1248 = _mm512_shuffle_ps(tmp1230, tmp1232, 238);
__m512 tmp1249 = _mm512_shuffle_ps(tmp1233, tmp1235, 68);
__m512 tmp1250 = _mm512_shuffle_ps(tmp1233, tmp1235, 238);
__m512 tmp1251 = _mm512_shuffle_ps(tmp1234, tmp1236, 68);
__m512 tmp1252 = _mm512_shuffle_ps(tmp1234, tmp1236, 238);
__m512 tmp1253 = _mm512_shuffle_ps(tmp1237, tmp1239, 68);
__m512 tmp1254 = _mm512_shuffle_ps(tmp1237, tmp1239, 238);
__m512 tmp1255 = _mm512_shuffle_ps(tmp1238, tmp1240, 68);
__m512 tmp1256 = _mm512_shuffle_ps(tmp1238, tmp1240, 238);
__m512 tmp1257 = _mm512_shuffle_ps(tmp1241, tmp1243, 68);
__m512 tmp1258 = _mm512_shuffle_ps(tmp1241, tmp1243, 238);
__m512 tmp1259 = _mm512_shuffle_ps(tmp1242, tmp1244, 68);
__m512 tmp1260 = _mm512_shuffle_ps(tmp1242, tmp1244, 238);
__m512 tmp1261 = _mm512_shuffle_f32x4(tmp1245, tmp1249, 136);
__m512 tmp1262 = _mm512_shuffle_f32x4(tmp1245, tmp1249, 221);
__m512 tmp1263 = _mm512_shuffle_f32x4(tmp1246, tmp1250, 136);
__m512 tmp1264 = _mm512_shuffle_f32x4(tmp1246, tmp1250, 221);
__m512 tmp1265 = _mm512_shuffle_f32x4(tmp1247, tmp1251, 136);
__m512 tmp1266 = _mm512_shuffle_f32x4(tmp1247, tmp1251, 221);
__m512 tmp1267 = _mm512_shuffle_f32x4(tmp1248, tmp1252, 136);
__m512 tmp1268 = _mm512_shuffle_f32x4(tmp1248, tmp1252, 221);
__m512 tmp1269 = _mm512_shuffle_f32x4(tmp1253, tmp1257, 136);
__m512 tmp1270 = _mm512_shuffle_f32x4(tmp1253, tmp1257, 221);
__m512 tmp1271 = _mm512_shuffle_f32x4(tmp1254, tmp1258, 136);
__m512 tmp1272 = _mm512_shuffle_f32x4(tmp1254, tmp1258, 221);
__m512 tmp1273 = _mm512_shuffle_f32x4(tmp1255, tmp1259, 136);
__m512 tmp1274 = _mm512_shuffle_f32x4(tmp1255, tmp1259, 221);
__m512 tmp1275 = _mm512_shuffle_f32x4(tmp1256, tmp1260, 136);
__m512 tmp1276 = _mm512_shuffle_f32x4(tmp1256, tmp1260, 221);
in244 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 136);
in252 = _mm512_shuffle_f32x4(tmp1261, tmp1269, 221);
tmp1215 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 136);
tmp1219 = _mm512_shuffle_f32x4(tmp1263, tmp1271, 221);
tmp1216 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 136);
tmp1220 = _mm512_shuffle_f32x4(tmp1265, tmp1273, 221);
in250 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 136);
in258 = _mm512_shuffle_f32x4(tmp1267, tmp1275, 221);
tmp1214 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 136);
tmp1218 = _mm512_shuffle_f32x4(tmp1262, tmp1270, 221);
in246 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 136);
in254 = _mm512_shuffle_f32x4(tmp1264, tmp1272, 221);
in248 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 136);
in256 = _mm512_shuffle_f32x4(tmp1266, tmp1274, 221);
in247 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 136);
in255 = _mm512_shuffle_f32x4(tmp1268, tmp1276, 221);
__m512 tmp1221 = _mm512_add_ps(tmp1215, in246);
__m512 tmp1225 = _mm512_add_ps(tmp1219, in254);
__m512 tmp1222 = _mm512_sub_ps(tmp1214, tmp1216);
__m512 tmp1226 = _mm512_sub_ps(tmp1218, tmp1220);
__m512 tmp1223 = _mm512_add_ps(tmp1216, in248);
__m512 tmp1227 = _mm512_add_ps(tmp1220, in256);
in244 = _mm512_sub_ps(in244, in248);
in252 = _mm512_sub_ps(in252, in256);
tmp1221 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-4.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-4.25e+00f), tmp1225);
tmp1223 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-4.25e+00f), tmp1223);
tmp1227 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-4.25e+00f), tmp1227);
in244 = _mm512_fmadd_ps(tmp1222, _mm512_set1_ps(5.25e+00f), in244);
in252 = _mm512_fmadd_ps(tmp1226, _mm512_set1_ps(5.25e+00f), in252);
tmp1222 = _mm512_fmadd_ps(tmp1216, _mm512_set1_ps(2.5e-01f), in248);
tmp1226 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(2.5e-01f), in256);
tmp1216 = _mm512_fmadd_ps(tmp1216, _mm512_set1_ps(4e+00f), in248);
tmp1220 = _mm512_fmadd_ps(tmp1220, _mm512_set1_ps(4e+00f), in256);
__m512 tmp1224 = _mm512_sub_ps(tmp1223, tmp1221);
__m512 tmp1228 = _mm512_sub_ps(tmp1227, tmp1225);
tmp1223 = _mm512_add_ps(tmp1221, tmp1223);
tmp1227 = _mm512_add_ps(tmp1225, tmp1227);
tmp1221 = _mm512_fmadd_ps(tmp1215, _mm512_set1_ps(2.5e-01f), in246);
tmp1225 = _mm512_fmadd_ps(tmp1219, _mm512_set1_ps(2.5e-01f), in254);
tmp1222 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-1.25e+00f), tmp1222);
tmp1226 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-1.25e+00f), tmp1226);
tmp1214 = _mm512_fmadd_ps(tmp1214, _mm512_set1_ps(-5e+00f), tmp1216);
tmp1218 = _mm512_fmadd_ps(tmp1218, _mm512_set1_ps(-5e+00f), tmp1220);
tmp1221 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-1.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-1.25e+00f), tmp1225);
in248 = _mm512_fmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1222);
in256 = _mm512_fmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1226);
tmp1222 = _mm512_fnmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1222);
tmp1226 = _mm512_fnmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1226);
tmp1221 = _mm512_fmadd_ps(in246, _mm512_set1_ps(2.5e-01f), tmp1215);
tmp1225 = _mm512_fmadd_ps(in254, _mm512_set1_ps(2.5e-01f), tmp1219);
tmp1215 = _mm512_sub_ps(in247, tmp1215);
tmp1219 = _mm512_sub_ps(in255, tmp1219);
tmp1221 = _mm512_fmadd_ps(in250, _mm512_set1_ps(-1.25e+00f), tmp1221);
tmp1225 = _mm512_fmadd_ps(in258, _mm512_set1_ps(-1.25e+00f), tmp1225);
in250 = _mm512_sub_ps(in250, in246);
in258 = _mm512_sub_ps(in258, in254);
in250 = _mm512_fmadd_ps(in250, _mm512_set1_ps(5.25e+00f), tmp1215);
in258 = _mm512_fmadd_ps(in258, _mm512_set1_ps(5.25e+00f), tmp1219);
tmp1216 = _mm512_fmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1214);
tmp1220 = _mm512_fmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1218);
tmp1214 = _mm512_fnmadd_ps(tmp1221, _mm512_set1_ps(2e+00f), tmp1214);
tmp1218 = _mm512_fnmadd_ps(tmp1225, _mm512_set1_ps(2e+00f), tmp1218);
__m512 out263 = _mm512_shuffle_f32x4(in244, tmp1223, 68);
__m512 out271 = _mm512_shuffle_f32x4(in244, tmp1223, 238);
__m512 out264 = _mm512_shuffle_f32x4(tmp1224, in248, 68);
__m512 out272 = _mm512_shuffle_f32x4(tmp1224, in248, 238);
__m512 out265 = _mm512_shuffle_f32x4(tmp1222, tmp1216, 68);
__m512 out273 = _mm512_shuffle_f32x4(tmp1222, tmp1216, 238);
__m512 out266 = _mm512_shuffle_f32x4(tmp1214, in250, 68);
__m512 out274 = _mm512_shuffle_f32x4(tmp1214, in250, 238);
__m512 out267 = _mm512_shuffle_f32x4(in252, tmp1227, 68);
__m512 out275 = _mm512_shuffle_f32x4(in252, tmp1227, 238);
__m512 out268 = _mm512_shuffle_f32x4(tmp1228, in256, 68);
__m512 out276 = _mm512_shuffle_f32x4(tmp1228, in256, 238);
__m512 out269 = _mm512_shuffle_f32x4(tmp1226, tmp1220, 68);
__m512 out277 = _mm512_shuffle_f32x4(tmp1226, tmp1220, 238);
__m512 out270 = _mm512_shuffle_f32x4(tmp1218, in258, 68);
__m512 out278 = _mm512_shuffle_f32x4(tmp1218, in258, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k56, out263);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k56, out271);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k56, out267);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k56, out275);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k56, out264);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k56, out272);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k56, out268);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k56, out276);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k56, out265);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k56, out273);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k56, out269);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k56, out277);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k56, out266);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k56, out274);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k56, out270);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k56, out278);
__m512 dat1163 = _mm512_maskz_loadu_ps(16383, datPtr5+96+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1163 = _mm512_max_ps(_mm512_setzero_ps(), dat1163);
__m512 dat1164 = _mm512_maskz_loadu_ps(8191, datPtr5+12612+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1164 = _mm512_max_ps(_mm512_setzero_ps(), dat1164);
__m512i pm92 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in260 = _mm512_permutexvar_ps(pm92, dat1163);
__m512i pm93 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in268 = _mm512_permutexvar_ps(pm93, dat1164);
__m512 dat1165 = _mm512_maskz_loadu_ps(16383, datPtr5+320+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1165 = _mm512_max_ps(_mm512_setzero_ps(), dat1165);
__m512 dat1166 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1166 = _mm512_max_ps(_mm512_setzero_ps(), dat1166);
__m512 in261 = _mm512_permutexvar_ps(pm92, dat1165);
__m512 in269 = _mm512_permutexvar_ps(pm93, dat1166);
__m512 dat1167 = _mm512_maskz_loadu_ps(16383, datPtr5+544+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1167 = _mm512_max_ps(_mm512_setzero_ps(), dat1167);
__m512 dat1168 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1168 = _mm512_max_ps(_mm512_setzero_ps(), dat1168);
__m512 in262 = _mm512_permutexvar_ps(pm92, dat1167);
__m512 in270 = _mm512_permutexvar_ps(pm93, dat1168);
__m512 dat1169 = _mm512_maskz_loadu_ps(16383, datPtr5+768+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1169 = _mm512_max_ps(_mm512_setzero_ps(), dat1169);
__m512 dat1170 = _mm512_maskz_loadu_ps(8191, datPtr5+13284+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1170 = _mm512_max_ps(_mm512_setzero_ps(), dat1170);
__m512 in263 = _mm512_permutexvar_ps(pm92, dat1169);
__m512 in271 = _mm512_permutexvar_ps(pm93, dat1170);
__m512 dat1171 = _mm512_maskz_loadu_ps(16383, datPtr5+992+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1171 = _mm512_max_ps(_mm512_setzero_ps(), dat1171);
__m512 dat1172 = _mm512_maskz_loadu_ps(8191, datPtr5+13508+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1172 = _mm512_max_ps(_mm512_setzero_ps(), dat1172);
__m512 in264 = _mm512_permutexvar_ps(pm92, dat1171);
__m512 in272 = _mm512_permutexvar_ps(pm93, dat1172);
__m512 dat1173 = _mm512_maskz_loadu_ps(16383, datPtr5+1216+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1173 = _mm512_max_ps(_mm512_setzero_ps(), dat1173);
__m512 dat1174 = _mm512_maskz_loadu_ps(8191, datPtr5+13732+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1174 = _mm512_max_ps(_mm512_setzero_ps(), dat1174);
__m512 in265 = _mm512_permutexvar_ps(pm92, dat1173);
__m512 in273 = _mm512_permutexvar_ps(pm93, dat1174);
__m512 dat1175 = _mm512_maskz_loadu_ps(16383, datPtr5+1440+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1175 = _mm512_max_ps(_mm512_setzero_ps(), dat1175);
__m512 dat1176 = _mm512_maskz_loadu_ps(8191, datPtr5+13956+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1176 = _mm512_max_ps(_mm512_setzero_ps(), dat1176);
__m512 in266 = _mm512_permutexvar_ps(pm92, dat1175);
__m512 in274 = _mm512_permutexvar_ps(pm93, dat1176);
__m512 dat1177 = _mm512_maskz_loadu_ps(16383, datPtr5+1664+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1177 = _mm512_max_ps(_mm512_setzero_ps(), dat1177);
__m512 dat1178 = _mm512_maskz_loadu_ps(8191, datPtr5+14180+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1178 = _mm512_max_ps(_mm512_setzero_ps(), dat1178);
__m512 in267 = _mm512_permutexvar_ps(pm92, dat1177);
__m512 in275 = _mm512_permutexvar_ps(pm93, dat1178);
__m512 tmp1277 = _mm512_add_ps(in261, in265);
__m512 tmp1281 = _mm512_add_ps(in269, in273);
__m512 tmp1278 = _mm512_sub_ps(in264, in262);
__m512 tmp1282 = _mm512_sub_ps(in272, in270);
__m512 tmp1279 = _mm512_add_ps(in262, in266);
__m512 tmp1283 = _mm512_add_ps(in270, in274);
in260 = _mm512_sub_ps(in260, in266);
in268 = _mm512_sub_ps(in268, in274);
tmp1277 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-4.25e+00f), tmp1277);
tmp1281 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-4.25e+00f), tmp1281);
tmp1279 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-4.25e+00f), tmp1279);
tmp1283 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-4.25e+00f), tmp1283);
in260 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(5.25e+00f), in260);
in268 = _mm512_fmadd_ps(tmp1282, _mm512_set1_ps(5.25e+00f), in268);
tmp1278 = _mm512_fmadd_ps(in262, _mm512_set1_ps(2.5e-01f), in266);
tmp1282 = _mm512_fmadd_ps(in270, _mm512_set1_ps(2.5e-01f), in274);
in262 = _mm512_fmadd_ps(in262, _mm512_set1_ps(4e+00f), in266);
in270 = _mm512_fmadd_ps(in270, _mm512_set1_ps(4e+00f), in274);
__m512 tmp1280 = _mm512_sub_ps(tmp1279, tmp1277);
__m512 tmp1284 = _mm512_sub_ps(tmp1283, tmp1281);
tmp1279 = _mm512_add_ps(tmp1277, tmp1279);
tmp1283 = _mm512_add_ps(tmp1281, tmp1283);
tmp1277 = _mm512_fmadd_ps(in261, _mm512_set1_ps(2.5e-01f), in265);
tmp1281 = _mm512_fmadd_ps(in269, _mm512_set1_ps(2.5e-01f), in273);
tmp1278 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-1.25e+00f), tmp1278);
tmp1282 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-1.25e+00f), tmp1282);
in264 = _mm512_fmadd_ps(in264, _mm512_set1_ps(-5e+00f), in262);
in272 = _mm512_fmadd_ps(in272, _mm512_set1_ps(-5e+00f), in270);
tmp1277 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-1.25e+00f), tmp1277);
tmp1281 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-1.25e+00f), tmp1281);
in266 = _mm512_fmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), tmp1278);
in274 = _mm512_fmadd_ps(tmp1281, _mm512_set1_ps(2e+00f), tmp1282);
tmp1278 = _mm512_fnmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), tmp1278);
tmp1282 = _mm512_fnmadd_ps(tmp1281, _mm512_set1_ps(2e+00f), tmp1282);
tmp1277 = _mm512_fmadd_ps(in265, _mm512_set1_ps(2.5e-01f), in261);
tmp1281 = _mm512_fmadd_ps(in273, _mm512_set1_ps(2.5e-01f), in269);
in261 = _mm512_sub_ps(in267, in261);
in269 = _mm512_sub_ps(in275, in269);
tmp1277 = _mm512_fmadd_ps(in263, _mm512_set1_ps(-1.25e+00f), tmp1277);
tmp1281 = _mm512_fmadd_ps(in271, _mm512_set1_ps(-1.25e+00f), tmp1281);
in263 = _mm512_sub_ps(in263, in265);
in271 = _mm512_sub_ps(in271, in273);
in263 = _mm512_fmadd_ps(in263, _mm512_set1_ps(5.25e+00f), in261);
in271 = _mm512_fmadd_ps(in271, _mm512_set1_ps(5.25e+00f), in269);
in262 = _mm512_fmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), in264);
in270 = _mm512_fmadd_ps(tmp1281, _mm512_set1_ps(2e+00f), in272);
in264 = _mm512_fnmadd_ps(tmp1277, _mm512_set1_ps(2e+00f), in264);
in272 = _mm512_fnmadd_ps(tmp1281, _mm512_set1_ps(2e+00f), in272);
__m512 tmp1293 = _mm512_unpacklo_ps(in260, tmp1279);
__m512 tmp1294 = _mm512_unpackhi_ps(in260, tmp1279);
__m512 tmp1295 = _mm512_unpacklo_ps(tmp1280, in266);
__m512 tmp1296 = _mm512_unpackhi_ps(tmp1280, in266);
__m512 tmp1297 = _mm512_unpacklo_ps(tmp1278, in262);
__m512 tmp1298 = _mm512_unpackhi_ps(tmp1278, in262);
__m512 tmp1299 = _mm512_unpacklo_ps(in264, in263);
__m512 tmp1300 = _mm512_unpackhi_ps(in264, in263);
__m512 tmp1301 = _mm512_unpacklo_ps(in268, tmp1283);
__m512 tmp1302 = _mm512_unpackhi_ps(in268, tmp1283);
__m512 tmp1303 = _mm512_unpacklo_ps(tmp1284, in274);
__m512 tmp1304 = _mm512_unpackhi_ps(tmp1284, in274);
__m512 tmp1305 = _mm512_unpacklo_ps(tmp1282, in270);
__m512 tmp1306 = _mm512_unpackhi_ps(tmp1282, in270);
__m512 tmp1307 = _mm512_unpacklo_ps(in272, in271);
__m512 tmp1308 = _mm512_unpackhi_ps(in272, in271);
__m512 tmp1309 = _mm512_shuffle_ps(tmp1293, tmp1295, 68);
__m512 tmp1310 = _mm512_shuffle_ps(tmp1293, tmp1295, 238);
__m512 tmp1311 = _mm512_shuffle_ps(tmp1294, tmp1296, 68);
__m512 tmp1312 = _mm512_shuffle_ps(tmp1294, tmp1296, 238);
__m512 tmp1313 = _mm512_shuffle_ps(tmp1297, tmp1299, 68);
__m512 tmp1314 = _mm512_shuffle_ps(tmp1297, tmp1299, 238);
__m512 tmp1315 = _mm512_shuffle_ps(tmp1298, tmp1300, 68);
__m512 tmp1316 = _mm512_shuffle_ps(tmp1298, tmp1300, 238);
__m512 tmp1317 = _mm512_shuffle_ps(tmp1301, tmp1303, 68);
__m512 tmp1318 = _mm512_shuffle_ps(tmp1301, tmp1303, 238);
__m512 tmp1319 = _mm512_shuffle_ps(tmp1302, tmp1304, 68);
__m512 tmp1320 = _mm512_shuffle_ps(tmp1302, tmp1304, 238);
__m512 tmp1321 = _mm512_shuffle_ps(tmp1305, tmp1307, 68);
__m512 tmp1322 = _mm512_shuffle_ps(tmp1305, tmp1307, 238);
__m512 tmp1323 = _mm512_shuffle_ps(tmp1306, tmp1308, 68);
__m512 tmp1324 = _mm512_shuffle_ps(tmp1306, tmp1308, 238);
__m512 tmp1325 = _mm512_shuffle_f32x4(tmp1309, tmp1313, 136);
__m512 tmp1326 = _mm512_shuffle_f32x4(tmp1309, tmp1313, 221);
__m512 tmp1327 = _mm512_shuffle_f32x4(tmp1310, tmp1314, 136);
__m512 tmp1328 = _mm512_shuffle_f32x4(tmp1310, tmp1314, 221);
__m512 tmp1329 = _mm512_shuffle_f32x4(tmp1311, tmp1315, 136);
__m512 tmp1330 = _mm512_shuffle_f32x4(tmp1311, tmp1315, 221);
__m512 tmp1331 = _mm512_shuffle_f32x4(tmp1312, tmp1316, 136);
__m512 tmp1332 = _mm512_shuffle_f32x4(tmp1312, tmp1316, 221);
__m512 tmp1333 = _mm512_shuffle_f32x4(tmp1317, tmp1321, 136);
__m512 tmp1334 = _mm512_shuffle_f32x4(tmp1317, tmp1321, 221);
__m512 tmp1335 = _mm512_shuffle_f32x4(tmp1318, tmp1322, 136);
__m512 tmp1336 = _mm512_shuffle_f32x4(tmp1318, tmp1322, 221);
__m512 tmp1337 = _mm512_shuffle_f32x4(tmp1319, tmp1323, 136);
__m512 tmp1338 = _mm512_shuffle_f32x4(tmp1319, tmp1323, 221);
__m512 tmp1339 = _mm512_shuffle_f32x4(tmp1320, tmp1324, 136);
__m512 tmp1340 = _mm512_shuffle_f32x4(tmp1320, tmp1324, 221);
in260 = _mm512_shuffle_f32x4(tmp1325, tmp1333, 136);
in268 = _mm512_shuffle_f32x4(tmp1325, tmp1333, 221);
tmp1279 = _mm512_shuffle_f32x4(tmp1327, tmp1335, 136);
tmp1283 = _mm512_shuffle_f32x4(tmp1327, tmp1335, 221);
tmp1280 = _mm512_shuffle_f32x4(tmp1329, tmp1337, 136);
tmp1284 = _mm512_shuffle_f32x4(tmp1329, tmp1337, 221);
in266 = _mm512_shuffle_f32x4(tmp1331, tmp1339, 136);
in274 = _mm512_shuffle_f32x4(tmp1331, tmp1339, 221);
tmp1278 = _mm512_shuffle_f32x4(tmp1326, tmp1334, 136);
tmp1282 = _mm512_shuffle_f32x4(tmp1326, tmp1334, 221);
in262 = _mm512_shuffle_f32x4(tmp1328, tmp1336, 136);
in270 = _mm512_shuffle_f32x4(tmp1328, tmp1336, 221);
in264 = _mm512_shuffle_f32x4(tmp1330, tmp1338, 136);
in272 = _mm512_shuffle_f32x4(tmp1330, tmp1338, 221);
in263 = _mm512_shuffle_f32x4(tmp1332, tmp1340, 136);
in271 = _mm512_shuffle_f32x4(tmp1332, tmp1340, 221);
__m512 tmp1285 = _mm512_add_ps(tmp1279, in262);
__m512 tmp1289 = _mm512_add_ps(tmp1283, in270);
__m512 tmp1286 = _mm512_sub_ps(tmp1278, tmp1280);
__m512 tmp1290 = _mm512_sub_ps(tmp1282, tmp1284);
__m512 tmp1287 = _mm512_add_ps(tmp1280, in264);
__m512 tmp1291 = _mm512_add_ps(tmp1284, in272);
in260 = _mm512_sub_ps(in260, in264);
in268 = _mm512_sub_ps(in268, in272);
tmp1285 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-4.25e+00f), tmp1285);
tmp1289 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-4.25e+00f), tmp1289);
tmp1287 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-4.25e+00f), tmp1287);
tmp1291 = _mm512_fmadd_ps(tmp1282, _mm512_set1_ps(-4.25e+00f), tmp1291);
in260 = _mm512_fmadd_ps(tmp1286, _mm512_set1_ps(5.25e+00f), in260);
in268 = _mm512_fmadd_ps(tmp1290, _mm512_set1_ps(5.25e+00f), in268);
tmp1286 = _mm512_fmadd_ps(tmp1280, _mm512_set1_ps(2.5e-01f), in264);
tmp1290 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(2.5e-01f), in272);
tmp1280 = _mm512_fmadd_ps(tmp1280, _mm512_set1_ps(4e+00f), in264);
tmp1284 = _mm512_fmadd_ps(tmp1284, _mm512_set1_ps(4e+00f), in272);
__m512 tmp1288 = _mm512_sub_ps(tmp1287, tmp1285);
__m512 tmp1292 = _mm512_sub_ps(tmp1291, tmp1289);
tmp1287 = _mm512_add_ps(tmp1285, tmp1287);
tmp1291 = _mm512_add_ps(tmp1289, tmp1291);
tmp1285 = _mm512_fmadd_ps(tmp1279, _mm512_set1_ps(2.5e-01f), in262);
tmp1289 = _mm512_fmadd_ps(tmp1283, _mm512_set1_ps(2.5e-01f), in270);
tmp1286 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-1.25e+00f), tmp1286);
tmp1290 = _mm512_fmadd_ps(tmp1282, _mm512_set1_ps(-1.25e+00f), tmp1290);
tmp1278 = _mm512_fmadd_ps(tmp1278, _mm512_set1_ps(-5e+00f), tmp1280);
tmp1282 = _mm512_fmadd_ps(tmp1282, _mm512_set1_ps(-5e+00f), tmp1284);
tmp1285 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-1.25e+00f), tmp1285);
tmp1289 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-1.25e+00f), tmp1289);
in264 = _mm512_fmadd_ps(tmp1285, _mm512_set1_ps(2e+00f), tmp1286);
in272 = _mm512_fmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1290);
tmp1286 = _mm512_fnmadd_ps(tmp1285, _mm512_set1_ps(2e+00f), tmp1286);
tmp1290 = _mm512_fnmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1290);
tmp1285 = _mm512_fmadd_ps(in262, _mm512_set1_ps(2.5e-01f), tmp1279);
tmp1289 = _mm512_fmadd_ps(in270, _mm512_set1_ps(2.5e-01f), tmp1283);
tmp1279 = _mm512_sub_ps(in263, tmp1279);
tmp1283 = _mm512_sub_ps(in271, tmp1283);
tmp1285 = _mm512_fmadd_ps(in266, _mm512_set1_ps(-1.25e+00f), tmp1285);
tmp1289 = _mm512_fmadd_ps(in274, _mm512_set1_ps(-1.25e+00f), tmp1289);
in266 = _mm512_sub_ps(in266, in262);
in274 = _mm512_sub_ps(in274, in270);
in266 = _mm512_fmadd_ps(in266, _mm512_set1_ps(5.25e+00f), tmp1279);
in274 = _mm512_fmadd_ps(in274, _mm512_set1_ps(5.25e+00f), tmp1283);
tmp1280 = _mm512_fmadd_ps(tmp1285, _mm512_set1_ps(2e+00f), tmp1278);
tmp1284 = _mm512_fmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1282);
tmp1278 = _mm512_fnmadd_ps(tmp1285, _mm512_set1_ps(2e+00f), tmp1278);
tmp1282 = _mm512_fnmadd_ps(tmp1289, _mm512_set1_ps(2e+00f), tmp1282);
__m512 out279 = _mm512_shuffle_f32x4(in260, tmp1287, 68);
__m512 out287 = _mm512_shuffle_f32x4(in260, tmp1287, 238);
__m512 out280 = _mm512_shuffle_f32x4(tmp1288, in264, 68);
__m512 out288 = _mm512_shuffle_f32x4(tmp1288, in264, 238);
__m512 out281 = _mm512_shuffle_f32x4(tmp1286, tmp1280, 68);
__m512 out289 = _mm512_shuffle_f32x4(tmp1286, tmp1280, 238);
__m512 out282 = _mm512_shuffle_f32x4(tmp1278, in266, 68);
__m512 out290 = _mm512_shuffle_f32x4(tmp1278, in266, 238);
__m512 out283 = _mm512_shuffle_f32x4(in268, tmp1291, 68);
__m512 out291 = _mm512_shuffle_f32x4(in268, tmp1291, 238);
__m512 out284 = _mm512_shuffle_f32x4(tmp1292, in272, 68);
__m512 out292 = _mm512_shuffle_f32x4(tmp1292, in272, 238);
__m512 out285 = _mm512_shuffle_f32x4(tmp1290, tmp1284, 68);
__m512 out293 = _mm512_shuffle_f32x4(tmp1290, tmp1284, 238);
__m512 out286 = _mm512_shuffle_f32x4(tmp1282, in274, 68);
__m512 out294 = _mm512_shuffle_f32x4(tmp1282, in274, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k56, out279);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k56, out287);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k56, out283);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k56, out291);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k56, out280);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k56, out288);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k56, out284);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k56, out292);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k56, out281);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k56, out289);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k56, out285);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k56, out293);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k56, out282);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k56, out290);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k56, out286);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k56, out294);
__m512 dat1179 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1179 = _mm512_max_ps(_mm512_setzero_ps(), dat1179);
__m512 dat1180 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1180 = _mm512_max_ps(_mm512_setzero_ps(), dat1180);
__m512i pm94 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in276 = _mm512_permutexvar_ps(pm94, dat1179);
__m512 in284 = _mm512_permutexvar_ps(pm94, dat1180);
__m512 dat1181 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1181 = _mm512_max_ps(_mm512_setzero_ps(), dat1181);
__m512 dat1182 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1182 = _mm512_max_ps(_mm512_setzero_ps(), dat1182);
__m512 in277 = _mm512_permutexvar_ps(pm94, dat1181);
__m512 in285 = _mm512_permutexvar_ps(pm94, dat1182);
__m512 dat1183 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1183 = _mm512_max_ps(_mm512_setzero_ps(), dat1183);
__m512 dat1184 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1184 = _mm512_max_ps(_mm512_setzero_ps(), dat1184);
__m512 in278 = _mm512_permutexvar_ps(pm94, dat1183);
__m512 in286 = _mm512_permutexvar_ps(pm94, dat1184);
__m512 dat1185 = _mm512_maskz_loadu_ps(16383, datPtr5+13328+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1185 = _mm512_max_ps(_mm512_setzero_ps(), dat1185);
__m512 dat1186 = _mm512_maskz_loadu_ps(16383, datPtr5+13376+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1186 = _mm512_max_ps(_mm512_setzero_ps(), dat1186);
__m512 in279 = _mm512_permutexvar_ps(pm94, dat1185);
__m512 in287 = _mm512_permutexvar_ps(pm94, dat1186);
__m512 dat1187 = _mm512_maskz_loadu_ps(16383, datPtr5+13552+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1187 = _mm512_max_ps(_mm512_setzero_ps(), dat1187);
__m512 dat1188 = _mm512_maskz_loadu_ps(16383, datPtr5+13600+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1188 = _mm512_max_ps(_mm512_setzero_ps(), dat1188);
__m512 in280 = _mm512_permutexvar_ps(pm94, dat1187);
__m512 in288 = _mm512_permutexvar_ps(pm94, dat1188);
__m512 dat1189 = _mm512_maskz_loadu_ps(16383, datPtr5+13776+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1189 = _mm512_max_ps(_mm512_setzero_ps(), dat1189);
__m512 dat1190 = _mm512_maskz_loadu_ps(16383, datPtr5+13824+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1190 = _mm512_max_ps(_mm512_setzero_ps(), dat1190);
__m512 in281 = _mm512_permutexvar_ps(pm94, dat1189);
__m512 in289 = _mm512_permutexvar_ps(pm94, dat1190);
__m512 dat1191 = _mm512_maskz_loadu_ps(16383, datPtr5+14000+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1191 = _mm512_max_ps(_mm512_setzero_ps(), dat1191);
__m512 dat1192 = _mm512_maskz_loadu_ps(16383, datPtr5+14048+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1192 = _mm512_max_ps(_mm512_setzero_ps(), dat1192);
__m512 in282 = _mm512_permutexvar_ps(pm94, dat1191);
__m512 in290 = _mm512_permutexvar_ps(pm94, dat1192);
__m512 dat1193 = _mm512_maskz_loadu_ps(16383, datPtr5+14224+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1193 = _mm512_max_ps(_mm512_setzero_ps(), dat1193);
__m512 dat1194 = _mm512_maskz_loadu_ps(16383, datPtr5+14272+50432*i17+224*h25+4*w28+50432*s12+25216*k56);
dat1194 = _mm512_max_ps(_mm512_setzero_ps(), dat1194);
__m512 in283 = _mm512_permutexvar_ps(pm94, dat1193);
__m512 in291 = _mm512_permutexvar_ps(pm94, dat1194);
__m512 tmp1341 = _mm512_add_ps(in277, in281);
__m512 tmp1345 = _mm512_add_ps(in285, in289);
__m512 tmp1342 = _mm512_sub_ps(in280, in278);
__m512 tmp1346 = _mm512_sub_ps(in288, in286);
__m512 tmp1343 = _mm512_add_ps(in278, in282);
__m512 tmp1347 = _mm512_add_ps(in286, in290);
in276 = _mm512_sub_ps(in276, in282);
in284 = _mm512_sub_ps(in284, in290);
tmp1341 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-4.25e+00f), tmp1341);
tmp1345 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-4.25e+00f), tmp1345);
tmp1343 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-4.25e+00f), tmp1343);
tmp1347 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-4.25e+00f), tmp1347);
in276 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(5.25e+00f), in276);
in284 = _mm512_fmadd_ps(tmp1346, _mm512_set1_ps(5.25e+00f), in284);
tmp1342 = _mm512_fmadd_ps(in278, _mm512_set1_ps(2.5e-01f), in282);
tmp1346 = _mm512_fmadd_ps(in286, _mm512_set1_ps(2.5e-01f), in290);
in278 = _mm512_fmadd_ps(in278, _mm512_set1_ps(4e+00f), in282);
in286 = _mm512_fmadd_ps(in286, _mm512_set1_ps(4e+00f), in290);
__m512 tmp1344 = _mm512_sub_ps(tmp1343, tmp1341);
__m512 tmp1348 = _mm512_sub_ps(tmp1347, tmp1345);
tmp1343 = _mm512_add_ps(tmp1341, tmp1343);
tmp1347 = _mm512_add_ps(tmp1345, tmp1347);
tmp1341 = _mm512_fmadd_ps(in277, _mm512_set1_ps(2.5e-01f), in281);
tmp1345 = _mm512_fmadd_ps(in285, _mm512_set1_ps(2.5e-01f), in289);
tmp1342 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-1.25e+00f), tmp1342);
tmp1346 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-1.25e+00f), tmp1346);
in280 = _mm512_fmadd_ps(in280, _mm512_set1_ps(-5e+00f), in278);
in288 = _mm512_fmadd_ps(in288, _mm512_set1_ps(-5e+00f), in286);
tmp1341 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-1.25e+00f), tmp1341);
tmp1345 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-1.25e+00f), tmp1345);
in282 = _mm512_fmadd_ps(tmp1341, _mm512_set1_ps(2e+00f), tmp1342);
in290 = _mm512_fmadd_ps(tmp1345, _mm512_set1_ps(2e+00f), tmp1346);
tmp1342 = _mm512_fnmadd_ps(tmp1341, _mm512_set1_ps(2e+00f), tmp1342);
tmp1346 = _mm512_fnmadd_ps(tmp1345, _mm512_set1_ps(2e+00f), tmp1346);
tmp1341 = _mm512_fmadd_ps(in281, _mm512_set1_ps(2.5e-01f), in277);
tmp1345 = _mm512_fmadd_ps(in289, _mm512_set1_ps(2.5e-01f), in285);
in277 = _mm512_sub_ps(in283, in277);
in285 = _mm512_sub_ps(in291, in285);
tmp1341 = _mm512_fmadd_ps(in279, _mm512_set1_ps(-1.25e+00f), tmp1341);
tmp1345 = _mm512_fmadd_ps(in287, _mm512_set1_ps(-1.25e+00f), tmp1345);
in279 = _mm512_sub_ps(in279, in281);
in287 = _mm512_sub_ps(in287, in289);
in279 = _mm512_fmadd_ps(in279, _mm512_set1_ps(5.25e+00f), in277);
in287 = _mm512_fmadd_ps(in287, _mm512_set1_ps(5.25e+00f), in285);
in278 = _mm512_fmadd_ps(tmp1341, _mm512_set1_ps(2e+00f), in280);
in286 = _mm512_fmadd_ps(tmp1345, _mm512_set1_ps(2e+00f), in288);
in280 = _mm512_fnmadd_ps(tmp1341, _mm512_set1_ps(2e+00f), in280);
in288 = _mm512_fnmadd_ps(tmp1345, _mm512_set1_ps(2e+00f), in288);
__m512 tmp1357 = _mm512_unpacklo_ps(in276, tmp1343);
__m512 tmp1358 = _mm512_unpackhi_ps(in276, tmp1343);
__m512 tmp1359 = _mm512_unpacklo_ps(tmp1344, in282);
__m512 tmp1360 = _mm512_unpackhi_ps(tmp1344, in282);
__m512 tmp1361 = _mm512_unpacklo_ps(tmp1342, in278);
__m512 tmp1362 = _mm512_unpackhi_ps(tmp1342, in278);
__m512 tmp1363 = _mm512_unpacklo_ps(in280, in279);
__m512 tmp1364 = _mm512_unpackhi_ps(in280, in279);
__m512 tmp1365 = _mm512_unpacklo_ps(in284, tmp1347);
__m512 tmp1366 = _mm512_unpackhi_ps(in284, tmp1347);
__m512 tmp1367 = _mm512_unpacklo_ps(tmp1348, in290);
__m512 tmp1368 = _mm512_unpackhi_ps(tmp1348, in290);
__m512 tmp1369 = _mm512_unpacklo_ps(tmp1346, in286);
__m512 tmp1370 = _mm512_unpackhi_ps(tmp1346, in286);
__m512 tmp1371 = _mm512_unpacklo_ps(in288, in287);
__m512 tmp1372 = _mm512_unpackhi_ps(in288, in287);
__m512 tmp1373 = _mm512_shuffle_ps(tmp1357, tmp1359, 68);
__m512 tmp1374 = _mm512_shuffle_ps(tmp1357, tmp1359, 238);
__m512 tmp1375 = _mm512_shuffle_ps(tmp1358, tmp1360, 68);
__m512 tmp1376 = _mm512_shuffle_ps(tmp1358, tmp1360, 238);
__m512 tmp1377 = _mm512_shuffle_ps(tmp1361, tmp1363, 68);
__m512 tmp1378 = _mm512_shuffle_ps(tmp1361, tmp1363, 238);
__m512 tmp1379 = _mm512_shuffle_ps(tmp1362, tmp1364, 68);
__m512 tmp1380 = _mm512_shuffle_ps(tmp1362, tmp1364, 238);
__m512 tmp1381 = _mm512_shuffle_ps(tmp1365, tmp1367, 68);
__m512 tmp1382 = _mm512_shuffle_ps(tmp1365, tmp1367, 238);
__m512 tmp1383 = _mm512_shuffle_ps(tmp1366, tmp1368, 68);
__m512 tmp1384 = _mm512_shuffle_ps(tmp1366, tmp1368, 238);
__m512 tmp1385 = _mm512_shuffle_ps(tmp1369, tmp1371, 68);
__m512 tmp1386 = _mm512_shuffle_ps(tmp1369, tmp1371, 238);
__m512 tmp1387 = _mm512_shuffle_ps(tmp1370, tmp1372, 68);
__m512 tmp1388 = _mm512_shuffle_ps(tmp1370, tmp1372, 238);
__m512 tmp1389 = _mm512_shuffle_f32x4(tmp1373, tmp1377, 136);
__m512 tmp1390 = _mm512_shuffle_f32x4(tmp1373, tmp1377, 221);
__m512 tmp1391 = _mm512_shuffle_f32x4(tmp1374, tmp1378, 136);
__m512 tmp1392 = _mm512_shuffle_f32x4(tmp1374, tmp1378, 221);
__m512 tmp1393 = _mm512_shuffle_f32x4(tmp1375, tmp1379, 136);
__m512 tmp1394 = _mm512_shuffle_f32x4(tmp1375, tmp1379, 221);
__m512 tmp1395 = _mm512_shuffle_f32x4(tmp1376, tmp1380, 136);
__m512 tmp1396 = _mm512_shuffle_f32x4(tmp1376, tmp1380, 221);
__m512 tmp1397 = _mm512_shuffle_f32x4(tmp1381, tmp1385, 136);
__m512 tmp1398 = _mm512_shuffle_f32x4(tmp1381, tmp1385, 221);
__m512 tmp1399 = _mm512_shuffle_f32x4(tmp1382, tmp1386, 136);
__m512 tmp1400 = _mm512_shuffle_f32x4(tmp1382, tmp1386, 221);
__m512 tmp1401 = _mm512_shuffle_f32x4(tmp1383, tmp1387, 136);
__m512 tmp1402 = _mm512_shuffle_f32x4(tmp1383, tmp1387, 221);
__m512 tmp1403 = _mm512_shuffle_f32x4(tmp1384, tmp1388, 136);
__m512 tmp1404 = _mm512_shuffle_f32x4(tmp1384, tmp1388, 221);
in276 = _mm512_shuffle_f32x4(tmp1389, tmp1397, 136);
in284 = _mm512_shuffle_f32x4(tmp1389, tmp1397, 221);
tmp1343 = _mm512_shuffle_f32x4(tmp1391, tmp1399, 136);
tmp1347 = _mm512_shuffle_f32x4(tmp1391, tmp1399, 221);
tmp1344 = _mm512_shuffle_f32x4(tmp1393, tmp1401, 136);
tmp1348 = _mm512_shuffle_f32x4(tmp1393, tmp1401, 221);
in282 = _mm512_shuffle_f32x4(tmp1395, tmp1403, 136);
in290 = _mm512_shuffle_f32x4(tmp1395, tmp1403, 221);
tmp1342 = _mm512_shuffle_f32x4(tmp1390, tmp1398, 136);
tmp1346 = _mm512_shuffle_f32x4(tmp1390, tmp1398, 221);
in278 = _mm512_shuffle_f32x4(tmp1392, tmp1400, 136);
in286 = _mm512_shuffle_f32x4(tmp1392, tmp1400, 221);
in280 = _mm512_shuffle_f32x4(tmp1394, tmp1402, 136);
in288 = _mm512_shuffle_f32x4(tmp1394, tmp1402, 221);
in279 = _mm512_shuffle_f32x4(tmp1396, tmp1404, 136);
in287 = _mm512_shuffle_f32x4(tmp1396, tmp1404, 221);
__m512 tmp1349 = _mm512_add_ps(tmp1343, in278);
__m512 tmp1353 = _mm512_add_ps(tmp1347, in286);
__m512 tmp1350 = _mm512_sub_ps(tmp1342, tmp1344);
__m512 tmp1354 = _mm512_sub_ps(tmp1346, tmp1348);
__m512 tmp1351 = _mm512_add_ps(tmp1344, in280);
__m512 tmp1355 = _mm512_add_ps(tmp1348, in288);
in276 = _mm512_sub_ps(in276, in280);
in284 = _mm512_sub_ps(in284, in288);
tmp1349 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-4.25e+00f), tmp1349);
tmp1353 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-4.25e+00f), tmp1353);
tmp1351 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(-4.25e+00f), tmp1351);
tmp1355 = _mm512_fmadd_ps(tmp1346, _mm512_set1_ps(-4.25e+00f), tmp1355);
in276 = _mm512_fmadd_ps(tmp1350, _mm512_set1_ps(5.25e+00f), in276);
in284 = _mm512_fmadd_ps(tmp1354, _mm512_set1_ps(5.25e+00f), in284);
tmp1350 = _mm512_fmadd_ps(tmp1344, _mm512_set1_ps(2.5e-01f), in280);
tmp1354 = _mm512_fmadd_ps(tmp1348, _mm512_set1_ps(2.5e-01f), in288);
tmp1344 = _mm512_fmadd_ps(tmp1344, _mm512_set1_ps(4e+00f), in280);
tmp1348 = _mm512_fmadd_ps(tmp1348, _mm512_set1_ps(4e+00f), in288);
__m512 tmp1352 = _mm512_sub_ps(tmp1351, tmp1349);
__m512 tmp1356 = _mm512_sub_ps(tmp1355, tmp1353);
tmp1351 = _mm512_add_ps(tmp1349, tmp1351);
tmp1355 = _mm512_add_ps(tmp1353, tmp1355);
tmp1349 = _mm512_fmadd_ps(tmp1343, _mm512_set1_ps(2.5e-01f), in278);
tmp1353 = _mm512_fmadd_ps(tmp1347, _mm512_set1_ps(2.5e-01f), in286);
tmp1350 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(-1.25e+00f), tmp1350);
tmp1354 = _mm512_fmadd_ps(tmp1346, _mm512_set1_ps(-1.25e+00f), tmp1354);
tmp1342 = _mm512_fmadd_ps(tmp1342, _mm512_set1_ps(-5e+00f), tmp1344);
tmp1346 = _mm512_fmadd_ps(tmp1346, _mm512_set1_ps(-5e+00f), tmp1348);
tmp1349 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-1.25e+00f), tmp1349);
tmp1353 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-1.25e+00f), tmp1353);
in280 = _mm512_fmadd_ps(tmp1349, _mm512_set1_ps(2e+00f), tmp1350);
in288 = _mm512_fmadd_ps(tmp1353, _mm512_set1_ps(2e+00f), tmp1354);
tmp1350 = _mm512_fnmadd_ps(tmp1349, _mm512_set1_ps(2e+00f), tmp1350);
tmp1354 = _mm512_fnmadd_ps(tmp1353, _mm512_set1_ps(2e+00f), tmp1354);
tmp1349 = _mm512_fmadd_ps(in278, _mm512_set1_ps(2.5e-01f), tmp1343);
tmp1353 = _mm512_fmadd_ps(in286, _mm512_set1_ps(2.5e-01f), tmp1347);
tmp1343 = _mm512_sub_ps(in279, tmp1343);
tmp1347 = _mm512_sub_ps(in287, tmp1347);
tmp1349 = _mm512_fmadd_ps(in282, _mm512_set1_ps(-1.25e+00f), tmp1349);
tmp1353 = _mm512_fmadd_ps(in290, _mm512_set1_ps(-1.25e+00f), tmp1353);
in282 = _mm512_sub_ps(in282, in278);
in290 = _mm512_sub_ps(in290, in286);
in282 = _mm512_fmadd_ps(in282, _mm512_set1_ps(5.25e+00f), tmp1343);
in290 = _mm512_fmadd_ps(in290, _mm512_set1_ps(5.25e+00f), tmp1347);
tmp1344 = _mm512_fmadd_ps(tmp1349, _mm512_set1_ps(2e+00f), tmp1342);
tmp1348 = _mm512_fmadd_ps(tmp1353, _mm512_set1_ps(2e+00f), tmp1346);
tmp1342 = _mm512_fnmadd_ps(tmp1349, _mm512_set1_ps(2e+00f), tmp1342);
tmp1346 = _mm512_fnmadd_ps(tmp1353, _mm512_set1_ps(2e+00f), tmp1346);
__m512 out295 = _mm512_shuffle_f32x4(in276, tmp1351, 68);
__m512 out303 = _mm512_shuffle_f32x4(in276, tmp1351, 238);
__m512 out296 = _mm512_shuffle_f32x4(tmp1352, in280, 68);
__m512 out304 = _mm512_shuffle_f32x4(tmp1352, in280, 238);
__m512 out297 = _mm512_shuffle_f32x4(tmp1350, tmp1344, 68);
__m512 out305 = _mm512_shuffle_f32x4(tmp1350, tmp1344, 238);
__m512 out298 = _mm512_shuffle_f32x4(tmp1342, in282, 68);
__m512 out306 = _mm512_shuffle_f32x4(tmp1342, in282, 238);
__m512 out299 = _mm512_shuffle_f32x4(in284, tmp1355, 68);
__m512 out307 = _mm512_shuffle_f32x4(in284, tmp1355, 238);
__m512 out300 = _mm512_shuffle_f32x4(tmp1356, in288, 68);
__m512 out308 = _mm512_shuffle_f32x4(tmp1356, in288, 238);
__m512 out301 = _mm512_shuffle_f32x4(tmp1354, tmp1348, 68);
__m512 out309 = _mm512_shuffle_f32x4(tmp1354, tmp1348, 238);
__m512 out302 = _mm512_shuffle_f32x4(tmp1346, in290, 68);
__m512 out310 = _mm512_shuffle_f32x4(tmp1346, in290, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k56, out295);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k56, out303);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k56, out299);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k56, out307);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k56, out296);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k56, out304);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k56, out300);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k56, out308);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k56, out297);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k56, out305);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k56, out301);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k56, out309);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k56, out298);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k56, out306);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k56, out302);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k56, out310);
}
++j11;
rel8 = 4;
}
ptrdiff_t h26 = base8+12;
ptrdiff_t w29 = 36;
ptrdiff_t k57 = 0;
for (; k57 != 2; ++k57) {
__m512 dat1195 = _mm512_maskz_loadu_ps(16383, datPtr5+0+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1195 = _mm512_max_ps(_mm512_setzero_ps(), dat1195);
__m512 dat1196 = _mm512_maskz_loadu_ps(511, datPtr5+48+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1196 = _mm512_max_ps(_mm512_setzero_ps(), dat1196);
__m512i pm95 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in292 = _mm512_permutexvar_ps(pm95, dat1195);
__m512i pm96 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in300 = _mm512_permutexvar_ps(pm96, dat1196);
__m512 dat1197 = _mm512_maskz_loadu_ps(16383, datPtr5+224+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1197 = _mm512_max_ps(_mm512_setzero_ps(), dat1197);
__m512 dat1198 = _mm512_maskz_loadu_ps(511, datPtr5+272+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1198 = _mm512_max_ps(_mm512_setzero_ps(), dat1198);
__m512 in293 = _mm512_permutexvar_ps(pm95, dat1197);
__m512 in301 = _mm512_permutexvar_ps(pm96, dat1198);
__m512 dat1199 = _mm512_maskz_loadu_ps(16383, datPtr5+448+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1199 = _mm512_max_ps(_mm512_setzero_ps(), dat1199);
__m512 dat1200 = _mm512_maskz_loadu_ps(511, datPtr5+496+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1200 = _mm512_max_ps(_mm512_setzero_ps(), dat1200);
__m512 in294 = _mm512_permutexvar_ps(pm95, dat1199);
__m512 in302 = _mm512_permutexvar_ps(pm96, dat1200);
__m512 dat1201 = _mm512_maskz_loadu_ps(16383, datPtr5+672+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1201 = _mm512_max_ps(_mm512_setzero_ps(), dat1201);
__m512 dat1202 = _mm512_maskz_loadu_ps(511, datPtr5+720+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1202 = _mm512_max_ps(_mm512_setzero_ps(), dat1202);
__m512 in295 = _mm512_permutexvar_ps(pm95, dat1201);
__m512 in303 = _mm512_permutexvar_ps(pm96, dat1202);
__m512 dat1203 = _mm512_maskz_loadu_ps(16383, datPtr5+896+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1203 = _mm512_max_ps(_mm512_setzero_ps(), dat1203);
__m512 dat1204 = _mm512_maskz_loadu_ps(511, datPtr5+944+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1204 = _mm512_max_ps(_mm512_setzero_ps(), dat1204);
__m512 in296 = _mm512_permutexvar_ps(pm95, dat1203);
__m512 in304 = _mm512_permutexvar_ps(pm96, dat1204);
__m512 dat1205 = _mm512_maskz_loadu_ps(16383, datPtr5+1120+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1205 = _mm512_max_ps(_mm512_setzero_ps(), dat1205);
__m512 dat1206 = _mm512_maskz_loadu_ps(511, datPtr5+1168+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1206 = _mm512_max_ps(_mm512_setzero_ps(), dat1206);
__m512 in297 = _mm512_permutexvar_ps(pm95, dat1205);
__m512 in305 = _mm512_permutexvar_ps(pm96, dat1206);
__m512 dat1207 = _mm512_maskz_loadu_ps(16383, datPtr5+1344+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1207 = _mm512_max_ps(_mm512_setzero_ps(), dat1207);
__m512 dat1208 = _mm512_maskz_loadu_ps(511, datPtr5+1392+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1208 = _mm512_max_ps(_mm512_setzero_ps(), dat1208);
__m512 in298 = _mm512_permutexvar_ps(pm95, dat1207);
__m512 in306 = _mm512_permutexvar_ps(pm96, dat1208);
__m512 dat1209 = _mm512_maskz_loadu_ps(16383, datPtr5+1568+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1209 = _mm512_max_ps(_mm512_setzero_ps(), dat1209);
__m512 dat1210 = _mm512_maskz_loadu_ps(511, datPtr5+1616+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1210 = _mm512_max_ps(_mm512_setzero_ps(), dat1210);
__m512 in299 = _mm512_permutexvar_ps(pm95, dat1209);
__m512 in307 = _mm512_permutexvar_ps(pm96, dat1210);
__m512 tmp1405 = _mm512_add_ps(in293, in297);
__m512 tmp1409 = _mm512_add_ps(in301, in305);
__m512 tmp1406 = _mm512_sub_ps(in296, in294);
__m512 tmp1410 = _mm512_sub_ps(in304, in302);
__m512 tmp1407 = _mm512_add_ps(in294, in298);
__m512 tmp1411 = _mm512_add_ps(in302, in306);
in292 = _mm512_sub_ps(in292, in298);
in300 = _mm512_sub_ps(in300, in306);
tmp1405 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-4.25e+00f), tmp1405);
tmp1409 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-4.25e+00f), tmp1409);
tmp1407 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-4.25e+00f), tmp1407);
tmp1411 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-4.25e+00f), tmp1411);
in292 = _mm512_fmadd_ps(tmp1406, _mm512_set1_ps(5.25e+00f), in292);
in300 = _mm512_fmadd_ps(tmp1410, _mm512_set1_ps(5.25e+00f), in300);
tmp1406 = _mm512_fmadd_ps(in294, _mm512_set1_ps(2.5e-01f), in298);
tmp1410 = _mm512_fmadd_ps(in302, _mm512_set1_ps(2.5e-01f), in306);
in294 = _mm512_fmadd_ps(in294, _mm512_set1_ps(4e+00f), in298);
in302 = _mm512_fmadd_ps(in302, _mm512_set1_ps(4e+00f), in306);
__m512 tmp1408 = _mm512_sub_ps(tmp1407, tmp1405);
__m512 tmp1412 = _mm512_sub_ps(tmp1411, tmp1409);
tmp1407 = _mm512_add_ps(tmp1405, tmp1407);
tmp1411 = _mm512_add_ps(tmp1409, tmp1411);
tmp1405 = _mm512_fmadd_ps(in293, _mm512_set1_ps(2.5e-01f), in297);
tmp1409 = _mm512_fmadd_ps(in301, _mm512_set1_ps(2.5e-01f), in305);
tmp1406 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-1.25e+00f), tmp1406);
tmp1410 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-1.25e+00f), tmp1410);
in296 = _mm512_fmadd_ps(in296, _mm512_set1_ps(-5e+00f), in294);
in304 = _mm512_fmadd_ps(in304, _mm512_set1_ps(-5e+00f), in302);
tmp1405 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-1.25e+00f), tmp1405);
tmp1409 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-1.25e+00f), tmp1409);
in298 = _mm512_fmadd_ps(tmp1405, _mm512_set1_ps(2e+00f), tmp1406);
in306 = _mm512_fmadd_ps(tmp1409, _mm512_set1_ps(2e+00f), tmp1410);
tmp1406 = _mm512_fnmadd_ps(tmp1405, _mm512_set1_ps(2e+00f), tmp1406);
tmp1410 = _mm512_fnmadd_ps(tmp1409, _mm512_set1_ps(2e+00f), tmp1410);
tmp1405 = _mm512_fmadd_ps(in297, _mm512_set1_ps(2.5e-01f), in293);
tmp1409 = _mm512_fmadd_ps(in305, _mm512_set1_ps(2.5e-01f), in301);
in293 = _mm512_sub_ps(in299, in293);
in301 = _mm512_sub_ps(in307, in301);
tmp1405 = _mm512_fmadd_ps(in295, _mm512_set1_ps(-1.25e+00f), tmp1405);
tmp1409 = _mm512_fmadd_ps(in303, _mm512_set1_ps(-1.25e+00f), tmp1409);
in295 = _mm512_sub_ps(in295, in297);
in303 = _mm512_sub_ps(in303, in305);
in295 = _mm512_fmadd_ps(in295, _mm512_set1_ps(5.25e+00f), in293);
in303 = _mm512_fmadd_ps(in303, _mm512_set1_ps(5.25e+00f), in301);
in294 = _mm512_fmadd_ps(tmp1405, _mm512_set1_ps(2e+00f), in296);
in302 = _mm512_fmadd_ps(tmp1409, _mm512_set1_ps(2e+00f), in304);
in296 = _mm512_fnmadd_ps(tmp1405, _mm512_set1_ps(2e+00f), in296);
in304 = _mm512_fnmadd_ps(tmp1409, _mm512_set1_ps(2e+00f), in304);
__m512 tmp1421 = _mm512_unpacklo_ps(in292, tmp1407);
__m512 tmp1422 = _mm512_unpackhi_ps(in292, tmp1407);
__m512 tmp1423 = _mm512_unpacklo_ps(tmp1408, in298);
__m512 tmp1424 = _mm512_unpackhi_ps(tmp1408, in298);
__m512 tmp1425 = _mm512_unpacklo_ps(tmp1406, in294);
__m512 tmp1426 = _mm512_unpackhi_ps(tmp1406, in294);
__m512 tmp1427 = _mm512_unpacklo_ps(in296, in295);
__m512 tmp1428 = _mm512_unpackhi_ps(in296, in295);
__m512 tmp1429 = _mm512_unpacklo_ps(in300, tmp1411);
__m512 tmp1430 = _mm512_unpackhi_ps(in300, tmp1411);
__m512 tmp1431 = _mm512_unpacklo_ps(tmp1412, in306);
__m512 tmp1432 = _mm512_unpackhi_ps(tmp1412, in306);
__m512 tmp1433 = _mm512_unpacklo_ps(tmp1410, in302);
__m512 tmp1434 = _mm512_unpackhi_ps(tmp1410, in302);
__m512 tmp1435 = _mm512_unpacklo_ps(in304, in303);
__m512 tmp1436 = _mm512_unpackhi_ps(in304, in303);
__m512 tmp1437 = _mm512_shuffle_ps(tmp1421, tmp1423, 68);
__m512 tmp1438 = _mm512_shuffle_ps(tmp1421, tmp1423, 238);
__m512 tmp1439 = _mm512_shuffle_ps(tmp1422, tmp1424, 68);
__m512 tmp1440 = _mm512_shuffle_ps(tmp1422, tmp1424, 238);
__m512 tmp1441 = _mm512_shuffle_ps(tmp1425, tmp1427, 68);
__m512 tmp1442 = _mm512_shuffle_ps(tmp1425, tmp1427, 238);
__m512 tmp1443 = _mm512_shuffle_ps(tmp1426, tmp1428, 68);
__m512 tmp1444 = _mm512_shuffle_ps(tmp1426, tmp1428, 238);
__m512 tmp1445 = _mm512_shuffle_ps(tmp1429, tmp1431, 68);
__m512 tmp1446 = _mm512_shuffle_ps(tmp1429, tmp1431, 238);
__m512 tmp1447 = _mm512_shuffle_ps(tmp1430, tmp1432, 68);
__m512 tmp1448 = _mm512_shuffle_ps(tmp1430, tmp1432, 238);
__m512 tmp1449 = _mm512_shuffle_ps(tmp1433, tmp1435, 68);
__m512 tmp1450 = _mm512_shuffle_ps(tmp1433, tmp1435, 238);
__m512 tmp1451 = _mm512_shuffle_ps(tmp1434, tmp1436, 68);
__m512 tmp1452 = _mm512_shuffle_ps(tmp1434, tmp1436, 238);
__m512 tmp1453 = _mm512_shuffle_f32x4(tmp1437, tmp1441, 136);
__m512 tmp1454 = _mm512_shuffle_f32x4(tmp1437, tmp1441, 221);
__m512 tmp1455 = _mm512_shuffle_f32x4(tmp1438, tmp1442, 136);
__m512 tmp1456 = _mm512_shuffle_f32x4(tmp1438, tmp1442, 221);
__m512 tmp1457 = _mm512_shuffle_f32x4(tmp1439, tmp1443, 136);
__m512 tmp1458 = _mm512_shuffle_f32x4(tmp1439, tmp1443, 221);
__m512 tmp1459 = _mm512_shuffle_f32x4(tmp1440, tmp1444, 136);
__m512 tmp1460 = _mm512_shuffle_f32x4(tmp1440, tmp1444, 221);
__m512 tmp1461 = _mm512_shuffle_f32x4(tmp1445, tmp1449, 136);
__m512 tmp1462 = _mm512_shuffle_f32x4(tmp1445, tmp1449, 221);
__m512 tmp1463 = _mm512_shuffle_f32x4(tmp1446, tmp1450, 136);
__m512 tmp1464 = _mm512_shuffle_f32x4(tmp1446, tmp1450, 221);
__m512 tmp1465 = _mm512_shuffle_f32x4(tmp1447, tmp1451, 136);
__m512 tmp1466 = _mm512_shuffle_f32x4(tmp1447, tmp1451, 221);
__m512 tmp1467 = _mm512_shuffle_f32x4(tmp1448, tmp1452, 136);
__m512 tmp1468 = _mm512_shuffle_f32x4(tmp1448, tmp1452, 221);
in292 = _mm512_shuffle_f32x4(tmp1453, tmp1461, 136);
in300 = _mm512_shuffle_f32x4(tmp1453, tmp1461, 221);
tmp1407 = _mm512_shuffle_f32x4(tmp1455, tmp1463, 136);
tmp1411 = _mm512_shuffle_f32x4(tmp1455, tmp1463, 221);
tmp1408 = _mm512_shuffle_f32x4(tmp1457, tmp1465, 136);
tmp1412 = _mm512_shuffle_f32x4(tmp1457, tmp1465, 221);
in298 = _mm512_shuffle_f32x4(tmp1459, tmp1467, 136);
in306 = _mm512_shuffle_f32x4(tmp1459, tmp1467, 221);
tmp1406 = _mm512_shuffle_f32x4(tmp1454, tmp1462, 136);
tmp1410 = _mm512_shuffle_f32x4(tmp1454, tmp1462, 221);
in294 = _mm512_shuffle_f32x4(tmp1456, tmp1464, 136);
in302 = _mm512_shuffle_f32x4(tmp1456, tmp1464, 221);
in296 = _mm512_shuffle_f32x4(tmp1458, tmp1466, 136);
in304 = _mm512_shuffle_f32x4(tmp1458, tmp1466, 221);
in295 = _mm512_shuffle_f32x4(tmp1460, tmp1468, 136);
in303 = _mm512_shuffle_f32x4(tmp1460, tmp1468, 221);
__m512 tmp1413 = _mm512_add_ps(tmp1407, in294);
__m512 tmp1417 = _mm512_add_ps(tmp1411, in302);
__m512 tmp1414 = _mm512_sub_ps(tmp1406, tmp1408);
__m512 tmp1418 = _mm512_sub_ps(tmp1410, tmp1412);
__m512 tmp1415 = _mm512_add_ps(tmp1408, in296);
__m512 tmp1419 = _mm512_add_ps(tmp1412, in304);
in292 = _mm512_sub_ps(in292, in296);
in300 = _mm512_sub_ps(in300, in304);
tmp1413 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-4.25e+00f), tmp1413);
tmp1417 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-4.25e+00f), tmp1417);
tmp1415 = _mm512_fmadd_ps(tmp1406, _mm512_set1_ps(-4.25e+00f), tmp1415);
tmp1419 = _mm512_fmadd_ps(tmp1410, _mm512_set1_ps(-4.25e+00f), tmp1419);
in292 = _mm512_fmadd_ps(tmp1414, _mm512_set1_ps(5.25e+00f), in292);
in300 = _mm512_fmadd_ps(tmp1418, _mm512_set1_ps(5.25e+00f), in300);
tmp1414 = _mm512_fmadd_ps(tmp1408, _mm512_set1_ps(2.5e-01f), in296);
tmp1418 = _mm512_fmadd_ps(tmp1412, _mm512_set1_ps(2.5e-01f), in304);
tmp1408 = _mm512_fmadd_ps(tmp1408, _mm512_set1_ps(4e+00f), in296);
tmp1412 = _mm512_fmadd_ps(tmp1412, _mm512_set1_ps(4e+00f), in304);
__m512 tmp1416 = _mm512_sub_ps(tmp1415, tmp1413);
__m512 tmp1420 = _mm512_sub_ps(tmp1419, tmp1417);
tmp1415 = _mm512_add_ps(tmp1413, tmp1415);
tmp1419 = _mm512_add_ps(tmp1417, tmp1419);
tmp1413 = _mm512_fmadd_ps(tmp1407, _mm512_set1_ps(2.5e-01f), in294);
tmp1417 = _mm512_fmadd_ps(tmp1411, _mm512_set1_ps(2.5e-01f), in302);
tmp1414 = _mm512_fmadd_ps(tmp1406, _mm512_set1_ps(-1.25e+00f), tmp1414);
tmp1418 = _mm512_fmadd_ps(tmp1410, _mm512_set1_ps(-1.25e+00f), tmp1418);
tmp1406 = _mm512_fmadd_ps(tmp1406, _mm512_set1_ps(-5e+00f), tmp1408);
tmp1410 = _mm512_fmadd_ps(tmp1410, _mm512_set1_ps(-5e+00f), tmp1412);
tmp1413 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-1.25e+00f), tmp1413);
tmp1417 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-1.25e+00f), tmp1417);
in296 = _mm512_fmadd_ps(tmp1413, _mm512_set1_ps(2e+00f), tmp1414);
in304 = _mm512_fmadd_ps(tmp1417, _mm512_set1_ps(2e+00f), tmp1418);
tmp1414 = _mm512_fnmadd_ps(tmp1413, _mm512_set1_ps(2e+00f), tmp1414);
tmp1418 = _mm512_fnmadd_ps(tmp1417, _mm512_set1_ps(2e+00f), tmp1418);
tmp1413 = _mm512_fmadd_ps(in294, _mm512_set1_ps(2.5e-01f), tmp1407);
tmp1417 = _mm512_fmadd_ps(in302, _mm512_set1_ps(2.5e-01f), tmp1411);
tmp1407 = _mm512_sub_ps(in295, tmp1407);
tmp1411 = _mm512_sub_ps(in303, tmp1411);
tmp1413 = _mm512_fmadd_ps(in298, _mm512_set1_ps(-1.25e+00f), tmp1413);
tmp1417 = _mm512_fmadd_ps(in306, _mm512_set1_ps(-1.25e+00f), tmp1417);
in298 = _mm512_sub_ps(in298, in294);
in306 = _mm512_sub_ps(in306, in302);
in298 = _mm512_fmadd_ps(in298, _mm512_set1_ps(5.25e+00f), tmp1407);
in306 = _mm512_fmadd_ps(in306, _mm512_set1_ps(5.25e+00f), tmp1411);
tmp1408 = _mm512_fmadd_ps(tmp1413, _mm512_set1_ps(2e+00f), tmp1406);
tmp1412 = _mm512_fmadd_ps(tmp1417, _mm512_set1_ps(2e+00f), tmp1410);
tmp1406 = _mm512_fnmadd_ps(tmp1413, _mm512_set1_ps(2e+00f), tmp1406);
tmp1410 = _mm512_fnmadd_ps(tmp1417, _mm512_set1_ps(2e+00f), tmp1410);
__m512 out311 = _mm512_shuffle_f32x4(in292, tmp1415, 68);
__m512 out319 = _mm512_shuffle_f32x4(in292, tmp1415, 238);
__m512 out312 = _mm512_shuffle_f32x4(tmp1416, in296, 68);
__m512 out320 = _mm512_shuffle_f32x4(tmp1416, in296, 238);
__m512 out313 = _mm512_shuffle_f32x4(tmp1414, tmp1408, 68);
__m512 out321 = _mm512_shuffle_f32x4(tmp1414, tmp1408, 238);
__m512 out314 = _mm512_shuffle_f32x4(tmp1406, in298, 68);
__m512 out322 = _mm512_shuffle_f32x4(tmp1406, in298, 238);
__m512 out315 = _mm512_shuffle_f32x4(in300, tmp1419, 68);
__m512 out323 = _mm512_shuffle_f32x4(in300, tmp1419, 238);
__m512 out316 = _mm512_shuffle_f32x4(tmp1420, in304, 68);
__m512 out324 = _mm512_shuffle_f32x4(tmp1420, in304, 238);
__m512 out317 = _mm512_shuffle_f32x4(tmp1418, tmp1412, 68);
__m512 out325 = _mm512_shuffle_f32x4(tmp1418, tmp1412, 238);
__m512 out318 = _mm512_shuffle_f32x4(tmp1410, in306, 68);
__m512 out326 = _mm512_shuffle_f32x4(tmp1410, in306, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k57, out311);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k57, out319);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k57, out315);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k57, out323);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k57, out312);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k57, out320);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k57, out316);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k57, out324);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k57, out313);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k57, out321);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k57, out317);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k57, out325);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k57, out314);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k57, out322);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k57, out318);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k57, out326);
__m512 dat1211 = _mm512_maskz_loadu_ps(8191, datPtr5+1204+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1211 = _mm512_max_ps(_mm512_setzero_ps(), dat1211);
__m512 dat1212 = _mm512_maskz_loadu_ps(16383, datPtr5+12608+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1212 = _mm512_max_ps(_mm512_setzero_ps(), dat1212);
__m512i pm97 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in308 = _mm512_permutexvar_ps(pm97, dat1211);
__m512i pm98 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in316 = _mm512_permutexvar_ps(pm98, dat1212);
__m512 dat1213 = _mm512_maskz_loadu_ps(8191, datPtr5+1428+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1213 = _mm512_max_ps(_mm512_setzero_ps(), dat1213);
__m512 dat1214 = _mm512_maskz_loadu_ps(16383, datPtr5+12832+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1214 = _mm512_max_ps(_mm512_setzero_ps(), dat1214);
__m512 in309 = _mm512_permutexvar_ps(pm97, dat1213);
__m512 in317 = _mm512_permutexvar_ps(pm98, dat1214);
__m512 dat1215 = _mm512_maskz_loadu_ps(8191, datPtr5+1652+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1215 = _mm512_max_ps(_mm512_setzero_ps(), dat1215);
__m512 dat1216 = _mm512_maskz_loadu_ps(16383, datPtr5+13056+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1216 = _mm512_max_ps(_mm512_setzero_ps(), dat1216);
__m512 in310 = _mm512_permutexvar_ps(pm97, dat1215);
__m512 in318 = _mm512_permutexvar_ps(pm98, dat1216);
__m512 dat1217 = _mm512_maskz_loadu_ps(8191, datPtr5+1876+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1217 = _mm512_max_ps(_mm512_setzero_ps(), dat1217);
__m512 dat1218 = _mm512_maskz_loadu_ps(16383, datPtr5+13280+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1218 = _mm512_max_ps(_mm512_setzero_ps(), dat1218);
__m512 in311 = _mm512_permutexvar_ps(pm97, dat1217);
__m512 in319 = _mm512_permutexvar_ps(pm98, dat1218);
__m512 dat1219 = _mm512_maskz_loadu_ps(8191, datPtr5+2100+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1219 = _mm512_max_ps(_mm512_setzero_ps(), dat1219);
__m512 dat1220 = _mm512_maskz_loadu_ps(16383, datPtr5+13504+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1220 = _mm512_max_ps(_mm512_setzero_ps(), dat1220);
__m512 in312 = _mm512_permutexvar_ps(pm97, dat1219);
__m512 in320 = _mm512_permutexvar_ps(pm98, dat1220);
__m512 dat1221 = _mm512_maskz_loadu_ps(8191, datPtr5+2324+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1221 = _mm512_max_ps(_mm512_setzero_ps(), dat1221);
__m512 dat1222 = _mm512_maskz_loadu_ps(16383, datPtr5+13728+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1222 = _mm512_max_ps(_mm512_setzero_ps(), dat1222);
__m512 in313 = _mm512_permutexvar_ps(pm97, dat1221);
__m512 in321 = _mm512_permutexvar_ps(pm98, dat1222);
__m512 dat1223 = _mm512_maskz_loadu_ps(8191, datPtr5+2548+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1223 = _mm512_max_ps(_mm512_setzero_ps(), dat1223);
__m512 dat1224 = _mm512_maskz_loadu_ps(16383, datPtr5+13952+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1224 = _mm512_max_ps(_mm512_setzero_ps(), dat1224);
__m512 in314 = _mm512_permutexvar_ps(pm97, dat1223);
__m512 in322 = _mm512_permutexvar_ps(pm98, dat1224);
__m512 dat1225 = _mm512_maskz_loadu_ps(8191, datPtr5+2772+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1225 = _mm512_max_ps(_mm512_setzero_ps(), dat1225);
__m512 dat1226 = _mm512_maskz_loadu_ps(16383, datPtr5+14176+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1226 = _mm512_max_ps(_mm512_setzero_ps(), dat1226);
__m512 in315 = _mm512_permutexvar_ps(pm97, dat1225);
__m512 in323 = _mm512_permutexvar_ps(pm98, dat1226);
__m512 tmp1469 = _mm512_add_ps(in309, in313);
__m512 tmp1473 = _mm512_add_ps(in317, in321);
__m512 tmp1470 = _mm512_sub_ps(in312, in310);
__m512 tmp1474 = _mm512_sub_ps(in320, in318);
__m512 tmp1471 = _mm512_add_ps(in310, in314);
__m512 tmp1475 = _mm512_add_ps(in318, in322);
in308 = _mm512_sub_ps(in308, in314);
in316 = _mm512_sub_ps(in316, in322);
tmp1469 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-4.25e+00f), tmp1469);
tmp1473 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-4.25e+00f), tmp1473);
tmp1471 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-4.25e+00f), tmp1471);
tmp1475 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-4.25e+00f), tmp1475);
in308 = _mm512_fmadd_ps(tmp1470, _mm512_set1_ps(5.25e+00f), in308);
in316 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(5.25e+00f), in316);
tmp1470 = _mm512_fmadd_ps(in310, _mm512_set1_ps(2.5e-01f), in314);
tmp1474 = _mm512_fmadd_ps(in318, _mm512_set1_ps(2.5e-01f), in322);
in310 = _mm512_fmadd_ps(in310, _mm512_set1_ps(4e+00f), in314);
in318 = _mm512_fmadd_ps(in318, _mm512_set1_ps(4e+00f), in322);
__m512 tmp1472 = _mm512_sub_ps(tmp1471, tmp1469);
__m512 tmp1476 = _mm512_sub_ps(tmp1475, tmp1473);
tmp1471 = _mm512_add_ps(tmp1469, tmp1471);
tmp1475 = _mm512_add_ps(tmp1473, tmp1475);
tmp1469 = _mm512_fmadd_ps(in309, _mm512_set1_ps(2.5e-01f), in313);
tmp1473 = _mm512_fmadd_ps(in317, _mm512_set1_ps(2.5e-01f), in321);
tmp1470 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-1.25e+00f), tmp1470);
tmp1474 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-1.25e+00f), tmp1474);
in312 = _mm512_fmadd_ps(in312, _mm512_set1_ps(-5e+00f), in310);
in320 = _mm512_fmadd_ps(in320, _mm512_set1_ps(-5e+00f), in318);
tmp1469 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-1.25e+00f), tmp1469);
tmp1473 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-1.25e+00f), tmp1473);
in314 = _mm512_fmadd_ps(tmp1469, _mm512_set1_ps(2e+00f), tmp1470);
in322 = _mm512_fmadd_ps(tmp1473, _mm512_set1_ps(2e+00f), tmp1474);
tmp1470 = _mm512_fnmadd_ps(tmp1469, _mm512_set1_ps(2e+00f), tmp1470);
tmp1474 = _mm512_fnmadd_ps(tmp1473, _mm512_set1_ps(2e+00f), tmp1474);
tmp1469 = _mm512_fmadd_ps(in313, _mm512_set1_ps(2.5e-01f), in309);
tmp1473 = _mm512_fmadd_ps(in321, _mm512_set1_ps(2.5e-01f), in317);
in309 = _mm512_sub_ps(in315, in309);
in317 = _mm512_sub_ps(in323, in317);
tmp1469 = _mm512_fmadd_ps(in311, _mm512_set1_ps(-1.25e+00f), tmp1469);
tmp1473 = _mm512_fmadd_ps(in319, _mm512_set1_ps(-1.25e+00f), tmp1473);
in311 = _mm512_sub_ps(in311, in313);
in319 = _mm512_sub_ps(in319, in321);
in311 = _mm512_fmadd_ps(in311, _mm512_set1_ps(5.25e+00f), in309);
in319 = _mm512_fmadd_ps(in319, _mm512_set1_ps(5.25e+00f), in317);
in310 = _mm512_fmadd_ps(tmp1469, _mm512_set1_ps(2e+00f), in312);
in318 = _mm512_fmadd_ps(tmp1473, _mm512_set1_ps(2e+00f), in320);
in312 = _mm512_fnmadd_ps(tmp1469, _mm512_set1_ps(2e+00f), in312);
in320 = _mm512_fnmadd_ps(tmp1473, _mm512_set1_ps(2e+00f), in320);
__m512 tmp1485 = _mm512_unpacklo_ps(in308, tmp1471);
__m512 tmp1486 = _mm512_unpackhi_ps(in308, tmp1471);
__m512 tmp1487 = _mm512_unpacklo_ps(tmp1472, in314);
__m512 tmp1488 = _mm512_unpackhi_ps(tmp1472, in314);
__m512 tmp1489 = _mm512_unpacklo_ps(tmp1470, in310);
__m512 tmp1490 = _mm512_unpackhi_ps(tmp1470, in310);
__m512 tmp1491 = _mm512_unpacklo_ps(in312, in311);
__m512 tmp1492 = _mm512_unpackhi_ps(in312, in311);
__m512 tmp1493 = _mm512_unpacklo_ps(in316, tmp1475);
__m512 tmp1494 = _mm512_unpackhi_ps(in316, tmp1475);
__m512 tmp1495 = _mm512_unpacklo_ps(tmp1476, in322);
__m512 tmp1496 = _mm512_unpackhi_ps(tmp1476, in322);
__m512 tmp1497 = _mm512_unpacklo_ps(tmp1474, in318);
__m512 tmp1498 = _mm512_unpackhi_ps(tmp1474, in318);
__m512 tmp1499 = _mm512_unpacklo_ps(in320, in319);
__m512 tmp1500 = _mm512_unpackhi_ps(in320, in319);
__m512 tmp1501 = _mm512_shuffle_ps(tmp1485, tmp1487, 68);
__m512 tmp1502 = _mm512_shuffle_ps(tmp1485, tmp1487, 238);
__m512 tmp1503 = _mm512_shuffle_ps(tmp1486, tmp1488, 68);
__m512 tmp1504 = _mm512_shuffle_ps(tmp1486, tmp1488, 238);
__m512 tmp1505 = _mm512_shuffle_ps(tmp1489, tmp1491, 68);
__m512 tmp1506 = _mm512_shuffle_ps(tmp1489, tmp1491, 238);
__m512 tmp1507 = _mm512_shuffle_ps(tmp1490, tmp1492, 68);
__m512 tmp1508 = _mm512_shuffle_ps(tmp1490, tmp1492, 238);
__m512 tmp1509 = _mm512_shuffle_ps(tmp1493, tmp1495, 68);
__m512 tmp1510 = _mm512_shuffle_ps(tmp1493, tmp1495, 238);
__m512 tmp1511 = _mm512_shuffle_ps(tmp1494, tmp1496, 68);
__m512 tmp1512 = _mm512_shuffle_ps(tmp1494, tmp1496, 238);
__m512 tmp1513 = _mm512_shuffle_ps(tmp1497, tmp1499, 68);
__m512 tmp1514 = _mm512_shuffle_ps(tmp1497, tmp1499, 238);
__m512 tmp1515 = _mm512_shuffle_ps(tmp1498, tmp1500, 68);
__m512 tmp1516 = _mm512_shuffle_ps(tmp1498, tmp1500, 238);
__m512 tmp1517 = _mm512_shuffle_f32x4(tmp1501, tmp1505, 136);
__m512 tmp1518 = _mm512_shuffle_f32x4(tmp1501, tmp1505, 221);
__m512 tmp1519 = _mm512_shuffle_f32x4(tmp1502, tmp1506, 136);
__m512 tmp1520 = _mm512_shuffle_f32x4(tmp1502, tmp1506, 221);
__m512 tmp1521 = _mm512_shuffle_f32x4(tmp1503, tmp1507, 136);
__m512 tmp1522 = _mm512_shuffle_f32x4(tmp1503, tmp1507, 221);
__m512 tmp1523 = _mm512_shuffle_f32x4(tmp1504, tmp1508, 136);
__m512 tmp1524 = _mm512_shuffle_f32x4(tmp1504, tmp1508, 221);
__m512 tmp1525 = _mm512_shuffle_f32x4(tmp1509, tmp1513, 136);
__m512 tmp1526 = _mm512_shuffle_f32x4(tmp1509, tmp1513, 221);
__m512 tmp1527 = _mm512_shuffle_f32x4(tmp1510, tmp1514, 136);
__m512 tmp1528 = _mm512_shuffle_f32x4(tmp1510, tmp1514, 221);
__m512 tmp1529 = _mm512_shuffle_f32x4(tmp1511, tmp1515, 136);
__m512 tmp1530 = _mm512_shuffle_f32x4(tmp1511, tmp1515, 221);
__m512 tmp1531 = _mm512_shuffle_f32x4(tmp1512, tmp1516, 136);
__m512 tmp1532 = _mm512_shuffle_f32x4(tmp1512, tmp1516, 221);
in308 = _mm512_shuffle_f32x4(tmp1517, tmp1525, 136);
in316 = _mm512_shuffle_f32x4(tmp1517, tmp1525, 221);
tmp1471 = _mm512_shuffle_f32x4(tmp1519, tmp1527, 136);
tmp1475 = _mm512_shuffle_f32x4(tmp1519, tmp1527, 221);
tmp1472 = _mm512_shuffle_f32x4(tmp1521, tmp1529, 136);
tmp1476 = _mm512_shuffle_f32x4(tmp1521, tmp1529, 221);
in314 = _mm512_shuffle_f32x4(tmp1523, tmp1531, 136);
in322 = _mm512_shuffle_f32x4(tmp1523, tmp1531, 221);
tmp1470 = _mm512_shuffle_f32x4(tmp1518, tmp1526, 136);
tmp1474 = _mm512_shuffle_f32x4(tmp1518, tmp1526, 221);
in310 = _mm512_shuffle_f32x4(tmp1520, tmp1528, 136);
in318 = _mm512_shuffle_f32x4(tmp1520, tmp1528, 221);
in312 = _mm512_shuffle_f32x4(tmp1522, tmp1530, 136);
in320 = _mm512_shuffle_f32x4(tmp1522, tmp1530, 221);
in311 = _mm512_shuffle_f32x4(tmp1524, tmp1532, 136);
in319 = _mm512_shuffle_f32x4(tmp1524, tmp1532, 221);
__m512 tmp1477 = _mm512_add_ps(tmp1471, in310);
__m512 tmp1481 = _mm512_add_ps(tmp1475, in318);
__m512 tmp1478 = _mm512_sub_ps(tmp1470, tmp1472);
__m512 tmp1482 = _mm512_sub_ps(tmp1474, tmp1476);
__m512 tmp1479 = _mm512_add_ps(tmp1472, in312);
__m512 tmp1483 = _mm512_add_ps(tmp1476, in320);
in308 = _mm512_sub_ps(in308, in312);
in316 = _mm512_sub_ps(in316, in320);
tmp1477 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-4.25e+00f), tmp1477);
tmp1481 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-4.25e+00f), tmp1481);
tmp1479 = _mm512_fmadd_ps(tmp1470, _mm512_set1_ps(-4.25e+00f), tmp1479);
tmp1483 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(-4.25e+00f), tmp1483);
in308 = _mm512_fmadd_ps(tmp1478, _mm512_set1_ps(5.25e+00f), in308);
in316 = _mm512_fmadd_ps(tmp1482, _mm512_set1_ps(5.25e+00f), in316);
tmp1478 = _mm512_fmadd_ps(tmp1472, _mm512_set1_ps(2.5e-01f), in312);
tmp1482 = _mm512_fmadd_ps(tmp1476, _mm512_set1_ps(2.5e-01f), in320);
tmp1472 = _mm512_fmadd_ps(tmp1472, _mm512_set1_ps(4e+00f), in312);
tmp1476 = _mm512_fmadd_ps(tmp1476, _mm512_set1_ps(4e+00f), in320);
__m512 tmp1480 = _mm512_sub_ps(tmp1479, tmp1477);
__m512 tmp1484 = _mm512_sub_ps(tmp1483, tmp1481);
tmp1479 = _mm512_add_ps(tmp1477, tmp1479);
tmp1483 = _mm512_add_ps(tmp1481, tmp1483);
tmp1477 = _mm512_fmadd_ps(tmp1471, _mm512_set1_ps(2.5e-01f), in310);
tmp1481 = _mm512_fmadd_ps(tmp1475, _mm512_set1_ps(2.5e-01f), in318);
tmp1478 = _mm512_fmadd_ps(tmp1470, _mm512_set1_ps(-1.25e+00f), tmp1478);
tmp1482 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(-1.25e+00f), tmp1482);
tmp1470 = _mm512_fmadd_ps(tmp1470, _mm512_set1_ps(-5e+00f), tmp1472);
tmp1474 = _mm512_fmadd_ps(tmp1474, _mm512_set1_ps(-5e+00f), tmp1476);
tmp1477 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-1.25e+00f), tmp1477);
tmp1481 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-1.25e+00f), tmp1481);
in312 = _mm512_fmadd_ps(tmp1477, _mm512_set1_ps(2e+00f), tmp1478);
in320 = _mm512_fmadd_ps(tmp1481, _mm512_set1_ps(2e+00f), tmp1482);
tmp1478 = _mm512_fnmadd_ps(tmp1477, _mm512_set1_ps(2e+00f), tmp1478);
tmp1482 = _mm512_fnmadd_ps(tmp1481, _mm512_set1_ps(2e+00f), tmp1482);
tmp1477 = _mm512_fmadd_ps(in310, _mm512_set1_ps(2.5e-01f), tmp1471);
tmp1481 = _mm512_fmadd_ps(in318, _mm512_set1_ps(2.5e-01f), tmp1475);
tmp1471 = _mm512_sub_ps(in311, tmp1471);
tmp1475 = _mm512_sub_ps(in319, tmp1475);
tmp1477 = _mm512_fmadd_ps(in314, _mm512_set1_ps(-1.25e+00f), tmp1477);
tmp1481 = _mm512_fmadd_ps(in322, _mm512_set1_ps(-1.25e+00f), tmp1481);
in314 = _mm512_sub_ps(in314, in310);
in322 = _mm512_sub_ps(in322, in318);
in314 = _mm512_fmadd_ps(in314, _mm512_set1_ps(5.25e+00f), tmp1471);
in322 = _mm512_fmadd_ps(in322, _mm512_set1_ps(5.25e+00f), tmp1475);
tmp1472 = _mm512_fmadd_ps(tmp1477, _mm512_set1_ps(2e+00f), tmp1470);
tmp1476 = _mm512_fmadd_ps(tmp1481, _mm512_set1_ps(2e+00f), tmp1474);
tmp1470 = _mm512_fnmadd_ps(tmp1477, _mm512_set1_ps(2e+00f), tmp1470);
tmp1474 = _mm512_fnmadd_ps(tmp1481, _mm512_set1_ps(2e+00f), tmp1474);
__m512 out327 = _mm512_shuffle_f32x4(in308, tmp1479, 68);
__m512 out335 = _mm512_shuffle_f32x4(in308, tmp1479, 238);
__m512 out328 = _mm512_shuffle_f32x4(tmp1480, in312, 68);
__m512 out336 = _mm512_shuffle_f32x4(tmp1480, in312, 238);
__m512 out329 = _mm512_shuffle_f32x4(tmp1478, tmp1472, 68);
__m512 out337 = _mm512_shuffle_f32x4(tmp1478, tmp1472, 238);
__m512 out330 = _mm512_shuffle_f32x4(tmp1470, in314, 68);
__m512 out338 = _mm512_shuffle_f32x4(tmp1470, in314, 238);
__m512 out331 = _mm512_shuffle_f32x4(in316, tmp1483, 68);
__m512 out339 = _mm512_shuffle_f32x4(in316, tmp1483, 238);
__m512 out332 = _mm512_shuffle_f32x4(tmp1484, in320, 68);
__m512 out340 = _mm512_shuffle_f32x4(tmp1484, in320, 238);
__m512 out333 = _mm512_shuffle_f32x4(tmp1482, tmp1476, 68);
__m512 out341 = _mm512_shuffle_f32x4(tmp1482, tmp1476, 238);
__m512 out334 = _mm512_shuffle_f32x4(tmp1474, in322, 68);
__m512 out342 = _mm512_shuffle_f32x4(tmp1474, in322, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k57, out327);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k57, out335);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k57, out331);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k57, out339);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k57, out328);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k57, out336);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k57, out332);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k57, out340);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k57, out329);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k57, out337);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k57, out333);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k57, out341);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k57, out330);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k57, out338);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k57, out334);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k57, out342);
__m512 dat1227 = _mm512_maskz_loadu_ps(511, datPtr5+12656+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1227 = _mm512_max_ps(_mm512_setzero_ps(), dat1227);
__m512 dat1228 = _mm512_maskz_loadu_ps(8191, datPtr5+13812+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1228 = _mm512_max_ps(_mm512_setzero_ps(), dat1228);
__m512i pm99 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in324 = _mm512_permutexvar_ps(pm99, dat1227);
__m512i pm100 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in332 = _mm512_permutexvar_ps(pm100, dat1228);
__m512 dat1229 = _mm512_maskz_loadu_ps(511, datPtr5+12880+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1229 = _mm512_max_ps(_mm512_setzero_ps(), dat1229);
__m512 dat1230 = _mm512_maskz_loadu_ps(8191, datPtr5+14036+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1230 = _mm512_max_ps(_mm512_setzero_ps(), dat1230);
__m512 in325 = _mm512_permutexvar_ps(pm99, dat1229);
__m512 in333 = _mm512_permutexvar_ps(pm100, dat1230);
__m512 dat1231 = _mm512_maskz_loadu_ps(511, datPtr5+13104+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1231 = _mm512_max_ps(_mm512_setzero_ps(), dat1231);
__m512 dat1232 = _mm512_maskz_loadu_ps(8191, datPtr5+14260+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1232 = _mm512_max_ps(_mm512_setzero_ps(), dat1232);
__m512 in326 = _mm512_permutexvar_ps(pm99, dat1231);
__m512 in334 = _mm512_permutexvar_ps(pm100, dat1232);
__m512 dat1233 = _mm512_maskz_loadu_ps(511, datPtr5+13328+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1233 = _mm512_max_ps(_mm512_setzero_ps(), dat1233);
__m512 dat1234 = _mm512_maskz_loadu_ps(8191, datPtr5+14484+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1234 = _mm512_max_ps(_mm512_setzero_ps(), dat1234);
__m512 in327 = _mm512_permutexvar_ps(pm99, dat1233);
__m512 in335 = _mm512_permutexvar_ps(pm100, dat1234);
__m512 dat1235 = _mm512_maskz_loadu_ps(511, datPtr5+13552+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1235 = _mm512_max_ps(_mm512_setzero_ps(), dat1235);
__m512 dat1236 = _mm512_maskz_loadu_ps(8191, datPtr5+14708+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1236 = _mm512_max_ps(_mm512_setzero_ps(), dat1236);
__m512 in328 = _mm512_permutexvar_ps(pm99, dat1235);
__m512 in336 = _mm512_permutexvar_ps(pm100, dat1236);
__m512 dat1237 = _mm512_maskz_loadu_ps(511, datPtr5+13776+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1237 = _mm512_max_ps(_mm512_setzero_ps(), dat1237);
__m512 dat1238 = _mm512_maskz_loadu_ps(8191, datPtr5+14932+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1238 = _mm512_max_ps(_mm512_setzero_ps(), dat1238);
__m512 in329 = _mm512_permutexvar_ps(pm99, dat1237);
__m512 in337 = _mm512_permutexvar_ps(pm100, dat1238);
__m512 dat1239 = _mm512_maskz_loadu_ps(511, datPtr5+14000+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1239 = _mm512_max_ps(_mm512_setzero_ps(), dat1239);
__m512 dat1240 = _mm512_maskz_loadu_ps(8191, datPtr5+15156+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1240 = _mm512_max_ps(_mm512_setzero_ps(), dat1240);
__m512 in330 = _mm512_permutexvar_ps(pm99, dat1239);
__m512 in338 = _mm512_permutexvar_ps(pm100, dat1240);
__m512 dat1241 = _mm512_maskz_loadu_ps(511, datPtr5+14224+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1241 = _mm512_max_ps(_mm512_setzero_ps(), dat1241);
__m512 dat1242 = _mm512_maskz_loadu_ps(8191, datPtr5+15380+50432*i17+224*h26+4*w29+50432*s12+25216*k57);
dat1242 = _mm512_max_ps(_mm512_setzero_ps(), dat1242);
__m512 in331 = _mm512_permutexvar_ps(pm99, dat1241);
__m512 in339 = _mm512_permutexvar_ps(pm100, dat1242);
__m512 tmp1533 = _mm512_add_ps(in325, in329);
__m512 tmp1537 = _mm512_add_ps(in333, in337);
__m512 tmp1534 = _mm512_sub_ps(in328, in326);
__m512 tmp1538 = _mm512_sub_ps(in336, in334);
__m512 tmp1535 = _mm512_add_ps(in326, in330);
__m512 tmp1539 = _mm512_add_ps(in334, in338);
in324 = _mm512_sub_ps(in324, in330);
in332 = _mm512_sub_ps(in332, in338);
tmp1533 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-4.25e+00f), tmp1533);
tmp1537 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-4.25e+00f), tmp1537);
tmp1535 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-4.25e+00f), tmp1535);
tmp1539 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-4.25e+00f), tmp1539);
in324 = _mm512_fmadd_ps(tmp1534, _mm512_set1_ps(5.25e+00f), in324);
in332 = _mm512_fmadd_ps(tmp1538, _mm512_set1_ps(5.25e+00f), in332);
tmp1534 = _mm512_fmadd_ps(in326, _mm512_set1_ps(2.5e-01f), in330);
tmp1538 = _mm512_fmadd_ps(in334, _mm512_set1_ps(2.5e-01f), in338);
in326 = _mm512_fmadd_ps(in326, _mm512_set1_ps(4e+00f), in330);
in334 = _mm512_fmadd_ps(in334, _mm512_set1_ps(4e+00f), in338);
__m512 tmp1536 = _mm512_sub_ps(tmp1535, tmp1533);
__m512 tmp1540 = _mm512_sub_ps(tmp1539, tmp1537);
tmp1535 = _mm512_add_ps(tmp1533, tmp1535);
tmp1539 = _mm512_add_ps(tmp1537, tmp1539);
tmp1533 = _mm512_fmadd_ps(in325, _mm512_set1_ps(2.5e-01f), in329);
tmp1537 = _mm512_fmadd_ps(in333, _mm512_set1_ps(2.5e-01f), in337);
tmp1534 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-1.25e+00f), tmp1534);
tmp1538 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-1.25e+00f), tmp1538);
in328 = _mm512_fmadd_ps(in328, _mm512_set1_ps(-5e+00f), in326);
in336 = _mm512_fmadd_ps(in336, _mm512_set1_ps(-5e+00f), in334);
tmp1533 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-1.25e+00f), tmp1533);
tmp1537 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-1.25e+00f), tmp1537);
in330 = _mm512_fmadd_ps(tmp1533, _mm512_set1_ps(2e+00f), tmp1534);
in338 = _mm512_fmadd_ps(tmp1537, _mm512_set1_ps(2e+00f), tmp1538);
tmp1534 = _mm512_fnmadd_ps(tmp1533, _mm512_set1_ps(2e+00f), tmp1534);
tmp1538 = _mm512_fnmadd_ps(tmp1537, _mm512_set1_ps(2e+00f), tmp1538);
tmp1533 = _mm512_fmadd_ps(in329, _mm512_set1_ps(2.5e-01f), in325);
tmp1537 = _mm512_fmadd_ps(in337, _mm512_set1_ps(2.5e-01f), in333);
in325 = _mm512_sub_ps(in331, in325);
in333 = _mm512_sub_ps(in339, in333);
tmp1533 = _mm512_fmadd_ps(in327, _mm512_set1_ps(-1.25e+00f), tmp1533);
tmp1537 = _mm512_fmadd_ps(in335, _mm512_set1_ps(-1.25e+00f), tmp1537);
in327 = _mm512_sub_ps(in327, in329);
in335 = _mm512_sub_ps(in335, in337);
in327 = _mm512_fmadd_ps(in327, _mm512_set1_ps(5.25e+00f), in325);
in335 = _mm512_fmadd_ps(in335, _mm512_set1_ps(5.25e+00f), in333);
in326 = _mm512_fmadd_ps(tmp1533, _mm512_set1_ps(2e+00f), in328);
in334 = _mm512_fmadd_ps(tmp1537, _mm512_set1_ps(2e+00f), in336);
in328 = _mm512_fnmadd_ps(tmp1533, _mm512_set1_ps(2e+00f), in328);
in336 = _mm512_fnmadd_ps(tmp1537, _mm512_set1_ps(2e+00f), in336);
__m512 tmp1549 = _mm512_unpacklo_ps(in324, tmp1535);
__m512 tmp1550 = _mm512_unpackhi_ps(in324, tmp1535);
__m512 tmp1551 = _mm512_unpacklo_ps(tmp1536, in330);
__m512 tmp1552 = _mm512_unpackhi_ps(tmp1536, in330);
__m512 tmp1553 = _mm512_unpacklo_ps(tmp1534, in326);
__m512 tmp1554 = _mm512_unpackhi_ps(tmp1534, in326);
__m512 tmp1555 = _mm512_unpacklo_ps(in328, in327);
__m512 tmp1556 = _mm512_unpackhi_ps(in328, in327);
__m512 tmp1557 = _mm512_unpacklo_ps(in332, tmp1539);
__m512 tmp1558 = _mm512_unpackhi_ps(in332, tmp1539);
__m512 tmp1559 = _mm512_unpacklo_ps(tmp1540, in338);
__m512 tmp1560 = _mm512_unpackhi_ps(tmp1540, in338);
__m512 tmp1561 = _mm512_unpacklo_ps(tmp1538, in334);
__m512 tmp1562 = _mm512_unpackhi_ps(tmp1538, in334);
__m512 tmp1563 = _mm512_unpacklo_ps(in336, in335);
__m512 tmp1564 = _mm512_unpackhi_ps(in336, in335);
__m512 tmp1565 = _mm512_shuffle_ps(tmp1549, tmp1551, 68);
__m512 tmp1566 = _mm512_shuffle_ps(tmp1549, tmp1551, 238);
__m512 tmp1567 = _mm512_shuffle_ps(tmp1550, tmp1552, 68);
__m512 tmp1568 = _mm512_shuffle_ps(tmp1550, tmp1552, 238);
__m512 tmp1569 = _mm512_shuffle_ps(tmp1553, tmp1555, 68);
__m512 tmp1570 = _mm512_shuffle_ps(tmp1553, tmp1555, 238);
__m512 tmp1571 = _mm512_shuffle_ps(tmp1554, tmp1556, 68);
__m512 tmp1572 = _mm512_shuffle_ps(tmp1554, tmp1556, 238);
__m512 tmp1573 = _mm512_shuffle_ps(tmp1557, tmp1559, 68);
__m512 tmp1574 = _mm512_shuffle_ps(tmp1557, tmp1559, 238);
__m512 tmp1575 = _mm512_shuffle_ps(tmp1558, tmp1560, 68);
__m512 tmp1576 = _mm512_shuffle_ps(tmp1558, tmp1560, 238);
__m512 tmp1577 = _mm512_shuffle_ps(tmp1561, tmp1563, 68);
__m512 tmp1578 = _mm512_shuffle_ps(tmp1561, tmp1563, 238);
__m512 tmp1579 = _mm512_shuffle_ps(tmp1562, tmp1564, 68);
__m512 tmp1580 = _mm512_shuffle_ps(tmp1562, tmp1564, 238);
__m512 tmp1581 = _mm512_shuffle_f32x4(tmp1565, tmp1569, 136);
__m512 tmp1582 = _mm512_shuffle_f32x4(tmp1565, tmp1569, 221);
__m512 tmp1583 = _mm512_shuffle_f32x4(tmp1566, tmp1570, 136);
__m512 tmp1584 = _mm512_shuffle_f32x4(tmp1566, tmp1570, 221);
__m512 tmp1585 = _mm512_shuffle_f32x4(tmp1567, tmp1571, 136);
__m512 tmp1586 = _mm512_shuffle_f32x4(tmp1567, tmp1571, 221);
__m512 tmp1587 = _mm512_shuffle_f32x4(tmp1568, tmp1572, 136);
__m512 tmp1588 = _mm512_shuffle_f32x4(tmp1568, tmp1572, 221);
__m512 tmp1589 = _mm512_shuffle_f32x4(tmp1573, tmp1577, 136);
__m512 tmp1590 = _mm512_shuffle_f32x4(tmp1573, tmp1577, 221);
__m512 tmp1591 = _mm512_shuffle_f32x4(tmp1574, tmp1578, 136);
__m512 tmp1592 = _mm512_shuffle_f32x4(tmp1574, tmp1578, 221);
__m512 tmp1593 = _mm512_shuffle_f32x4(tmp1575, tmp1579, 136);
__m512 tmp1594 = _mm512_shuffle_f32x4(tmp1575, tmp1579, 221);
__m512 tmp1595 = _mm512_shuffle_f32x4(tmp1576, tmp1580, 136);
__m512 tmp1596 = _mm512_shuffle_f32x4(tmp1576, tmp1580, 221);
in324 = _mm512_shuffle_f32x4(tmp1581, tmp1589, 136);
in332 = _mm512_shuffle_f32x4(tmp1581, tmp1589, 221);
tmp1535 = _mm512_shuffle_f32x4(tmp1583, tmp1591, 136);
tmp1539 = _mm512_shuffle_f32x4(tmp1583, tmp1591, 221);
tmp1536 = _mm512_shuffle_f32x4(tmp1585, tmp1593, 136);
tmp1540 = _mm512_shuffle_f32x4(tmp1585, tmp1593, 221);
in330 = _mm512_shuffle_f32x4(tmp1587, tmp1595, 136);
in338 = _mm512_shuffle_f32x4(tmp1587, tmp1595, 221);
tmp1534 = _mm512_shuffle_f32x4(tmp1582, tmp1590, 136);
tmp1538 = _mm512_shuffle_f32x4(tmp1582, tmp1590, 221);
in326 = _mm512_shuffle_f32x4(tmp1584, tmp1592, 136);
in334 = _mm512_shuffle_f32x4(tmp1584, tmp1592, 221);
in328 = _mm512_shuffle_f32x4(tmp1586, tmp1594, 136);
in336 = _mm512_shuffle_f32x4(tmp1586, tmp1594, 221);
in327 = _mm512_shuffle_f32x4(tmp1588, tmp1596, 136);
in335 = _mm512_shuffle_f32x4(tmp1588, tmp1596, 221);
__m512 tmp1541 = _mm512_add_ps(tmp1535, in326);
__m512 tmp1545 = _mm512_add_ps(tmp1539, in334);
__m512 tmp1542 = _mm512_sub_ps(tmp1534, tmp1536);
__m512 tmp1546 = _mm512_sub_ps(tmp1538, tmp1540);
__m512 tmp1543 = _mm512_add_ps(tmp1536, in328);
__m512 tmp1547 = _mm512_add_ps(tmp1540, in336);
in324 = _mm512_sub_ps(in324, in328);
in332 = _mm512_sub_ps(in332, in336);
tmp1541 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-4.25e+00f), tmp1541);
tmp1545 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-4.25e+00f), tmp1545);
tmp1543 = _mm512_fmadd_ps(tmp1534, _mm512_set1_ps(-4.25e+00f), tmp1543);
tmp1547 = _mm512_fmadd_ps(tmp1538, _mm512_set1_ps(-4.25e+00f), tmp1547);
in324 = _mm512_fmadd_ps(tmp1542, _mm512_set1_ps(5.25e+00f), in324);
in332 = _mm512_fmadd_ps(tmp1546, _mm512_set1_ps(5.25e+00f), in332);
tmp1542 = _mm512_fmadd_ps(tmp1536, _mm512_set1_ps(2.5e-01f), in328);
tmp1546 = _mm512_fmadd_ps(tmp1540, _mm512_set1_ps(2.5e-01f), in336);
tmp1536 = _mm512_fmadd_ps(tmp1536, _mm512_set1_ps(4e+00f), in328);
tmp1540 = _mm512_fmadd_ps(tmp1540, _mm512_set1_ps(4e+00f), in336);
__m512 tmp1544 = _mm512_sub_ps(tmp1543, tmp1541);
__m512 tmp1548 = _mm512_sub_ps(tmp1547, tmp1545);
tmp1543 = _mm512_add_ps(tmp1541, tmp1543);
tmp1547 = _mm512_add_ps(tmp1545, tmp1547);
tmp1541 = _mm512_fmadd_ps(tmp1535, _mm512_set1_ps(2.5e-01f), in326);
tmp1545 = _mm512_fmadd_ps(tmp1539, _mm512_set1_ps(2.5e-01f), in334);
tmp1542 = _mm512_fmadd_ps(tmp1534, _mm512_set1_ps(-1.25e+00f), tmp1542);
tmp1546 = _mm512_fmadd_ps(tmp1538, _mm512_set1_ps(-1.25e+00f), tmp1546);
tmp1534 = _mm512_fmadd_ps(tmp1534, _mm512_set1_ps(-5e+00f), tmp1536);
tmp1538 = _mm512_fmadd_ps(tmp1538, _mm512_set1_ps(-5e+00f), tmp1540);
tmp1541 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-1.25e+00f), tmp1541);
tmp1545 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-1.25e+00f), tmp1545);
in328 = _mm512_fmadd_ps(tmp1541, _mm512_set1_ps(2e+00f), tmp1542);
in336 = _mm512_fmadd_ps(tmp1545, _mm512_set1_ps(2e+00f), tmp1546);
tmp1542 = _mm512_fnmadd_ps(tmp1541, _mm512_set1_ps(2e+00f), tmp1542);
tmp1546 = _mm512_fnmadd_ps(tmp1545, _mm512_set1_ps(2e+00f), tmp1546);
tmp1541 = _mm512_fmadd_ps(in326, _mm512_set1_ps(2.5e-01f), tmp1535);
tmp1545 = _mm512_fmadd_ps(in334, _mm512_set1_ps(2.5e-01f), tmp1539);
tmp1535 = _mm512_sub_ps(in327, tmp1535);
tmp1539 = _mm512_sub_ps(in335, tmp1539);
tmp1541 = _mm512_fmadd_ps(in330, _mm512_set1_ps(-1.25e+00f), tmp1541);
tmp1545 = _mm512_fmadd_ps(in338, _mm512_set1_ps(-1.25e+00f), tmp1545);
in330 = _mm512_sub_ps(in330, in326);
in338 = _mm512_sub_ps(in338, in334);
in330 = _mm512_fmadd_ps(in330, _mm512_set1_ps(5.25e+00f), tmp1535);
in338 = _mm512_fmadd_ps(in338, _mm512_set1_ps(5.25e+00f), tmp1539);
tmp1536 = _mm512_fmadd_ps(tmp1541, _mm512_set1_ps(2e+00f), tmp1534);
tmp1540 = _mm512_fmadd_ps(tmp1545, _mm512_set1_ps(2e+00f), tmp1538);
tmp1534 = _mm512_fnmadd_ps(tmp1541, _mm512_set1_ps(2e+00f), tmp1534);
tmp1538 = _mm512_fnmadd_ps(tmp1545, _mm512_set1_ps(2e+00f), tmp1538);
__m512 out343 = _mm512_shuffle_f32x4(in324, tmp1543, 68);
__m512 out351 = _mm512_shuffle_f32x4(in324, tmp1543, 238);
__m512 out344 = _mm512_shuffle_f32x4(tmp1544, in328, 68);
__m512 out352 = _mm512_shuffle_f32x4(tmp1544, in328, 238);
__m512 out345 = _mm512_shuffle_f32x4(tmp1542, tmp1536, 68);
__m512 out353 = _mm512_shuffle_f32x4(tmp1542, tmp1536, 238);
__m512 out346 = _mm512_shuffle_f32x4(tmp1534, in330, 68);
__m512 out354 = _mm512_shuffle_f32x4(tmp1534, in330, 238);
__m512 out347 = _mm512_shuffle_f32x4(in332, tmp1547, 68);
__m512 out355 = _mm512_shuffle_f32x4(in332, tmp1547, 238);
__m512 out348 = _mm512_shuffle_f32x4(tmp1548, in336, 68);
__m512 out356 = _mm512_shuffle_f32x4(tmp1548, in336, 238);
__m512 out349 = _mm512_shuffle_f32x4(tmp1546, tmp1540, 68);
__m512 out357 = _mm512_shuffle_f32x4(tmp1546, tmp1540, 238);
__m512 out350 = _mm512_shuffle_f32x4(tmp1538, in338, 68);
__m512 out358 = _mm512_shuffle_f32x4(tmp1538, in338, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k57, out343);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k57, out351);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k57, out347);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k57, out355);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k57, out344);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k57, out352);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k57, out348);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k57, out356);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k57, out345);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k57, out353);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k57, out349);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k57, out357);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k57, out346);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k57, out354);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k57, out350);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k57, out358);
}
++j11;
}
j11 = 15;
}
ptrdiff_t rel9 = j11-15;
ptrdiff_t base9 = 54;
if (rel9 < 1) {
ptrdiff_t h27 = base9+0;
ptrdiff_t w30 = 0;
ptrdiff_t k58 = 0;
for (; k58 != 2; ++k58) {
__m512 dat1243 = _mm512_maskz_loadu_ps(8191, datPtr5+4+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1243 = _mm512_max_ps(_mm512_setzero_ps(), dat1243);
__m512 dat1244 = _mm512_maskz_loadu_ps(16383, datPtr5+48+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1244 = _mm512_max_ps(_mm512_setzero_ps(), dat1244);
__m512i pm101 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in340 = _mm512_permutexvar_ps(pm101, dat1243);
__m512i pm102 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in343 = _mm512_permutexvar_ps(pm102, dat1244);
__m512 dat1245 = _mm512_maskz_loadu_ps(8191, datPtr5+228+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1245 = _mm512_max_ps(_mm512_setzero_ps(), dat1245);
__m512 dat1246 = _mm512_maskz_loadu_ps(16383, datPtr5+272+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1246 = _mm512_max_ps(_mm512_setzero_ps(), dat1246);
__m512 in341 = _mm512_permutexvar_ps(pm101, dat1245);
__m512 in344 = _mm512_permutexvar_ps(pm102, dat1246);
__m512 dat1247 = _mm512_maskz_loadu_ps(8191, datPtr5+452+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1247 = _mm512_max_ps(_mm512_setzero_ps(), dat1247);
__m512 dat1248 = _mm512_maskz_loadu_ps(16383, datPtr5+496+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1248 = _mm512_max_ps(_mm512_setzero_ps(), dat1248);
__m512 in342 = _mm512_permutexvar_ps(pm101, dat1247);
__m512 in345 = _mm512_permutexvar_ps(pm102, dat1248);
__m512 tmp1597 = in341;
__m512 tmp1604 = in344;
__m512 tmp1598 = _mm512_sub_ps(_mm512_setzero_ps(), in342);
__m512 tmp1605 = _mm512_sub_ps(_mm512_setzero_ps(), in345);
__m512 tmp1599 = in342;
__m512 tmp1606 = in345;
in340 = in340;
in343 = in343;
tmp1597 = tmp1597;
tmp1604 = tmp1604;
tmp1599 = tmp1599;
tmp1606 = tmp1606;
in340 = _mm512_fmadd_ps(tmp1598, _mm512_set1_ps(5.25e+00f), in340);
in343 = _mm512_fmadd_ps(tmp1605, _mm512_set1_ps(5.25e+00f), in343);
tmp1598 = _mm512_mul_ps(in342, _mm512_set1_ps(2.5e-01f));
tmp1605 = _mm512_mul_ps(in345, _mm512_set1_ps(2.5e-01f));
in342 = _mm512_mul_ps(in342, _mm512_set1_ps(4e+00f));
in345 = _mm512_mul_ps(in345, _mm512_set1_ps(4e+00f));
__m512 tmp1600 = _mm512_sub_ps(tmp1599, tmp1597);
__m512 tmp1607 = _mm512_sub_ps(tmp1606, tmp1604);
tmp1599 = _mm512_add_ps(tmp1597, tmp1599);
tmp1606 = _mm512_add_ps(tmp1604, tmp1606);
tmp1597 = _mm512_mul_ps(in341, _mm512_set1_ps(2.5e-01f));
tmp1604 = _mm512_mul_ps(in344, _mm512_set1_ps(2.5e-01f));
tmp1598 = tmp1598;
tmp1605 = tmp1605;
__m512 tmp1601 = in342;
__m512 tmp1608 = in345;
tmp1597 = tmp1597;
tmp1604 = tmp1604;
__m512 tmp1602 = _mm512_fmadd_ps(tmp1597, _mm512_set1_ps(2e+00f), tmp1598);
__m512 tmp1609 = _mm512_fmadd_ps(tmp1604, _mm512_set1_ps(2e+00f), tmp1605);
tmp1598 = _mm512_fnmadd_ps(tmp1597, _mm512_set1_ps(2e+00f), tmp1598);
tmp1605 = _mm512_fnmadd_ps(tmp1604, _mm512_set1_ps(2e+00f), tmp1605);
tmp1597 = in341;
tmp1604 = in344;
in341 = _mm512_sub_ps(_mm512_setzero_ps(), in341);
in344 = _mm512_sub_ps(_mm512_setzero_ps(), in344);
tmp1597 = tmp1597;
tmp1604 = tmp1604;
__m512 tmp1603 = in341;
__m512 tmp1610 = in344;
in342 = _mm512_fmadd_ps(tmp1597, _mm512_set1_ps(2e+00f), tmp1601);
in345 = _mm512_fmadd_ps(tmp1604, _mm512_set1_ps(2e+00f), tmp1608);
tmp1601 = _mm512_fnmadd_ps(tmp1597, _mm512_set1_ps(2e+00f), tmp1601);
tmp1608 = _mm512_fnmadd_ps(tmp1604, _mm512_set1_ps(2e+00f), tmp1608);
__m512 tmp1619 = _mm512_unpacklo_ps(in340, tmp1599);
__m512 tmp1620 = _mm512_unpackhi_ps(in340, tmp1599);
__m512 tmp1621 = _mm512_unpacklo_ps(tmp1600, tmp1602);
__m512 tmp1622 = _mm512_unpackhi_ps(tmp1600, tmp1602);
__m512 tmp1623 = _mm512_unpacklo_ps(tmp1598, in342);
__m512 tmp1624 = _mm512_unpackhi_ps(tmp1598, in342);
__m512 tmp1625 = _mm512_unpacklo_ps(tmp1601, tmp1603);
__m512 tmp1626 = _mm512_unpackhi_ps(tmp1601, tmp1603);
__m512 tmp1627 = _mm512_unpacklo_ps(in343, tmp1606);
__m512 tmp1628 = _mm512_unpackhi_ps(in343, tmp1606);
__m512 tmp1629 = _mm512_unpacklo_ps(tmp1607, tmp1609);
__m512 tmp1630 = _mm512_unpackhi_ps(tmp1607, tmp1609);
__m512 tmp1631 = _mm512_unpacklo_ps(tmp1605, in345);
__m512 tmp1632 = _mm512_unpackhi_ps(tmp1605, in345);
__m512 tmp1633 = _mm512_unpacklo_ps(tmp1608, tmp1610);
__m512 tmp1634 = _mm512_unpackhi_ps(tmp1608, tmp1610);
__m512 tmp1635 = _mm512_shuffle_ps(tmp1619, tmp1621, 68);
__m512 tmp1636 = _mm512_shuffle_ps(tmp1619, tmp1621, 238);
__m512 tmp1637 = _mm512_shuffle_ps(tmp1620, tmp1622, 68);
__m512 tmp1638 = _mm512_shuffle_ps(tmp1620, tmp1622, 238);
__m512 tmp1639 = _mm512_shuffle_ps(tmp1623, tmp1625, 68);
__m512 tmp1640 = _mm512_shuffle_ps(tmp1623, tmp1625, 238);
__m512 tmp1641 = _mm512_shuffle_ps(tmp1624, tmp1626, 68);
__m512 tmp1642 = _mm512_shuffle_ps(tmp1624, tmp1626, 238);
__m512 tmp1643 = _mm512_shuffle_ps(tmp1627, tmp1629, 68);
__m512 tmp1644 = _mm512_shuffle_ps(tmp1627, tmp1629, 238);
__m512 tmp1645 = _mm512_shuffle_ps(tmp1628, tmp1630, 68);
__m512 tmp1646 = _mm512_shuffle_ps(tmp1628, tmp1630, 238);
__m512 tmp1647 = _mm512_shuffle_ps(tmp1631, tmp1633, 68);
__m512 tmp1648 = _mm512_shuffle_ps(tmp1631, tmp1633, 238);
__m512 tmp1649 = _mm512_shuffle_ps(tmp1632, tmp1634, 68);
__m512 tmp1650 = _mm512_shuffle_ps(tmp1632, tmp1634, 238);
__m512 tmp1651 = _mm512_shuffle_f32x4(tmp1635, tmp1639, 136);
__m512 tmp1652 = _mm512_shuffle_f32x4(tmp1635, tmp1639, 221);
__m512 tmp1653 = _mm512_shuffle_f32x4(tmp1636, tmp1640, 136);
__m512 tmp1654 = _mm512_shuffle_f32x4(tmp1636, tmp1640, 221);
__m512 tmp1655 = _mm512_shuffle_f32x4(tmp1637, tmp1641, 136);
__m512 tmp1656 = _mm512_shuffle_f32x4(tmp1637, tmp1641, 221);
__m512 tmp1657 = _mm512_shuffle_f32x4(tmp1638, tmp1642, 136);
__m512 tmp1658 = _mm512_shuffle_f32x4(tmp1638, tmp1642, 221);
__m512 tmp1659 = _mm512_shuffle_f32x4(tmp1643, tmp1647, 136);
__m512 tmp1660 = _mm512_shuffle_f32x4(tmp1643, tmp1647, 221);
__m512 tmp1661 = _mm512_shuffle_f32x4(tmp1644, tmp1648, 136);
__m512 tmp1662 = _mm512_shuffle_f32x4(tmp1644, tmp1648, 221);
__m512 tmp1663 = _mm512_shuffle_f32x4(tmp1645, tmp1649, 136);
__m512 tmp1664 = _mm512_shuffle_f32x4(tmp1645, tmp1649, 221);
__m512 tmp1665 = _mm512_shuffle_f32x4(tmp1646, tmp1650, 136);
__m512 tmp1666 = _mm512_shuffle_f32x4(tmp1646, tmp1650, 221);
in340 = _mm512_shuffle_f32x4(tmp1651, tmp1659, 136);
in343 = _mm512_shuffle_f32x4(tmp1651, tmp1659, 221);
tmp1599 = _mm512_shuffle_f32x4(tmp1653, tmp1661, 136);
tmp1606 = _mm512_shuffle_f32x4(tmp1653, tmp1661, 221);
tmp1600 = _mm512_shuffle_f32x4(tmp1655, tmp1663, 136);
tmp1607 = _mm512_shuffle_f32x4(tmp1655, tmp1663, 221);
tmp1602 = _mm512_shuffle_f32x4(tmp1657, tmp1665, 136);
tmp1609 = _mm512_shuffle_f32x4(tmp1657, tmp1665, 221);
tmp1598 = _mm512_shuffle_f32x4(tmp1652, tmp1660, 136);
tmp1605 = _mm512_shuffle_f32x4(tmp1652, tmp1660, 221);
in342 = _mm512_shuffle_f32x4(tmp1654, tmp1662, 136);
in345 = _mm512_shuffle_f32x4(tmp1654, tmp1662, 221);
tmp1601 = _mm512_shuffle_f32x4(tmp1656, tmp1664, 136);
tmp1608 = _mm512_shuffle_f32x4(tmp1656, tmp1664, 221);
tmp1603 = _mm512_shuffle_f32x4(tmp1658, tmp1666, 136);
tmp1610 = _mm512_shuffle_f32x4(tmp1658, tmp1666, 221);
__m512 tmp1611 = _mm512_add_ps(tmp1599, in342);
__m512 tmp1615 = _mm512_add_ps(tmp1606, in345);
__m512 tmp1612 = _mm512_sub_ps(tmp1598, tmp1600);
__m512 tmp1616 = _mm512_sub_ps(tmp1605, tmp1607);
__m512 tmp1613 = _mm512_add_ps(tmp1600, tmp1601);
__m512 tmp1617 = _mm512_add_ps(tmp1607, tmp1608);
in340 = _mm512_sub_ps(in340, tmp1601);
in343 = _mm512_sub_ps(in343, tmp1608);
tmp1611 = _mm512_fmadd_ps(tmp1602, _mm512_set1_ps(-4.25e+00f), tmp1611);
tmp1615 = _mm512_fmadd_ps(tmp1609, _mm512_set1_ps(-4.25e+00f), tmp1615);
tmp1613 = _mm512_fmadd_ps(tmp1598, _mm512_set1_ps(-4.25e+00f), tmp1613);
tmp1617 = _mm512_fmadd_ps(tmp1605, _mm512_set1_ps(-4.25e+00f), tmp1617);
in340 = _mm512_fmadd_ps(tmp1612, _mm512_set1_ps(5.25e+00f), in340);
in343 = _mm512_fmadd_ps(tmp1616, _mm512_set1_ps(5.25e+00f), in343);
tmp1612 = _mm512_fmadd_ps(tmp1600, _mm512_set1_ps(2.5e-01f), tmp1601);
tmp1616 = _mm512_fmadd_ps(tmp1607, _mm512_set1_ps(2.5e-01f), tmp1608);
tmp1600 = _mm512_fmadd_ps(tmp1600, _mm512_set1_ps(4e+00f), tmp1601);
tmp1607 = _mm512_fmadd_ps(tmp1607, _mm512_set1_ps(4e+00f), tmp1608);
__m512 tmp1614 = _mm512_sub_ps(tmp1613, tmp1611);
__m512 tmp1618 = _mm512_sub_ps(tmp1617, tmp1615);
tmp1613 = _mm512_add_ps(tmp1611, tmp1613);
tmp1617 = _mm512_add_ps(tmp1615, tmp1617);
tmp1611 = _mm512_fmadd_ps(tmp1599, _mm512_set1_ps(2.5e-01f), in342);
tmp1615 = _mm512_fmadd_ps(tmp1606, _mm512_set1_ps(2.5e-01f), in345);
tmp1612 = _mm512_fmadd_ps(tmp1598, _mm512_set1_ps(-1.25e+00f), tmp1612);
tmp1616 = _mm512_fmadd_ps(tmp1605, _mm512_set1_ps(-1.25e+00f), tmp1616);
tmp1598 = _mm512_fmadd_ps(tmp1598, _mm512_set1_ps(-5e+00f), tmp1600);
tmp1605 = _mm512_fmadd_ps(tmp1605, _mm512_set1_ps(-5e+00f), tmp1607);
tmp1611 = _mm512_fmadd_ps(tmp1602, _mm512_set1_ps(-1.25e+00f), tmp1611);
tmp1615 = _mm512_fmadd_ps(tmp1609, _mm512_set1_ps(-1.25e+00f), tmp1615);
tmp1601 = _mm512_fmadd_ps(tmp1611, _mm512_set1_ps(2e+00f), tmp1612);
tmp1608 = _mm512_fmadd_ps(tmp1615, _mm512_set1_ps(2e+00f), tmp1616);
tmp1612 = _mm512_fnmadd_ps(tmp1611, _mm512_set1_ps(2e+00f), tmp1612);
tmp1616 = _mm512_fnmadd_ps(tmp1615, _mm512_set1_ps(2e+00f), tmp1616);
tmp1611 = _mm512_fmadd_ps(in342, _mm512_set1_ps(2.5e-01f), tmp1599);
tmp1615 = _mm512_fmadd_ps(in345, _mm512_set1_ps(2.5e-01f), tmp1606);
tmp1599 = _mm512_sub_ps(tmp1603, tmp1599);
tmp1606 = _mm512_sub_ps(tmp1610, tmp1606);
tmp1611 = _mm512_fmadd_ps(tmp1602, _mm512_set1_ps(-1.25e+00f), tmp1611);
tmp1615 = _mm512_fmadd_ps(tmp1609, _mm512_set1_ps(-1.25e+00f), tmp1615);
tmp1602 = _mm512_sub_ps(tmp1602, in342);
tmp1609 = _mm512_sub_ps(tmp1609, in345);
tmp1602 = _mm512_fmadd_ps(tmp1602, _mm512_set1_ps(5.25e+00f), tmp1599);
tmp1609 = _mm512_fmadd_ps(tmp1609, _mm512_set1_ps(5.25e+00f), tmp1606);
tmp1600 = _mm512_fmadd_ps(tmp1611, _mm512_set1_ps(2e+00f), tmp1598);
tmp1607 = _mm512_fmadd_ps(tmp1615, _mm512_set1_ps(2e+00f), tmp1605);
tmp1598 = _mm512_fnmadd_ps(tmp1611, _mm512_set1_ps(2e+00f), tmp1598);
tmp1605 = _mm512_fnmadd_ps(tmp1615, _mm512_set1_ps(2e+00f), tmp1605);
__m512 out359 = _mm512_shuffle_f32x4(in340, tmp1613, 68);
__m512 out367 = _mm512_shuffle_f32x4(in340, tmp1613, 238);
__m512 out360 = _mm512_shuffle_f32x4(tmp1614, tmp1601, 68);
__m512 out368 = _mm512_shuffle_f32x4(tmp1614, tmp1601, 238);
__m512 out361 = _mm512_shuffle_f32x4(tmp1612, tmp1600, 68);
__m512 out369 = _mm512_shuffle_f32x4(tmp1612, tmp1600, 238);
__m512 out362 = _mm512_shuffle_f32x4(tmp1598, tmp1602, 68);
__m512 out370 = _mm512_shuffle_f32x4(tmp1598, tmp1602, 238);
__m512 out363 = _mm512_shuffle_f32x4(in343, tmp1617, 68);
__m512 out371 = _mm512_shuffle_f32x4(in343, tmp1617, 238);
__m512 out364 = _mm512_shuffle_f32x4(tmp1618, tmp1608, 68);
__m512 out372 = _mm512_shuffle_f32x4(tmp1618, tmp1608, 238);
__m512 out365 = _mm512_shuffle_f32x4(tmp1616, tmp1607, 68);
__m512 out373 = _mm512_shuffle_f32x4(tmp1616, tmp1607, 238);
__m512 out366 = _mm512_shuffle_f32x4(tmp1605, tmp1609, 68);
__m512 out374 = _mm512_shuffle_f32x4(tmp1605, tmp1609, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1536*s12+768*k58, out359);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1536*s12+768*k58, out367);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1536*s12+768*k58, out363);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1536*s12+768*k58, out371);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1536*s12+768*k58, out360);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1536*s12+768*k58, out368);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1536*s12+768*k58, out364);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1536*s12+768*k58, out372);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1536*s12+768*k58, out361);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1536*s12+768*k58, out369);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1536*s12+768*k58, out365);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1536*s12+768*k58, out373);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1536*s12+768*k58, out362);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1536*s12+768*k58, out370);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1536*s12+768*k58, out366);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1536*s12+768*k58, out374);
__m512 dat1249 = _mm512_maskz_loadu_ps(16383, datPtr5+96+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1249 = _mm512_max_ps(_mm512_setzero_ps(), dat1249);
__m512 dat1250 = _mm512_maskz_loadu_ps(8191, datPtr5+12612+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1250 = _mm512_max_ps(_mm512_setzero_ps(), dat1250);
__m512i pm103 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in346 = _mm512_permutexvar_ps(pm103, dat1249);
__m512i pm104 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in349 = _mm512_permutexvar_ps(pm104, dat1250);
__m512 dat1251 = _mm512_maskz_loadu_ps(16383, datPtr5+320+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1251 = _mm512_max_ps(_mm512_setzero_ps(), dat1251);
__m512 dat1252 = _mm512_maskz_loadu_ps(8191, datPtr5+12836+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1252 = _mm512_max_ps(_mm512_setzero_ps(), dat1252);
__m512 in347 = _mm512_permutexvar_ps(pm103, dat1251);
__m512 in350 = _mm512_permutexvar_ps(pm104, dat1252);
__m512 dat1253 = _mm512_maskz_loadu_ps(16383, datPtr5+544+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1253 = _mm512_max_ps(_mm512_setzero_ps(), dat1253);
__m512 dat1254 = _mm512_maskz_loadu_ps(8191, datPtr5+13060+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1254 = _mm512_max_ps(_mm512_setzero_ps(), dat1254);
__m512 in348 = _mm512_permutexvar_ps(pm103, dat1253);
__m512 in351 = _mm512_permutexvar_ps(pm104, dat1254);
__m512 tmp1667 = in347;
__m512 tmp1674 = in350;
__m512 tmp1668 = _mm512_sub_ps(_mm512_setzero_ps(), in348);
__m512 tmp1675 = _mm512_sub_ps(_mm512_setzero_ps(), in351);
__m512 tmp1669 = in348;
__m512 tmp1676 = in351;
in346 = in346;
in349 = in349;
tmp1667 = tmp1667;
tmp1674 = tmp1674;
tmp1669 = tmp1669;
tmp1676 = tmp1676;
in346 = _mm512_fmadd_ps(tmp1668, _mm512_set1_ps(5.25e+00f), in346);
in349 = _mm512_fmadd_ps(tmp1675, _mm512_set1_ps(5.25e+00f), in349);
tmp1668 = _mm512_mul_ps(in348, _mm512_set1_ps(2.5e-01f));
tmp1675 = _mm512_mul_ps(in351, _mm512_set1_ps(2.5e-01f));
in348 = _mm512_mul_ps(in348, _mm512_set1_ps(4e+00f));
in351 = _mm512_mul_ps(in351, _mm512_set1_ps(4e+00f));
__m512 tmp1670 = _mm512_sub_ps(tmp1669, tmp1667);
__m512 tmp1677 = _mm512_sub_ps(tmp1676, tmp1674);
tmp1669 = _mm512_add_ps(tmp1667, tmp1669);
tmp1676 = _mm512_add_ps(tmp1674, tmp1676);
tmp1667 = _mm512_mul_ps(in347, _mm512_set1_ps(2.5e-01f));
tmp1674 = _mm512_mul_ps(in350, _mm512_set1_ps(2.5e-01f));
tmp1668 = tmp1668;
tmp1675 = tmp1675;
__m512 tmp1671 = in348;
__m512 tmp1678 = in351;
tmp1667 = tmp1667;
tmp1674 = tmp1674;
__m512 tmp1672 = _mm512_fmadd_ps(tmp1667, _mm512_set1_ps(2e+00f), tmp1668);
__m512 tmp1679 = _mm512_fmadd_ps(tmp1674, _mm512_set1_ps(2e+00f), tmp1675);
tmp1668 = _mm512_fnmadd_ps(tmp1667, _mm512_set1_ps(2e+00f), tmp1668);
tmp1675 = _mm512_fnmadd_ps(tmp1674, _mm512_set1_ps(2e+00f), tmp1675);
tmp1667 = in347;
tmp1674 = in350;
in347 = _mm512_sub_ps(_mm512_setzero_ps(), in347);
in350 = _mm512_sub_ps(_mm512_setzero_ps(), in350);
tmp1667 = tmp1667;
tmp1674 = tmp1674;
__m512 tmp1673 = in347;
__m512 tmp1680 = in350;
in348 = _mm512_fmadd_ps(tmp1667, _mm512_set1_ps(2e+00f), tmp1671);
in351 = _mm512_fmadd_ps(tmp1674, _mm512_set1_ps(2e+00f), tmp1678);
tmp1671 = _mm512_fnmadd_ps(tmp1667, _mm512_set1_ps(2e+00f), tmp1671);
tmp1678 = _mm512_fnmadd_ps(tmp1674, _mm512_set1_ps(2e+00f), tmp1678);
__m512 tmp1689 = _mm512_unpacklo_ps(in346, tmp1669);
__m512 tmp1690 = _mm512_unpackhi_ps(in346, tmp1669);
__m512 tmp1691 = _mm512_unpacklo_ps(tmp1670, tmp1672);
__m512 tmp1692 = _mm512_unpackhi_ps(tmp1670, tmp1672);
__m512 tmp1693 = _mm512_unpacklo_ps(tmp1668, in348);
__m512 tmp1694 = _mm512_unpackhi_ps(tmp1668, in348);
__m512 tmp1695 = _mm512_unpacklo_ps(tmp1671, tmp1673);
__m512 tmp1696 = _mm512_unpackhi_ps(tmp1671, tmp1673);
__m512 tmp1697 = _mm512_unpacklo_ps(in349, tmp1676);
__m512 tmp1698 = _mm512_unpackhi_ps(in349, tmp1676);
__m512 tmp1699 = _mm512_unpacklo_ps(tmp1677, tmp1679);
__m512 tmp1700 = _mm512_unpackhi_ps(tmp1677, tmp1679);
__m512 tmp1701 = _mm512_unpacklo_ps(tmp1675, in351);
__m512 tmp1702 = _mm512_unpackhi_ps(tmp1675, in351);
__m512 tmp1703 = _mm512_unpacklo_ps(tmp1678, tmp1680);
__m512 tmp1704 = _mm512_unpackhi_ps(tmp1678, tmp1680);
__m512 tmp1705 = _mm512_shuffle_ps(tmp1689, tmp1691, 68);
__m512 tmp1706 = _mm512_shuffle_ps(tmp1689, tmp1691, 238);
__m512 tmp1707 = _mm512_shuffle_ps(tmp1690, tmp1692, 68);
__m512 tmp1708 = _mm512_shuffle_ps(tmp1690, tmp1692, 238);
__m512 tmp1709 = _mm512_shuffle_ps(tmp1693, tmp1695, 68);
__m512 tmp1710 = _mm512_shuffle_ps(tmp1693, tmp1695, 238);
__m512 tmp1711 = _mm512_shuffle_ps(tmp1694, tmp1696, 68);
__m512 tmp1712 = _mm512_shuffle_ps(tmp1694, tmp1696, 238);
__m512 tmp1713 = _mm512_shuffle_ps(tmp1697, tmp1699, 68);
__m512 tmp1714 = _mm512_shuffle_ps(tmp1697, tmp1699, 238);
__m512 tmp1715 = _mm512_shuffle_ps(tmp1698, tmp1700, 68);
__m512 tmp1716 = _mm512_shuffle_ps(tmp1698, tmp1700, 238);
__m512 tmp1717 = _mm512_shuffle_ps(tmp1701, tmp1703, 68);
__m512 tmp1718 = _mm512_shuffle_ps(tmp1701, tmp1703, 238);
__m512 tmp1719 = _mm512_shuffle_ps(tmp1702, tmp1704, 68);
__m512 tmp1720 = _mm512_shuffle_ps(tmp1702, tmp1704, 238);
__m512 tmp1721 = _mm512_shuffle_f32x4(tmp1705, tmp1709, 136);
__m512 tmp1722 = _mm512_shuffle_f32x4(tmp1705, tmp1709, 221);
__m512 tmp1723 = _mm512_shuffle_f32x4(tmp1706, tmp1710, 136);
__m512 tmp1724 = _mm512_shuffle_f32x4(tmp1706, tmp1710, 221);
__m512 tmp1725 = _mm512_shuffle_f32x4(tmp1707, tmp1711, 136);
__m512 tmp1726 = _mm512_shuffle_f32x4(tmp1707, tmp1711, 221);
__m512 tmp1727 = _mm512_shuffle_f32x4(tmp1708, tmp1712, 136);
__m512 tmp1728 = _mm512_shuffle_f32x4(tmp1708, tmp1712, 221);
__m512 tmp1729 = _mm512_shuffle_f32x4(tmp1713, tmp1717, 136);
__m512 tmp1730 = _mm512_shuffle_f32x4(tmp1713, tmp1717, 221);
__m512 tmp1731 = _mm512_shuffle_f32x4(tmp1714, tmp1718, 136);
__m512 tmp1732 = _mm512_shuffle_f32x4(tmp1714, tmp1718, 221);
__m512 tmp1733 = _mm512_shuffle_f32x4(tmp1715, tmp1719, 136);
__m512 tmp1734 = _mm512_shuffle_f32x4(tmp1715, tmp1719, 221);
__m512 tmp1735 = _mm512_shuffle_f32x4(tmp1716, tmp1720, 136);
__m512 tmp1736 = _mm512_shuffle_f32x4(tmp1716, tmp1720, 221);
in346 = _mm512_shuffle_f32x4(tmp1721, tmp1729, 136);
in349 = _mm512_shuffle_f32x4(tmp1721, tmp1729, 221);
tmp1669 = _mm512_shuffle_f32x4(tmp1723, tmp1731, 136);
tmp1676 = _mm512_shuffle_f32x4(tmp1723, tmp1731, 221);
tmp1670 = _mm512_shuffle_f32x4(tmp1725, tmp1733, 136);
tmp1677 = _mm512_shuffle_f32x4(tmp1725, tmp1733, 221);
tmp1672 = _mm512_shuffle_f32x4(tmp1727, tmp1735, 136);
tmp1679 = _mm512_shuffle_f32x4(tmp1727, tmp1735, 221);
tmp1668 = _mm512_shuffle_f32x4(tmp1722, tmp1730, 136);
tmp1675 = _mm512_shuffle_f32x4(tmp1722, tmp1730, 221);
in348 = _mm512_shuffle_f32x4(tmp1724, tmp1732, 136);
in351 = _mm512_shuffle_f32x4(tmp1724, tmp1732, 221);
tmp1671 = _mm512_shuffle_f32x4(tmp1726, tmp1734, 136);
tmp1678 = _mm512_shuffle_f32x4(tmp1726, tmp1734, 221);
tmp1673 = _mm512_shuffle_f32x4(tmp1728, tmp1736, 136);
tmp1680 = _mm512_shuffle_f32x4(tmp1728, tmp1736, 221);
__m512 tmp1681 = _mm512_add_ps(tmp1669, in348);
__m512 tmp1685 = _mm512_add_ps(tmp1676, in351);
__m512 tmp1682 = _mm512_sub_ps(tmp1668, tmp1670);
__m512 tmp1686 = _mm512_sub_ps(tmp1675, tmp1677);
__m512 tmp1683 = _mm512_add_ps(tmp1670, tmp1671);
__m512 tmp1687 = _mm512_add_ps(tmp1677, tmp1678);
in346 = _mm512_sub_ps(in346, tmp1671);
in349 = _mm512_sub_ps(in349, tmp1678);
tmp1681 = _mm512_fmadd_ps(tmp1672, _mm512_set1_ps(-4.25e+00f), tmp1681);
tmp1685 = _mm512_fmadd_ps(tmp1679, _mm512_set1_ps(-4.25e+00f), tmp1685);
tmp1683 = _mm512_fmadd_ps(tmp1668, _mm512_set1_ps(-4.25e+00f), tmp1683);
tmp1687 = _mm512_fmadd_ps(tmp1675, _mm512_set1_ps(-4.25e+00f), tmp1687);
in346 = _mm512_fmadd_ps(tmp1682, _mm512_set1_ps(5.25e+00f), in346);
in349 = _mm512_fmadd_ps(tmp1686, _mm512_set1_ps(5.25e+00f), in349);
tmp1682 = _mm512_fmadd_ps(tmp1670, _mm512_set1_ps(2.5e-01f), tmp1671);
tmp1686 = _mm512_fmadd_ps(tmp1677, _mm512_set1_ps(2.5e-01f), tmp1678);
tmp1670 = _mm512_fmadd_ps(tmp1670, _mm512_set1_ps(4e+00f), tmp1671);
tmp1677 = _mm512_fmadd_ps(tmp1677, _mm512_set1_ps(4e+00f), tmp1678);
__m512 tmp1684 = _mm512_sub_ps(tmp1683, tmp1681);
__m512 tmp1688 = _mm512_sub_ps(tmp1687, tmp1685);
tmp1683 = _mm512_add_ps(tmp1681, tmp1683);
tmp1687 = _mm512_add_ps(tmp1685, tmp1687);
tmp1681 = _mm512_fmadd_ps(tmp1669, _mm512_set1_ps(2.5e-01f), in348);
tmp1685 = _mm512_fmadd_ps(tmp1676, _mm512_set1_ps(2.5e-01f), in351);
tmp1682 = _mm512_fmadd_ps(tmp1668, _mm512_set1_ps(-1.25e+00f), tmp1682);
tmp1686 = _mm512_fmadd_ps(tmp1675, _mm512_set1_ps(-1.25e+00f), tmp1686);
tmp1668 = _mm512_fmadd_ps(tmp1668, _mm512_set1_ps(-5e+00f), tmp1670);
tmp1675 = _mm512_fmadd_ps(tmp1675, _mm512_set1_ps(-5e+00f), tmp1677);
tmp1681 = _mm512_fmadd_ps(tmp1672, _mm512_set1_ps(-1.25e+00f), tmp1681);
tmp1685 = _mm512_fmadd_ps(tmp1679, _mm512_set1_ps(-1.25e+00f), tmp1685);
tmp1671 = _mm512_fmadd_ps(tmp1681, _mm512_set1_ps(2e+00f), tmp1682);
tmp1678 = _mm512_fmadd_ps(tmp1685, _mm512_set1_ps(2e+00f), tmp1686);
tmp1682 = _mm512_fnmadd_ps(tmp1681, _mm512_set1_ps(2e+00f), tmp1682);
tmp1686 = _mm512_fnmadd_ps(tmp1685, _mm512_set1_ps(2e+00f), tmp1686);
tmp1681 = _mm512_fmadd_ps(in348, _mm512_set1_ps(2.5e-01f), tmp1669);
tmp1685 = _mm512_fmadd_ps(in351, _mm512_set1_ps(2.5e-01f), tmp1676);
tmp1669 = _mm512_sub_ps(tmp1673, tmp1669);
tmp1676 = _mm512_sub_ps(tmp1680, tmp1676);
tmp1681 = _mm512_fmadd_ps(tmp1672, _mm512_set1_ps(-1.25e+00f), tmp1681);
tmp1685 = _mm512_fmadd_ps(tmp1679, _mm512_set1_ps(-1.25e+00f), tmp1685);
tmp1672 = _mm512_sub_ps(tmp1672, in348);
tmp1679 = _mm512_sub_ps(tmp1679, in351);
tmp1672 = _mm512_fmadd_ps(tmp1672, _mm512_set1_ps(5.25e+00f), tmp1669);
tmp1679 = _mm512_fmadd_ps(tmp1679, _mm512_set1_ps(5.25e+00f), tmp1676);
tmp1670 = _mm512_fmadd_ps(tmp1681, _mm512_set1_ps(2e+00f), tmp1668);
tmp1677 = _mm512_fmadd_ps(tmp1685, _mm512_set1_ps(2e+00f), tmp1675);
tmp1668 = _mm512_fnmadd_ps(tmp1681, _mm512_set1_ps(2e+00f), tmp1668);
tmp1675 = _mm512_fnmadd_ps(tmp1685, _mm512_set1_ps(2e+00f), tmp1675);
__m512 out375 = _mm512_shuffle_f32x4(in346, tmp1683, 68);
__m512 out383 = _mm512_shuffle_f32x4(in346, tmp1683, 238);
__m512 out376 = _mm512_shuffle_f32x4(tmp1684, tmp1671, 68);
__m512 out384 = _mm512_shuffle_f32x4(tmp1684, tmp1671, 238);
__m512 out377 = _mm512_shuffle_f32x4(tmp1682, tmp1670, 68);
__m512 out385 = _mm512_shuffle_f32x4(tmp1682, tmp1670, 238);
__m512 out378 = _mm512_shuffle_f32x4(tmp1668, tmp1672, 68);
__m512 out386 = _mm512_shuffle_f32x4(tmp1668, tmp1672, 238);
__m512 out379 = _mm512_shuffle_f32x4(in349, tmp1687, 68);
__m512 out387 = _mm512_shuffle_f32x4(in349, tmp1687, 238);
__m512 out380 = _mm512_shuffle_f32x4(tmp1688, tmp1678, 68);
__m512 out388 = _mm512_shuffle_f32x4(tmp1688, tmp1678, 238);
__m512 out381 = _mm512_shuffle_f32x4(tmp1686, tmp1677, 68);
__m512 out389 = _mm512_shuffle_f32x4(tmp1686, tmp1677, 238);
__m512 out382 = _mm512_shuffle_f32x4(tmp1675, tmp1679, 68);
__m512 out390 = _mm512_shuffle_f32x4(tmp1675, tmp1679, 238);
_mm512_storeu_ps(dfPtr4+256+102400*i17+1536*j11+1536*s12+768*k58, out375);
_mm512_storeu_ps(dfPtr4+384+102400*i17+1536*j11+1536*s12+768*k58, out383);
_mm512_storeu_ps(dfPtr4+320+102400*i17+1536*j11+1536*s12+768*k58, out379);
_mm512_storeu_ps(dfPtr4+448+102400*i17+1536*j11+1536*s12+768*k58, out387);
_mm512_storeu_ps(dfPtr4+25856+102400*i17+1536*j11+1536*s12+768*k58, out376);
_mm512_storeu_ps(dfPtr4+25984+102400*i17+1536*j11+1536*s12+768*k58, out384);
_mm512_storeu_ps(dfPtr4+25920+102400*i17+1536*j11+1536*s12+768*k58, out380);
_mm512_storeu_ps(dfPtr4+26048+102400*i17+1536*j11+1536*s12+768*k58, out388);
_mm512_storeu_ps(dfPtr4+51456+102400*i17+1536*j11+1536*s12+768*k58, out377);
_mm512_storeu_ps(dfPtr4+51584+102400*i17+1536*j11+1536*s12+768*k58, out385);
_mm512_storeu_ps(dfPtr4+51520+102400*i17+1536*j11+1536*s12+768*k58, out381);
_mm512_storeu_ps(dfPtr4+51648+102400*i17+1536*j11+1536*s12+768*k58, out389);
_mm512_storeu_ps(dfPtr4+77056+102400*i17+1536*j11+1536*s12+768*k58, out378);
_mm512_storeu_ps(dfPtr4+77184+102400*i17+1536*j11+1536*s12+768*k58, out386);
_mm512_storeu_ps(dfPtr4+77120+102400*i17+1536*j11+1536*s12+768*k58, out382);
_mm512_storeu_ps(dfPtr4+77248+102400*i17+1536*j11+1536*s12+768*k58, out390);
__m512 dat1255 = _mm512_maskz_loadu_ps(16383, datPtr5+12656+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1255 = _mm512_max_ps(_mm512_setzero_ps(), dat1255);
__m512 dat1256 = _mm512_maskz_loadu_ps(16383, datPtr5+12704+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1256 = _mm512_max_ps(_mm512_setzero_ps(), dat1256);
__m512i pm105 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in352 = _mm512_permutexvar_ps(pm105, dat1255);
__m512 in355 = _mm512_permutexvar_ps(pm105, dat1256);
__m512 dat1257 = _mm512_maskz_loadu_ps(16383, datPtr5+12880+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1257 = _mm512_max_ps(_mm512_setzero_ps(), dat1257);
__m512 dat1258 = _mm512_maskz_loadu_ps(16383, datPtr5+12928+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1258 = _mm512_max_ps(_mm512_setzero_ps(), dat1258);
__m512 in353 = _mm512_permutexvar_ps(pm105, dat1257);
__m512 in356 = _mm512_permutexvar_ps(pm105, dat1258);
__m512 dat1259 = _mm512_maskz_loadu_ps(16383, datPtr5+13104+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1259 = _mm512_max_ps(_mm512_setzero_ps(), dat1259);
__m512 dat1260 = _mm512_maskz_loadu_ps(16383, datPtr5+13152+50432*i17+224*h27+4*w30+50432*s12+25216*k58);
dat1260 = _mm512_max_ps(_mm512_setzero_ps(), dat1260);
__m512 in354 = _mm512_permutexvar_ps(pm105, dat1259);
__m512 in357 = _mm512_permutexvar_ps(pm105, dat1260);
__m512 tmp1737 = in353;
__m512 tmp1744 = in356;
__m512 tmp1738 = _mm512_sub_ps(_mm512_setzero_ps(), in354);
__m512 tmp1745 = _mm512_sub_ps(_mm512_setzero_ps(), in357);
__m512 tmp1739 = in354;
__m512 tmp1746 = in357;
in352 = in352;
in355 = in355;
tmp1737 = tmp1737;
tmp1744 = tmp1744;
tmp1739 = tmp1739;
tmp1746 = tmp1746;
in352 = _mm512_fmadd_ps(tmp1738, _mm512_set1_ps(5.25e+00f), in352);
in355 = _mm512_fmadd_ps(tmp1745, _mm512_set1_ps(5.25e+00f), in355);
tmp1738 = _mm512_mul_ps(in354, _mm512_set1_ps(2.5e-01f));
tmp1745 = _mm512_mul_ps(in357, _mm512_set1_ps(2.5e-01f));
in354 = _mm512_mul_ps(in354, _mm512_set1_ps(4e+00f));
in357 = _mm512_mul_ps(in357, _mm512_set1_ps(4e+00f));
__m512 tmp1740 = _mm512_sub_ps(tmp1739, tmp1737);
__m512 tmp1747 = _mm512_sub_ps(tmp1746, tmp1744);
tmp1739 = _mm512_add_ps(tmp1737, tmp1739);
tmp1746 = _mm512_add_ps(tmp1744, tmp1746);
tmp1737 = _mm512_mul_ps(in353, _mm512_set1_ps(2.5e-01f));
tmp1744 = _mm512_mul_ps(in356, _mm512_set1_ps(2.5e-01f));
tmp1738 = tmp1738;
tmp1745 = tmp1745;
__m512 tmp1741 = in354;
__m512 tmp1748 = in357;
tmp1737 = tmp1737;
tmp1744 = tmp1744;
__m512 tmp1742 = _mm512_fmadd_ps(tmp1737, _mm512_set1_ps(2e+00f), tmp1738);
__m512 tmp1749 = _mm512_fmadd_ps(tmp1744, _mm512_set1_ps(2e+00f), tmp1745);
tmp1738 = _mm512_fnmadd_ps(tmp1737, _mm512_set1_ps(2e+00f), tmp1738);
tmp1745 = _mm512_fnmadd_ps(tmp1744, _mm512_set1_ps(2e+00f), tmp1745);
tmp1737 = in353;
tmp1744 = in356;
in353 = _mm512_sub_ps(_mm512_setzero_ps(), in353);
in356 = _mm512_sub_ps(_mm512_setzero_ps(), in356);
tmp1737 = tmp1737;
tmp1744 = tmp1744;
__m512 tmp1743 = in353;
__m512 tmp1750 = in356;
in354 = _mm512_fmadd_ps(tmp1737, _mm512_set1_ps(2e+00f), tmp1741);
in357 = _mm512_fmadd_ps(tmp1744, _mm512_set1_ps(2e+00f), tmp1748);
tmp1741 = _mm512_fnmadd_ps(tmp1737, _mm512_set1_ps(2e+00f), tmp1741);
tmp1748 = _mm512_fnmadd_ps(tmp1744, _mm512_set1_ps(2e+00f), tmp1748);
__m512 tmp1759 = _mm512_unpacklo_ps(in352, tmp1739);
__m512 tmp1760 = _mm512_unpackhi_ps(in352, tmp1739);
__m512 tmp1761 = _mm512_unpacklo_ps(tmp1740, tmp1742);
__m512 tmp1762 = _mm512_unpackhi_ps(tmp1740, tmp1742);
__m512 tmp1763 = _mm512_unpacklo_ps(tmp1738, in354);
__m512 tmp1764 = _mm512_unpackhi_ps(tmp1738, in354);
__m512 tmp1765 = _mm512_unpacklo_ps(tmp1741, tmp1743);
__m512 tmp1766 = _mm512_unpackhi_ps(tmp1741, tmp1743);
__m512 tmp1767 = _mm512_unpacklo_ps(in355, tmp1746);
__m512 tmp1768 = _mm512_unpackhi_ps(in355, tmp1746);
__m512 tmp1769 = _mm512_unpacklo_ps(tmp1747, tmp1749);
__m512 tmp1770 = _mm512_unpackhi_ps(tmp1747, tmp1749);
__m512 tmp1771 = _mm512_unpacklo_ps(tmp1745, in357);
__m512 tmp1772 = _mm512_unpackhi_ps(tmp1745, in357);
__m512 tmp1773 = _mm512_unpacklo_ps(tmp1748, tmp1750);
__m512 tmp1774 = _mm512_unpackhi_ps(tmp1748, tmp1750);
__m512 tmp1775 = _mm512_shuffle_ps(tmp1759, tmp1761, 68);
__m512 tmp1776 = _mm512_shuffle_ps(tmp1759, tmp1761, 238);
__m512 tmp1777 = _mm512_shuffle_ps(tmp1760, tmp1762, 68);
__m512 tmp1778 = _mm512_shuffle_ps(tmp1760, tmp1762, 238);
__m512 tmp1779 = _mm512_shuffle_ps(tmp1763, tmp1765, 68);
__m512 tmp1780 = _mm512_shuffle_ps(tmp1763, tmp1765, 238);
__m512 tmp1781 = _mm512_shuffle_ps(tmp1764, tmp1766, 68);
__m512 tmp1782 = _mm512_shuffle_ps(tmp1764, tmp1766, 238);
__m512 tmp1783 = _mm512_shuffle_ps(tmp1767, tmp1769, 68);
__m512 tmp1784 = _mm512_shuffle_ps(tmp1767, tmp1769, 238);
__m512 tmp1785 = _mm512_shuffle_ps(tmp1768, tmp1770, 68);
__m512 tmp1786 = _mm512_shuffle_ps(tmp1768, tmp1770, 238);
__m512 tmp1787 = _mm512_shuffle_ps(tmp1771, tmp1773, 68);
__m512 tmp1788 = _mm512_shuffle_ps(tmp1771, tmp1773, 238);
__m512 tmp1789 = _mm512_shuffle_ps(tmp1772, tmp1774, 68);
__m512 tmp1790 = _mm512_shuffle_ps(tmp1772, tmp1774, 238);
__m512 tmp1791 = _mm512_shuffle_f32x4(tmp1775, tmp1779, 136);
__m512 tmp1792 = _mm512_shuffle_f32x4(tmp1775, tmp1779, 221);
__m512 tmp1793 = _mm512_shuffle_f32x4(tmp1776, tmp1780, 136);
__m512 tmp1794 = _mm512_shuffle_f32x4(tmp1776, tmp1780, 221);
__m512 tmp1795 = _mm512_shuffle_f32x4(tmp1777, tmp1781, 136);
__m512 tmp1796 = _mm512_shuffle_f32x4(tmp1777, tmp1781, 221);
__m512 tmp1797 = _mm512_shuffle_f32x4(tmp1778, tmp1782, 136);
__m512 tmp1798 = _mm512_shuffle_f32x4(tmp1778, tmp1782, 221);
__m512 tmp1799 = _mm512_shuffle_f32x4(tmp1783, tmp1787, 136);
__m512 tmp1800 = _mm512_shuffle_f32x4(tmp1783, tmp1787, 221);
__m512 tmp1801 = _mm512_shuffle_f32x4(tmp1784, tmp1788, 136);
__m512 tmp1802 = _mm512_shuffle_f32x4(tmp1784, tmp1788, 221);
__m512 tmp1803 = _mm512_shuffle_f32x4(tmp1785, tmp1789, 136);
__m512 tmp1804 = _mm512_shuffle_f32x4(tmp1785, tmp1789, 221);
__m512 tmp1805 = _mm512_shuffle_f32x4(tmp1786, tmp1790, 136);
__m512 tmp1806 = _mm512_shuffle_f32x4(tmp1786, tmp1790, 221);
in352 = _mm512_shuffle_f32x4(tmp1791, tmp1799, 136);
in355 = _mm512_shuffle_f32x4(tmp1791, tmp1799, 221);
tmp1739 = _mm512_shuffle_f32x4(tmp1793, tmp1801, 136);
tmp1746 = _mm512_shuffle_f32x4(tmp1793, tmp1801, 221);
tmp1740 = _mm512_shuffle_f32x4(tmp1795, tmp1803, 136);
tmp1747 = _mm512_shuffle_f32x4(tmp1795, tmp1803, 221);
tmp1742 = _mm512_shuffle_f32x4(tmp1797, tmp1805, 136);
tmp1749 = _mm512_shuffle_f32x4(tmp1797, tmp1805, 221);
tmp1738 = _mm512_shuffle_f32x4(tmp1792, tmp1800, 136);
tmp1745 = _mm512_shuffle_f32x4(tmp1792, tmp1800, 221);
in354 = _mm512_shuffle_f32x4(tmp1794, tmp1802, 136);
in357 = _mm512_shuffle_f32x4(tmp1794, tmp1802, 221);
tmp1741 = _mm512_shuffle_f32x4(tmp1796, tmp1804, 136);
tmp1748 = _mm512_shuffle_f32x4(tmp1796, tmp1804, 221);
tmp1743 = _mm512_shuffle_f32x4(tmp1798, tmp1806, 136);
tmp1750 = _mm512_shuffle_f32x4(tmp1798, tmp1806, 221);
__m512 tmp1751 = _mm512_add_ps(tmp1739, in354);
__m512 tmp1755 = _mm512_add_ps(tmp1746, in357);
__m512 tmp1752 = _mm512_sub_ps(tmp1738, tmp1740);
__m512 tmp1756 = _mm512_sub_ps(tmp1745, tmp1747);
__m512 tmp1753 = _mm512_add_ps(tmp1740, tmp1741);
__m512 tmp1757 = _mm512_add_ps(tmp1747, tmp1748);
in352 = _mm512_sub_ps(in352, tmp1741);
in355 = _mm512_sub_ps(in355, tmp1748);
tmp1751 = _mm512_fmadd_ps(tmp1742, _mm512_set1_ps(-4.25e+00f), tmp1751);
tmp1755 = _mm512_fmadd_ps(tmp1749, _mm512_set1_ps(-4.25e+00f), tmp1755);
tmp1753 = _mm512_fmadd_ps(tmp1738, _mm512_set1_ps(-4.25e+00f), tmp1753);
tmp1757 = _mm512_fmadd_ps(tmp1745, _mm512_set1_ps(-4.25e+00f), tmp1757);
in352 = _mm512_fmadd_ps(tmp1752, _mm512_set1_ps(5.25e+00f), in352);
in355 = _mm512_fmadd_ps(tmp1756, _mm512_set1_ps(5.25e+00f), in355);
tmp1752 = _mm512_fmadd_ps(tmp1740, _mm512_set1_ps(2.5e-01f), tmp1741);
tmp1756 = _mm512_fmadd_ps(tmp1747, _mm512_set1_ps(2.5e-01f), tmp1748);
tmp1740 = _mm512_fmadd_ps(tmp1740, _mm512_set1_ps(4e+00f), tmp1741);
tmp1747 = _mm512_fmadd_ps(tmp1747, _mm512_set1_ps(4e+00f), tmp1748);
__m512 tmp1754 = _mm512_sub_ps(tmp1753, tmp1751);
__m512 tmp1758 = _mm512_sub_ps(tmp1757, tmp1755);
tmp1753 = _mm512_add_ps(tmp1751, tmp1753);
tmp1757 = _mm512_add_ps(tmp1755, tmp1757);
tmp1751 = _mm512_fmadd_ps(tmp1739, _mm512_set1_ps(2.5e-01f), in354);
tmp1755 = _mm512_fmadd_ps(tmp1746, _mm512_set1_ps(2.5e-01f), in357);
tmp1752 = _mm512_fmadd_ps(tmp1738, _mm512_set1_ps(-1.25e+00f), tmp1752);
tmp1756 = _mm512_fmadd_ps(tmp1745, _mm512_set1_ps(-1.25e+00f), tmp1756);
tmp1738 = _mm512_fmadd_ps(tmp1738, _mm512_set1_ps(-5e+00f), tmp1740);
tmp1745 = _mm512_fmadd_ps(tmp1745, _mm512_set1_ps(-5e+00f), tmp1747);
tmp1751 = _mm512_fmadd_ps(tmp1742, _mm512_set1_ps(-1.25e+00f), tmp1751);
tmp1755 = _mm512_fmadd_ps(tmp1749, _mm512_set1_ps(-1.25e+00f), tmp1755);
tmp1741 = _mm512_fmadd_ps(tmp1751, _mm512_set1_ps(2e+00f), tmp1752);
tmp1748 = _mm512_fmadd_ps(tmp1755, _mm512_set1_ps(2e+00f), tmp1756);
tmp1752 = _mm512_fnmadd_ps(tmp1751, _mm512_set1_ps(2e+00f), tmp1752);
tmp1756 = _mm512_fnmadd_ps(tmp1755, _mm512_set1_ps(2e+00f), tmp1756);
tmp1751 = _mm512_fmadd_ps(in354, _mm512_set1_ps(2.5e-01f), tmp1739);
tmp1755 = _mm512_fmadd_ps(in357, _mm512_set1_ps(2.5e-01f), tmp1746);
tmp1739 = _mm512_sub_ps(tmp1743, tmp1739);
tmp1746 = _mm512_sub_ps(tmp1750, tmp1746);
tmp1751 = _mm512_fmadd_ps(tmp1742, _mm512_set1_ps(-1.25e+00f), tmp1751);
tmp1755 = _mm512_fmadd_ps(tmp1749, _mm512_set1_ps(-1.25e+00f), tmp1755);
tmp1742 = _mm512_sub_ps(tmp1742, in354);
tmp1749 = _mm512_sub_ps(tmp1749, in357);
tmp1742 = _mm512_fmadd_ps(tmp1742, _mm512_set1_ps(5.25e+00f), tmp1739);
tmp1749 = _mm512_fmadd_ps(tmp1749, _mm512_set1_ps(5.25e+00f), tmp1746);
tmp1740 = _mm512_fmadd_ps(tmp1751, _mm512_set1_ps(2e+00f), tmp1738);
tmp1747 = _mm512_fmadd_ps(tmp1755, _mm512_set1_ps(2e+00f), tmp1745);
tmp1738 = _mm512_fnmadd_ps(tmp1751, _mm512_set1_ps(2e+00f), tmp1738);
tmp1745 = _mm512_fnmadd_ps(tmp1755, _mm512_set1_ps(2e+00f), tmp1745);
__m512 out391 = _mm512_shuffle_f32x4(in352, tmp1753, 68);
__m512 out399 = _mm512_shuffle_f32x4(in352, tmp1753, 238);
__m512 out392 = _mm512_shuffle_f32x4(tmp1754, tmp1741, 68);
__m512 out400 = _mm512_shuffle_f32x4(tmp1754, tmp1741, 238);
__m512 out393 = _mm512_shuffle_f32x4(tmp1752, tmp1740, 68);
__m512 out401 = _mm512_shuffle_f32x4(tmp1752, tmp1740, 238);
__m512 out394 = _mm512_shuffle_f32x4(tmp1738, tmp1742, 68);
__m512 out402 = _mm512_shuffle_f32x4(tmp1738, tmp1742, 238);
__m512 out395 = _mm512_shuffle_f32x4(in355, tmp1757, 68);
__m512 out403 = _mm512_shuffle_f32x4(in355, tmp1757, 238);
__m512 out396 = _mm512_shuffle_f32x4(tmp1758, tmp1748, 68);
__m512 out404 = _mm512_shuffle_f32x4(tmp1758, tmp1748, 238);
__m512 out397 = _mm512_shuffle_f32x4(tmp1756, tmp1747, 68);
__m512 out405 = _mm512_shuffle_f32x4(tmp1756, tmp1747, 238);
__m512 out398 = _mm512_shuffle_f32x4(tmp1745, tmp1749, 68);
__m512 out406 = _mm512_shuffle_f32x4(tmp1745, tmp1749, 238);
_mm512_storeu_ps(dfPtr4+512+102400*i17+1536*j11+1536*s12+768*k58, out391);
_mm512_storeu_ps(dfPtr4+640+102400*i17+1536*j11+1536*s12+768*k58, out399);
_mm512_storeu_ps(dfPtr4+576+102400*i17+1536*j11+1536*s12+768*k58, out395);
_mm512_storeu_ps(dfPtr4+704+102400*i17+1536*j11+1536*s12+768*k58, out403);
_mm512_storeu_ps(dfPtr4+26112+102400*i17+1536*j11+1536*s12+768*k58, out392);
_mm512_storeu_ps(dfPtr4+26240+102400*i17+1536*j11+1536*s12+768*k58, out400);
_mm512_storeu_ps(dfPtr4+26176+102400*i17+1536*j11+1536*s12+768*k58, out396);
_mm512_storeu_ps(dfPtr4+26304+102400*i17+1536*j11+1536*s12+768*k58, out404);
_mm512_storeu_ps(dfPtr4+51712+102400*i17+1536*j11+1536*s12+768*k58, out393);
_mm512_storeu_ps(dfPtr4+51840+102400*i17+1536*j11+1536*s12+768*k58, out401);
_mm512_storeu_ps(dfPtr4+51776+102400*i17+1536*j11+1536*s12+768*k58, out397);
_mm512_storeu_ps(dfPtr4+51904+102400*i17+1536*j11+1536*s12+768*k58, out405);
_mm512_storeu_ps(dfPtr4+77312+102400*i17+1536*j11+1536*s12+768*k58, out394);
_mm512_storeu_ps(dfPtr4+77440+102400*i17+1536*j11+1536*s12+768*k58, out402);
_mm512_storeu_ps(dfPtr4+77376+102400*i17+1536*j11+1536*s12+768*k58, out398);
_mm512_storeu_ps(dfPtr4+77504+102400*i17+1536*j11+1536*s12+768*k58, out406);
}
++j11;
rel9 = 1;
}
ptrdiff_t h28 = base9+0;
ptrdiff_t w31 = 36;
ptrdiff_t k59 = 0;
for (; k59 != 4; ++k59) {
__m512 dat1261 = _mm512_maskz_loadu_ps(16383, datPtr5+0+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1261 = _mm512_max_ps(_mm512_setzero_ps(), dat1261);
__m512 dat1262 = _mm512_maskz_loadu_ps(511, datPtr5+48+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1262 = _mm512_max_ps(_mm512_setzero_ps(), dat1262);
__m512i pm106 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in358 = _mm512_permutexvar_ps(pm106, dat1261);
__m512i pm107 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in361 = _mm512_permutexvar_ps(pm107, dat1262);
__m512 dat1263 = _mm512_maskz_loadu_ps(16383, datPtr5+224+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1263 = _mm512_max_ps(_mm512_setzero_ps(), dat1263);
__m512 dat1264 = _mm512_maskz_loadu_ps(511, datPtr5+272+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1264 = _mm512_max_ps(_mm512_setzero_ps(), dat1264);
__m512 in359 = _mm512_permutexvar_ps(pm106, dat1263);
__m512 in362 = _mm512_permutexvar_ps(pm107, dat1264);
__m512 dat1265 = _mm512_maskz_loadu_ps(16383, datPtr5+448+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1265 = _mm512_max_ps(_mm512_setzero_ps(), dat1265);
__m512 dat1266 = _mm512_maskz_loadu_ps(511, datPtr5+496+50432*i17+224*h28+4*w31+50432*s12+12608*k59);
dat1266 = _mm512_max_ps(_mm512_setzero_ps(), dat1266);
__m512 in360 = _mm512_permutexvar_ps(pm106, dat1265);
__m512 in363 = _mm512_permutexvar_ps(pm107, dat1266);
__m512 tmp1807 = in359;
__m512 tmp1814 = in362;
__m512 tmp1808 = _mm512_sub_ps(_mm512_setzero_ps(), in360);
__m512 tmp1815 = _mm512_sub_ps(_mm512_setzero_ps(), in363);
__m512 tmp1809 = in360;
__m512 tmp1816 = in363;
in358 = in358;
in361 = in361;
tmp1807 = tmp1807;
tmp1814 = tmp1814;
tmp1809 = tmp1809;
tmp1816 = tmp1816;
in358 = _mm512_fmadd_ps(tmp1808, _mm512_set1_ps(5.25e+00f), in358);
in361 = _mm512_fmadd_ps(tmp1815, _mm512_set1_ps(5.25e+00f), in361);
tmp1808 = _mm512_mul_ps(in360, _mm512_set1_ps(2.5e-01f));
tmp1815 = _mm512_mul_ps(in363, _mm512_set1_ps(2.5e-01f));
in360 = _mm512_mul_ps(in360, _mm512_set1_ps(4e+00f));
in363 = _mm512_mul_ps(in363, _mm512_set1_ps(4e+00f));
__m512 tmp1810 = _mm512_sub_ps(tmp1809, tmp1807);
__m512 tmp1817 = _mm512_sub_ps(tmp1816, tmp1814);
tmp1809 = _mm512_add_ps(tmp1807, tmp1809);
tmp1816 = _mm512_add_ps(tmp1814, tmp1816);
tmp1807 = _mm512_mul_ps(in359, _mm512_set1_ps(2.5e-01f));
tmp1814 = _mm512_mul_ps(in362, _mm512_set1_ps(2.5e-01f));
tmp1808 = tmp1808;
tmp1815 = tmp1815;
__m512 tmp1811 = in360;
__m512 tmp1818 = in363;
tmp1807 = tmp1807;
tmp1814 = tmp1814;
__m512 tmp1812 = _mm512_fmadd_ps(tmp1807, _mm512_set1_ps(2e+00f), tmp1808);
__m512 tmp1819 = _mm512_fmadd_ps(tmp1814, _mm512_set1_ps(2e+00f), tmp1815);
tmp1808 = _mm512_fnmadd_ps(tmp1807, _mm512_set1_ps(2e+00f), tmp1808);
tmp1815 = _mm512_fnmadd_ps(tmp1814, _mm512_set1_ps(2e+00f), tmp1815);
tmp1807 = in359;
tmp1814 = in362;
in359 = _mm512_sub_ps(_mm512_setzero_ps(), in359);
in362 = _mm512_sub_ps(_mm512_setzero_ps(), in362);
tmp1807 = tmp1807;
tmp1814 = tmp1814;
__m512 tmp1813 = in359;
__m512 tmp1820 = in362;
in360 = _mm512_fmadd_ps(tmp1807, _mm512_set1_ps(2e+00f), tmp1811);
in363 = _mm512_fmadd_ps(tmp1814, _mm512_set1_ps(2e+00f), tmp1818);
tmp1811 = _mm512_fnmadd_ps(tmp1807, _mm512_set1_ps(2e+00f), tmp1811);
tmp1818 = _mm512_fnmadd_ps(tmp1814, _mm512_set1_ps(2e+00f), tmp1818);
__m512 tmp1829 = _mm512_unpacklo_ps(in358, tmp1809);
__m512 tmp1830 = _mm512_unpackhi_ps(in358, tmp1809);
__m512 tmp1831 = _mm512_unpacklo_ps(tmp1810, tmp1812);
__m512 tmp1832 = _mm512_unpackhi_ps(tmp1810, tmp1812);
__m512 tmp1833 = _mm512_unpacklo_ps(tmp1808, in360);
__m512 tmp1834 = _mm512_unpackhi_ps(tmp1808, in360);
__m512 tmp1835 = _mm512_unpacklo_ps(tmp1811, tmp1813);
__m512 tmp1836 = _mm512_unpackhi_ps(tmp1811, tmp1813);
__m512 tmp1837 = _mm512_unpacklo_ps(in361, tmp1816);
__m512 tmp1838 = _mm512_unpackhi_ps(in361, tmp1816);
__m512 tmp1839 = _mm512_unpacklo_ps(tmp1817, tmp1819);
__m512 tmp1840 = _mm512_unpackhi_ps(tmp1817, tmp1819);
__m512 tmp1841 = _mm512_unpacklo_ps(tmp1815, in363);
__m512 tmp1842 = _mm512_unpackhi_ps(tmp1815, in363);
__m512 tmp1843 = _mm512_unpacklo_ps(tmp1818, tmp1820);
__m512 tmp1844 = _mm512_unpackhi_ps(tmp1818, tmp1820);
__m512 tmp1845 = _mm512_shuffle_ps(tmp1829, tmp1831, 68);
__m512 tmp1846 = _mm512_shuffle_ps(tmp1829, tmp1831, 238);
__m512 tmp1847 = _mm512_shuffle_ps(tmp1830, tmp1832, 68);
__m512 tmp1848 = _mm512_shuffle_ps(tmp1830, tmp1832, 238);
__m512 tmp1849 = _mm512_shuffle_ps(tmp1833, tmp1835, 68);
__m512 tmp1850 = _mm512_shuffle_ps(tmp1833, tmp1835, 238);
__m512 tmp1851 = _mm512_shuffle_ps(tmp1834, tmp1836, 68);
__m512 tmp1852 = _mm512_shuffle_ps(tmp1834, tmp1836, 238);
__m512 tmp1853 = _mm512_shuffle_ps(tmp1837, tmp1839, 68);
__m512 tmp1854 = _mm512_shuffle_ps(tmp1837, tmp1839, 238);
__m512 tmp1855 = _mm512_shuffle_ps(tmp1838, tmp1840, 68);
__m512 tmp1856 = _mm512_shuffle_ps(tmp1838, tmp1840, 238);
__m512 tmp1857 = _mm512_shuffle_ps(tmp1841, tmp1843, 68);
__m512 tmp1858 = _mm512_shuffle_ps(tmp1841, tmp1843, 238);
__m512 tmp1859 = _mm512_shuffle_ps(tmp1842, tmp1844, 68);
__m512 tmp1860 = _mm512_shuffle_ps(tmp1842, tmp1844, 238);
__m512 tmp1861 = _mm512_shuffle_f32x4(tmp1845, tmp1849, 136);
__m512 tmp1862 = _mm512_shuffle_f32x4(tmp1845, tmp1849, 221);
__m512 tmp1863 = _mm512_shuffle_f32x4(tmp1846, tmp1850, 136);
__m512 tmp1864 = _mm512_shuffle_f32x4(tmp1846, tmp1850, 221);
__m512 tmp1865 = _mm512_shuffle_f32x4(tmp1847, tmp1851, 136);
__m512 tmp1866 = _mm512_shuffle_f32x4(tmp1847, tmp1851, 221);
__m512 tmp1867 = _mm512_shuffle_f32x4(tmp1848, tmp1852, 136);
__m512 tmp1868 = _mm512_shuffle_f32x4(tmp1848, tmp1852, 221);
__m512 tmp1869 = _mm512_shuffle_f32x4(tmp1853, tmp1857, 136);
__m512 tmp1870 = _mm512_shuffle_f32x4(tmp1853, tmp1857, 221);
__m512 tmp1871 = _mm512_shuffle_f32x4(tmp1854, tmp1858, 136);
__m512 tmp1872 = _mm512_shuffle_f32x4(tmp1854, tmp1858, 221);
__m512 tmp1873 = _mm512_shuffle_f32x4(tmp1855, tmp1859, 136);
__m512 tmp1874 = _mm512_shuffle_f32x4(tmp1855, tmp1859, 221);
__m512 tmp1875 = _mm512_shuffle_f32x4(tmp1856, tmp1860, 136);
__m512 tmp1876 = _mm512_shuffle_f32x4(tmp1856, tmp1860, 221);
in358 = _mm512_shuffle_f32x4(tmp1861, tmp1869, 136);
in361 = _mm512_shuffle_f32x4(tmp1861, tmp1869, 221);
tmp1809 = _mm512_shuffle_f32x4(tmp1863, tmp1871, 136);
tmp1816 = _mm512_shuffle_f32x4(tmp1863, tmp1871, 221);
tmp1810 = _mm512_shuffle_f32x4(tmp1865, tmp1873, 136);
tmp1817 = _mm512_shuffle_f32x4(tmp1865, tmp1873, 221);
tmp1812 = _mm512_shuffle_f32x4(tmp1867, tmp1875, 136);
tmp1819 = _mm512_shuffle_f32x4(tmp1867, tmp1875, 221);
tmp1808 = _mm512_shuffle_f32x4(tmp1862, tmp1870, 136);
tmp1815 = _mm512_shuffle_f32x4(tmp1862, tmp1870, 221);
in360 = _mm512_shuffle_f32x4(tmp1864, tmp1872, 136);
in363 = _mm512_shuffle_f32x4(tmp1864, tmp1872, 221);
tmp1811 = _mm512_shuffle_f32x4(tmp1866, tmp1874, 136);
tmp1818 = _mm512_shuffle_f32x4(tmp1866, tmp1874, 221);
tmp1813 = _mm512_shuffle_f32x4(tmp1868, tmp1876, 136);
tmp1820 = _mm512_shuffle_f32x4(tmp1868, tmp1876, 221);
__m512 tmp1821 = _mm512_add_ps(tmp1809, in360);
__m512 tmp1825 = _mm512_add_ps(tmp1816, in363);
__m512 tmp1822 = _mm512_sub_ps(tmp1808, tmp1810);
__m512 tmp1826 = _mm512_sub_ps(tmp1815, tmp1817);
__m512 tmp1823 = _mm512_add_ps(tmp1810, tmp1811);
__m512 tmp1827 = _mm512_add_ps(tmp1817, tmp1818);
in358 = _mm512_sub_ps(in358, tmp1811);
in361 = _mm512_sub_ps(in361, tmp1818);
tmp1821 = _mm512_fmadd_ps(tmp1812, _mm512_set1_ps(-4.25e+00f), tmp1821);
tmp1825 = _mm512_fmadd_ps(tmp1819, _mm512_set1_ps(-4.25e+00f), tmp1825);
tmp1823 = _mm512_fmadd_ps(tmp1808, _mm512_set1_ps(-4.25e+00f), tmp1823);
tmp1827 = _mm512_fmadd_ps(tmp1815, _mm512_set1_ps(-4.25e+00f), tmp1827);
in358 = _mm512_fmadd_ps(tmp1822, _mm512_set1_ps(5.25e+00f), in358);
in361 = _mm512_fmadd_ps(tmp1826, _mm512_set1_ps(5.25e+00f), in361);
tmp1822 = _mm512_fmadd_ps(tmp1810, _mm512_set1_ps(2.5e-01f), tmp1811);
tmp1826 = _mm512_fmadd_ps(tmp1817, _mm512_set1_ps(2.5e-01f), tmp1818);
tmp1810 = _mm512_fmadd_ps(tmp1810, _mm512_set1_ps(4e+00f), tmp1811);
tmp1817 = _mm512_fmadd_ps(tmp1817, _mm512_set1_ps(4e+00f), tmp1818);
__m512 tmp1824 = _mm512_sub_ps(tmp1823, tmp1821);
__m512 tmp1828 = _mm512_sub_ps(tmp1827, tmp1825);
tmp1823 = _mm512_add_ps(tmp1821, tmp1823);
tmp1827 = _mm512_add_ps(tmp1825, tmp1827);
tmp1821 = _mm512_fmadd_ps(tmp1809, _mm512_set1_ps(2.5e-01f), in360);
tmp1825 = _mm512_fmadd_ps(tmp1816, _mm512_set1_ps(2.5e-01f), in363);
tmp1822 = _mm512_fmadd_ps(tmp1808, _mm512_set1_ps(-1.25e+00f), tmp1822);
tmp1826 = _mm512_fmadd_ps(tmp1815, _mm512_set1_ps(-1.25e+00f), tmp1826);
tmp1808 = _mm512_fmadd_ps(tmp1808, _mm512_set1_ps(-5e+00f), tmp1810);
tmp1815 = _mm512_fmadd_ps(tmp1815, _mm512_set1_ps(-5e+00f), tmp1817);
tmp1821 = _mm512_fmadd_ps(tmp1812, _mm512_set1_ps(-1.25e+00f), tmp1821);
tmp1825 = _mm512_fmadd_ps(tmp1819, _mm512_set1_ps(-1.25e+00f), tmp1825);
tmp1811 = _mm512_fmadd_ps(tmp1821, _mm512_set1_ps(2e+00f), tmp1822);
tmp1818 = _mm512_fmadd_ps(tmp1825, _mm512_set1_ps(2e+00f), tmp1826);
tmp1822 = _mm512_fnmadd_ps(tmp1821, _mm512_set1_ps(2e+00f), tmp1822);
tmp1826 = _mm512_fnmadd_ps(tmp1825, _mm512_set1_ps(2e+00f), tmp1826);
tmp1821 = _mm512_fmadd_ps(in360, _mm512_set1_ps(2.5e-01f), tmp1809);
tmp1825 = _mm512_fmadd_ps(in363, _mm512_set1_ps(2.5e-01f), tmp1816);
tmp1809 = _mm512_sub_ps(tmp1813, tmp1809);
tmp1816 = _mm512_sub_ps(tmp1820, tmp1816);
tmp1821 = _mm512_fmadd_ps(tmp1812, _mm512_set1_ps(-1.25e+00f), tmp1821);
tmp1825 = _mm512_fmadd_ps(tmp1819, _mm512_set1_ps(-1.25e+00f), tmp1825);
tmp1812 = _mm512_sub_ps(tmp1812, in360);
tmp1819 = _mm512_sub_ps(tmp1819, in363);
tmp1812 = _mm512_fmadd_ps(tmp1812, _mm512_set1_ps(5.25e+00f), tmp1809);
tmp1819 = _mm512_fmadd_ps(tmp1819, _mm512_set1_ps(5.25e+00f), tmp1816);
tmp1810 = _mm512_fmadd_ps(tmp1821, _mm512_set1_ps(2e+00f), tmp1808);
tmp1817 = _mm512_fmadd_ps(tmp1825, _mm512_set1_ps(2e+00f), tmp1815);
tmp1808 = _mm512_fnmadd_ps(tmp1821, _mm512_set1_ps(2e+00f), tmp1808);
tmp1815 = _mm512_fnmadd_ps(tmp1825, _mm512_set1_ps(2e+00f), tmp1815);
__m512 out407 = _mm512_shuffle_f32x4(in358, tmp1823, 68);
__m512 out415 = _mm512_shuffle_f32x4(in358, tmp1823, 238);
__m512 out408 = _mm512_shuffle_f32x4(tmp1824, tmp1811, 68);
__m512 out416 = _mm512_shuffle_f32x4(tmp1824, tmp1811, 238);
__m512 out409 = _mm512_shuffle_f32x4(tmp1822, tmp1810, 68);
__m512 out417 = _mm512_shuffle_f32x4(tmp1822, tmp1810, 238);
__m512 out410 = _mm512_shuffle_f32x4(tmp1808, tmp1812, 68);
__m512 out418 = _mm512_shuffle_f32x4(tmp1808, tmp1812, 238);
__m512 out411 = _mm512_shuffle_f32x4(in361, tmp1827, 68);
__m512 out419 = _mm512_shuffle_f32x4(in361, tmp1827, 238);
__m512 out412 = _mm512_shuffle_f32x4(tmp1828, tmp1818, 68);
__m512 out420 = _mm512_shuffle_f32x4(tmp1828, tmp1818, 238);
__m512 out413 = _mm512_shuffle_f32x4(tmp1826, tmp1817, 68);
__m512 out421 = _mm512_shuffle_f32x4(tmp1826, tmp1817, 238);
__m512 out414 = _mm512_shuffle_f32x4(tmp1815, tmp1819, 68);
__m512 out422 = _mm512_shuffle_f32x4(tmp1815, tmp1819, 238);
_mm512_storeu_ps(dfPtr4+0+102400*i17+1536*j11+1024*s12+256*k59, out407);
_mm512_storeu_ps(dfPtr4+128+102400*i17+1536*j11+1024*s12+256*k59, out415);
_mm512_storeu_ps(dfPtr4+64+102400*i17+1536*j11+1024*s12+256*k59, out411);
_mm512_storeu_ps(dfPtr4+192+102400*i17+1536*j11+1024*s12+256*k59, out419);
_mm512_storeu_ps(dfPtr4+25600+102400*i17+1536*j11+1024*s12+256*k59, out408);
_mm512_storeu_ps(dfPtr4+25728+102400*i17+1536*j11+1024*s12+256*k59, out416);
_mm512_storeu_ps(dfPtr4+25664+102400*i17+1536*j11+1024*s12+256*k59, out412);
_mm512_storeu_ps(dfPtr4+25792+102400*i17+1536*j11+1024*s12+256*k59, out420);
_mm512_storeu_ps(dfPtr4+51200+102400*i17+1536*j11+1024*s12+256*k59, out409);
_mm512_storeu_ps(dfPtr4+51328+102400*i17+1536*j11+1024*s12+256*k59, out417);
_mm512_storeu_ps(dfPtr4+51264+102400*i17+1536*j11+1024*s12+256*k59, out413);
_mm512_storeu_ps(dfPtr4+51392+102400*i17+1536*j11+1024*s12+256*k59, out421);
_mm512_storeu_ps(dfPtr4+76800+102400*i17+1536*j11+1024*s12+256*k59, out410);
_mm512_storeu_ps(dfPtr4+76928+102400*i17+1536*j11+1024*s12+256*k59, out418);
_mm512_storeu_ps(dfPtr4+76864+102400*i17+1536*j11+1024*s12+256*k59, out414);
_mm512_storeu_ps(dfPtr4+76992+102400*i17+1536*j11+1024*s12+256*k59, out422);
}
++j11;
}
}

static void ResNeXt50ThreeArrangeDats1(ResNeXt50ThreaderTeam1* team23, char** tensors19) {
ResNeXt50ThreaderTask1 task23;
task23.callee1 = ResNeXt50ThreeArrangeDats1Callee1;
task23.any1 = tensors19;
task23.nd1 = 4;
task23.hull1[0] = 1;
task23.hull1[1] = 1;
task23.hull1[2] = 16;
task23.hull1[3] = 1;
ResNeXt50ThreaderDo1(team23, &task23);
}

static void ResNeXt50ThreeProduceSums1Callee1(ResNeXt50ThreaderTask1* task24, int64_t* pt17) {
void** pair4 = task24->any1;
char** tensors22 = pair4[0];
ptrdiff_t e9 = 0;
ptrdiff_t g9 = pt17[3];
ptrdiff_t f44 = 0;
ptrdiff_t d4 = 0;
ptrdiff_t w32 = 0;
char*restrict bfPtr5 = tensors22[0]+512*e9;
char*restrict wfPtr5 = tensors22[0]+512+6488064*e9;
char*restrict dfPtr5 = tensors22[1]+324403200*e9;
char*restrict sfPtr4 = tensors22[2];
ptrdiff_t i18 = 2*g9;
ptrdiff_t ii6 = i18+1;
for (; i18 <= ii6; ++i18) {
ptrdiff_t j12 = 4*f44;
ptrdiff_t jj22 = j12+3;
for (; j12 <= jj22; ++j12) {
ptrdiff_t k60 = 17*d4;
for (; k60 != 16; ++k60) {
ptrdiff_t l11 = 1*w32;
for (; l11 != 1; ++l11) {
__m512 sum30;
__m512 sum36;
__m512 sum42;
__m512 sum48;
if (__builtin_expect(!j12, 0)) {
sum30 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+0+16*i18+16*l11)));
sum36 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+4+16*i18+16*l11)));
sum42 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+8+16*i18+16*l11)));
sum48 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+12+16*i18+16*l11)));
} else {
sum30 = _mm512_setzero_ps();
sum36 = _mm512_setzero_ps();
sum42 = _mm512_setzero_ps();
sum48 = _mm512_setzero_ps();
}
__m512 sum31 = sum30;
__m512 sum32 = sum30;
__m512 sum33 = sum30;
__m512 sum34 = sum30;
__m512 sum35 = sum30;
__m512 sum37 = sum36;
__m512 sum38 = sum36;
__m512 sum39 = sum36;
__m512 sum40 = sum36;
__m512 sum41 = sum36;
__m512 sum43 = sum42;
__m512 sum44 = sum42;
__m512 sum45 = sum42;
__m512 sum46 = sum42;
__m512 sum47 = sum42;
__m512 sum49 = sum48;
__m512 sum50 = sum48;
__m512 sum51 = sum48;
__m512 sum52 = sum48;
__m512 sum53 = sum48;
ptrdiff_t b46 = 0;
for (; b46 != 4; ++b46) {
__m512i wfs17 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+2048*i18+512*j12+512*l11+128*b46);
__m512 wf49 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs17));
__m512 df641 = _mm512_loadu_ps(dfPtr5+0+102400*i18+25600*j12+1536*k60+384*b46);
sum30 = _mm512_fmadd_ps(wf49, df641, sum30);
__m512 df642 = _mm512_loadu_ps(dfPtr5+64+102400*i18+25600*j12+1536*k60+384*b46);
sum31 = _mm512_fmadd_ps(wf49, df642, sum31);
__m512 df643 = _mm512_loadu_ps(dfPtr5+128+102400*i18+25600*j12+1536*k60+384*b46);
sum32 = _mm512_fmadd_ps(wf49, df643, sum32);
__m512 df644 = _mm512_loadu_ps(dfPtr5+192+102400*i18+25600*j12+1536*k60+384*b46);
sum33 = _mm512_fmadd_ps(wf49, df644, sum33);
__m512 df645 = _mm512_loadu_ps(dfPtr5+256+102400*i18+25600*j12+1536*k60+384*b46);
sum34 = _mm512_fmadd_ps(wf49, df645, sum34);
__m512 df646 = _mm512_loadu_ps(dfPtr5+320+102400*i18+25600*j12+1536*k60+384*b46);
sum35 = _mm512_fmadd_ps(wf49, df646, sum35);
__m512 wf50 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs17, 1));
sum36 = _mm512_fmadd_ps(wf50, df641, sum36);
sum37 = _mm512_fmadd_ps(wf50, df642, sum37);
sum38 = _mm512_fmadd_ps(wf50, df643, sum38);
sum39 = _mm512_fmadd_ps(wf50, df644, sum39);
sum40 = _mm512_fmadd_ps(wf50, df645, sum40);
sum41 = _mm512_fmadd_ps(wf50, df646, sum41);
__m512i wfs18 = _mm512_maskz_loadu_epi32(65535, wfPtr5+64+2048*i18+512*j12+512*l11+128*b46);
__m512 wf51 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs18));
sum42 = _mm512_fmadd_ps(wf51, df641, sum42);
sum43 = _mm512_fmadd_ps(wf51, df642, sum43);
sum44 = _mm512_fmadd_ps(wf51, df643, sum44);
sum45 = _mm512_fmadd_ps(wf51, df644, sum45);
sum46 = _mm512_fmadd_ps(wf51, df645, sum46);
sum47 = _mm512_fmadd_ps(wf51, df646, sum47);
__m512 wf52 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs18, 1));
sum48 = _mm512_fmadd_ps(wf52, df641, sum48);
sum49 = _mm512_fmadd_ps(wf52, df642, sum49);
sum50 = _mm512_fmadd_ps(wf52, df643, sum50);
sum51 = _mm512_fmadd_ps(wf52, df644, sum51);
sum52 = _mm512_fmadd_ps(wf52, df645, sum52);
sum53 = _mm512_fmadd_ps(wf52, df646, sum53);
}
_mm512_storeu_ps(sfPtr4+0+102400*i18+25600*j12+1536*k60+1536*l11, sum30);
_mm512_storeu_ps(sfPtr4+64+102400*i18+25600*j12+1536*k60+1536*l11, sum31);
_mm512_storeu_ps(sfPtr4+128+102400*i18+25600*j12+1536*k60+1536*l11, sum32);
_mm512_storeu_ps(sfPtr4+192+102400*i18+25600*j12+1536*k60+1536*l11, sum33);
_mm512_storeu_ps(sfPtr4+256+102400*i18+25600*j12+1536*k60+1536*l11, sum34);
_mm512_storeu_ps(sfPtr4+320+102400*i18+25600*j12+1536*k60+1536*l11, sum35);
_mm512_storeu_ps(sfPtr4+384+102400*i18+25600*j12+1536*k60+1536*l11, sum36);
_mm512_storeu_ps(sfPtr4+448+102400*i18+25600*j12+1536*k60+1536*l11, sum37);
_mm512_storeu_ps(sfPtr4+512+102400*i18+25600*j12+1536*k60+1536*l11, sum38);
_mm512_storeu_ps(sfPtr4+576+102400*i18+25600*j12+1536*k60+1536*l11, sum39);
_mm512_storeu_ps(sfPtr4+640+102400*i18+25600*j12+1536*k60+1536*l11, sum40);
_mm512_storeu_ps(sfPtr4+704+102400*i18+25600*j12+1536*k60+1536*l11, sum41);
_mm512_storeu_ps(sfPtr4+768+102400*i18+25600*j12+1536*k60+1536*l11, sum42);
_mm512_storeu_ps(sfPtr4+832+102400*i18+25600*j12+1536*k60+1536*l11, sum43);
_mm512_storeu_ps(sfPtr4+896+102400*i18+25600*j12+1536*k60+1536*l11, sum44);
_mm512_storeu_ps(sfPtr4+960+102400*i18+25600*j12+1536*k60+1536*l11, sum45);
_mm512_storeu_ps(sfPtr4+1024+102400*i18+25600*j12+1536*k60+1536*l11, sum46);
_mm512_storeu_ps(sfPtr4+1088+102400*i18+25600*j12+1536*k60+1536*l11, sum47);
_mm512_storeu_ps(sfPtr4+1152+102400*i18+25600*j12+1536*k60+1536*l11, sum48);
_mm512_storeu_ps(sfPtr4+1216+102400*i18+25600*j12+1536*k60+1536*l11, sum49);
_mm512_storeu_ps(sfPtr4+1280+102400*i18+25600*j12+1536*k60+1536*l11, sum50);
_mm512_storeu_ps(sfPtr4+1344+102400*i18+25600*j12+1536*k60+1536*l11, sum51);
_mm512_storeu_ps(sfPtr4+1408+102400*i18+25600*j12+1536*k60+1536*l11, sum52);
_mm512_storeu_ps(sfPtr4+1472+102400*i18+25600*j12+1536*k60+1536*l11, sum53);
}
}
ptrdiff_t l12 = 1*w32;
for (; l12 != 1; ++l12) {
__m512 sum54;
__m512 sum58;
__m512 sum62;
__m512 sum66;
if (__builtin_expect(!j12, 0)) {
sum54 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+0+16*i18+16*l12)));
sum58 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+4+16*i18+16*l12)));
sum62 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+8+16*i18+16*l12)));
sum66 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr5+12+16*i18+16*l12)));
} else {
sum54 = _mm512_setzero_ps();
sum58 = _mm512_setzero_ps();
sum62 = _mm512_setzero_ps();
sum66 = _mm512_setzero_ps();
}
__m512 sum55 = sum54;
__m512 sum56 = sum54;
__m512 sum57 = sum54;
__m512 sum59 = sum58;
__m512 sum60 = sum58;
__m512 sum61 = sum58;
__m512 sum63 = sum62;
__m512 sum64 = sum62;
__m512 sum65 = sum62;
__m512 sum67 = sum66;
__m512 sum68 = sum66;
__m512 sum69 = sum66;
ptrdiff_t b47 = 0;
for (; b47 != 4; ++b47) {
__m512i wfs19 = _mm512_maskz_loadu_epi32(65535, wfPtr5+0+2048*i18+512*j12+512*l12+128*b47);
__m512 wf53 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs19));
__m512 df647 = _mm512_loadu_ps(dfPtr5+0+102400*i18+25600*j12+1536*k60+256*b47);
sum54 = _mm512_fmadd_ps(wf53, df647, sum54);
__m512 df648 = _mm512_loadu_ps(dfPtr5+64+102400*i18+25600*j12+1536*k60+256*b47);
sum55 = _mm512_fmadd_ps(wf53, df648, sum55);
__m512 df649 = _mm512_loadu_ps(dfPtr5+128+102400*i18+25600*j12+1536*k60+256*b47);
sum56 = _mm512_fmadd_ps(wf53, df649, sum56);
__m512 df650 = _mm512_loadu_ps(dfPtr5+192+102400*i18+25600*j12+1536*k60+256*b47);
sum57 = _mm512_fmadd_ps(wf53, df650, sum57);
__m512 wf54 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs19, 1));
sum58 = _mm512_fmadd_ps(wf54, df647, sum58);
sum59 = _mm512_fmadd_ps(wf54, df648, sum59);
sum60 = _mm512_fmadd_ps(wf54, df649, sum60);
sum61 = _mm512_fmadd_ps(wf54, df650, sum61);
__m512i wfs20 = _mm512_maskz_loadu_epi32(65535, wfPtr5+64+2048*i18+512*j12+512*l12+128*b47);
__m512 wf55 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs20));
sum62 = _mm512_fmadd_ps(wf55, df647, sum62);
sum63 = _mm512_fmadd_ps(wf55, df648, sum63);
sum64 = _mm512_fmadd_ps(wf55, df649, sum64);
sum65 = _mm512_fmadd_ps(wf55, df650, sum65);
__m512 wf56 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs20, 1));
sum66 = _mm512_fmadd_ps(wf56, df647, sum66);
sum67 = _mm512_fmadd_ps(wf56, df648, sum67);
sum68 = _mm512_fmadd_ps(wf56, df649, sum68);
sum69 = _mm512_fmadd_ps(wf56, df650, sum69);
}
_mm512_storeu_ps(sfPtr4+0+102400*i18+25600*j12+1536*k60+1024*l12, sum54);
_mm512_storeu_ps(sfPtr4+64+102400*i18+25600*j12+1536*k60+1024*l12, sum55);
_mm512_storeu_ps(sfPtr4+128+102400*i18+25600*j12+1536*k60+1024*l12, sum56);
_mm512_storeu_ps(sfPtr4+192+102400*i18+25600*j12+1536*k60+1024*l12, sum57);
_mm512_storeu_ps(sfPtr4+256+102400*i18+25600*j12+1536*k60+1024*l12, sum58);
_mm512_storeu_ps(sfPtr4+320+102400*i18+25600*j12+1536*k60+1024*l12, sum59);
_mm512_storeu_ps(sfPtr4+384+102400*i18+25600*j12+1536*k60+1024*l12, sum60);
_mm512_storeu_ps(sfPtr4+448+102400*i18+25600*j12+1536*k60+1024*l12, sum61);
_mm512_storeu_ps(sfPtr4+512+102400*i18+25600*j12+1536*k60+1024*l12, sum62);
_mm512_storeu_ps(sfPtr4+576+102400*i18+25600*j12+1536*k60+1024*l12, sum63);
_mm512_storeu_ps(sfPtr4+640+102400*i18+25600*j12+1536*k60+1024*l12, sum64);
_mm512_storeu_ps(sfPtr4+704+102400*i18+25600*j12+1536*k60+1024*l12, sum65);
_mm512_storeu_ps(sfPtr4+768+102400*i18+25600*j12+1536*k60+1024*l12, sum66);
_mm512_storeu_ps(sfPtr4+832+102400*i18+25600*j12+1536*k60+1024*l12, sum67);
_mm512_storeu_ps(sfPtr4+896+102400*i18+25600*j12+1536*k60+1024*l12, sum68);
_mm512_storeu_ps(sfPtr4+960+102400*i18+25600*j12+1536*k60+1024*l12, sum69);
}
}
}
}

static void ResNeXt50ThreeProduceSums1(ResNeXt50ThreaderTeam1* team24, char** tensors21) {
void* pair3[] = {tensors21, 0};
ResNeXt50ThreaderTask1 task25;
task25.callee1 = ResNeXt50ThreeProduceSums1Callee1;
task25.any1 = pair3;
task25.nd1 = 4;
task25.hull1[0] = 1;
task25.hull1[1] = 1;
task25.hull1[2] = 1;
task25.hull1[3] = 16;
ResNeXt50ThreaderDo1(team24, &task25);
}

static void ResNeXt50ThreeConsumeSums1Callee1(ResNeXt50ThreaderTask1* task26, int64_t* pt18) {
char** tensors24 = task26->any1;
ptrdiff_t w33 = 0;
ptrdiff_t d5 = 0;
ptrdiff_t g10 = pt18[2];
char*restrict sfPtr5 = tensors24[0];
char*restrict datPtr6 = tensors24[1];
ptrdiff_t i19 = 2*g10;
ptrdiff_t ii7 = i19+1;
for (; i19 <= ii7; ++i19) {
ptrdiff_t j13 = 17*d5;
if (j13 < 2) {
ptrdiff_t rel10 = j13-0;
ptrdiff_t base10 = 0;
if (rel10 < 1) {
ptrdiff_t toH20 = base10+0;
ptrdiff_t toW20 = 0;
ptrdiff_t k61 = 1*w33;
for (; k61 != 1; ++k61) {
ptrdiff_t l13 = 0;
for (; l13 != 2; ++l13) {
__m512 sf1 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf2 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in364 = _mm512_shuffle_f32x4(sf1, sf2, 68);
__m512 in365 = _mm512_shuffle_f32x4(sf1, sf2, 238);
__m512 sf3 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf4 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in372 = _mm512_shuffle_f32x4(sf3, sf4, 68);
__m512 in373 = _mm512_shuffle_f32x4(sf3, sf4, 238);
__m512 sf5 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf6 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in366 = _mm512_shuffle_f32x4(sf5, sf6, 68);
__m512 in367 = _mm512_shuffle_f32x4(sf5, sf6, 238);
__m512 sf7 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf8 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in374 = _mm512_shuffle_f32x4(sf7, sf8, 68);
__m512 in375 = _mm512_shuffle_f32x4(sf7, sf8, 238);
__m512 sf9 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf10 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in368 = _mm512_shuffle_f32x4(sf9, sf10, 68);
__m512 in369 = _mm512_shuffle_f32x4(sf9, sf10, 238);
__m512 sf11 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf12 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in376 = _mm512_shuffle_f32x4(sf11, sf12, 68);
__m512 in377 = _mm512_shuffle_f32x4(sf11, sf12, 238);
__m512 sf13 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf14 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in370 = _mm512_shuffle_f32x4(sf13, sf14, 68);
__m512 in371 = _mm512_shuffle_f32x4(sf13, sf14, 238);
__m512 sf15 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf16 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in378 = _mm512_shuffle_f32x4(sf15, sf16, 68);
__m512 in379 = _mm512_shuffle_f32x4(sf15, sf16, 238);
__m512 tmp1893 = _mm512_add_ps(in365, in366);
__m512 tmp1913 = _mm512_add_ps(in373, in374);
__m512 tmp1892 = _mm512_add_ps(in367, in368);
__m512 tmp1912 = _mm512_add_ps(in375, in376);
__m512 tmp1898 = _mm512_sub_ps(in367, in368);
__m512 tmp1918 = _mm512_sub_ps(in375, in376);
__m512 tmp1897 = _mm512_sub_ps(in365, in366);
__m512 tmp1917 = _mm512_sub_ps(in373, in374);
__m512 tmp1894 = _mm512_add_ps(in369, in370);
__m512 tmp1914 = _mm512_add_ps(in377, in378);
__m512 tmp1899 = _mm512_sub_ps(in369, in370);
__m512 tmp1919 = _mm512_sub_ps(in377, in378);
__m512 tmp1896 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(2e+00f), tmp1897);
__m512 tmp1916 = _mm512_fmadd_ps(tmp1918, _mm512_set1_ps(2e+00f), tmp1917);
__m512 tmp1903 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(8e+00f), tmp1897);
__m512 tmp1923 = _mm512_fmadd_ps(tmp1918, _mm512_set1_ps(8e+00f), tmp1917);
__m512 tmp1891 = _mm512_add_ps(tmp1892, tmp1893);
__m512 tmp1911 = _mm512_add_ps(tmp1912, tmp1913);
__m512 tmp1895 = _mm512_fmadd_ps(tmp1899, _mm512_set1_ps(1.6e+01f), tmp1896);
__m512 tmp1915 = _mm512_fmadd_ps(tmp1919, _mm512_set1_ps(1.6e+01f), tmp1916);
__m512 tmp1902 = _mm512_fmadd_ps(tmp1899, _mm512_set1_ps(4e+00f), tmp1903);
__m512 tmp1922 = _mm512_fmadd_ps(tmp1919, _mm512_set1_ps(4e+00f), tmp1923);
__m512 tmp1908 = _mm512_add_ps(tmp1899, tmp1897);
__m512 tmp1928 = _mm512_add_ps(tmp1919, tmp1917);
__m512 tmp1901 = _mm512_fmadd_ps(tmp1892, _mm512_set1_ps(4e+00f), tmp1893);
__m512 tmp1921 = _mm512_fmadd_ps(tmp1912, _mm512_set1_ps(4e+00f), tmp1913);
__m512 tmp1905 = _mm512_fmadd_ps(tmp1892, _mm512_set1_ps(1.6e+01f), tmp1893);
__m512 tmp1925 = _mm512_fmadd_ps(tmp1912, _mm512_set1_ps(1.6e+01f), tmp1913);
__m512 tmp1890 = _mm512_add_ps(tmp1891, in364);
__m512 tmp1910 = _mm512_add_ps(tmp1911, in372);
__m512 tmp1907 = _mm512_add_ps(tmp1908, in371);
__m512 tmp1927 = _mm512_add_ps(tmp1928, in379);
__m512 tmp1889 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(3.2e+01f), tmp1890);
__m512 tmp1909 = _mm512_fmadd_ps(tmp1914, _mm512_set1_ps(3.2e+01f), tmp1910);
__m512 tmp1900 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(8e+00f), tmp1901);
__m512 tmp1920 = _mm512_fmadd_ps(tmp1914, _mm512_set1_ps(8e+00f), tmp1921);
__m512 tmp1906 = _mm512_fmadd_ps(tmp1898, _mm512_set1_ps(3.2e+01f), tmp1907);
__m512 tmp1926 = _mm512_fmadd_ps(tmp1918, _mm512_set1_ps(3.2e+01f), tmp1927);
__m512 tmp1904 = _mm512_fmadd_ps(tmp1894, _mm512_set1_ps(2e+00f), tmp1905);
__m512 tmp1924 = _mm512_fmadd_ps(tmp1914, _mm512_set1_ps(2e+00f), tmp1925);
__m512 tmp1877 = tmp1889;
__m512 tmp1883 = tmp1909;
__m512 tmp1878 = tmp1895;
__m512 tmp1884 = tmp1915;
__m512 tmp1879 = tmp1900;
__m512 tmp1885 = tmp1920;
__m512 tmp1880 = tmp1902;
__m512 tmp1886 = tmp1922;
__m512 tmp1881 = tmp1904;
__m512 tmp1887 = tmp1924;
__m512 tmp1882 = tmp1906;
__m512 tmp1888 = tmp1926;
__m512 tmp1973 = _mm512_unpacklo_ps(tmp1877, tmp1878);
__m512 tmp1974 = _mm512_unpackhi_ps(tmp1877, tmp1878);
__m512 tmp1975 = _mm512_unpacklo_ps(tmp1879, tmp1880);
__m512 tmp1976 = _mm512_unpackhi_ps(tmp1879, tmp1880);
__m512 tmp1977 = _mm512_unpacklo_ps(tmp1881, tmp1882);
__m512 tmp1978 = _mm512_unpackhi_ps(tmp1881, tmp1882);
__m512 tmp1979 = _mm512_unpacklo_ps(tmp1883, tmp1884);
__m512 tmp1980 = _mm512_unpackhi_ps(tmp1883, tmp1884);
__m512 tmp1981 = _mm512_unpacklo_ps(tmp1885, tmp1886);
__m512 tmp1982 = _mm512_unpackhi_ps(tmp1885, tmp1886);
__m512 tmp1983 = _mm512_unpacklo_ps(tmp1887, tmp1888);
__m512 tmp1984 = _mm512_unpackhi_ps(tmp1887, tmp1888);
__m512 tmp1985 = _mm512_shuffle_ps(tmp1973, tmp1975, 68);
__m512 tmp1986 = _mm512_shuffle_ps(tmp1973, tmp1975, 238);
__m512 tmp1987 = _mm512_shuffle_ps(tmp1974, tmp1976, 68);
__m512 tmp1988 = _mm512_shuffle_ps(tmp1974, tmp1976, 238);
__m512 tmp1989 = _mm512_shuffle_ps(tmp1977, tmp1979, 68);
__m512 tmp1990 = _mm512_shuffle_ps(tmp1977, tmp1979, 238);
__m512 tmp1991 = _mm512_shuffle_ps(tmp1978, tmp1980, 68);
__m512 tmp1992 = _mm512_shuffle_ps(tmp1978, tmp1980, 238);
__m512 tmp1993 = _mm512_shuffle_ps(tmp1981, tmp1983, 68);
__m512 tmp1994 = _mm512_shuffle_ps(tmp1981, tmp1983, 238);
__m512 tmp1995 = _mm512_shuffle_ps(tmp1982, tmp1984, 68);
__m512 tmp1996 = _mm512_shuffle_ps(tmp1982, tmp1984, 238);
__m512 tmp1997 = _mm512_shuffle_f32x4(tmp1985, tmp1989, 136);
__m512 tmp1998 = _mm512_shuffle_f32x4(tmp1985, tmp1989, 221);
__m512 tmp1999 = _mm512_shuffle_f32x4(tmp1986, tmp1990, 136);
__m512 tmp2000 = _mm512_shuffle_f32x4(tmp1986, tmp1990, 221);
__m512 tmp2001 = _mm512_shuffle_f32x4(tmp1987, tmp1991, 136);
__m512 tmp2002 = _mm512_shuffle_f32x4(tmp1987, tmp1991, 221);
__m512 tmp2003 = _mm512_shuffle_f32x4(tmp1988, tmp1992, 136);
__m512 tmp2004 = _mm512_shuffle_f32x4(tmp1988, tmp1992, 221);
__m512 tmp2005 = _mm512_shuffle_f32x4(tmp1993, tmp1993, 136);
__m512 tmp2006 = _mm512_shuffle_f32x4(tmp1993, tmp1993, 221);
__m512 tmp2007 = _mm512_shuffle_f32x4(tmp1994, tmp1994, 136);
__m512 tmp2008 = _mm512_shuffle_f32x4(tmp1994, tmp1994, 221);
__m512 tmp2009 = _mm512_shuffle_f32x4(tmp1995, tmp1995, 136);
__m512 tmp2010 = _mm512_shuffle_f32x4(tmp1995, tmp1995, 221);
__m512 tmp2011 = _mm512_shuffle_f32x4(tmp1996, tmp1996, 136);
__m512 tmp2012 = _mm512_shuffle_f32x4(tmp1996, tmp1996, 221);
tmp1877 = _mm512_shuffle_f32x4(tmp1997, tmp2005, 136);
tmp1885 = _mm512_shuffle_f32x4(tmp1997, tmp2005, 221);
tmp1878 = _mm512_shuffle_f32x4(tmp1999, tmp2007, 136);
tmp1886 = _mm512_shuffle_f32x4(tmp1999, tmp2007, 221);
tmp1879 = _mm512_shuffle_f32x4(tmp2001, tmp2009, 136);
tmp1887 = _mm512_shuffle_f32x4(tmp2001, tmp2009, 221);
tmp1880 = _mm512_shuffle_f32x4(tmp2003, tmp2011, 136);
tmp1888 = _mm512_shuffle_f32x4(tmp2003, tmp2011, 221);
tmp1881 = _mm512_shuffle_f32x4(tmp1998, tmp2006, 136);
__m512 tmp1929 = _mm512_shuffle_f32x4(tmp1998, tmp2006, 221);
tmp1882 = _mm512_shuffle_f32x4(tmp2000, tmp2008, 136);
__m512 tmp1930 = _mm512_shuffle_f32x4(tmp2000, tmp2008, 221);
tmp1883 = _mm512_shuffle_f32x4(tmp2002, tmp2010, 136);
__m512 tmp1931 = _mm512_shuffle_f32x4(tmp2002, tmp2010, 221);
tmp1884 = _mm512_shuffle_f32x4(tmp2004, tmp2012, 136);
__m512 tmp1932 = _mm512_shuffle_f32x4(tmp2004, tmp2012, 221);
__m512 tmp1937 = _mm512_add_ps(tmp1878, tmp1879);
__m512 tmp1957 = _mm512_add_ps(tmp1886, tmp1887);
__m512 tmp1936 = _mm512_add_ps(tmp1880, tmp1881);
__m512 tmp1956 = _mm512_add_ps(tmp1888, tmp1929);
__m512 tmp1942 = _mm512_sub_ps(tmp1880, tmp1881);
__m512 tmp1962 = _mm512_sub_ps(tmp1888, tmp1929);
__m512 tmp1941 = _mm512_sub_ps(tmp1878, tmp1879);
__m512 tmp1961 = _mm512_sub_ps(tmp1886, tmp1887);
__m512 tmp1938 = _mm512_add_ps(tmp1882, tmp1883);
__m512 tmp1958 = _mm512_add_ps(tmp1930, tmp1931);
__m512 tmp1943 = _mm512_sub_ps(tmp1882, tmp1883);
__m512 tmp1963 = _mm512_sub_ps(tmp1930, tmp1931);
__m512 tmp1940 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(2e+00f), tmp1941);
__m512 tmp1960 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(2e+00f), tmp1961);
__m512 tmp1947 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(8e+00f), tmp1941);
__m512 tmp1967 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(8e+00f), tmp1961);
__m512 tmp1935 = _mm512_add_ps(tmp1936, tmp1937);
__m512 tmp1955 = _mm512_add_ps(tmp1956, tmp1957);
__m512 tmp1939 = _mm512_fmadd_ps(tmp1943, _mm512_set1_ps(1.6e+01f), tmp1940);
__m512 tmp1959 = _mm512_fmadd_ps(tmp1963, _mm512_set1_ps(1.6e+01f), tmp1960);
__m512 tmp1946 = _mm512_fmadd_ps(tmp1943, _mm512_set1_ps(4e+00f), tmp1947);
__m512 tmp1966 = _mm512_fmadd_ps(tmp1963, _mm512_set1_ps(4e+00f), tmp1967);
__m512 tmp1952 = _mm512_add_ps(tmp1943, tmp1941);
__m512 tmp1972 = _mm512_add_ps(tmp1963, tmp1961);
__m512 tmp1945 = _mm512_fmadd_ps(tmp1936, _mm512_set1_ps(4e+00f), tmp1937);
__m512 tmp1965 = _mm512_fmadd_ps(tmp1956, _mm512_set1_ps(4e+00f), tmp1957);
__m512 tmp1949 = _mm512_fmadd_ps(tmp1936, _mm512_set1_ps(1.6e+01f), tmp1937);
__m512 tmp1969 = _mm512_fmadd_ps(tmp1956, _mm512_set1_ps(1.6e+01f), tmp1957);
__m512 tmp1934 = _mm512_add_ps(tmp1935, tmp1877);
__m512 tmp1954 = _mm512_add_ps(tmp1955, tmp1885);
__m512 tmp1951 = _mm512_add_ps(tmp1952, tmp1884);
__m512 tmp1971 = _mm512_add_ps(tmp1972, tmp1932);
__m512 tmp1933 = _mm512_fmadd_ps(tmp1938, _mm512_set1_ps(3.2e+01f), tmp1934);
__m512 tmp1953 = _mm512_fmadd_ps(tmp1958, _mm512_set1_ps(3.2e+01f), tmp1954);
__m512 tmp1944 = _mm512_fmadd_ps(tmp1938, _mm512_set1_ps(8e+00f), tmp1945);
__m512 tmp1964 = _mm512_fmadd_ps(tmp1958, _mm512_set1_ps(8e+00f), tmp1965);
__m512 tmp1950 = _mm512_fmadd_ps(tmp1942, _mm512_set1_ps(3.2e+01f), tmp1951);
__m512 tmp1970 = _mm512_fmadd_ps(tmp1962, _mm512_set1_ps(3.2e+01f), tmp1971);
__m512 tmp1948 = _mm512_fmadd_ps(tmp1938, _mm512_set1_ps(2e+00f), tmp1949);
__m512 tmp1968 = _mm512_fmadd_ps(tmp1958, _mm512_set1_ps(2e+00f), tmp1969);
__m512 out423 = tmp1933;
__m512 out429 = tmp1953;
__m512 out424 = tmp1939;
__m512 out430 = tmp1959;
__m512 out425 = tmp1944;
__m512 out431 = tmp1964;
__m512 out426 = tmp1946;
__m512 out432 = tmp1966;
__m512 out427 = tmp1948;
__m512 out433 = tmp1968;
__m512 out428 = tmp1950;
__m512 out434 = tmp1970;
out423 = _mm512_max_ps(_mm512_setzero_ps(), out423);
out429 = _mm512_max_ps(_mm512_setzero_ps(), out429);
out424 = _mm512_max_ps(_mm512_setzero_ps(), out424);
out430 = _mm512_max_ps(_mm512_setzero_ps(), out430);
out425 = _mm512_max_ps(_mm512_setzero_ps(), out425);
out431 = _mm512_max_ps(_mm512_setzero_ps(), out431);
out426 = _mm512_max_ps(_mm512_setzero_ps(), out426);
out432 = _mm512_max_ps(_mm512_setzero_ps(), out432);
out427 = _mm512_max_ps(_mm512_setzero_ps(), out427);
out433 = _mm512_max_ps(_mm512_setzero_ps(), out433);
out428 = _mm512_max_ps(_mm512_setzero_ps(), out428);
out434 = _mm512_max_ps(_mm512_setzero_ps(), out434);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out423);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out429);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out424);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out430);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out425);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out431);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out426);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out432);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out427);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out433);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out428);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out434);
__m512 sf17 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf18 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in380 = _mm512_shuffle_f32x4(sf17, sf18, 68);
__m512 in381 = _mm512_shuffle_f32x4(sf17, sf18, 238);
__m512 sf19 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf20 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in388 = _mm512_shuffle_f32x4(sf19, sf20, 68);
__m512 in389 = _mm512_shuffle_f32x4(sf19, sf20, 238);
__m512 sf21 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf22 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in382 = _mm512_shuffle_f32x4(sf21, sf22, 68);
__m512 in383 = _mm512_shuffle_f32x4(sf21, sf22, 238);
__m512 sf23 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf24 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in390 = _mm512_shuffle_f32x4(sf23, sf24, 68);
__m512 in391 = _mm512_shuffle_f32x4(sf23, sf24, 238);
__m512 sf25 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf26 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in384 = _mm512_shuffle_f32x4(sf25, sf26, 68);
__m512 in385 = _mm512_shuffle_f32x4(sf25, sf26, 238);
__m512 sf27 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf28 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in392 = _mm512_shuffle_f32x4(sf27, sf28, 68);
__m512 in393 = _mm512_shuffle_f32x4(sf27, sf28, 238);
__m512 sf29 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf30 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in386 = _mm512_shuffle_f32x4(sf29, sf30, 68);
__m512 in387 = _mm512_shuffle_f32x4(sf29, sf30, 238);
__m512 sf31 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf32 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in394 = _mm512_shuffle_f32x4(sf31, sf32, 68);
__m512 in395 = _mm512_shuffle_f32x4(sf31, sf32, 238);
__m512 tmp2029 = _mm512_add_ps(in381, in382);
__m512 tmp2049 = _mm512_add_ps(in389, in390);
__m512 tmp2028 = _mm512_add_ps(in383, in384);
__m512 tmp2048 = _mm512_add_ps(in391, in392);
__m512 tmp2034 = _mm512_sub_ps(in383, in384);
__m512 tmp2054 = _mm512_sub_ps(in391, in392);
__m512 tmp2033 = _mm512_sub_ps(in381, in382);
__m512 tmp2053 = _mm512_sub_ps(in389, in390);
__m512 tmp2030 = _mm512_add_ps(in385, in386);
__m512 tmp2050 = _mm512_add_ps(in393, in394);
__m512 tmp2035 = _mm512_sub_ps(in385, in386);
__m512 tmp2055 = _mm512_sub_ps(in393, in394);
__m512 tmp2032 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(2e+00f), tmp2033);
__m512 tmp2052 = _mm512_fmadd_ps(tmp2054, _mm512_set1_ps(2e+00f), tmp2053);
__m512 tmp2039 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(8e+00f), tmp2033);
__m512 tmp2059 = _mm512_fmadd_ps(tmp2054, _mm512_set1_ps(8e+00f), tmp2053);
__m512 tmp2027 = _mm512_add_ps(tmp2028, tmp2029);
__m512 tmp2047 = _mm512_add_ps(tmp2048, tmp2049);
__m512 tmp2031 = _mm512_fmadd_ps(tmp2035, _mm512_set1_ps(1.6e+01f), tmp2032);
__m512 tmp2051 = _mm512_fmadd_ps(tmp2055, _mm512_set1_ps(1.6e+01f), tmp2052);
__m512 tmp2038 = _mm512_fmadd_ps(tmp2035, _mm512_set1_ps(4e+00f), tmp2039);
__m512 tmp2058 = _mm512_fmadd_ps(tmp2055, _mm512_set1_ps(4e+00f), tmp2059);
__m512 tmp2044 = _mm512_add_ps(tmp2035, tmp2033);
__m512 tmp2064 = _mm512_add_ps(tmp2055, tmp2053);
__m512 tmp2037 = _mm512_fmadd_ps(tmp2028, _mm512_set1_ps(4e+00f), tmp2029);
__m512 tmp2057 = _mm512_fmadd_ps(tmp2048, _mm512_set1_ps(4e+00f), tmp2049);
__m512 tmp2041 = _mm512_fmadd_ps(tmp2028, _mm512_set1_ps(1.6e+01f), tmp2029);
__m512 tmp2061 = _mm512_fmadd_ps(tmp2048, _mm512_set1_ps(1.6e+01f), tmp2049);
__m512 tmp2026 = _mm512_add_ps(tmp2027, in380);
__m512 tmp2046 = _mm512_add_ps(tmp2047, in388);
__m512 tmp2043 = _mm512_add_ps(tmp2044, in387);
__m512 tmp2063 = _mm512_add_ps(tmp2064, in395);
__m512 tmp2025 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(3.2e+01f), tmp2026);
__m512 tmp2045 = _mm512_fmadd_ps(tmp2050, _mm512_set1_ps(3.2e+01f), tmp2046);
__m512 tmp2036 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(8e+00f), tmp2037);
__m512 tmp2056 = _mm512_fmadd_ps(tmp2050, _mm512_set1_ps(8e+00f), tmp2057);
__m512 tmp2042 = _mm512_fmadd_ps(tmp2034, _mm512_set1_ps(3.2e+01f), tmp2043);
__m512 tmp2062 = _mm512_fmadd_ps(tmp2054, _mm512_set1_ps(3.2e+01f), tmp2063);
__m512 tmp2040 = _mm512_fmadd_ps(tmp2030, _mm512_set1_ps(2e+00f), tmp2041);
__m512 tmp2060 = _mm512_fmadd_ps(tmp2050, _mm512_set1_ps(2e+00f), tmp2061);
__m512 tmp2013 = tmp2025;
__m512 tmp2019 = tmp2045;
__m512 tmp2014 = tmp2031;
__m512 tmp2020 = tmp2051;
__m512 tmp2015 = tmp2036;
__m512 tmp2021 = tmp2056;
__m512 tmp2016 = tmp2038;
__m512 tmp2022 = tmp2058;
__m512 tmp2017 = tmp2040;
__m512 tmp2023 = tmp2060;
__m512 tmp2018 = tmp2042;
__m512 tmp2024 = tmp2062;
__m512 tmp2109 = _mm512_unpacklo_ps(tmp2013, tmp2014);
__m512 tmp2110 = _mm512_unpackhi_ps(tmp2013, tmp2014);
__m512 tmp2111 = _mm512_unpacklo_ps(tmp2015, tmp2016);
__m512 tmp2112 = _mm512_unpackhi_ps(tmp2015, tmp2016);
__m512 tmp2113 = _mm512_unpacklo_ps(tmp2017, tmp2018);
__m512 tmp2114 = _mm512_unpackhi_ps(tmp2017, tmp2018);
__m512 tmp2115 = _mm512_unpacklo_ps(tmp2019, tmp2020);
__m512 tmp2116 = _mm512_unpackhi_ps(tmp2019, tmp2020);
__m512 tmp2117 = _mm512_unpacklo_ps(tmp2021, tmp2022);
__m512 tmp2118 = _mm512_unpackhi_ps(tmp2021, tmp2022);
__m512 tmp2119 = _mm512_unpacklo_ps(tmp2023, tmp2024);
__m512 tmp2120 = _mm512_unpackhi_ps(tmp2023, tmp2024);
__m512 tmp2121 = _mm512_shuffle_ps(tmp2109, tmp2111, 68);
__m512 tmp2122 = _mm512_shuffle_ps(tmp2109, tmp2111, 238);
__m512 tmp2123 = _mm512_shuffle_ps(tmp2110, tmp2112, 68);
__m512 tmp2124 = _mm512_shuffle_ps(tmp2110, tmp2112, 238);
__m512 tmp2125 = _mm512_shuffle_ps(tmp2113, tmp2115, 68);
__m512 tmp2126 = _mm512_shuffle_ps(tmp2113, tmp2115, 238);
__m512 tmp2127 = _mm512_shuffle_ps(tmp2114, tmp2116, 68);
__m512 tmp2128 = _mm512_shuffle_ps(tmp2114, tmp2116, 238);
__m512 tmp2129 = _mm512_shuffle_ps(tmp2117, tmp2119, 68);
__m512 tmp2130 = _mm512_shuffle_ps(tmp2117, tmp2119, 238);
__m512 tmp2131 = _mm512_shuffle_ps(tmp2118, tmp2120, 68);
__m512 tmp2132 = _mm512_shuffle_ps(tmp2118, tmp2120, 238);
__m512 tmp2133 = _mm512_shuffle_f32x4(tmp2121, tmp2125, 136);
__m512 tmp2134 = _mm512_shuffle_f32x4(tmp2121, tmp2125, 221);
__m512 tmp2135 = _mm512_shuffle_f32x4(tmp2122, tmp2126, 136);
__m512 tmp2136 = _mm512_shuffle_f32x4(tmp2122, tmp2126, 221);
__m512 tmp2137 = _mm512_shuffle_f32x4(tmp2123, tmp2127, 136);
__m512 tmp2138 = _mm512_shuffle_f32x4(tmp2123, tmp2127, 221);
__m512 tmp2139 = _mm512_shuffle_f32x4(tmp2124, tmp2128, 136);
__m512 tmp2140 = _mm512_shuffle_f32x4(tmp2124, tmp2128, 221);
__m512 tmp2141 = _mm512_shuffle_f32x4(tmp2129, tmp2129, 136);
__m512 tmp2142 = _mm512_shuffle_f32x4(tmp2129, tmp2129, 221);
__m512 tmp2143 = _mm512_shuffle_f32x4(tmp2130, tmp2130, 136);
__m512 tmp2144 = _mm512_shuffle_f32x4(tmp2130, tmp2130, 221);
__m512 tmp2145 = _mm512_shuffle_f32x4(tmp2131, tmp2131, 136);
__m512 tmp2146 = _mm512_shuffle_f32x4(tmp2131, tmp2131, 221);
__m512 tmp2147 = _mm512_shuffle_f32x4(tmp2132, tmp2132, 136);
__m512 tmp2148 = _mm512_shuffle_f32x4(tmp2132, tmp2132, 221);
tmp2013 = _mm512_shuffle_f32x4(tmp2133, tmp2141, 136);
tmp2021 = _mm512_shuffle_f32x4(tmp2133, tmp2141, 221);
tmp2014 = _mm512_shuffle_f32x4(tmp2135, tmp2143, 136);
tmp2022 = _mm512_shuffle_f32x4(tmp2135, tmp2143, 221);
tmp2015 = _mm512_shuffle_f32x4(tmp2137, tmp2145, 136);
tmp2023 = _mm512_shuffle_f32x4(tmp2137, tmp2145, 221);
tmp2016 = _mm512_shuffle_f32x4(tmp2139, tmp2147, 136);
tmp2024 = _mm512_shuffle_f32x4(tmp2139, tmp2147, 221);
tmp2017 = _mm512_shuffle_f32x4(tmp2134, tmp2142, 136);
__m512 tmp2065 = _mm512_shuffle_f32x4(tmp2134, tmp2142, 221);
tmp2018 = _mm512_shuffle_f32x4(tmp2136, tmp2144, 136);
__m512 tmp2066 = _mm512_shuffle_f32x4(tmp2136, tmp2144, 221);
tmp2019 = _mm512_shuffle_f32x4(tmp2138, tmp2146, 136);
__m512 tmp2067 = _mm512_shuffle_f32x4(tmp2138, tmp2146, 221);
tmp2020 = _mm512_shuffle_f32x4(tmp2140, tmp2148, 136);
__m512 tmp2068 = _mm512_shuffle_f32x4(tmp2140, tmp2148, 221);
__m512 tmp2073 = _mm512_add_ps(tmp2014, tmp2015);
__m512 tmp2093 = _mm512_add_ps(tmp2022, tmp2023);
__m512 tmp2072 = _mm512_add_ps(tmp2016, tmp2017);
__m512 tmp2092 = _mm512_add_ps(tmp2024, tmp2065);
__m512 tmp2078 = _mm512_sub_ps(tmp2016, tmp2017);
__m512 tmp2098 = _mm512_sub_ps(tmp2024, tmp2065);
__m512 tmp2077 = _mm512_sub_ps(tmp2014, tmp2015);
__m512 tmp2097 = _mm512_sub_ps(tmp2022, tmp2023);
__m512 tmp2074 = _mm512_add_ps(tmp2018, tmp2019);
__m512 tmp2094 = _mm512_add_ps(tmp2066, tmp2067);
__m512 tmp2079 = _mm512_sub_ps(tmp2018, tmp2019);
__m512 tmp2099 = _mm512_sub_ps(tmp2066, tmp2067);
__m512 tmp2076 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(2e+00f), tmp2077);
__m512 tmp2096 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(2e+00f), tmp2097);
__m512 tmp2083 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(8e+00f), tmp2077);
__m512 tmp2103 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(8e+00f), tmp2097);
__m512 tmp2071 = _mm512_add_ps(tmp2072, tmp2073);
__m512 tmp2091 = _mm512_add_ps(tmp2092, tmp2093);
__m512 tmp2075 = _mm512_fmadd_ps(tmp2079, _mm512_set1_ps(1.6e+01f), tmp2076);
__m512 tmp2095 = _mm512_fmadd_ps(tmp2099, _mm512_set1_ps(1.6e+01f), tmp2096);
__m512 tmp2082 = _mm512_fmadd_ps(tmp2079, _mm512_set1_ps(4e+00f), tmp2083);
__m512 tmp2102 = _mm512_fmadd_ps(tmp2099, _mm512_set1_ps(4e+00f), tmp2103);
__m512 tmp2088 = _mm512_add_ps(tmp2079, tmp2077);
__m512 tmp2108 = _mm512_add_ps(tmp2099, tmp2097);
__m512 tmp2081 = _mm512_fmadd_ps(tmp2072, _mm512_set1_ps(4e+00f), tmp2073);
__m512 tmp2101 = _mm512_fmadd_ps(tmp2092, _mm512_set1_ps(4e+00f), tmp2093);
__m512 tmp2085 = _mm512_fmadd_ps(tmp2072, _mm512_set1_ps(1.6e+01f), tmp2073);
__m512 tmp2105 = _mm512_fmadd_ps(tmp2092, _mm512_set1_ps(1.6e+01f), tmp2093);
__m512 tmp2070 = _mm512_add_ps(tmp2071, tmp2013);
__m512 tmp2090 = _mm512_add_ps(tmp2091, tmp2021);
__m512 tmp2087 = _mm512_add_ps(tmp2088, tmp2020);
__m512 tmp2107 = _mm512_add_ps(tmp2108, tmp2068);
__m512 tmp2069 = _mm512_fmadd_ps(tmp2074, _mm512_set1_ps(3.2e+01f), tmp2070);
__m512 tmp2089 = _mm512_fmadd_ps(tmp2094, _mm512_set1_ps(3.2e+01f), tmp2090);
__m512 tmp2080 = _mm512_fmadd_ps(tmp2074, _mm512_set1_ps(8e+00f), tmp2081);
__m512 tmp2100 = _mm512_fmadd_ps(tmp2094, _mm512_set1_ps(8e+00f), tmp2101);
__m512 tmp2086 = _mm512_fmadd_ps(tmp2078, _mm512_set1_ps(3.2e+01f), tmp2087);
__m512 tmp2106 = _mm512_fmadd_ps(tmp2098, _mm512_set1_ps(3.2e+01f), tmp2107);
__m512 tmp2084 = _mm512_fmadd_ps(tmp2074, _mm512_set1_ps(2e+00f), tmp2085);
__m512 tmp2104 = _mm512_fmadd_ps(tmp2094, _mm512_set1_ps(2e+00f), tmp2105);
__m512 out435 = tmp2069;
__m512 out441 = tmp2089;
__m512 out436 = tmp2075;
__m512 out442 = tmp2095;
__m512 out437 = tmp2080;
__m512 out443 = tmp2100;
__m512 out438 = tmp2082;
__m512 out444 = tmp2102;
__m512 out439 = tmp2084;
__m512 out445 = tmp2104;
__m512 out440 = tmp2086;
__m512 out446 = tmp2106;
out435 = _mm512_max_ps(_mm512_setzero_ps(), out435);
out441 = _mm512_max_ps(_mm512_setzero_ps(), out441);
out436 = _mm512_max_ps(_mm512_setzero_ps(), out436);
out442 = _mm512_max_ps(_mm512_setzero_ps(), out442);
out437 = _mm512_max_ps(_mm512_setzero_ps(), out437);
out443 = _mm512_max_ps(_mm512_setzero_ps(), out443);
out438 = _mm512_max_ps(_mm512_setzero_ps(), out438);
out444 = _mm512_max_ps(_mm512_setzero_ps(), out444);
out439 = _mm512_max_ps(_mm512_setzero_ps(), out439);
out445 = _mm512_max_ps(_mm512_setzero_ps(), out445);
out440 = _mm512_max_ps(_mm512_setzero_ps(), out440);
out446 = _mm512_max_ps(_mm512_setzero_ps(), out446);
_mm512_mask_storeu_ps(datPtr6+96+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out435);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out441);
_mm512_mask_storeu_ps(datPtr6+320+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out436);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out442);
_mm512_mask_storeu_ps(datPtr6+544+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out437);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out443);
_mm512_mask_storeu_ps(datPtr6+768+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out438);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out444);
_mm512_mask_storeu_ps(datPtr6+992+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out439);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out445);
_mm512_mask_storeu_ps(datPtr6+1216+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out440);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out446);
__m512 sf33 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf34 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in396 = _mm512_shuffle_f32x4(sf33, sf34, 68);
__m512 in397 = _mm512_shuffle_f32x4(sf33, sf34, 238);
__m512 sf35 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf36 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in404 = _mm512_shuffle_f32x4(sf35, sf36, 68);
__m512 in405 = _mm512_shuffle_f32x4(sf35, sf36, 238);
__m512 sf37 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf38 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in398 = _mm512_shuffle_f32x4(sf37, sf38, 68);
__m512 in399 = _mm512_shuffle_f32x4(sf37, sf38, 238);
__m512 sf39 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf40 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in406 = _mm512_shuffle_f32x4(sf39, sf40, 68);
__m512 in407 = _mm512_shuffle_f32x4(sf39, sf40, 238);
__m512 sf41 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf42 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in400 = _mm512_shuffle_f32x4(sf41, sf42, 68);
__m512 in401 = _mm512_shuffle_f32x4(sf41, sf42, 238);
__m512 sf43 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf44 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in408 = _mm512_shuffle_f32x4(sf43, sf44, 68);
__m512 in409 = _mm512_shuffle_f32x4(sf43, sf44, 238);
__m512 sf45 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf46 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in402 = _mm512_shuffle_f32x4(sf45, sf46, 68);
__m512 in403 = _mm512_shuffle_f32x4(sf45, sf46, 238);
__m512 sf47 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k61+768*l13);
__m512 sf48 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k61+768*l13);
__m512 in410 = _mm512_shuffle_f32x4(sf47, sf48, 68);
__m512 in411 = _mm512_shuffle_f32x4(sf47, sf48, 238);
__m512 tmp2165 = _mm512_add_ps(in397, in398);
__m512 tmp2185 = _mm512_add_ps(in405, in406);
__m512 tmp2164 = _mm512_add_ps(in399, in400);
__m512 tmp2184 = _mm512_add_ps(in407, in408);
__m512 tmp2170 = _mm512_sub_ps(in399, in400);
__m512 tmp2190 = _mm512_sub_ps(in407, in408);
__m512 tmp2169 = _mm512_sub_ps(in397, in398);
__m512 tmp2189 = _mm512_sub_ps(in405, in406);
__m512 tmp2166 = _mm512_add_ps(in401, in402);
__m512 tmp2186 = _mm512_add_ps(in409, in410);
__m512 tmp2171 = _mm512_sub_ps(in401, in402);
__m512 tmp2191 = _mm512_sub_ps(in409, in410);
__m512 tmp2168 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(2e+00f), tmp2169);
__m512 tmp2188 = _mm512_fmadd_ps(tmp2190, _mm512_set1_ps(2e+00f), tmp2189);
__m512 tmp2175 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(8e+00f), tmp2169);
__m512 tmp2195 = _mm512_fmadd_ps(tmp2190, _mm512_set1_ps(8e+00f), tmp2189);
__m512 tmp2163 = _mm512_add_ps(tmp2164, tmp2165);
__m512 tmp2183 = _mm512_add_ps(tmp2184, tmp2185);
__m512 tmp2167 = _mm512_fmadd_ps(tmp2171, _mm512_set1_ps(1.6e+01f), tmp2168);
__m512 tmp2187 = _mm512_fmadd_ps(tmp2191, _mm512_set1_ps(1.6e+01f), tmp2188);
__m512 tmp2174 = _mm512_fmadd_ps(tmp2171, _mm512_set1_ps(4e+00f), tmp2175);
__m512 tmp2194 = _mm512_fmadd_ps(tmp2191, _mm512_set1_ps(4e+00f), tmp2195);
__m512 tmp2180 = _mm512_add_ps(tmp2171, tmp2169);
__m512 tmp2200 = _mm512_add_ps(tmp2191, tmp2189);
__m512 tmp2173 = _mm512_fmadd_ps(tmp2164, _mm512_set1_ps(4e+00f), tmp2165);
__m512 tmp2193 = _mm512_fmadd_ps(tmp2184, _mm512_set1_ps(4e+00f), tmp2185);
__m512 tmp2177 = _mm512_fmadd_ps(tmp2164, _mm512_set1_ps(1.6e+01f), tmp2165);
__m512 tmp2197 = _mm512_fmadd_ps(tmp2184, _mm512_set1_ps(1.6e+01f), tmp2185);
__m512 tmp2162 = _mm512_add_ps(tmp2163, in396);
__m512 tmp2182 = _mm512_add_ps(tmp2183, in404);
__m512 tmp2179 = _mm512_add_ps(tmp2180, in403);
__m512 tmp2199 = _mm512_add_ps(tmp2200, in411);
__m512 tmp2161 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(3.2e+01f), tmp2162);
__m512 tmp2181 = _mm512_fmadd_ps(tmp2186, _mm512_set1_ps(3.2e+01f), tmp2182);
__m512 tmp2172 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(8e+00f), tmp2173);
__m512 tmp2192 = _mm512_fmadd_ps(tmp2186, _mm512_set1_ps(8e+00f), tmp2193);
__m512 tmp2178 = _mm512_fmadd_ps(tmp2170, _mm512_set1_ps(3.2e+01f), tmp2179);
__m512 tmp2198 = _mm512_fmadd_ps(tmp2190, _mm512_set1_ps(3.2e+01f), tmp2199);
__m512 tmp2176 = _mm512_fmadd_ps(tmp2166, _mm512_set1_ps(2e+00f), tmp2177);
__m512 tmp2196 = _mm512_fmadd_ps(tmp2186, _mm512_set1_ps(2e+00f), tmp2197);
__m512 tmp2149 = tmp2161;
__m512 tmp2155 = tmp2181;
__m512 tmp2150 = tmp2167;
__m512 tmp2156 = tmp2187;
__m512 tmp2151 = tmp2172;
__m512 tmp2157 = tmp2192;
__m512 tmp2152 = tmp2174;
__m512 tmp2158 = tmp2194;
__m512 tmp2153 = tmp2176;
__m512 tmp2159 = tmp2196;
__m512 tmp2154 = tmp2178;
__m512 tmp2160 = tmp2198;
__m512 tmp2245 = _mm512_unpacklo_ps(tmp2149, tmp2150);
__m512 tmp2246 = _mm512_unpackhi_ps(tmp2149, tmp2150);
__m512 tmp2247 = _mm512_unpacklo_ps(tmp2151, tmp2152);
__m512 tmp2248 = _mm512_unpackhi_ps(tmp2151, tmp2152);
__m512 tmp2249 = _mm512_unpacklo_ps(tmp2153, tmp2154);
__m512 tmp2250 = _mm512_unpackhi_ps(tmp2153, tmp2154);
__m512 tmp2251 = _mm512_unpacklo_ps(tmp2155, tmp2156);
__m512 tmp2252 = _mm512_unpackhi_ps(tmp2155, tmp2156);
__m512 tmp2253 = _mm512_unpacklo_ps(tmp2157, tmp2158);
__m512 tmp2254 = _mm512_unpackhi_ps(tmp2157, tmp2158);
__m512 tmp2255 = _mm512_unpacklo_ps(tmp2159, tmp2160);
__m512 tmp2256 = _mm512_unpackhi_ps(tmp2159, tmp2160);
__m512 tmp2257 = _mm512_shuffle_ps(tmp2245, tmp2247, 68);
__m512 tmp2258 = _mm512_shuffle_ps(tmp2245, tmp2247, 238);
__m512 tmp2259 = _mm512_shuffle_ps(tmp2246, tmp2248, 68);
__m512 tmp2260 = _mm512_shuffle_ps(tmp2246, tmp2248, 238);
__m512 tmp2261 = _mm512_shuffle_ps(tmp2249, tmp2251, 68);
__m512 tmp2262 = _mm512_shuffle_ps(tmp2249, tmp2251, 238);
__m512 tmp2263 = _mm512_shuffle_ps(tmp2250, tmp2252, 68);
__m512 tmp2264 = _mm512_shuffle_ps(tmp2250, tmp2252, 238);
__m512 tmp2265 = _mm512_shuffle_ps(tmp2253, tmp2255, 68);
__m512 tmp2266 = _mm512_shuffle_ps(tmp2253, tmp2255, 238);
__m512 tmp2267 = _mm512_shuffle_ps(tmp2254, tmp2256, 68);
__m512 tmp2268 = _mm512_shuffle_ps(tmp2254, tmp2256, 238);
__m512 tmp2269 = _mm512_shuffle_f32x4(tmp2257, tmp2261, 136);
__m512 tmp2270 = _mm512_shuffle_f32x4(tmp2257, tmp2261, 221);
__m512 tmp2271 = _mm512_shuffle_f32x4(tmp2258, tmp2262, 136);
__m512 tmp2272 = _mm512_shuffle_f32x4(tmp2258, tmp2262, 221);
__m512 tmp2273 = _mm512_shuffle_f32x4(tmp2259, tmp2263, 136);
__m512 tmp2274 = _mm512_shuffle_f32x4(tmp2259, tmp2263, 221);
__m512 tmp2275 = _mm512_shuffle_f32x4(tmp2260, tmp2264, 136);
__m512 tmp2276 = _mm512_shuffle_f32x4(tmp2260, tmp2264, 221);
__m512 tmp2277 = _mm512_shuffle_f32x4(tmp2265, tmp2265, 136);
__m512 tmp2278 = _mm512_shuffle_f32x4(tmp2265, tmp2265, 221);
__m512 tmp2279 = _mm512_shuffle_f32x4(tmp2266, tmp2266, 136);
__m512 tmp2280 = _mm512_shuffle_f32x4(tmp2266, tmp2266, 221);
__m512 tmp2281 = _mm512_shuffle_f32x4(tmp2267, tmp2267, 136);
__m512 tmp2282 = _mm512_shuffle_f32x4(tmp2267, tmp2267, 221);
__m512 tmp2283 = _mm512_shuffle_f32x4(tmp2268, tmp2268, 136);
__m512 tmp2284 = _mm512_shuffle_f32x4(tmp2268, tmp2268, 221);
tmp2149 = _mm512_shuffle_f32x4(tmp2269, tmp2277, 136);
tmp2157 = _mm512_shuffle_f32x4(tmp2269, tmp2277, 221);
tmp2150 = _mm512_shuffle_f32x4(tmp2271, tmp2279, 136);
tmp2158 = _mm512_shuffle_f32x4(tmp2271, tmp2279, 221);
tmp2151 = _mm512_shuffle_f32x4(tmp2273, tmp2281, 136);
tmp2159 = _mm512_shuffle_f32x4(tmp2273, tmp2281, 221);
tmp2152 = _mm512_shuffle_f32x4(tmp2275, tmp2283, 136);
tmp2160 = _mm512_shuffle_f32x4(tmp2275, tmp2283, 221);
tmp2153 = _mm512_shuffle_f32x4(tmp2270, tmp2278, 136);
__m512 tmp2201 = _mm512_shuffle_f32x4(tmp2270, tmp2278, 221);
tmp2154 = _mm512_shuffle_f32x4(tmp2272, tmp2280, 136);
__m512 tmp2202 = _mm512_shuffle_f32x4(tmp2272, tmp2280, 221);
tmp2155 = _mm512_shuffle_f32x4(tmp2274, tmp2282, 136);
__m512 tmp2203 = _mm512_shuffle_f32x4(tmp2274, tmp2282, 221);
tmp2156 = _mm512_shuffle_f32x4(tmp2276, tmp2284, 136);
__m512 tmp2204 = _mm512_shuffle_f32x4(tmp2276, tmp2284, 221);
__m512 tmp2209 = _mm512_add_ps(tmp2150, tmp2151);
__m512 tmp2229 = _mm512_add_ps(tmp2158, tmp2159);
__m512 tmp2208 = _mm512_add_ps(tmp2152, tmp2153);
__m512 tmp2228 = _mm512_add_ps(tmp2160, tmp2201);
__m512 tmp2214 = _mm512_sub_ps(tmp2152, tmp2153);
__m512 tmp2234 = _mm512_sub_ps(tmp2160, tmp2201);
__m512 tmp2213 = _mm512_sub_ps(tmp2150, tmp2151);
__m512 tmp2233 = _mm512_sub_ps(tmp2158, tmp2159);
__m512 tmp2210 = _mm512_add_ps(tmp2154, tmp2155);
__m512 tmp2230 = _mm512_add_ps(tmp2202, tmp2203);
__m512 tmp2215 = _mm512_sub_ps(tmp2154, tmp2155);
__m512 tmp2235 = _mm512_sub_ps(tmp2202, tmp2203);
__m512 tmp2212 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(2e+00f), tmp2213);
__m512 tmp2232 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(2e+00f), tmp2233);
__m512 tmp2219 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(8e+00f), tmp2213);
__m512 tmp2239 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(8e+00f), tmp2233);
__m512 tmp2207 = _mm512_add_ps(tmp2208, tmp2209);
__m512 tmp2227 = _mm512_add_ps(tmp2228, tmp2229);
__m512 tmp2211 = _mm512_fmadd_ps(tmp2215, _mm512_set1_ps(1.6e+01f), tmp2212);
__m512 tmp2231 = _mm512_fmadd_ps(tmp2235, _mm512_set1_ps(1.6e+01f), tmp2232);
__m512 tmp2218 = _mm512_fmadd_ps(tmp2215, _mm512_set1_ps(4e+00f), tmp2219);
__m512 tmp2238 = _mm512_fmadd_ps(tmp2235, _mm512_set1_ps(4e+00f), tmp2239);
__m512 tmp2224 = _mm512_add_ps(tmp2215, tmp2213);
__m512 tmp2244 = _mm512_add_ps(tmp2235, tmp2233);
__m512 tmp2217 = _mm512_fmadd_ps(tmp2208, _mm512_set1_ps(4e+00f), tmp2209);
__m512 tmp2237 = _mm512_fmadd_ps(tmp2228, _mm512_set1_ps(4e+00f), tmp2229);
__m512 tmp2221 = _mm512_fmadd_ps(tmp2208, _mm512_set1_ps(1.6e+01f), tmp2209);
__m512 tmp2241 = _mm512_fmadd_ps(tmp2228, _mm512_set1_ps(1.6e+01f), tmp2229);
__m512 tmp2206 = _mm512_add_ps(tmp2207, tmp2149);
__m512 tmp2226 = _mm512_add_ps(tmp2227, tmp2157);
__m512 tmp2223 = _mm512_add_ps(tmp2224, tmp2156);
__m512 tmp2243 = _mm512_add_ps(tmp2244, tmp2204);
__m512 tmp2205 = _mm512_fmadd_ps(tmp2210, _mm512_set1_ps(3.2e+01f), tmp2206);
__m512 tmp2225 = _mm512_fmadd_ps(tmp2230, _mm512_set1_ps(3.2e+01f), tmp2226);
__m512 tmp2216 = _mm512_fmadd_ps(tmp2210, _mm512_set1_ps(8e+00f), tmp2217);
__m512 tmp2236 = _mm512_fmadd_ps(tmp2230, _mm512_set1_ps(8e+00f), tmp2237);
__m512 tmp2222 = _mm512_fmadd_ps(tmp2214, _mm512_set1_ps(3.2e+01f), tmp2223);
__m512 tmp2242 = _mm512_fmadd_ps(tmp2234, _mm512_set1_ps(3.2e+01f), tmp2243);
__m512 tmp2220 = _mm512_fmadd_ps(tmp2210, _mm512_set1_ps(2e+00f), tmp2221);
__m512 tmp2240 = _mm512_fmadd_ps(tmp2230, _mm512_set1_ps(2e+00f), tmp2241);
__m512 out447 = tmp2205;
__m512 out453 = tmp2225;
__m512 out448 = tmp2211;
__m512 out454 = tmp2231;
__m512 out449 = tmp2216;
__m512 out455 = tmp2236;
__m512 out450 = tmp2218;
__m512 out456 = tmp2238;
__m512 out451 = tmp2220;
__m512 out457 = tmp2240;
__m512 out452 = tmp2222;
__m512 out458 = tmp2242;
out447 = _mm512_max_ps(_mm512_setzero_ps(), out447);
out453 = _mm512_max_ps(_mm512_setzero_ps(), out453);
out448 = _mm512_max_ps(_mm512_setzero_ps(), out448);
out454 = _mm512_max_ps(_mm512_setzero_ps(), out454);
out449 = _mm512_max_ps(_mm512_setzero_ps(), out449);
out455 = _mm512_max_ps(_mm512_setzero_ps(), out455);
out450 = _mm512_max_ps(_mm512_setzero_ps(), out450);
out456 = _mm512_max_ps(_mm512_setzero_ps(), out456);
out451 = _mm512_max_ps(_mm512_setzero_ps(), out451);
out457 = _mm512_max_ps(_mm512_setzero_ps(), out457);
out452 = _mm512_max_ps(_mm512_setzero_ps(), out452);
out458 = _mm512_max_ps(_mm512_setzero_ps(), out458);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out447);
_mm512_mask_storeu_ps(datPtr6+12704+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out453);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out448);
_mm512_mask_storeu_ps(datPtr6+12928+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out454);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out449);
_mm512_mask_storeu_ps(datPtr6+13152+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out455);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out450);
_mm512_mask_storeu_ps(datPtr6+13376+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out456);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out451);
_mm512_mask_storeu_ps(datPtr6+13600+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out457);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out452);
_mm512_mask_storeu_ps(datPtr6+13824+50432*i19+224*toH20+4*toW20+50432*k61+25216*l13, 4095, out458);
}
}
++j13;
rel10 = 1;
}
ptrdiff_t toH21 = base10+0;
ptrdiff_t toW21 = 36;
ptrdiff_t k62 = 1*w33;
for (; k62 != 1; ++k62) {
ptrdiff_t l14 = 0;
for (; l14 != 2; ++l14) {
__m512 sf49 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf50 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in412 = _mm512_shuffle_f32x4(sf49, sf50, 68);
__m512 in413 = _mm512_shuffle_f32x4(sf49, sf50, 238);
__m512 sf51 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf52 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in420 = _mm512_shuffle_f32x4(sf51, sf52, 68);
__m512 in421 = _mm512_shuffle_f32x4(sf51, sf52, 238);
__m512 sf53 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf54 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in414 = _mm512_shuffle_f32x4(sf53, sf54, 68);
__m512 in415 = _mm512_shuffle_f32x4(sf53, sf54, 238);
__m512 sf55 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf56 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in422 = _mm512_shuffle_f32x4(sf55, sf56, 68);
__m512 in423 = _mm512_shuffle_f32x4(sf55, sf56, 238);
__m512 sf57 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf58 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in416 = _mm512_shuffle_f32x4(sf57, sf58, 68);
__m512 in417 = _mm512_shuffle_f32x4(sf57, sf58, 238);
__m512 sf59 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf60 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in424 = _mm512_shuffle_f32x4(sf59, sf60, 68);
__m512 in425 = _mm512_shuffle_f32x4(sf59, sf60, 238);
__m512 sf61 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf62 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in418 = _mm512_shuffle_f32x4(sf61, sf62, 68);
__m512 in419 = _mm512_shuffle_f32x4(sf61, sf62, 238);
__m512 sf63 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf64 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in426 = _mm512_shuffle_f32x4(sf63, sf64, 68);
__m512 in427 = _mm512_shuffle_f32x4(sf63, sf64, 238);
__m512 tmp2301 = _mm512_add_ps(in413, in414);
__m512 tmp2321 = _mm512_add_ps(in421, in422);
__m512 tmp2300 = _mm512_add_ps(in415, in416);
__m512 tmp2320 = _mm512_add_ps(in423, in424);
__m512 tmp2306 = _mm512_sub_ps(in415, in416);
__m512 tmp2326 = _mm512_sub_ps(in423, in424);
__m512 tmp2305 = _mm512_sub_ps(in413, in414);
__m512 tmp2325 = _mm512_sub_ps(in421, in422);
__m512 tmp2302 = _mm512_add_ps(in417, in418);
__m512 tmp2322 = _mm512_add_ps(in425, in426);
__m512 tmp2307 = _mm512_sub_ps(in417, in418);
__m512 tmp2327 = _mm512_sub_ps(in425, in426);
__m512 tmp2304 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(2e+00f), tmp2305);
__m512 tmp2324 = _mm512_fmadd_ps(tmp2326, _mm512_set1_ps(2e+00f), tmp2325);
__m512 tmp2311 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(8e+00f), tmp2305);
__m512 tmp2331 = _mm512_fmadd_ps(tmp2326, _mm512_set1_ps(8e+00f), tmp2325);
__m512 tmp2299 = _mm512_add_ps(tmp2300, tmp2301);
__m512 tmp2319 = _mm512_add_ps(tmp2320, tmp2321);
__m512 tmp2303 = _mm512_fmadd_ps(tmp2307, _mm512_set1_ps(1.6e+01f), tmp2304);
__m512 tmp2323 = _mm512_fmadd_ps(tmp2327, _mm512_set1_ps(1.6e+01f), tmp2324);
__m512 tmp2310 = _mm512_fmadd_ps(tmp2307, _mm512_set1_ps(4e+00f), tmp2311);
__m512 tmp2330 = _mm512_fmadd_ps(tmp2327, _mm512_set1_ps(4e+00f), tmp2331);
__m512 tmp2316 = _mm512_add_ps(tmp2307, tmp2305);
__m512 tmp2336 = _mm512_add_ps(tmp2327, tmp2325);
__m512 tmp2309 = _mm512_fmadd_ps(tmp2300, _mm512_set1_ps(4e+00f), tmp2301);
__m512 tmp2329 = _mm512_fmadd_ps(tmp2320, _mm512_set1_ps(4e+00f), tmp2321);
__m512 tmp2313 = _mm512_fmadd_ps(tmp2300, _mm512_set1_ps(1.6e+01f), tmp2301);
__m512 tmp2333 = _mm512_fmadd_ps(tmp2320, _mm512_set1_ps(1.6e+01f), tmp2321);
__m512 tmp2298 = _mm512_add_ps(tmp2299, in412);
__m512 tmp2318 = _mm512_add_ps(tmp2319, in420);
__m512 tmp2315 = _mm512_add_ps(tmp2316, in419);
__m512 tmp2335 = _mm512_add_ps(tmp2336, in427);
__m512 tmp2297 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(3.2e+01f), tmp2298);
__m512 tmp2317 = _mm512_fmadd_ps(tmp2322, _mm512_set1_ps(3.2e+01f), tmp2318);
__m512 tmp2308 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(8e+00f), tmp2309);
__m512 tmp2328 = _mm512_fmadd_ps(tmp2322, _mm512_set1_ps(8e+00f), tmp2329);
__m512 tmp2314 = _mm512_fmadd_ps(tmp2306, _mm512_set1_ps(3.2e+01f), tmp2315);
__m512 tmp2334 = _mm512_fmadd_ps(tmp2326, _mm512_set1_ps(3.2e+01f), tmp2335);
__m512 tmp2312 = _mm512_fmadd_ps(tmp2302, _mm512_set1_ps(2e+00f), tmp2313);
__m512 tmp2332 = _mm512_fmadd_ps(tmp2322, _mm512_set1_ps(2e+00f), tmp2333);
__m512 tmp2285 = tmp2297;
__m512 tmp2291 = tmp2317;
__m512 tmp2286 = tmp2303;
__m512 tmp2292 = tmp2323;
__m512 tmp2287 = tmp2308;
__m512 tmp2293 = tmp2328;
__m512 tmp2288 = tmp2310;
__m512 tmp2294 = tmp2330;
__m512 tmp2289 = tmp2312;
__m512 tmp2295 = tmp2332;
__m512 tmp2290 = tmp2314;
__m512 tmp2296 = tmp2334;
__m512 tmp2381 = _mm512_unpacklo_ps(tmp2285, tmp2286);
__m512 tmp2382 = _mm512_unpackhi_ps(tmp2285, tmp2286);
__m512 tmp2383 = _mm512_unpacklo_ps(tmp2287, tmp2288);
__m512 tmp2384 = _mm512_unpackhi_ps(tmp2287, tmp2288);
__m512 tmp2385 = _mm512_unpacklo_ps(tmp2289, tmp2290);
__m512 tmp2386 = _mm512_unpackhi_ps(tmp2289, tmp2290);
__m512 tmp2387 = _mm512_unpacklo_ps(tmp2291, tmp2292);
__m512 tmp2388 = _mm512_unpackhi_ps(tmp2291, tmp2292);
__m512 tmp2389 = _mm512_unpacklo_ps(tmp2293, tmp2294);
__m512 tmp2390 = _mm512_unpackhi_ps(tmp2293, tmp2294);
__m512 tmp2391 = _mm512_unpacklo_ps(tmp2295, tmp2296);
__m512 tmp2392 = _mm512_unpackhi_ps(tmp2295, tmp2296);
__m512 tmp2393 = _mm512_shuffle_ps(tmp2381, tmp2383, 68);
__m512 tmp2394 = _mm512_shuffle_ps(tmp2381, tmp2383, 238);
__m512 tmp2395 = _mm512_shuffle_ps(tmp2382, tmp2384, 68);
__m512 tmp2396 = _mm512_shuffle_ps(tmp2382, tmp2384, 238);
__m512 tmp2397 = _mm512_shuffle_ps(tmp2385, tmp2387, 68);
__m512 tmp2398 = _mm512_shuffle_ps(tmp2385, tmp2387, 238);
__m512 tmp2399 = _mm512_shuffle_ps(tmp2386, tmp2388, 68);
__m512 tmp2400 = _mm512_shuffle_ps(tmp2386, tmp2388, 238);
__m512 tmp2401 = _mm512_shuffle_ps(tmp2389, tmp2391, 68);
__m512 tmp2402 = _mm512_shuffle_ps(tmp2389, tmp2391, 238);
__m512 tmp2403 = _mm512_shuffle_ps(tmp2390, tmp2392, 68);
__m512 tmp2404 = _mm512_shuffle_ps(tmp2390, tmp2392, 238);
__m512 tmp2405 = _mm512_shuffle_f32x4(tmp2393, tmp2397, 136);
__m512 tmp2406 = _mm512_shuffle_f32x4(tmp2393, tmp2397, 221);
__m512 tmp2407 = _mm512_shuffle_f32x4(tmp2394, tmp2398, 136);
__m512 tmp2408 = _mm512_shuffle_f32x4(tmp2394, tmp2398, 221);
__m512 tmp2409 = _mm512_shuffle_f32x4(tmp2395, tmp2399, 136);
__m512 tmp2410 = _mm512_shuffle_f32x4(tmp2395, tmp2399, 221);
__m512 tmp2411 = _mm512_shuffle_f32x4(tmp2396, tmp2400, 136);
__m512 tmp2412 = _mm512_shuffle_f32x4(tmp2396, tmp2400, 221);
__m512 tmp2413 = _mm512_shuffle_f32x4(tmp2401, tmp2401, 136);
__m512 tmp2414 = _mm512_shuffle_f32x4(tmp2401, tmp2401, 221);
__m512 tmp2415 = _mm512_shuffle_f32x4(tmp2402, tmp2402, 136);
__m512 tmp2416 = _mm512_shuffle_f32x4(tmp2402, tmp2402, 221);
__m512 tmp2417 = _mm512_shuffle_f32x4(tmp2403, tmp2403, 136);
__m512 tmp2418 = _mm512_shuffle_f32x4(tmp2403, tmp2403, 221);
__m512 tmp2419 = _mm512_shuffle_f32x4(tmp2404, tmp2404, 136);
__m512 tmp2420 = _mm512_shuffle_f32x4(tmp2404, tmp2404, 221);
tmp2285 = _mm512_shuffle_f32x4(tmp2405, tmp2413, 136);
tmp2293 = _mm512_shuffle_f32x4(tmp2405, tmp2413, 221);
tmp2286 = _mm512_shuffle_f32x4(tmp2407, tmp2415, 136);
tmp2294 = _mm512_shuffle_f32x4(tmp2407, tmp2415, 221);
tmp2287 = _mm512_shuffle_f32x4(tmp2409, tmp2417, 136);
tmp2295 = _mm512_shuffle_f32x4(tmp2409, tmp2417, 221);
tmp2288 = _mm512_shuffle_f32x4(tmp2411, tmp2419, 136);
tmp2296 = _mm512_shuffle_f32x4(tmp2411, tmp2419, 221);
tmp2289 = _mm512_shuffle_f32x4(tmp2406, tmp2414, 136);
__m512 tmp2337 = _mm512_shuffle_f32x4(tmp2406, tmp2414, 221);
tmp2290 = _mm512_shuffle_f32x4(tmp2408, tmp2416, 136);
__m512 tmp2338 = _mm512_shuffle_f32x4(tmp2408, tmp2416, 221);
tmp2291 = _mm512_shuffle_f32x4(tmp2410, tmp2418, 136);
__m512 tmp2339 = _mm512_shuffle_f32x4(tmp2410, tmp2418, 221);
tmp2292 = _mm512_shuffle_f32x4(tmp2412, tmp2420, 136);
__m512 tmp2340 = _mm512_shuffle_f32x4(tmp2412, tmp2420, 221);
__m512 tmp2345 = _mm512_add_ps(tmp2286, tmp2287);
__m512 tmp2365 = _mm512_add_ps(tmp2294, tmp2295);
__m512 tmp2344 = _mm512_add_ps(tmp2288, tmp2289);
__m512 tmp2364 = _mm512_add_ps(tmp2296, tmp2337);
__m512 tmp2350 = _mm512_sub_ps(tmp2288, tmp2289);
__m512 tmp2370 = _mm512_sub_ps(tmp2296, tmp2337);
__m512 tmp2349 = _mm512_sub_ps(tmp2286, tmp2287);
__m512 tmp2369 = _mm512_sub_ps(tmp2294, tmp2295);
__m512 tmp2346 = _mm512_add_ps(tmp2290, tmp2291);
__m512 tmp2366 = _mm512_add_ps(tmp2338, tmp2339);
__m512 tmp2351 = _mm512_sub_ps(tmp2290, tmp2291);
__m512 tmp2371 = _mm512_sub_ps(tmp2338, tmp2339);
__m512 tmp2348 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(2e+00f), tmp2349);
__m512 tmp2368 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(2e+00f), tmp2369);
__m512 tmp2355 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(8e+00f), tmp2349);
__m512 tmp2375 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(8e+00f), tmp2369);
__m512 tmp2343 = _mm512_add_ps(tmp2344, tmp2345);
__m512 tmp2363 = _mm512_add_ps(tmp2364, tmp2365);
__m512 tmp2347 = _mm512_fmadd_ps(tmp2351, _mm512_set1_ps(1.6e+01f), tmp2348);
__m512 tmp2367 = _mm512_fmadd_ps(tmp2371, _mm512_set1_ps(1.6e+01f), tmp2368);
__m512 tmp2354 = _mm512_fmadd_ps(tmp2351, _mm512_set1_ps(4e+00f), tmp2355);
__m512 tmp2374 = _mm512_fmadd_ps(tmp2371, _mm512_set1_ps(4e+00f), tmp2375);
__m512 tmp2360 = _mm512_add_ps(tmp2351, tmp2349);
__m512 tmp2380 = _mm512_add_ps(tmp2371, tmp2369);
__m512 tmp2353 = _mm512_fmadd_ps(tmp2344, _mm512_set1_ps(4e+00f), tmp2345);
__m512 tmp2373 = _mm512_fmadd_ps(tmp2364, _mm512_set1_ps(4e+00f), tmp2365);
__m512 tmp2357 = _mm512_fmadd_ps(tmp2344, _mm512_set1_ps(1.6e+01f), tmp2345);
__m512 tmp2377 = _mm512_fmadd_ps(tmp2364, _mm512_set1_ps(1.6e+01f), tmp2365);
__m512 tmp2342 = _mm512_add_ps(tmp2343, tmp2285);
__m512 tmp2362 = _mm512_add_ps(tmp2363, tmp2293);
__m512 tmp2359 = _mm512_add_ps(tmp2360, tmp2292);
__m512 tmp2379 = _mm512_add_ps(tmp2380, tmp2340);
__m512 tmp2341 = _mm512_fmadd_ps(tmp2346, _mm512_set1_ps(3.2e+01f), tmp2342);
__m512 tmp2361 = _mm512_fmadd_ps(tmp2366, _mm512_set1_ps(3.2e+01f), tmp2362);
__m512 tmp2352 = _mm512_fmadd_ps(tmp2346, _mm512_set1_ps(8e+00f), tmp2353);
__m512 tmp2372 = _mm512_fmadd_ps(tmp2366, _mm512_set1_ps(8e+00f), tmp2373);
__m512 tmp2358 = _mm512_fmadd_ps(tmp2350, _mm512_set1_ps(3.2e+01f), tmp2359);
__m512 tmp2378 = _mm512_fmadd_ps(tmp2370, _mm512_set1_ps(3.2e+01f), tmp2379);
__m512 tmp2356 = _mm512_fmadd_ps(tmp2346, _mm512_set1_ps(2e+00f), tmp2357);
__m512 tmp2376 = _mm512_fmadd_ps(tmp2366, _mm512_set1_ps(2e+00f), tmp2377);
__m512 out459 = tmp2341;
__m512 out465 = tmp2361;
__m512 out460 = tmp2347;
__m512 out466 = tmp2367;
__m512 out461 = tmp2352;
__m512 out467 = tmp2372;
__m512 out462 = tmp2354;
__m512 out468 = tmp2374;
__m512 out463 = tmp2356;
__m512 out469 = tmp2376;
__m512 out464 = tmp2358;
__m512 out470 = tmp2378;
out459 = _mm512_max_ps(_mm512_setzero_ps(), out459);
out465 = _mm512_max_ps(_mm512_setzero_ps(), out465);
out460 = _mm512_max_ps(_mm512_setzero_ps(), out460);
out466 = _mm512_max_ps(_mm512_setzero_ps(), out466);
out461 = _mm512_max_ps(_mm512_setzero_ps(), out461);
out467 = _mm512_max_ps(_mm512_setzero_ps(), out467);
out462 = _mm512_max_ps(_mm512_setzero_ps(), out462);
out468 = _mm512_max_ps(_mm512_setzero_ps(), out468);
out463 = _mm512_max_ps(_mm512_setzero_ps(), out463);
out469 = _mm512_max_ps(_mm512_setzero_ps(), out469);
out464 = _mm512_max_ps(_mm512_setzero_ps(), out464);
out470 = _mm512_max_ps(_mm512_setzero_ps(), out470);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out459);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out465);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out460);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out466);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out461);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out467);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out462);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out468);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out463);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out469);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out464);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out470);
__m512 sf65 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf66 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in428 = _mm512_shuffle_f32x4(sf65, sf66, 68);
__m512 in429 = _mm512_shuffle_f32x4(sf65, sf66, 238);
__m512 sf67 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf68 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in436 = _mm512_shuffle_f32x4(sf67, sf68, 68);
__m512 in437 = _mm512_shuffle_f32x4(sf67, sf68, 238);
__m512 sf69 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf70 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in430 = _mm512_shuffle_f32x4(sf69, sf70, 68);
__m512 in431 = _mm512_shuffle_f32x4(sf69, sf70, 238);
__m512 sf71 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf72 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in438 = _mm512_shuffle_f32x4(sf71, sf72, 68);
__m512 in439 = _mm512_shuffle_f32x4(sf71, sf72, 238);
__m512 sf73 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf74 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in432 = _mm512_shuffle_f32x4(sf73, sf74, 68);
__m512 in433 = _mm512_shuffle_f32x4(sf73, sf74, 238);
__m512 sf75 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf76 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in440 = _mm512_shuffle_f32x4(sf75, sf76, 68);
__m512 in441 = _mm512_shuffle_f32x4(sf75, sf76, 238);
__m512 sf77 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf78 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in434 = _mm512_shuffle_f32x4(sf77, sf78, 68);
__m512 in435 = _mm512_shuffle_f32x4(sf77, sf78, 238);
__m512 sf79 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf80 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in442 = _mm512_shuffle_f32x4(sf79, sf80, 68);
__m512 in443 = _mm512_shuffle_f32x4(sf79, sf80, 238);
__m512 tmp2437 = _mm512_add_ps(in429, in430);
__m512 tmp2457 = _mm512_add_ps(in437, in438);
__m512 tmp2436 = _mm512_add_ps(in431, in432);
__m512 tmp2456 = _mm512_add_ps(in439, in440);
__m512 tmp2442 = _mm512_sub_ps(in431, in432);
__m512 tmp2462 = _mm512_sub_ps(in439, in440);
__m512 tmp2441 = _mm512_sub_ps(in429, in430);
__m512 tmp2461 = _mm512_sub_ps(in437, in438);
__m512 tmp2438 = _mm512_add_ps(in433, in434);
__m512 tmp2458 = _mm512_add_ps(in441, in442);
__m512 tmp2443 = _mm512_sub_ps(in433, in434);
__m512 tmp2463 = _mm512_sub_ps(in441, in442);
__m512 tmp2440 = _mm512_fmadd_ps(tmp2442, _mm512_set1_ps(2e+00f), tmp2441);
__m512 tmp2460 = _mm512_fmadd_ps(tmp2462, _mm512_set1_ps(2e+00f), tmp2461);
__m512 tmp2447 = _mm512_fmadd_ps(tmp2442, _mm512_set1_ps(8e+00f), tmp2441);
__m512 tmp2467 = _mm512_fmadd_ps(tmp2462, _mm512_set1_ps(8e+00f), tmp2461);
__m512 tmp2435 = _mm512_add_ps(tmp2436, tmp2437);
__m512 tmp2455 = _mm512_add_ps(tmp2456, tmp2457);
__m512 tmp2439 = _mm512_fmadd_ps(tmp2443, _mm512_set1_ps(1.6e+01f), tmp2440);
__m512 tmp2459 = _mm512_fmadd_ps(tmp2463, _mm512_set1_ps(1.6e+01f), tmp2460);
__m512 tmp2446 = _mm512_fmadd_ps(tmp2443, _mm512_set1_ps(4e+00f), tmp2447);
__m512 tmp2466 = _mm512_fmadd_ps(tmp2463, _mm512_set1_ps(4e+00f), tmp2467);
__m512 tmp2452 = _mm512_add_ps(tmp2443, tmp2441);
__m512 tmp2472 = _mm512_add_ps(tmp2463, tmp2461);
__m512 tmp2445 = _mm512_fmadd_ps(tmp2436, _mm512_set1_ps(4e+00f), tmp2437);
__m512 tmp2465 = _mm512_fmadd_ps(tmp2456, _mm512_set1_ps(4e+00f), tmp2457);
__m512 tmp2449 = _mm512_fmadd_ps(tmp2436, _mm512_set1_ps(1.6e+01f), tmp2437);
__m512 tmp2469 = _mm512_fmadd_ps(tmp2456, _mm512_set1_ps(1.6e+01f), tmp2457);
__m512 tmp2434 = _mm512_add_ps(tmp2435, in428);
__m512 tmp2454 = _mm512_add_ps(tmp2455, in436);
__m512 tmp2451 = _mm512_add_ps(tmp2452, in435);
__m512 tmp2471 = _mm512_add_ps(tmp2472, in443);
__m512 tmp2433 = _mm512_fmadd_ps(tmp2438, _mm512_set1_ps(3.2e+01f), tmp2434);
__m512 tmp2453 = _mm512_fmadd_ps(tmp2458, _mm512_set1_ps(3.2e+01f), tmp2454);
__m512 tmp2444 = _mm512_fmadd_ps(tmp2438, _mm512_set1_ps(8e+00f), tmp2445);
__m512 tmp2464 = _mm512_fmadd_ps(tmp2458, _mm512_set1_ps(8e+00f), tmp2465);
__m512 tmp2450 = _mm512_fmadd_ps(tmp2442, _mm512_set1_ps(3.2e+01f), tmp2451);
__m512 tmp2470 = _mm512_fmadd_ps(tmp2462, _mm512_set1_ps(3.2e+01f), tmp2471);
__m512 tmp2448 = _mm512_fmadd_ps(tmp2438, _mm512_set1_ps(2e+00f), tmp2449);
__m512 tmp2468 = _mm512_fmadd_ps(tmp2458, _mm512_set1_ps(2e+00f), tmp2469);
__m512 tmp2421 = tmp2433;
__m512 tmp2427 = tmp2453;
__m512 tmp2422 = tmp2439;
__m512 tmp2428 = tmp2459;
__m512 tmp2423 = tmp2444;
__m512 tmp2429 = tmp2464;
__m512 tmp2424 = tmp2446;
__m512 tmp2430 = tmp2466;
__m512 tmp2425 = tmp2448;
__m512 tmp2431 = tmp2468;
__m512 tmp2426 = tmp2450;
__m512 tmp2432 = tmp2470;
__m512 tmp2517 = _mm512_unpacklo_ps(tmp2421, tmp2422);
__m512 tmp2518 = _mm512_unpackhi_ps(tmp2421, tmp2422);
__m512 tmp2519 = _mm512_unpacklo_ps(tmp2423, tmp2424);
__m512 tmp2520 = _mm512_unpackhi_ps(tmp2423, tmp2424);
__m512 tmp2521 = _mm512_unpacklo_ps(tmp2425, tmp2426);
__m512 tmp2522 = _mm512_unpackhi_ps(tmp2425, tmp2426);
__m512 tmp2523 = _mm512_unpacklo_ps(tmp2427, tmp2428);
__m512 tmp2524 = _mm512_unpackhi_ps(tmp2427, tmp2428);
__m512 tmp2525 = _mm512_unpacklo_ps(tmp2429, tmp2430);
__m512 tmp2526 = _mm512_unpackhi_ps(tmp2429, tmp2430);
__m512 tmp2527 = _mm512_unpacklo_ps(tmp2431, tmp2432);
__m512 tmp2528 = _mm512_unpackhi_ps(tmp2431, tmp2432);
__m512 tmp2529 = _mm512_shuffle_ps(tmp2517, tmp2519, 68);
__m512 tmp2530 = _mm512_shuffle_ps(tmp2517, tmp2519, 238);
__m512 tmp2531 = _mm512_shuffle_ps(tmp2518, tmp2520, 68);
__m512 tmp2532 = _mm512_shuffle_ps(tmp2518, tmp2520, 238);
__m512 tmp2533 = _mm512_shuffle_ps(tmp2521, tmp2523, 68);
__m512 tmp2534 = _mm512_shuffle_ps(tmp2521, tmp2523, 238);
__m512 tmp2535 = _mm512_shuffle_ps(tmp2522, tmp2524, 68);
__m512 tmp2536 = _mm512_shuffle_ps(tmp2522, tmp2524, 238);
__m512 tmp2537 = _mm512_shuffle_ps(tmp2525, tmp2527, 68);
__m512 tmp2538 = _mm512_shuffle_ps(tmp2525, tmp2527, 238);
__m512 tmp2539 = _mm512_shuffle_ps(tmp2526, tmp2528, 68);
__m512 tmp2540 = _mm512_shuffle_ps(tmp2526, tmp2528, 238);
__m512 tmp2541 = _mm512_shuffle_f32x4(tmp2529, tmp2533, 136);
__m512 tmp2542 = _mm512_shuffle_f32x4(tmp2529, tmp2533, 221);
__m512 tmp2543 = _mm512_shuffle_f32x4(tmp2530, tmp2534, 136);
__m512 tmp2544 = _mm512_shuffle_f32x4(tmp2530, tmp2534, 221);
__m512 tmp2545 = _mm512_shuffle_f32x4(tmp2531, tmp2535, 136);
__m512 tmp2546 = _mm512_shuffle_f32x4(tmp2531, tmp2535, 221);
__m512 tmp2547 = _mm512_shuffle_f32x4(tmp2532, tmp2536, 136);
__m512 tmp2548 = _mm512_shuffle_f32x4(tmp2532, tmp2536, 221);
__m512 tmp2549 = _mm512_shuffle_f32x4(tmp2537, tmp2537, 136);
__m512 tmp2550 = _mm512_shuffle_f32x4(tmp2537, tmp2537, 221);
__m512 tmp2551 = _mm512_shuffle_f32x4(tmp2538, tmp2538, 136);
__m512 tmp2552 = _mm512_shuffle_f32x4(tmp2538, tmp2538, 221);
__m512 tmp2553 = _mm512_shuffle_f32x4(tmp2539, tmp2539, 136);
__m512 tmp2554 = _mm512_shuffle_f32x4(tmp2539, tmp2539, 221);
__m512 tmp2555 = _mm512_shuffle_f32x4(tmp2540, tmp2540, 136);
__m512 tmp2556 = _mm512_shuffle_f32x4(tmp2540, tmp2540, 221);
tmp2421 = _mm512_shuffle_f32x4(tmp2541, tmp2549, 136);
tmp2429 = _mm512_shuffle_f32x4(tmp2541, tmp2549, 221);
tmp2422 = _mm512_shuffle_f32x4(tmp2543, tmp2551, 136);
tmp2430 = _mm512_shuffle_f32x4(tmp2543, tmp2551, 221);
tmp2423 = _mm512_shuffle_f32x4(tmp2545, tmp2553, 136);
tmp2431 = _mm512_shuffle_f32x4(tmp2545, tmp2553, 221);
tmp2424 = _mm512_shuffle_f32x4(tmp2547, tmp2555, 136);
tmp2432 = _mm512_shuffle_f32x4(tmp2547, tmp2555, 221);
tmp2425 = _mm512_shuffle_f32x4(tmp2542, tmp2550, 136);
__m512 tmp2473 = _mm512_shuffle_f32x4(tmp2542, tmp2550, 221);
tmp2426 = _mm512_shuffle_f32x4(tmp2544, tmp2552, 136);
__m512 tmp2474 = _mm512_shuffle_f32x4(tmp2544, tmp2552, 221);
tmp2427 = _mm512_shuffle_f32x4(tmp2546, tmp2554, 136);
__m512 tmp2475 = _mm512_shuffle_f32x4(tmp2546, tmp2554, 221);
tmp2428 = _mm512_shuffle_f32x4(tmp2548, tmp2556, 136);
__m512 tmp2476 = _mm512_shuffle_f32x4(tmp2548, tmp2556, 221);
__m512 tmp2481 = _mm512_add_ps(tmp2422, tmp2423);
__m512 tmp2501 = _mm512_add_ps(tmp2430, tmp2431);
__m512 tmp2480 = _mm512_add_ps(tmp2424, tmp2425);
__m512 tmp2500 = _mm512_add_ps(tmp2432, tmp2473);
__m512 tmp2486 = _mm512_sub_ps(tmp2424, tmp2425);
__m512 tmp2506 = _mm512_sub_ps(tmp2432, tmp2473);
__m512 tmp2485 = _mm512_sub_ps(tmp2422, tmp2423);
__m512 tmp2505 = _mm512_sub_ps(tmp2430, tmp2431);
__m512 tmp2482 = _mm512_add_ps(tmp2426, tmp2427);
__m512 tmp2502 = _mm512_add_ps(tmp2474, tmp2475);
__m512 tmp2487 = _mm512_sub_ps(tmp2426, tmp2427);
__m512 tmp2507 = _mm512_sub_ps(tmp2474, tmp2475);
__m512 tmp2484 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(2e+00f), tmp2485);
__m512 tmp2504 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(2e+00f), tmp2505);
__m512 tmp2491 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(8e+00f), tmp2485);
__m512 tmp2511 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(8e+00f), tmp2505);
__m512 tmp2479 = _mm512_add_ps(tmp2480, tmp2481);
__m512 tmp2499 = _mm512_add_ps(tmp2500, tmp2501);
__m512 tmp2483 = _mm512_fmadd_ps(tmp2487, _mm512_set1_ps(1.6e+01f), tmp2484);
__m512 tmp2503 = _mm512_fmadd_ps(tmp2507, _mm512_set1_ps(1.6e+01f), tmp2504);
__m512 tmp2490 = _mm512_fmadd_ps(tmp2487, _mm512_set1_ps(4e+00f), tmp2491);
__m512 tmp2510 = _mm512_fmadd_ps(tmp2507, _mm512_set1_ps(4e+00f), tmp2511);
__m512 tmp2496 = _mm512_add_ps(tmp2487, tmp2485);
__m512 tmp2516 = _mm512_add_ps(tmp2507, tmp2505);
__m512 tmp2489 = _mm512_fmadd_ps(tmp2480, _mm512_set1_ps(4e+00f), tmp2481);
__m512 tmp2509 = _mm512_fmadd_ps(tmp2500, _mm512_set1_ps(4e+00f), tmp2501);
__m512 tmp2493 = _mm512_fmadd_ps(tmp2480, _mm512_set1_ps(1.6e+01f), tmp2481);
__m512 tmp2513 = _mm512_fmadd_ps(tmp2500, _mm512_set1_ps(1.6e+01f), tmp2501);
__m512 tmp2478 = _mm512_add_ps(tmp2479, tmp2421);
__m512 tmp2498 = _mm512_add_ps(tmp2499, tmp2429);
__m512 tmp2495 = _mm512_add_ps(tmp2496, tmp2428);
__m512 tmp2515 = _mm512_add_ps(tmp2516, tmp2476);
__m512 tmp2477 = _mm512_fmadd_ps(tmp2482, _mm512_set1_ps(3.2e+01f), tmp2478);
__m512 tmp2497 = _mm512_fmadd_ps(tmp2502, _mm512_set1_ps(3.2e+01f), tmp2498);
__m512 tmp2488 = _mm512_fmadd_ps(tmp2482, _mm512_set1_ps(8e+00f), tmp2489);
__m512 tmp2508 = _mm512_fmadd_ps(tmp2502, _mm512_set1_ps(8e+00f), tmp2509);
__m512 tmp2494 = _mm512_fmadd_ps(tmp2486, _mm512_set1_ps(3.2e+01f), tmp2495);
__m512 tmp2514 = _mm512_fmadd_ps(tmp2506, _mm512_set1_ps(3.2e+01f), tmp2515);
__m512 tmp2492 = _mm512_fmadd_ps(tmp2482, _mm512_set1_ps(2e+00f), tmp2493);
__m512 tmp2512 = _mm512_fmadd_ps(tmp2502, _mm512_set1_ps(2e+00f), tmp2513);
__m512 out471 = tmp2477;
__m512 out477 = tmp2497;
__m512 out472 = tmp2483;
__m512 out478 = tmp2503;
__m512 out473 = tmp2488;
__m512 out479 = tmp2508;
__m512 out474 = tmp2490;
__m512 out480 = tmp2510;
__m512 out475 = tmp2492;
__m512 out481 = tmp2512;
__m512 out476 = tmp2494;
__m512 out482 = tmp2514;
out471 = _mm512_max_ps(_mm512_setzero_ps(), out471);
out477 = _mm512_max_ps(_mm512_setzero_ps(), out477);
out472 = _mm512_max_ps(_mm512_setzero_ps(), out472);
out478 = _mm512_max_ps(_mm512_setzero_ps(), out478);
out473 = _mm512_max_ps(_mm512_setzero_ps(), out473);
out479 = _mm512_max_ps(_mm512_setzero_ps(), out479);
out474 = _mm512_max_ps(_mm512_setzero_ps(), out474);
out480 = _mm512_max_ps(_mm512_setzero_ps(), out480);
out475 = _mm512_max_ps(_mm512_setzero_ps(), out475);
out481 = _mm512_max_ps(_mm512_setzero_ps(), out481);
out476 = _mm512_max_ps(_mm512_setzero_ps(), out476);
out482 = _mm512_max_ps(_mm512_setzero_ps(), out482);
_mm512_mask_storeu_ps(datPtr6+1200+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out471);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out477);
_mm512_mask_storeu_ps(datPtr6+1424+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out472);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out478);
_mm512_mask_storeu_ps(datPtr6+1648+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out473);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out479);
_mm512_mask_storeu_ps(datPtr6+1872+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out474);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out480);
_mm512_mask_storeu_ps(datPtr6+2096+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out475);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out481);
_mm512_mask_storeu_ps(datPtr6+2320+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out476);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out482);
__m512 sf81 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf82 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in444 = _mm512_shuffle_f32x4(sf81, sf82, 68);
__m512 in445 = _mm512_shuffle_f32x4(sf81, sf82, 238);
__m512 sf83 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf84 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in452 = _mm512_shuffle_f32x4(sf83, sf84, 68);
__m512 in453 = _mm512_shuffle_f32x4(sf83, sf84, 238);
__m512 sf85 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf86 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in446 = _mm512_shuffle_f32x4(sf85, sf86, 68);
__m512 in447 = _mm512_shuffle_f32x4(sf85, sf86, 238);
__m512 sf87 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf88 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in454 = _mm512_shuffle_f32x4(sf87, sf88, 68);
__m512 in455 = _mm512_shuffle_f32x4(sf87, sf88, 238);
__m512 sf89 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf90 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in448 = _mm512_shuffle_f32x4(sf89, sf90, 68);
__m512 in449 = _mm512_shuffle_f32x4(sf89, sf90, 238);
__m512 sf91 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf92 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in456 = _mm512_shuffle_f32x4(sf91, sf92, 68);
__m512 in457 = _mm512_shuffle_f32x4(sf91, sf92, 238);
__m512 sf93 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf94 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in450 = _mm512_shuffle_f32x4(sf93, sf94, 68);
__m512 in451 = _mm512_shuffle_f32x4(sf93, sf94, 238);
__m512 sf95 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k62+768*l14);
__m512 sf96 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k62+768*l14);
__m512 in458 = _mm512_shuffle_f32x4(sf95, sf96, 68);
__m512 in459 = _mm512_shuffle_f32x4(sf95, sf96, 238);
__m512 tmp2573 = _mm512_add_ps(in445, in446);
__m512 tmp2593 = _mm512_add_ps(in453, in454);
__m512 tmp2572 = _mm512_add_ps(in447, in448);
__m512 tmp2592 = _mm512_add_ps(in455, in456);
__m512 tmp2578 = _mm512_sub_ps(in447, in448);
__m512 tmp2598 = _mm512_sub_ps(in455, in456);
__m512 tmp2577 = _mm512_sub_ps(in445, in446);
__m512 tmp2597 = _mm512_sub_ps(in453, in454);
__m512 tmp2574 = _mm512_add_ps(in449, in450);
__m512 tmp2594 = _mm512_add_ps(in457, in458);
__m512 tmp2579 = _mm512_sub_ps(in449, in450);
__m512 tmp2599 = _mm512_sub_ps(in457, in458);
__m512 tmp2576 = _mm512_fmadd_ps(tmp2578, _mm512_set1_ps(2e+00f), tmp2577);
__m512 tmp2596 = _mm512_fmadd_ps(tmp2598, _mm512_set1_ps(2e+00f), tmp2597);
__m512 tmp2583 = _mm512_fmadd_ps(tmp2578, _mm512_set1_ps(8e+00f), tmp2577);
__m512 tmp2603 = _mm512_fmadd_ps(tmp2598, _mm512_set1_ps(8e+00f), tmp2597);
__m512 tmp2571 = _mm512_add_ps(tmp2572, tmp2573);
__m512 tmp2591 = _mm512_add_ps(tmp2592, tmp2593);
__m512 tmp2575 = _mm512_fmadd_ps(tmp2579, _mm512_set1_ps(1.6e+01f), tmp2576);
__m512 tmp2595 = _mm512_fmadd_ps(tmp2599, _mm512_set1_ps(1.6e+01f), tmp2596);
__m512 tmp2582 = _mm512_fmadd_ps(tmp2579, _mm512_set1_ps(4e+00f), tmp2583);
__m512 tmp2602 = _mm512_fmadd_ps(tmp2599, _mm512_set1_ps(4e+00f), tmp2603);
__m512 tmp2588 = _mm512_add_ps(tmp2579, tmp2577);
__m512 tmp2608 = _mm512_add_ps(tmp2599, tmp2597);
__m512 tmp2581 = _mm512_fmadd_ps(tmp2572, _mm512_set1_ps(4e+00f), tmp2573);
__m512 tmp2601 = _mm512_fmadd_ps(tmp2592, _mm512_set1_ps(4e+00f), tmp2593);
__m512 tmp2585 = _mm512_fmadd_ps(tmp2572, _mm512_set1_ps(1.6e+01f), tmp2573);
__m512 tmp2605 = _mm512_fmadd_ps(tmp2592, _mm512_set1_ps(1.6e+01f), tmp2593);
__m512 tmp2570 = _mm512_add_ps(tmp2571, in444);
__m512 tmp2590 = _mm512_add_ps(tmp2591, in452);
__m512 tmp2587 = _mm512_add_ps(tmp2588, in451);
__m512 tmp2607 = _mm512_add_ps(tmp2608, in459);
__m512 tmp2569 = _mm512_fmadd_ps(tmp2574, _mm512_set1_ps(3.2e+01f), tmp2570);
__m512 tmp2589 = _mm512_fmadd_ps(tmp2594, _mm512_set1_ps(3.2e+01f), tmp2590);
__m512 tmp2580 = _mm512_fmadd_ps(tmp2574, _mm512_set1_ps(8e+00f), tmp2581);
__m512 tmp2600 = _mm512_fmadd_ps(tmp2594, _mm512_set1_ps(8e+00f), tmp2601);
__m512 tmp2586 = _mm512_fmadd_ps(tmp2578, _mm512_set1_ps(3.2e+01f), tmp2587);
__m512 tmp2606 = _mm512_fmadd_ps(tmp2598, _mm512_set1_ps(3.2e+01f), tmp2607);
__m512 tmp2584 = _mm512_fmadd_ps(tmp2574, _mm512_set1_ps(2e+00f), tmp2585);
__m512 tmp2604 = _mm512_fmadd_ps(tmp2594, _mm512_set1_ps(2e+00f), tmp2605);
__m512 tmp2557 = tmp2569;
__m512 tmp2563 = tmp2589;
__m512 tmp2558 = tmp2575;
__m512 tmp2564 = tmp2595;
__m512 tmp2559 = tmp2580;
__m512 tmp2565 = tmp2600;
__m512 tmp2560 = tmp2582;
__m512 tmp2566 = tmp2602;
__m512 tmp2561 = tmp2584;
__m512 tmp2567 = tmp2604;
__m512 tmp2562 = tmp2586;
__m512 tmp2568 = tmp2606;
__m512 tmp2653 = _mm512_unpacklo_ps(tmp2557, tmp2558);
__m512 tmp2654 = _mm512_unpackhi_ps(tmp2557, tmp2558);
__m512 tmp2655 = _mm512_unpacklo_ps(tmp2559, tmp2560);
__m512 tmp2656 = _mm512_unpackhi_ps(tmp2559, tmp2560);
__m512 tmp2657 = _mm512_unpacklo_ps(tmp2561, tmp2562);
__m512 tmp2658 = _mm512_unpackhi_ps(tmp2561, tmp2562);
__m512 tmp2659 = _mm512_unpacklo_ps(tmp2563, tmp2564);
__m512 tmp2660 = _mm512_unpackhi_ps(tmp2563, tmp2564);
__m512 tmp2661 = _mm512_unpacklo_ps(tmp2565, tmp2566);
__m512 tmp2662 = _mm512_unpackhi_ps(tmp2565, tmp2566);
__m512 tmp2663 = _mm512_unpacklo_ps(tmp2567, tmp2568);
__m512 tmp2664 = _mm512_unpackhi_ps(tmp2567, tmp2568);
__m512 tmp2665 = _mm512_shuffle_ps(tmp2653, tmp2655, 68);
__m512 tmp2666 = _mm512_shuffle_ps(tmp2653, tmp2655, 238);
__m512 tmp2667 = _mm512_shuffle_ps(tmp2654, tmp2656, 68);
__m512 tmp2668 = _mm512_shuffle_ps(tmp2654, tmp2656, 238);
__m512 tmp2669 = _mm512_shuffle_ps(tmp2657, tmp2659, 68);
__m512 tmp2670 = _mm512_shuffle_ps(tmp2657, tmp2659, 238);
__m512 tmp2671 = _mm512_shuffle_ps(tmp2658, tmp2660, 68);
__m512 tmp2672 = _mm512_shuffle_ps(tmp2658, tmp2660, 238);
__m512 tmp2673 = _mm512_shuffle_ps(tmp2661, tmp2663, 68);
__m512 tmp2674 = _mm512_shuffle_ps(tmp2661, tmp2663, 238);
__m512 tmp2675 = _mm512_shuffle_ps(tmp2662, tmp2664, 68);
__m512 tmp2676 = _mm512_shuffle_ps(tmp2662, tmp2664, 238);
__m512 tmp2677 = _mm512_shuffle_f32x4(tmp2665, tmp2669, 136);
__m512 tmp2678 = _mm512_shuffle_f32x4(tmp2665, tmp2669, 221);
__m512 tmp2679 = _mm512_shuffle_f32x4(tmp2666, tmp2670, 136);
__m512 tmp2680 = _mm512_shuffle_f32x4(tmp2666, tmp2670, 221);
__m512 tmp2681 = _mm512_shuffle_f32x4(tmp2667, tmp2671, 136);
__m512 tmp2682 = _mm512_shuffle_f32x4(tmp2667, tmp2671, 221);
__m512 tmp2683 = _mm512_shuffle_f32x4(tmp2668, tmp2672, 136);
__m512 tmp2684 = _mm512_shuffle_f32x4(tmp2668, tmp2672, 221);
__m512 tmp2685 = _mm512_shuffle_f32x4(tmp2673, tmp2673, 136);
__m512 tmp2686 = _mm512_shuffle_f32x4(tmp2673, tmp2673, 221);
__m512 tmp2687 = _mm512_shuffle_f32x4(tmp2674, tmp2674, 136);
__m512 tmp2688 = _mm512_shuffle_f32x4(tmp2674, tmp2674, 221);
__m512 tmp2689 = _mm512_shuffle_f32x4(tmp2675, tmp2675, 136);
__m512 tmp2690 = _mm512_shuffle_f32x4(tmp2675, tmp2675, 221);
__m512 tmp2691 = _mm512_shuffle_f32x4(tmp2676, tmp2676, 136);
__m512 tmp2692 = _mm512_shuffle_f32x4(tmp2676, tmp2676, 221);
tmp2557 = _mm512_shuffle_f32x4(tmp2677, tmp2685, 136);
tmp2565 = _mm512_shuffle_f32x4(tmp2677, tmp2685, 221);
tmp2558 = _mm512_shuffle_f32x4(tmp2679, tmp2687, 136);
tmp2566 = _mm512_shuffle_f32x4(tmp2679, tmp2687, 221);
tmp2559 = _mm512_shuffle_f32x4(tmp2681, tmp2689, 136);
tmp2567 = _mm512_shuffle_f32x4(tmp2681, tmp2689, 221);
tmp2560 = _mm512_shuffle_f32x4(tmp2683, tmp2691, 136);
tmp2568 = _mm512_shuffle_f32x4(tmp2683, tmp2691, 221);
tmp2561 = _mm512_shuffle_f32x4(tmp2678, tmp2686, 136);
__m512 tmp2609 = _mm512_shuffle_f32x4(tmp2678, tmp2686, 221);
tmp2562 = _mm512_shuffle_f32x4(tmp2680, tmp2688, 136);
__m512 tmp2610 = _mm512_shuffle_f32x4(tmp2680, tmp2688, 221);
tmp2563 = _mm512_shuffle_f32x4(tmp2682, tmp2690, 136);
__m512 tmp2611 = _mm512_shuffle_f32x4(tmp2682, tmp2690, 221);
tmp2564 = _mm512_shuffle_f32x4(tmp2684, tmp2692, 136);
__m512 tmp2612 = _mm512_shuffle_f32x4(tmp2684, tmp2692, 221);
__m512 tmp2617 = _mm512_add_ps(tmp2558, tmp2559);
__m512 tmp2637 = _mm512_add_ps(tmp2566, tmp2567);
__m512 tmp2616 = _mm512_add_ps(tmp2560, tmp2561);
__m512 tmp2636 = _mm512_add_ps(tmp2568, tmp2609);
__m512 tmp2622 = _mm512_sub_ps(tmp2560, tmp2561);
__m512 tmp2642 = _mm512_sub_ps(tmp2568, tmp2609);
__m512 tmp2621 = _mm512_sub_ps(tmp2558, tmp2559);
__m512 tmp2641 = _mm512_sub_ps(tmp2566, tmp2567);
__m512 tmp2618 = _mm512_add_ps(tmp2562, tmp2563);
__m512 tmp2638 = _mm512_add_ps(tmp2610, tmp2611);
__m512 tmp2623 = _mm512_sub_ps(tmp2562, tmp2563);
__m512 tmp2643 = _mm512_sub_ps(tmp2610, tmp2611);
__m512 tmp2620 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(2e+00f), tmp2621);
__m512 tmp2640 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(2e+00f), tmp2641);
__m512 tmp2627 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(8e+00f), tmp2621);
__m512 tmp2647 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(8e+00f), tmp2641);
__m512 tmp2615 = _mm512_add_ps(tmp2616, tmp2617);
__m512 tmp2635 = _mm512_add_ps(tmp2636, tmp2637);
__m512 tmp2619 = _mm512_fmadd_ps(tmp2623, _mm512_set1_ps(1.6e+01f), tmp2620);
__m512 tmp2639 = _mm512_fmadd_ps(tmp2643, _mm512_set1_ps(1.6e+01f), tmp2640);
__m512 tmp2626 = _mm512_fmadd_ps(tmp2623, _mm512_set1_ps(4e+00f), tmp2627);
__m512 tmp2646 = _mm512_fmadd_ps(tmp2643, _mm512_set1_ps(4e+00f), tmp2647);
__m512 tmp2632 = _mm512_add_ps(tmp2623, tmp2621);
__m512 tmp2652 = _mm512_add_ps(tmp2643, tmp2641);
__m512 tmp2625 = _mm512_fmadd_ps(tmp2616, _mm512_set1_ps(4e+00f), tmp2617);
__m512 tmp2645 = _mm512_fmadd_ps(tmp2636, _mm512_set1_ps(4e+00f), tmp2637);
__m512 tmp2629 = _mm512_fmadd_ps(tmp2616, _mm512_set1_ps(1.6e+01f), tmp2617);
__m512 tmp2649 = _mm512_fmadd_ps(tmp2636, _mm512_set1_ps(1.6e+01f), tmp2637);
__m512 tmp2614 = _mm512_add_ps(tmp2615, tmp2557);
__m512 tmp2634 = _mm512_add_ps(tmp2635, tmp2565);
__m512 tmp2631 = _mm512_add_ps(tmp2632, tmp2564);
__m512 tmp2651 = _mm512_add_ps(tmp2652, tmp2612);
__m512 tmp2613 = _mm512_fmadd_ps(tmp2618, _mm512_set1_ps(3.2e+01f), tmp2614);
__m512 tmp2633 = _mm512_fmadd_ps(tmp2638, _mm512_set1_ps(3.2e+01f), tmp2634);
__m512 tmp2624 = _mm512_fmadd_ps(tmp2618, _mm512_set1_ps(8e+00f), tmp2625);
__m512 tmp2644 = _mm512_fmadd_ps(tmp2638, _mm512_set1_ps(8e+00f), tmp2645);
__m512 tmp2630 = _mm512_fmadd_ps(tmp2622, _mm512_set1_ps(3.2e+01f), tmp2631);
__m512 tmp2650 = _mm512_fmadd_ps(tmp2642, _mm512_set1_ps(3.2e+01f), tmp2651);
__m512 tmp2628 = _mm512_fmadd_ps(tmp2618, _mm512_set1_ps(2e+00f), tmp2629);
__m512 tmp2648 = _mm512_fmadd_ps(tmp2638, _mm512_set1_ps(2e+00f), tmp2649);
__m512 out483 = tmp2613;
__m512 out489 = tmp2633;
__m512 out484 = tmp2619;
__m512 out490 = tmp2639;
__m512 out485 = tmp2624;
__m512 out491 = tmp2644;
__m512 out486 = tmp2626;
__m512 out492 = tmp2646;
__m512 out487 = tmp2628;
__m512 out493 = tmp2648;
__m512 out488 = tmp2630;
__m512 out494 = tmp2650;
out483 = _mm512_max_ps(_mm512_setzero_ps(), out483);
out489 = _mm512_max_ps(_mm512_setzero_ps(), out489);
out484 = _mm512_max_ps(_mm512_setzero_ps(), out484);
out490 = _mm512_max_ps(_mm512_setzero_ps(), out490);
out485 = _mm512_max_ps(_mm512_setzero_ps(), out485);
out491 = _mm512_max_ps(_mm512_setzero_ps(), out491);
out486 = _mm512_max_ps(_mm512_setzero_ps(), out486);
out492 = _mm512_max_ps(_mm512_setzero_ps(), out492);
out487 = _mm512_max_ps(_mm512_setzero_ps(), out487);
out493 = _mm512_max_ps(_mm512_setzero_ps(), out493);
out488 = _mm512_max_ps(_mm512_setzero_ps(), out488);
out494 = _mm512_max_ps(_mm512_setzero_ps(), out494);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out483);
_mm512_mask_storeu_ps(datPtr6+13808+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out489);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out484);
_mm512_mask_storeu_ps(datPtr6+14032+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out490);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out485);
_mm512_mask_storeu_ps(datPtr6+14256+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out491);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out486);
_mm512_mask_storeu_ps(datPtr6+14480+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out492);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out487);
_mm512_mask_storeu_ps(datPtr6+14704+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out493);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 255, out488);
_mm512_mask_storeu_ps(datPtr6+14928+50432*i19+224*toH21+4*toW21+50432*k62+25216*l14, 4095, out494);
}
}
++j13;
j13 = 2;
}
if (j13 < 15) {
ptrdiff_t rel11 = (size_t)(j13-2)%5;
ptrdiff_t base11 = 6+(size_t)(j13-2)/5*18;
for (; ; rel11 = 0, base11 += 18) {
if (rel11 < 2) {
if (rel11 < 1) {
ptrdiff_t toH22 = base11+0;
ptrdiff_t toW22 = 12;
ptrdiff_t k63 = 1*w33;
for (; k63 != 1; ++k63) {
ptrdiff_t l15 = 0;
for (; l15 != 2; ++l15) {
__m512 sf97 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf98 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in460 = _mm512_shuffle_f32x4(sf97, sf98, 68);
__m512 in461 = _mm512_shuffle_f32x4(sf97, sf98, 238);
__m512 sf99 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf100 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in468 = _mm512_shuffle_f32x4(sf99, sf100, 68);
__m512 in469 = _mm512_shuffle_f32x4(sf99, sf100, 238);
__m512 sf101 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf102 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in462 = _mm512_shuffle_f32x4(sf101, sf102, 68);
__m512 in463 = _mm512_shuffle_f32x4(sf101, sf102, 238);
__m512 sf103 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf104 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in470 = _mm512_shuffle_f32x4(sf103, sf104, 68);
__m512 in471 = _mm512_shuffle_f32x4(sf103, sf104, 238);
__m512 sf105 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf106 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in464 = _mm512_shuffle_f32x4(sf105, sf106, 68);
__m512 in465 = _mm512_shuffle_f32x4(sf105, sf106, 238);
__m512 sf107 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf108 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in472 = _mm512_shuffle_f32x4(sf107, sf108, 68);
__m512 in473 = _mm512_shuffle_f32x4(sf107, sf108, 238);
__m512 sf109 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf110 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in466 = _mm512_shuffle_f32x4(sf109, sf110, 68);
__m512 in467 = _mm512_shuffle_f32x4(sf109, sf110, 238);
__m512 sf111 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf112 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in474 = _mm512_shuffle_f32x4(sf111, sf112, 68);
__m512 in475 = _mm512_shuffle_f32x4(sf111, sf112, 238);
__m512 tmp2709 = _mm512_add_ps(in461, in462);
__m512 tmp2729 = _mm512_add_ps(in469, in470);
__m512 tmp2708 = _mm512_add_ps(in463, in464);
__m512 tmp2728 = _mm512_add_ps(in471, in472);
__m512 tmp2714 = _mm512_sub_ps(in463, in464);
__m512 tmp2734 = _mm512_sub_ps(in471, in472);
__m512 tmp2713 = _mm512_sub_ps(in461, in462);
__m512 tmp2733 = _mm512_sub_ps(in469, in470);
__m512 tmp2710 = _mm512_add_ps(in465, in466);
__m512 tmp2730 = _mm512_add_ps(in473, in474);
__m512 tmp2715 = _mm512_sub_ps(in465, in466);
__m512 tmp2735 = _mm512_sub_ps(in473, in474);
__m512 tmp2712 = _mm512_fmadd_ps(tmp2714, _mm512_set1_ps(2e+00f), tmp2713);
__m512 tmp2732 = _mm512_fmadd_ps(tmp2734, _mm512_set1_ps(2e+00f), tmp2733);
__m512 tmp2719 = _mm512_fmadd_ps(tmp2714, _mm512_set1_ps(8e+00f), tmp2713);
__m512 tmp2739 = _mm512_fmadd_ps(tmp2734, _mm512_set1_ps(8e+00f), tmp2733);
__m512 tmp2707 = _mm512_add_ps(tmp2708, tmp2709);
__m512 tmp2727 = _mm512_add_ps(tmp2728, tmp2729);
__m512 tmp2711 = _mm512_fmadd_ps(tmp2715, _mm512_set1_ps(1.6e+01f), tmp2712);
__m512 tmp2731 = _mm512_fmadd_ps(tmp2735, _mm512_set1_ps(1.6e+01f), tmp2732);
__m512 tmp2718 = _mm512_fmadd_ps(tmp2715, _mm512_set1_ps(4e+00f), tmp2719);
__m512 tmp2738 = _mm512_fmadd_ps(tmp2735, _mm512_set1_ps(4e+00f), tmp2739);
__m512 tmp2724 = _mm512_add_ps(tmp2715, tmp2713);
__m512 tmp2744 = _mm512_add_ps(tmp2735, tmp2733);
__m512 tmp2717 = _mm512_fmadd_ps(tmp2708, _mm512_set1_ps(4e+00f), tmp2709);
__m512 tmp2737 = _mm512_fmadd_ps(tmp2728, _mm512_set1_ps(4e+00f), tmp2729);
__m512 tmp2721 = _mm512_fmadd_ps(tmp2708, _mm512_set1_ps(1.6e+01f), tmp2709);
__m512 tmp2741 = _mm512_fmadd_ps(tmp2728, _mm512_set1_ps(1.6e+01f), tmp2729);
__m512 tmp2706 = _mm512_add_ps(tmp2707, in460);
__m512 tmp2726 = _mm512_add_ps(tmp2727, in468);
__m512 tmp2723 = _mm512_add_ps(tmp2724, in467);
__m512 tmp2743 = _mm512_add_ps(tmp2744, in475);
__m512 tmp2705 = _mm512_fmadd_ps(tmp2710, _mm512_set1_ps(3.2e+01f), tmp2706);
__m512 tmp2725 = _mm512_fmadd_ps(tmp2730, _mm512_set1_ps(3.2e+01f), tmp2726);
__m512 tmp2716 = _mm512_fmadd_ps(tmp2710, _mm512_set1_ps(8e+00f), tmp2717);
__m512 tmp2736 = _mm512_fmadd_ps(tmp2730, _mm512_set1_ps(8e+00f), tmp2737);
__m512 tmp2722 = _mm512_fmadd_ps(tmp2714, _mm512_set1_ps(3.2e+01f), tmp2723);
__m512 tmp2742 = _mm512_fmadd_ps(tmp2734, _mm512_set1_ps(3.2e+01f), tmp2743);
__m512 tmp2720 = _mm512_fmadd_ps(tmp2710, _mm512_set1_ps(2e+00f), tmp2721);
__m512 tmp2740 = _mm512_fmadd_ps(tmp2730, _mm512_set1_ps(2e+00f), tmp2741);
__m512 tmp2693 = tmp2705;
__m512 tmp2699 = tmp2725;
__m512 tmp2694 = tmp2711;
__m512 tmp2700 = tmp2731;
__m512 tmp2695 = tmp2716;
__m512 tmp2701 = tmp2736;
__m512 tmp2696 = tmp2718;
__m512 tmp2702 = tmp2738;
__m512 tmp2697 = tmp2720;
__m512 tmp2703 = tmp2740;
__m512 tmp2698 = tmp2722;
__m512 tmp2704 = tmp2742;
__m512 tmp2789 = _mm512_unpacklo_ps(tmp2693, tmp2694);
__m512 tmp2790 = _mm512_unpackhi_ps(tmp2693, tmp2694);
__m512 tmp2791 = _mm512_unpacklo_ps(tmp2695, tmp2696);
__m512 tmp2792 = _mm512_unpackhi_ps(tmp2695, tmp2696);
__m512 tmp2793 = _mm512_unpacklo_ps(tmp2697, tmp2698);
__m512 tmp2794 = _mm512_unpackhi_ps(tmp2697, tmp2698);
__m512 tmp2795 = _mm512_unpacklo_ps(tmp2699, tmp2700);
__m512 tmp2796 = _mm512_unpackhi_ps(tmp2699, tmp2700);
__m512 tmp2797 = _mm512_unpacklo_ps(tmp2701, tmp2702);
__m512 tmp2798 = _mm512_unpackhi_ps(tmp2701, tmp2702);
__m512 tmp2799 = _mm512_unpacklo_ps(tmp2703, tmp2704);
__m512 tmp2800 = _mm512_unpackhi_ps(tmp2703, tmp2704);
__m512 tmp2801 = _mm512_shuffle_ps(tmp2789, tmp2791, 68);
__m512 tmp2802 = _mm512_shuffle_ps(tmp2789, tmp2791, 238);
__m512 tmp2803 = _mm512_shuffle_ps(tmp2790, tmp2792, 68);
__m512 tmp2804 = _mm512_shuffle_ps(tmp2790, tmp2792, 238);
__m512 tmp2805 = _mm512_shuffle_ps(tmp2793, tmp2795, 68);
__m512 tmp2806 = _mm512_shuffle_ps(tmp2793, tmp2795, 238);
__m512 tmp2807 = _mm512_shuffle_ps(tmp2794, tmp2796, 68);
__m512 tmp2808 = _mm512_shuffle_ps(tmp2794, tmp2796, 238);
__m512 tmp2809 = _mm512_shuffle_ps(tmp2797, tmp2799, 68);
__m512 tmp2810 = _mm512_shuffle_ps(tmp2797, tmp2799, 238);
__m512 tmp2811 = _mm512_shuffle_ps(tmp2798, tmp2800, 68);
__m512 tmp2812 = _mm512_shuffle_ps(tmp2798, tmp2800, 238);
__m512 tmp2813 = _mm512_shuffle_f32x4(tmp2801, tmp2805, 136);
__m512 tmp2814 = _mm512_shuffle_f32x4(tmp2801, tmp2805, 221);
__m512 tmp2815 = _mm512_shuffle_f32x4(tmp2802, tmp2806, 136);
__m512 tmp2816 = _mm512_shuffle_f32x4(tmp2802, tmp2806, 221);
__m512 tmp2817 = _mm512_shuffle_f32x4(tmp2803, tmp2807, 136);
__m512 tmp2818 = _mm512_shuffle_f32x4(tmp2803, tmp2807, 221);
__m512 tmp2819 = _mm512_shuffle_f32x4(tmp2804, tmp2808, 136);
__m512 tmp2820 = _mm512_shuffle_f32x4(tmp2804, tmp2808, 221);
__m512 tmp2821 = _mm512_shuffle_f32x4(tmp2809, tmp2809, 136);
__m512 tmp2822 = _mm512_shuffle_f32x4(tmp2809, tmp2809, 221);
__m512 tmp2823 = _mm512_shuffle_f32x4(tmp2810, tmp2810, 136);
__m512 tmp2824 = _mm512_shuffle_f32x4(tmp2810, tmp2810, 221);
__m512 tmp2825 = _mm512_shuffle_f32x4(tmp2811, tmp2811, 136);
__m512 tmp2826 = _mm512_shuffle_f32x4(tmp2811, tmp2811, 221);
__m512 tmp2827 = _mm512_shuffle_f32x4(tmp2812, tmp2812, 136);
__m512 tmp2828 = _mm512_shuffle_f32x4(tmp2812, tmp2812, 221);
tmp2693 = _mm512_shuffle_f32x4(tmp2813, tmp2821, 136);
tmp2701 = _mm512_shuffle_f32x4(tmp2813, tmp2821, 221);
tmp2694 = _mm512_shuffle_f32x4(tmp2815, tmp2823, 136);
tmp2702 = _mm512_shuffle_f32x4(tmp2815, tmp2823, 221);
tmp2695 = _mm512_shuffle_f32x4(tmp2817, tmp2825, 136);
tmp2703 = _mm512_shuffle_f32x4(tmp2817, tmp2825, 221);
tmp2696 = _mm512_shuffle_f32x4(tmp2819, tmp2827, 136);
tmp2704 = _mm512_shuffle_f32x4(tmp2819, tmp2827, 221);
tmp2697 = _mm512_shuffle_f32x4(tmp2814, tmp2822, 136);
__m512 tmp2745 = _mm512_shuffle_f32x4(tmp2814, tmp2822, 221);
tmp2698 = _mm512_shuffle_f32x4(tmp2816, tmp2824, 136);
__m512 tmp2746 = _mm512_shuffle_f32x4(tmp2816, tmp2824, 221);
tmp2699 = _mm512_shuffle_f32x4(tmp2818, tmp2826, 136);
__m512 tmp2747 = _mm512_shuffle_f32x4(tmp2818, tmp2826, 221);
tmp2700 = _mm512_shuffle_f32x4(tmp2820, tmp2828, 136);
__m512 tmp2748 = _mm512_shuffle_f32x4(tmp2820, tmp2828, 221);
__m512 tmp2753 = _mm512_add_ps(tmp2694, tmp2695);
__m512 tmp2773 = _mm512_add_ps(tmp2702, tmp2703);
__m512 tmp2752 = _mm512_add_ps(tmp2696, tmp2697);
__m512 tmp2772 = _mm512_add_ps(tmp2704, tmp2745);
__m512 tmp2758 = _mm512_sub_ps(tmp2696, tmp2697);
__m512 tmp2778 = _mm512_sub_ps(tmp2704, tmp2745);
__m512 tmp2757 = _mm512_sub_ps(tmp2694, tmp2695);
__m512 tmp2777 = _mm512_sub_ps(tmp2702, tmp2703);
__m512 tmp2754 = _mm512_add_ps(tmp2698, tmp2699);
__m512 tmp2774 = _mm512_add_ps(tmp2746, tmp2747);
__m512 tmp2759 = _mm512_sub_ps(tmp2698, tmp2699);
__m512 tmp2779 = _mm512_sub_ps(tmp2746, tmp2747);
__m512 tmp2756 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(2e+00f), tmp2757);
__m512 tmp2776 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(2e+00f), tmp2777);
__m512 tmp2763 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(8e+00f), tmp2757);
__m512 tmp2783 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(8e+00f), tmp2777);
__m512 tmp2751 = _mm512_add_ps(tmp2752, tmp2753);
__m512 tmp2771 = _mm512_add_ps(tmp2772, tmp2773);
__m512 tmp2755 = _mm512_fmadd_ps(tmp2759, _mm512_set1_ps(1.6e+01f), tmp2756);
__m512 tmp2775 = _mm512_fmadd_ps(tmp2779, _mm512_set1_ps(1.6e+01f), tmp2776);
__m512 tmp2762 = _mm512_fmadd_ps(tmp2759, _mm512_set1_ps(4e+00f), tmp2763);
__m512 tmp2782 = _mm512_fmadd_ps(tmp2779, _mm512_set1_ps(4e+00f), tmp2783);
__m512 tmp2768 = _mm512_add_ps(tmp2759, tmp2757);
__m512 tmp2788 = _mm512_add_ps(tmp2779, tmp2777);
__m512 tmp2761 = _mm512_fmadd_ps(tmp2752, _mm512_set1_ps(4e+00f), tmp2753);
__m512 tmp2781 = _mm512_fmadd_ps(tmp2772, _mm512_set1_ps(4e+00f), tmp2773);
__m512 tmp2765 = _mm512_fmadd_ps(tmp2752, _mm512_set1_ps(1.6e+01f), tmp2753);
__m512 tmp2785 = _mm512_fmadd_ps(tmp2772, _mm512_set1_ps(1.6e+01f), tmp2773);
__m512 tmp2750 = _mm512_add_ps(tmp2751, tmp2693);
__m512 tmp2770 = _mm512_add_ps(tmp2771, tmp2701);
__m512 tmp2767 = _mm512_add_ps(tmp2768, tmp2700);
__m512 tmp2787 = _mm512_add_ps(tmp2788, tmp2748);
__m512 tmp2749 = _mm512_fmadd_ps(tmp2754, _mm512_set1_ps(3.2e+01f), tmp2750);
__m512 tmp2769 = _mm512_fmadd_ps(tmp2774, _mm512_set1_ps(3.2e+01f), tmp2770);
__m512 tmp2760 = _mm512_fmadd_ps(tmp2754, _mm512_set1_ps(8e+00f), tmp2761);
__m512 tmp2780 = _mm512_fmadd_ps(tmp2774, _mm512_set1_ps(8e+00f), tmp2781);
__m512 tmp2766 = _mm512_fmadd_ps(tmp2758, _mm512_set1_ps(3.2e+01f), tmp2767);
__m512 tmp2786 = _mm512_fmadd_ps(tmp2778, _mm512_set1_ps(3.2e+01f), tmp2787);
__m512 tmp2764 = _mm512_fmadd_ps(tmp2754, _mm512_set1_ps(2e+00f), tmp2765);
__m512 tmp2784 = _mm512_fmadd_ps(tmp2774, _mm512_set1_ps(2e+00f), tmp2785);
__m512 out495 = tmp2749;
__m512 out501 = tmp2769;
__m512 out496 = tmp2755;
__m512 out502 = tmp2775;
__m512 out497 = tmp2760;
__m512 out503 = tmp2780;
__m512 out498 = tmp2762;
__m512 out504 = tmp2782;
__m512 out499 = tmp2764;
__m512 out505 = tmp2784;
__m512 out500 = tmp2766;
__m512 out506 = tmp2786;
out495 = _mm512_max_ps(_mm512_setzero_ps(), out495);
out501 = _mm512_max_ps(_mm512_setzero_ps(), out501);
out496 = _mm512_max_ps(_mm512_setzero_ps(), out496);
out502 = _mm512_max_ps(_mm512_setzero_ps(), out502);
out497 = _mm512_max_ps(_mm512_setzero_ps(), out497);
out503 = _mm512_max_ps(_mm512_setzero_ps(), out503);
out498 = _mm512_max_ps(_mm512_setzero_ps(), out498);
out504 = _mm512_max_ps(_mm512_setzero_ps(), out504);
out499 = _mm512_max_ps(_mm512_setzero_ps(), out499);
out505 = _mm512_max_ps(_mm512_setzero_ps(), out505);
out500 = _mm512_max_ps(_mm512_setzero_ps(), out500);
out506 = _mm512_max_ps(_mm512_setzero_ps(), out506);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out495);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out501);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out496);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out502);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out497);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out503);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out498);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out504);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out499);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out505);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out500);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out506);
__m512 sf113 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf114 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in476 = _mm512_shuffle_f32x4(sf113, sf114, 68);
__m512 in477 = _mm512_shuffle_f32x4(sf113, sf114, 238);
__m512 sf115 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf116 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in484 = _mm512_shuffle_f32x4(sf115, sf116, 68);
__m512 in485 = _mm512_shuffle_f32x4(sf115, sf116, 238);
__m512 sf117 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf118 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in478 = _mm512_shuffle_f32x4(sf117, sf118, 68);
__m512 in479 = _mm512_shuffle_f32x4(sf117, sf118, 238);
__m512 sf119 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf120 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in486 = _mm512_shuffle_f32x4(sf119, sf120, 68);
__m512 in487 = _mm512_shuffle_f32x4(sf119, sf120, 238);
__m512 sf121 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf122 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in480 = _mm512_shuffle_f32x4(sf121, sf122, 68);
__m512 in481 = _mm512_shuffle_f32x4(sf121, sf122, 238);
__m512 sf123 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf124 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in488 = _mm512_shuffle_f32x4(sf123, sf124, 68);
__m512 in489 = _mm512_shuffle_f32x4(sf123, sf124, 238);
__m512 sf125 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf126 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in482 = _mm512_shuffle_f32x4(sf125, sf126, 68);
__m512 in483 = _mm512_shuffle_f32x4(sf125, sf126, 238);
__m512 sf127 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf128 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in490 = _mm512_shuffle_f32x4(sf127, sf128, 68);
__m512 in491 = _mm512_shuffle_f32x4(sf127, sf128, 238);
__m512 tmp2845 = _mm512_add_ps(in477, in478);
__m512 tmp2865 = _mm512_add_ps(in485, in486);
__m512 tmp2844 = _mm512_add_ps(in479, in480);
__m512 tmp2864 = _mm512_add_ps(in487, in488);
__m512 tmp2850 = _mm512_sub_ps(in479, in480);
__m512 tmp2870 = _mm512_sub_ps(in487, in488);
__m512 tmp2849 = _mm512_sub_ps(in477, in478);
__m512 tmp2869 = _mm512_sub_ps(in485, in486);
__m512 tmp2846 = _mm512_add_ps(in481, in482);
__m512 tmp2866 = _mm512_add_ps(in489, in490);
__m512 tmp2851 = _mm512_sub_ps(in481, in482);
__m512 tmp2871 = _mm512_sub_ps(in489, in490);
__m512 tmp2848 = _mm512_fmadd_ps(tmp2850, _mm512_set1_ps(2e+00f), tmp2849);
__m512 tmp2868 = _mm512_fmadd_ps(tmp2870, _mm512_set1_ps(2e+00f), tmp2869);
__m512 tmp2855 = _mm512_fmadd_ps(tmp2850, _mm512_set1_ps(8e+00f), tmp2849);
__m512 tmp2875 = _mm512_fmadd_ps(tmp2870, _mm512_set1_ps(8e+00f), tmp2869);
__m512 tmp2843 = _mm512_add_ps(tmp2844, tmp2845);
__m512 tmp2863 = _mm512_add_ps(tmp2864, tmp2865);
__m512 tmp2847 = _mm512_fmadd_ps(tmp2851, _mm512_set1_ps(1.6e+01f), tmp2848);
__m512 tmp2867 = _mm512_fmadd_ps(tmp2871, _mm512_set1_ps(1.6e+01f), tmp2868);
__m512 tmp2854 = _mm512_fmadd_ps(tmp2851, _mm512_set1_ps(4e+00f), tmp2855);
__m512 tmp2874 = _mm512_fmadd_ps(tmp2871, _mm512_set1_ps(4e+00f), tmp2875);
__m512 tmp2860 = _mm512_add_ps(tmp2851, tmp2849);
__m512 tmp2880 = _mm512_add_ps(tmp2871, tmp2869);
__m512 tmp2853 = _mm512_fmadd_ps(tmp2844, _mm512_set1_ps(4e+00f), tmp2845);
__m512 tmp2873 = _mm512_fmadd_ps(tmp2864, _mm512_set1_ps(4e+00f), tmp2865);
__m512 tmp2857 = _mm512_fmadd_ps(tmp2844, _mm512_set1_ps(1.6e+01f), tmp2845);
__m512 tmp2877 = _mm512_fmadd_ps(tmp2864, _mm512_set1_ps(1.6e+01f), tmp2865);
__m512 tmp2842 = _mm512_add_ps(tmp2843, in476);
__m512 tmp2862 = _mm512_add_ps(tmp2863, in484);
__m512 tmp2859 = _mm512_add_ps(tmp2860, in483);
__m512 tmp2879 = _mm512_add_ps(tmp2880, in491);
__m512 tmp2841 = _mm512_fmadd_ps(tmp2846, _mm512_set1_ps(3.2e+01f), tmp2842);
__m512 tmp2861 = _mm512_fmadd_ps(tmp2866, _mm512_set1_ps(3.2e+01f), tmp2862);
__m512 tmp2852 = _mm512_fmadd_ps(tmp2846, _mm512_set1_ps(8e+00f), tmp2853);
__m512 tmp2872 = _mm512_fmadd_ps(tmp2866, _mm512_set1_ps(8e+00f), tmp2873);
__m512 tmp2858 = _mm512_fmadd_ps(tmp2850, _mm512_set1_ps(3.2e+01f), tmp2859);
__m512 tmp2878 = _mm512_fmadd_ps(tmp2870, _mm512_set1_ps(3.2e+01f), tmp2879);
__m512 tmp2856 = _mm512_fmadd_ps(tmp2846, _mm512_set1_ps(2e+00f), tmp2857);
__m512 tmp2876 = _mm512_fmadd_ps(tmp2866, _mm512_set1_ps(2e+00f), tmp2877);
__m512 tmp2829 = tmp2841;
__m512 tmp2835 = tmp2861;
__m512 tmp2830 = tmp2847;
__m512 tmp2836 = tmp2867;
__m512 tmp2831 = tmp2852;
__m512 tmp2837 = tmp2872;
__m512 tmp2832 = tmp2854;
__m512 tmp2838 = tmp2874;
__m512 tmp2833 = tmp2856;
__m512 tmp2839 = tmp2876;
__m512 tmp2834 = tmp2858;
__m512 tmp2840 = tmp2878;
__m512 tmp2925 = _mm512_unpacklo_ps(tmp2829, tmp2830);
__m512 tmp2926 = _mm512_unpackhi_ps(tmp2829, tmp2830);
__m512 tmp2927 = _mm512_unpacklo_ps(tmp2831, tmp2832);
__m512 tmp2928 = _mm512_unpackhi_ps(tmp2831, tmp2832);
__m512 tmp2929 = _mm512_unpacklo_ps(tmp2833, tmp2834);
__m512 tmp2930 = _mm512_unpackhi_ps(tmp2833, tmp2834);
__m512 tmp2931 = _mm512_unpacklo_ps(tmp2835, tmp2836);
__m512 tmp2932 = _mm512_unpackhi_ps(tmp2835, tmp2836);
__m512 tmp2933 = _mm512_unpacklo_ps(tmp2837, tmp2838);
__m512 tmp2934 = _mm512_unpackhi_ps(tmp2837, tmp2838);
__m512 tmp2935 = _mm512_unpacklo_ps(tmp2839, tmp2840);
__m512 tmp2936 = _mm512_unpackhi_ps(tmp2839, tmp2840);
__m512 tmp2937 = _mm512_shuffle_ps(tmp2925, tmp2927, 68);
__m512 tmp2938 = _mm512_shuffle_ps(tmp2925, tmp2927, 238);
__m512 tmp2939 = _mm512_shuffle_ps(tmp2926, tmp2928, 68);
__m512 tmp2940 = _mm512_shuffle_ps(tmp2926, tmp2928, 238);
__m512 tmp2941 = _mm512_shuffle_ps(tmp2929, tmp2931, 68);
__m512 tmp2942 = _mm512_shuffle_ps(tmp2929, tmp2931, 238);
__m512 tmp2943 = _mm512_shuffle_ps(tmp2930, tmp2932, 68);
__m512 tmp2944 = _mm512_shuffle_ps(tmp2930, tmp2932, 238);
__m512 tmp2945 = _mm512_shuffle_ps(tmp2933, tmp2935, 68);
__m512 tmp2946 = _mm512_shuffle_ps(tmp2933, tmp2935, 238);
__m512 tmp2947 = _mm512_shuffle_ps(tmp2934, tmp2936, 68);
__m512 tmp2948 = _mm512_shuffle_ps(tmp2934, tmp2936, 238);
__m512 tmp2949 = _mm512_shuffle_f32x4(tmp2937, tmp2941, 136);
__m512 tmp2950 = _mm512_shuffle_f32x4(tmp2937, tmp2941, 221);
__m512 tmp2951 = _mm512_shuffle_f32x4(tmp2938, tmp2942, 136);
__m512 tmp2952 = _mm512_shuffle_f32x4(tmp2938, tmp2942, 221);
__m512 tmp2953 = _mm512_shuffle_f32x4(tmp2939, tmp2943, 136);
__m512 tmp2954 = _mm512_shuffle_f32x4(tmp2939, tmp2943, 221);
__m512 tmp2955 = _mm512_shuffle_f32x4(tmp2940, tmp2944, 136);
__m512 tmp2956 = _mm512_shuffle_f32x4(tmp2940, tmp2944, 221);
__m512 tmp2957 = _mm512_shuffle_f32x4(tmp2945, tmp2945, 136);
__m512 tmp2958 = _mm512_shuffle_f32x4(tmp2945, tmp2945, 221);
__m512 tmp2959 = _mm512_shuffle_f32x4(tmp2946, tmp2946, 136);
__m512 tmp2960 = _mm512_shuffle_f32x4(tmp2946, tmp2946, 221);
__m512 tmp2961 = _mm512_shuffle_f32x4(tmp2947, tmp2947, 136);
__m512 tmp2962 = _mm512_shuffle_f32x4(tmp2947, tmp2947, 221);
__m512 tmp2963 = _mm512_shuffle_f32x4(tmp2948, tmp2948, 136);
__m512 tmp2964 = _mm512_shuffle_f32x4(tmp2948, tmp2948, 221);
tmp2829 = _mm512_shuffle_f32x4(tmp2949, tmp2957, 136);
tmp2837 = _mm512_shuffle_f32x4(tmp2949, tmp2957, 221);
tmp2830 = _mm512_shuffle_f32x4(tmp2951, tmp2959, 136);
tmp2838 = _mm512_shuffle_f32x4(tmp2951, tmp2959, 221);
tmp2831 = _mm512_shuffle_f32x4(tmp2953, tmp2961, 136);
tmp2839 = _mm512_shuffle_f32x4(tmp2953, tmp2961, 221);
tmp2832 = _mm512_shuffle_f32x4(tmp2955, tmp2963, 136);
tmp2840 = _mm512_shuffle_f32x4(tmp2955, tmp2963, 221);
tmp2833 = _mm512_shuffle_f32x4(tmp2950, tmp2958, 136);
__m512 tmp2881 = _mm512_shuffle_f32x4(tmp2950, tmp2958, 221);
tmp2834 = _mm512_shuffle_f32x4(tmp2952, tmp2960, 136);
__m512 tmp2882 = _mm512_shuffle_f32x4(tmp2952, tmp2960, 221);
tmp2835 = _mm512_shuffle_f32x4(tmp2954, tmp2962, 136);
__m512 tmp2883 = _mm512_shuffle_f32x4(tmp2954, tmp2962, 221);
tmp2836 = _mm512_shuffle_f32x4(tmp2956, tmp2964, 136);
__m512 tmp2884 = _mm512_shuffle_f32x4(tmp2956, tmp2964, 221);
__m512 tmp2889 = _mm512_add_ps(tmp2830, tmp2831);
__m512 tmp2909 = _mm512_add_ps(tmp2838, tmp2839);
__m512 tmp2888 = _mm512_add_ps(tmp2832, tmp2833);
__m512 tmp2908 = _mm512_add_ps(tmp2840, tmp2881);
__m512 tmp2894 = _mm512_sub_ps(tmp2832, tmp2833);
__m512 tmp2914 = _mm512_sub_ps(tmp2840, tmp2881);
__m512 tmp2893 = _mm512_sub_ps(tmp2830, tmp2831);
__m512 tmp2913 = _mm512_sub_ps(tmp2838, tmp2839);
__m512 tmp2890 = _mm512_add_ps(tmp2834, tmp2835);
__m512 tmp2910 = _mm512_add_ps(tmp2882, tmp2883);
__m512 tmp2895 = _mm512_sub_ps(tmp2834, tmp2835);
__m512 tmp2915 = _mm512_sub_ps(tmp2882, tmp2883);
__m512 tmp2892 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(2e+00f), tmp2893);
__m512 tmp2912 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(2e+00f), tmp2913);
__m512 tmp2899 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(8e+00f), tmp2893);
__m512 tmp2919 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(8e+00f), tmp2913);
__m512 tmp2887 = _mm512_add_ps(tmp2888, tmp2889);
__m512 tmp2907 = _mm512_add_ps(tmp2908, tmp2909);
__m512 tmp2891 = _mm512_fmadd_ps(tmp2895, _mm512_set1_ps(1.6e+01f), tmp2892);
__m512 tmp2911 = _mm512_fmadd_ps(tmp2915, _mm512_set1_ps(1.6e+01f), tmp2912);
__m512 tmp2898 = _mm512_fmadd_ps(tmp2895, _mm512_set1_ps(4e+00f), tmp2899);
__m512 tmp2918 = _mm512_fmadd_ps(tmp2915, _mm512_set1_ps(4e+00f), tmp2919);
__m512 tmp2904 = _mm512_add_ps(tmp2895, tmp2893);
__m512 tmp2924 = _mm512_add_ps(tmp2915, tmp2913);
__m512 tmp2897 = _mm512_fmadd_ps(tmp2888, _mm512_set1_ps(4e+00f), tmp2889);
__m512 tmp2917 = _mm512_fmadd_ps(tmp2908, _mm512_set1_ps(4e+00f), tmp2909);
__m512 tmp2901 = _mm512_fmadd_ps(tmp2888, _mm512_set1_ps(1.6e+01f), tmp2889);
__m512 tmp2921 = _mm512_fmadd_ps(tmp2908, _mm512_set1_ps(1.6e+01f), tmp2909);
__m512 tmp2886 = _mm512_add_ps(tmp2887, tmp2829);
__m512 tmp2906 = _mm512_add_ps(tmp2907, tmp2837);
__m512 tmp2903 = _mm512_add_ps(tmp2904, tmp2836);
__m512 tmp2923 = _mm512_add_ps(tmp2924, tmp2884);
__m512 tmp2885 = _mm512_fmadd_ps(tmp2890, _mm512_set1_ps(3.2e+01f), tmp2886);
__m512 tmp2905 = _mm512_fmadd_ps(tmp2910, _mm512_set1_ps(3.2e+01f), tmp2906);
__m512 tmp2896 = _mm512_fmadd_ps(tmp2890, _mm512_set1_ps(8e+00f), tmp2897);
__m512 tmp2916 = _mm512_fmadd_ps(tmp2910, _mm512_set1_ps(8e+00f), tmp2917);
__m512 tmp2902 = _mm512_fmadd_ps(tmp2894, _mm512_set1_ps(3.2e+01f), tmp2903);
__m512 tmp2922 = _mm512_fmadd_ps(tmp2914, _mm512_set1_ps(3.2e+01f), tmp2923);
__m512 tmp2900 = _mm512_fmadd_ps(tmp2890, _mm512_set1_ps(2e+00f), tmp2901);
__m512 tmp2920 = _mm512_fmadd_ps(tmp2910, _mm512_set1_ps(2e+00f), tmp2921);
__m512 out507 = tmp2885;
__m512 out513 = tmp2905;
__m512 out508 = tmp2891;
__m512 out514 = tmp2911;
__m512 out509 = tmp2896;
__m512 out515 = tmp2916;
__m512 out510 = tmp2898;
__m512 out516 = tmp2918;
__m512 out511 = tmp2900;
__m512 out517 = tmp2920;
__m512 out512 = tmp2902;
__m512 out518 = tmp2922;
out507 = _mm512_max_ps(_mm512_setzero_ps(), out507);
out513 = _mm512_max_ps(_mm512_setzero_ps(), out513);
out508 = _mm512_max_ps(_mm512_setzero_ps(), out508);
out514 = _mm512_max_ps(_mm512_setzero_ps(), out514);
out509 = _mm512_max_ps(_mm512_setzero_ps(), out509);
out515 = _mm512_max_ps(_mm512_setzero_ps(), out515);
out510 = _mm512_max_ps(_mm512_setzero_ps(), out510);
out516 = _mm512_max_ps(_mm512_setzero_ps(), out516);
out511 = _mm512_max_ps(_mm512_setzero_ps(), out511);
out517 = _mm512_max_ps(_mm512_setzero_ps(), out517);
out512 = _mm512_max_ps(_mm512_setzero_ps(), out512);
out518 = _mm512_max_ps(_mm512_setzero_ps(), out518);
_mm512_mask_storeu_ps(datPtr6+96+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out507);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out513);
_mm512_mask_storeu_ps(datPtr6+320+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out508);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out514);
_mm512_mask_storeu_ps(datPtr6+544+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out509);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out515);
_mm512_mask_storeu_ps(datPtr6+768+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out510);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out516);
_mm512_mask_storeu_ps(datPtr6+992+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out511);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out517);
_mm512_mask_storeu_ps(datPtr6+1216+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out512);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out518);
__m512 sf129 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf130 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in492 = _mm512_shuffle_f32x4(sf129, sf130, 68);
__m512 in493 = _mm512_shuffle_f32x4(sf129, sf130, 238);
__m512 sf131 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf132 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in500 = _mm512_shuffle_f32x4(sf131, sf132, 68);
__m512 in501 = _mm512_shuffle_f32x4(sf131, sf132, 238);
__m512 sf133 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf134 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in494 = _mm512_shuffle_f32x4(sf133, sf134, 68);
__m512 in495 = _mm512_shuffle_f32x4(sf133, sf134, 238);
__m512 sf135 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf136 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in502 = _mm512_shuffle_f32x4(sf135, sf136, 68);
__m512 in503 = _mm512_shuffle_f32x4(sf135, sf136, 238);
__m512 sf137 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf138 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in496 = _mm512_shuffle_f32x4(sf137, sf138, 68);
__m512 in497 = _mm512_shuffle_f32x4(sf137, sf138, 238);
__m512 sf139 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf140 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in504 = _mm512_shuffle_f32x4(sf139, sf140, 68);
__m512 in505 = _mm512_shuffle_f32x4(sf139, sf140, 238);
__m512 sf141 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf142 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in498 = _mm512_shuffle_f32x4(sf141, sf142, 68);
__m512 in499 = _mm512_shuffle_f32x4(sf141, sf142, 238);
__m512 sf143 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k63+768*l15);
__m512 sf144 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k63+768*l15);
__m512 in506 = _mm512_shuffle_f32x4(sf143, sf144, 68);
__m512 in507 = _mm512_shuffle_f32x4(sf143, sf144, 238);
__m512 tmp2981 = _mm512_add_ps(in493, in494);
__m512 tmp3001 = _mm512_add_ps(in501, in502);
__m512 tmp2980 = _mm512_add_ps(in495, in496);
__m512 tmp3000 = _mm512_add_ps(in503, in504);
__m512 tmp2986 = _mm512_sub_ps(in495, in496);
__m512 tmp3006 = _mm512_sub_ps(in503, in504);
__m512 tmp2985 = _mm512_sub_ps(in493, in494);
__m512 tmp3005 = _mm512_sub_ps(in501, in502);
__m512 tmp2982 = _mm512_add_ps(in497, in498);
__m512 tmp3002 = _mm512_add_ps(in505, in506);
__m512 tmp2987 = _mm512_sub_ps(in497, in498);
__m512 tmp3007 = _mm512_sub_ps(in505, in506);
__m512 tmp2984 = _mm512_fmadd_ps(tmp2986, _mm512_set1_ps(2e+00f), tmp2985);
__m512 tmp3004 = _mm512_fmadd_ps(tmp3006, _mm512_set1_ps(2e+00f), tmp3005);
__m512 tmp2991 = _mm512_fmadd_ps(tmp2986, _mm512_set1_ps(8e+00f), tmp2985);
__m512 tmp3011 = _mm512_fmadd_ps(tmp3006, _mm512_set1_ps(8e+00f), tmp3005);
__m512 tmp2979 = _mm512_add_ps(tmp2980, tmp2981);
__m512 tmp2999 = _mm512_add_ps(tmp3000, tmp3001);
__m512 tmp2983 = _mm512_fmadd_ps(tmp2987, _mm512_set1_ps(1.6e+01f), tmp2984);
__m512 tmp3003 = _mm512_fmadd_ps(tmp3007, _mm512_set1_ps(1.6e+01f), tmp3004);
__m512 tmp2990 = _mm512_fmadd_ps(tmp2987, _mm512_set1_ps(4e+00f), tmp2991);
__m512 tmp3010 = _mm512_fmadd_ps(tmp3007, _mm512_set1_ps(4e+00f), tmp3011);
__m512 tmp2996 = _mm512_add_ps(tmp2987, tmp2985);
__m512 tmp3016 = _mm512_add_ps(tmp3007, tmp3005);
__m512 tmp2989 = _mm512_fmadd_ps(tmp2980, _mm512_set1_ps(4e+00f), tmp2981);
__m512 tmp3009 = _mm512_fmadd_ps(tmp3000, _mm512_set1_ps(4e+00f), tmp3001);
__m512 tmp2993 = _mm512_fmadd_ps(tmp2980, _mm512_set1_ps(1.6e+01f), tmp2981);
__m512 tmp3013 = _mm512_fmadd_ps(tmp3000, _mm512_set1_ps(1.6e+01f), tmp3001);
__m512 tmp2978 = _mm512_add_ps(tmp2979, in492);
__m512 tmp2998 = _mm512_add_ps(tmp2999, in500);
__m512 tmp2995 = _mm512_add_ps(tmp2996, in499);
__m512 tmp3015 = _mm512_add_ps(tmp3016, in507);
__m512 tmp2977 = _mm512_fmadd_ps(tmp2982, _mm512_set1_ps(3.2e+01f), tmp2978);
__m512 tmp2997 = _mm512_fmadd_ps(tmp3002, _mm512_set1_ps(3.2e+01f), tmp2998);
__m512 tmp2988 = _mm512_fmadd_ps(tmp2982, _mm512_set1_ps(8e+00f), tmp2989);
__m512 tmp3008 = _mm512_fmadd_ps(tmp3002, _mm512_set1_ps(8e+00f), tmp3009);
__m512 tmp2994 = _mm512_fmadd_ps(tmp2986, _mm512_set1_ps(3.2e+01f), tmp2995);
__m512 tmp3014 = _mm512_fmadd_ps(tmp3006, _mm512_set1_ps(3.2e+01f), tmp3015);
__m512 tmp2992 = _mm512_fmadd_ps(tmp2982, _mm512_set1_ps(2e+00f), tmp2993);
__m512 tmp3012 = _mm512_fmadd_ps(tmp3002, _mm512_set1_ps(2e+00f), tmp3013);
__m512 tmp2965 = tmp2977;
__m512 tmp2971 = tmp2997;
__m512 tmp2966 = tmp2983;
__m512 tmp2972 = tmp3003;
__m512 tmp2967 = tmp2988;
__m512 tmp2973 = tmp3008;
__m512 tmp2968 = tmp2990;
__m512 tmp2974 = tmp3010;
__m512 tmp2969 = tmp2992;
__m512 tmp2975 = tmp3012;
__m512 tmp2970 = tmp2994;
__m512 tmp2976 = tmp3014;
__m512 tmp3061 = _mm512_unpacklo_ps(tmp2965, tmp2966);
__m512 tmp3062 = _mm512_unpackhi_ps(tmp2965, tmp2966);
__m512 tmp3063 = _mm512_unpacklo_ps(tmp2967, tmp2968);
__m512 tmp3064 = _mm512_unpackhi_ps(tmp2967, tmp2968);
__m512 tmp3065 = _mm512_unpacklo_ps(tmp2969, tmp2970);
__m512 tmp3066 = _mm512_unpackhi_ps(tmp2969, tmp2970);
__m512 tmp3067 = _mm512_unpacklo_ps(tmp2971, tmp2972);
__m512 tmp3068 = _mm512_unpackhi_ps(tmp2971, tmp2972);
__m512 tmp3069 = _mm512_unpacklo_ps(tmp2973, tmp2974);
__m512 tmp3070 = _mm512_unpackhi_ps(tmp2973, tmp2974);
__m512 tmp3071 = _mm512_unpacklo_ps(tmp2975, tmp2976);
__m512 tmp3072 = _mm512_unpackhi_ps(tmp2975, tmp2976);
__m512 tmp3073 = _mm512_shuffle_ps(tmp3061, tmp3063, 68);
__m512 tmp3074 = _mm512_shuffle_ps(tmp3061, tmp3063, 238);
__m512 tmp3075 = _mm512_shuffle_ps(tmp3062, tmp3064, 68);
__m512 tmp3076 = _mm512_shuffle_ps(tmp3062, tmp3064, 238);
__m512 tmp3077 = _mm512_shuffle_ps(tmp3065, tmp3067, 68);
__m512 tmp3078 = _mm512_shuffle_ps(tmp3065, tmp3067, 238);
__m512 tmp3079 = _mm512_shuffle_ps(tmp3066, tmp3068, 68);
__m512 tmp3080 = _mm512_shuffle_ps(tmp3066, tmp3068, 238);
__m512 tmp3081 = _mm512_shuffle_ps(tmp3069, tmp3071, 68);
__m512 tmp3082 = _mm512_shuffle_ps(tmp3069, tmp3071, 238);
__m512 tmp3083 = _mm512_shuffle_ps(tmp3070, tmp3072, 68);
__m512 tmp3084 = _mm512_shuffle_ps(tmp3070, tmp3072, 238);
__m512 tmp3085 = _mm512_shuffle_f32x4(tmp3073, tmp3077, 136);
__m512 tmp3086 = _mm512_shuffle_f32x4(tmp3073, tmp3077, 221);
__m512 tmp3087 = _mm512_shuffle_f32x4(tmp3074, tmp3078, 136);
__m512 tmp3088 = _mm512_shuffle_f32x4(tmp3074, tmp3078, 221);
__m512 tmp3089 = _mm512_shuffle_f32x4(tmp3075, tmp3079, 136);
__m512 tmp3090 = _mm512_shuffle_f32x4(tmp3075, tmp3079, 221);
__m512 tmp3091 = _mm512_shuffle_f32x4(tmp3076, tmp3080, 136);
__m512 tmp3092 = _mm512_shuffle_f32x4(tmp3076, tmp3080, 221);
__m512 tmp3093 = _mm512_shuffle_f32x4(tmp3081, tmp3081, 136);
__m512 tmp3094 = _mm512_shuffle_f32x4(tmp3081, tmp3081, 221);
__m512 tmp3095 = _mm512_shuffle_f32x4(tmp3082, tmp3082, 136);
__m512 tmp3096 = _mm512_shuffle_f32x4(tmp3082, tmp3082, 221);
__m512 tmp3097 = _mm512_shuffle_f32x4(tmp3083, tmp3083, 136);
__m512 tmp3098 = _mm512_shuffle_f32x4(tmp3083, tmp3083, 221);
__m512 tmp3099 = _mm512_shuffle_f32x4(tmp3084, tmp3084, 136);
__m512 tmp3100 = _mm512_shuffle_f32x4(tmp3084, tmp3084, 221);
tmp2965 = _mm512_shuffle_f32x4(tmp3085, tmp3093, 136);
tmp2973 = _mm512_shuffle_f32x4(tmp3085, tmp3093, 221);
tmp2966 = _mm512_shuffle_f32x4(tmp3087, tmp3095, 136);
tmp2974 = _mm512_shuffle_f32x4(tmp3087, tmp3095, 221);
tmp2967 = _mm512_shuffle_f32x4(tmp3089, tmp3097, 136);
tmp2975 = _mm512_shuffle_f32x4(tmp3089, tmp3097, 221);
tmp2968 = _mm512_shuffle_f32x4(tmp3091, tmp3099, 136);
tmp2976 = _mm512_shuffle_f32x4(tmp3091, tmp3099, 221);
tmp2969 = _mm512_shuffle_f32x4(tmp3086, tmp3094, 136);
__m512 tmp3017 = _mm512_shuffle_f32x4(tmp3086, tmp3094, 221);
tmp2970 = _mm512_shuffle_f32x4(tmp3088, tmp3096, 136);
__m512 tmp3018 = _mm512_shuffle_f32x4(tmp3088, tmp3096, 221);
tmp2971 = _mm512_shuffle_f32x4(tmp3090, tmp3098, 136);
__m512 tmp3019 = _mm512_shuffle_f32x4(tmp3090, tmp3098, 221);
tmp2972 = _mm512_shuffle_f32x4(tmp3092, tmp3100, 136);
__m512 tmp3020 = _mm512_shuffle_f32x4(tmp3092, tmp3100, 221);
__m512 tmp3025 = _mm512_add_ps(tmp2966, tmp2967);
__m512 tmp3045 = _mm512_add_ps(tmp2974, tmp2975);
__m512 tmp3024 = _mm512_add_ps(tmp2968, tmp2969);
__m512 tmp3044 = _mm512_add_ps(tmp2976, tmp3017);
__m512 tmp3030 = _mm512_sub_ps(tmp2968, tmp2969);
__m512 tmp3050 = _mm512_sub_ps(tmp2976, tmp3017);
__m512 tmp3029 = _mm512_sub_ps(tmp2966, tmp2967);
__m512 tmp3049 = _mm512_sub_ps(tmp2974, tmp2975);
__m512 tmp3026 = _mm512_add_ps(tmp2970, tmp2971);
__m512 tmp3046 = _mm512_add_ps(tmp3018, tmp3019);
__m512 tmp3031 = _mm512_sub_ps(tmp2970, tmp2971);
__m512 tmp3051 = _mm512_sub_ps(tmp3018, tmp3019);
__m512 tmp3028 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(2e+00f), tmp3029);
__m512 tmp3048 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(2e+00f), tmp3049);
__m512 tmp3035 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(8e+00f), tmp3029);
__m512 tmp3055 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(8e+00f), tmp3049);
__m512 tmp3023 = _mm512_add_ps(tmp3024, tmp3025);
__m512 tmp3043 = _mm512_add_ps(tmp3044, tmp3045);
__m512 tmp3027 = _mm512_fmadd_ps(tmp3031, _mm512_set1_ps(1.6e+01f), tmp3028);
__m512 tmp3047 = _mm512_fmadd_ps(tmp3051, _mm512_set1_ps(1.6e+01f), tmp3048);
__m512 tmp3034 = _mm512_fmadd_ps(tmp3031, _mm512_set1_ps(4e+00f), tmp3035);
__m512 tmp3054 = _mm512_fmadd_ps(tmp3051, _mm512_set1_ps(4e+00f), tmp3055);
__m512 tmp3040 = _mm512_add_ps(tmp3031, tmp3029);
__m512 tmp3060 = _mm512_add_ps(tmp3051, tmp3049);
__m512 tmp3033 = _mm512_fmadd_ps(tmp3024, _mm512_set1_ps(4e+00f), tmp3025);
__m512 tmp3053 = _mm512_fmadd_ps(tmp3044, _mm512_set1_ps(4e+00f), tmp3045);
__m512 tmp3037 = _mm512_fmadd_ps(tmp3024, _mm512_set1_ps(1.6e+01f), tmp3025);
__m512 tmp3057 = _mm512_fmadd_ps(tmp3044, _mm512_set1_ps(1.6e+01f), tmp3045);
__m512 tmp3022 = _mm512_add_ps(tmp3023, tmp2965);
__m512 tmp3042 = _mm512_add_ps(tmp3043, tmp2973);
__m512 tmp3039 = _mm512_add_ps(tmp3040, tmp2972);
__m512 tmp3059 = _mm512_add_ps(tmp3060, tmp3020);
__m512 tmp3021 = _mm512_fmadd_ps(tmp3026, _mm512_set1_ps(3.2e+01f), tmp3022);
__m512 tmp3041 = _mm512_fmadd_ps(tmp3046, _mm512_set1_ps(3.2e+01f), tmp3042);
__m512 tmp3032 = _mm512_fmadd_ps(tmp3026, _mm512_set1_ps(8e+00f), tmp3033);
__m512 tmp3052 = _mm512_fmadd_ps(tmp3046, _mm512_set1_ps(8e+00f), tmp3053);
__m512 tmp3038 = _mm512_fmadd_ps(tmp3030, _mm512_set1_ps(3.2e+01f), tmp3039);
__m512 tmp3058 = _mm512_fmadd_ps(tmp3050, _mm512_set1_ps(3.2e+01f), tmp3059);
__m512 tmp3036 = _mm512_fmadd_ps(tmp3026, _mm512_set1_ps(2e+00f), tmp3037);
__m512 tmp3056 = _mm512_fmadd_ps(tmp3046, _mm512_set1_ps(2e+00f), tmp3057);
__m512 out519 = tmp3021;
__m512 out525 = tmp3041;
__m512 out520 = tmp3027;
__m512 out526 = tmp3047;
__m512 out521 = tmp3032;
__m512 out527 = tmp3052;
__m512 out522 = tmp3034;
__m512 out528 = tmp3054;
__m512 out523 = tmp3036;
__m512 out529 = tmp3056;
__m512 out524 = tmp3038;
__m512 out530 = tmp3058;
out519 = _mm512_max_ps(_mm512_setzero_ps(), out519);
out525 = _mm512_max_ps(_mm512_setzero_ps(), out525);
out520 = _mm512_max_ps(_mm512_setzero_ps(), out520);
out526 = _mm512_max_ps(_mm512_setzero_ps(), out526);
out521 = _mm512_max_ps(_mm512_setzero_ps(), out521);
out527 = _mm512_max_ps(_mm512_setzero_ps(), out527);
out522 = _mm512_max_ps(_mm512_setzero_ps(), out522);
out528 = _mm512_max_ps(_mm512_setzero_ps(), out528);
out523 = _mm512_max_ps(_mm512_setzero_ps(), out523);
out529 = _mm512_max_ps(_mm512_setzero_ps(), out529);
out524 = _mm512_max_ps(_mm512_setzero_ps(), out524);
out530 = _mm512_max_ps(_mm512_setzero_ps(), out530);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out519);
_mm512_mask_storeu_ps(datPtr6+12704+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out525);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out520);
_mm512_mask_storeu_ps(datPtr6+12928+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out526);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out521);
_mm512_mask_storeu_ps(datPtr6+13152+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out527);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out522);
_mm512_mask_storeu_ps(datPtr6+13376+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out528);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out523);
_mm512_mask_storeu_ps(datPtr6+13600+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out529);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out524);
_mm512_mask_storeu_ps(datPtr6+13824+50432*i19+224*toH22+4*toW22+50432*k63+25216*l15, 4095, out530);
}
}
++j13;
rel11 = 1;
}
ptrdiff_t toH23 = base11+0;
ptrdiff_t toW23 = 48;
ptrdiff_t k64 = 1*w33;
for (; k64 != 1; ++k64) {
ptrdiff_t l16 = 0;
for (; l16 != 2; ++l16) {
__m512 sf145 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf146 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in508 = _mm512_shuffle_f32x4(sf145, sf146, 68);
__m512 in509 = _mm512_shuffle_f32x4(sf145, sf146, 238);
__m512 sf147 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf148 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in516 = _mm512_shuffle_f32x4(sf147, sf148, 68);
__m512 in517 = _mm512_shuffle_f32x4(sf147, sf148, 238);
__m512 sf149 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf150 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in510 = _mm512_shuffle_f32x4(sf149, sf150, 68);
__m512 in511 = _mm512_shuffle_f32x4(sf149, sf150, 238);
__m512 sf151 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf152 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in518 = _mm512_shuffle_f32x4(sf151, sf152, 68);
__m512 in519 = _mm512_shuffle_f32x4(sf151, sf152, 238);
__m512 sf153 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf154 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in512 = _mm512_shuffle_f32x4(sf153, sf154, 68);
__m512 in513 = _mm512_shuffle_f32x4(sf153, sf154, 238);
__m512 sf155 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf156 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in520 = _mm512_shuffle_f32x4(sf155, sf156, 68);
__m512 in521 = _mm512_shuffle_f32x4(sf155, sf156, 238);
__m512 sf157 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf158 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in514 = _mm512_shuffle_f32x4(sf157, sf158, 68);
__m512 in515 = _mm512_shuffle_f32x4(sf157, sf158, 238);
__m512 sf159 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf160 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in522 = _mm512_shuffle_f32x4(sf159, sf160, 68);
__m512 in523 = _mm512_shuffle_f32x4(sf159, sf160, 238);
__m512 tmp3117 = _mm512_add_ps(in509, in510);
__m512 tmp3137 = _mm512_add_ps(in517, in518);
__m512 tmp3116 = _mm512_add_ps(in511, in512);
__m512 tmp3136 = _mm512_add_ps(in519, in520);
__m512 tmp3122 = _mm512_sub_ps(in511, in512);
__m512 tmp3142 = _mm512_sub_ps(in519, in520);
__m512 tmp3121 = _mm512_sub_ps(in509, in510);
__m512 tmp3141 = _mm512_sub_ps(in517, in518);
__m512 tmp3118 = _mm512_add_ps(in513, in514);
__m512 tmp3138 = _mm512_add_ps(in521, in522);
__m512 tmp3123 = _mm512_sub_ps(in513, in514);
__m512 tmp3143 = _mm512_sub_ps(in521, in522);
__m512 tmp3120 = _mm512_fmadd_ps(tmp3122, _mm512_set1_ps(2e+00f), tmp3121);
__m512 tmp3140 = _mm512_fmadd_ps(tmp3142, _mm512_set1_ps(2e+00f), tmp3141);
__m512 tmp3127 = _mm512_fmadd_ps(tmp3122, _mm512_set1_ps(8e+00f), tmp3121);
__m512 tmp3147 = _mm512_fmadd_ps(tmp3142, _mm512_set1_ps(8e+00f), tmp3141);
__m512 tmp3115 = _mm512_add_ps(tmp3116, tmp3117);
__m512 tmp3135 = _mm512_add_ps(tmp3136, tmp3137);
__m512 tmp3119 = _mm512_fmadd_ps(tmp3123, _mm512_set1_ps(1.6e+01f), tmp3120);
__m512 tmp3139 = _mm512_fmadd_ps(tmp3143, _mm512_set1_ps(1.6e+01f), tmp3140);
__m512 tmp3126 = _mm512_fmadd_ps(tmp3123, _mm512_set1_ps(4e+00f), tmp3127);
__m512 tmp3146 = _mm512_fmadd_ps(tmp3143, _mm512_set1_ps(4e+00f), tmp3147);
__m512 tmp3132 = _mm512_add_ps(tmp3123, tmp3121);
__m512 tmp3152 = _mm512_add_ps(tmp3143, tmp3141);
__m512 tmp3125 = _mm512_fmadd_ps(tmp3116, _mm512_set1_ps(4e+00f), tmp3117);
__m512 tmp3145 = _mm512_fmadd_ps(tmp3136, _mm512_set1_ps(4e+00f), tmp3137);
__m512 tmp3129 = _mm512_fmadd_ps(tmp3116, _mm512_set1_ps(1.6e+01f), tmp3117);
__m512 tmp3149 = _mm512_fmadd_ps(tmp3136, _mm512_set1_ps(1.6e+01f), tmp3137);
__m512 tmp3114 = _mm512_add_ps(tmp3115, in508);
__m512 tmp3134 = _mm512_add_ps(tmp3135, in516);
__m512 tmp3131 = _mm512_add_ps(tmp3132, in515);
__m512 tmp3151 = _mm512_add_ps(tmp3152, in523);
__m512 tmp3113 = _mm512_fmadd_ps(tmp3118, _mm512_set1_ps(3.2e+01f), tmp3114);
__m512 tmp3133 = _mm512_fmadd_ps(tmp3138, _mm512_set1_ps(3.2e+01f), tmp3134);
__m512 tmp3124 = _mm512_fmadd_ps(tmp3118, _mm512_set1_ps(8e+00f), tmp3125);
__m512 tmp3144 = _mm512_fmadd_ps(tmp3138, _mm512_set1_ps(8e+00f), tmp3145);
__m512 tmp3130 = _mm512_fmadd_ps(tmp3122, _mm512_set1_ps(3.2e+01f), tmp3131);
__m512 tmp3150 = _mm512_fmadd_ps(tmp3142, _mm512_set1_ps(3.2e+01f), tmp3151);
__m512 tmp3128 = _mm512_fmadd_ps(tmp3118, _mm512_set1_ps(2e+00f), tmp3129);
__m512 tmp3148 = _mm512_fmadd_ps(tmp3138, _mm512_set1_ps(2e+00f), tmp3149);
__m512 tmp3101 = tmp3113;
__m512 tmp3107 = tmp3133;
__m512 tmp3102 = tmp3119;
__m512 tmp3108 = tmp3139;
__m512 tmp3103 = tmp3124;
__m512 tmp3109 = tmp3144;
__m512 tmp3104 = tmp3126;
__m512 tmp3110 = tmp3146;
__m512 tmp3105 = tmp3128;
__m512 tmp3111 = tmp3148;
__m512 tmp3106 = tmp3130;
__m512 tmp3112 = tmp3150;
__m512 tmp3197 = _mm512_unpacklo_ps(tmp3101, tmp3102);
__m512 tmp3198 = _mm512_unpackhi_ps(tmp3101, tmp3102);
__m512 tmp3199 = _mm512_unpacklo_ps(tmp3103, tmp3104);
__m512 tmp3200 = _mm512_unpackhi_ps(tmp3103, tmp3104);
__m512 tmp3201 = _mm512_unpacklo_ps(tmp3105, tmp3106);
__m512 tmp3202 = _mm512_unpackhi_ps(tmp3105, tmp3106);
__m512 tmp3203 = _mm512_unpacklo_ps(tmp3107, tmp3108);
__m512 tmp3204 = _mm512_unpackhi_ps(tmp3107, tmp3108);
__m512 tmp3205 = _mm512_unpacklo_ps(tmp3109, tmp3110);
__m512 tmp3206 = _mm512_unpackhi_ps(tmp3109, tmp3110);
__m512 tmp3207 = _mm512_unpacklo_ps(tmp3111, tmp3112);
__m512 tmp3208 = _mm512_unpackhi_ps(tmp3111, tmp3112);
__m512 tmp3209 = _mm512_shuffle_ps(tmp3197, tmp3199, 68);
__m512 tmp3210 = _mm512_shuffle_ps(tmp3197, tmp3199, 238);
__m512 tmp3211 = _mm512_shuffle_ps(tmp3198, tmp3200, 68);
__m512 tmp3212 = _mm512_shuffle_ps(tmp3198, tmp3200, 238);
__m512 tmp3213 = _mm512_shuffle_ps(tmp3201, tmp3203, 68);
__m512 tmp3214 = _mm512_shuffle_ps(tmp3201, tmp3203, 238);
__m512 tmp3215 = _mm512_shuffle_ps(tmp3202, tmp3204, 68);
__m512 tmp3216 = _mm512_shuffle_ps(tmp3202, tmp3204, 238);
__m512 tmp3217 = _mm512_shuffle_ps(tmp3205, tmp3207, 68);
__m512 tmp3218 = _mm512_shuffle_ps(tmp3205, tmp3207, 238);
__m512 tmp3219 = _mm512_shuffle_ps(tmp3206, tmp3208, 68);
__m512 tmp3220 = _mm512_shuffle_ps(tmp3206, tmp3208, 238);
__m512 tmp3221 = _mm512_shuffle_f32x4(tmp3209, tmp3213, 136);
__m512 tmp3222 = _mm512_shuffle_f32x4(tmp3209, tmp3213, 221);
__m512 tmp3223 = _mm512_shuffle_f32x4(tmp3210, tmp3214, 136);
__m512 tmp3224 = _mm512_shuffle_f32x4(tmp3210, tmp3214, 221);
__m512 tmp3225 = _mm512_shuffle_f32x4(tmp3211, tmp3215, 136);
__m512 tmp3226 = _mm512_shuffle_f32x4(tmp3211, tmp3215, 221);
__m512 tmp3227 = _mm512_shuffle_f32x4(tmp3212, tmp3216, 136);
__m512 tmp3228 = _mm512_shuffle_f32x4(tmp3212, tmp3216, 221);
__m512 tmp3229 = _mm512_shuffle_f32x4(tmp3217, tmp3217, 136);
__m512 tmp3230 = _mm512_shuffle_f32x4(tmp3217, tmp3217, 221);
__m512 tmp3231 = _mm512_shuffle_f32x4(tmp3218, tmp3218, 136);
__m512 tmp3232 = _mm512_shuffle_f32x4(tmp3218, tmp3218, 221);
__m512 tmp3233 = _mm512_shuffle_f32x4(tmp3219, tmp3219, 136);
__m512 tmp3234 = _mm512_shuffle_f32x4(tmp3219, tmp3219, 221);
__m512 tmp3235 = _mm512_shuffle_f32x4(tmp3220, tmp3220, 136);
__m512 tmp3236 = _mm512_shuffle_f32x4(tmp3220, tmp3220, 221);
tmp3101 = _mm512_shuffle_f32x4(tmp3221, tmp3229, 136);
tmp3109 = _mm512_shuffle_f32x4(tmp3221, tmp3229, 221);
tmp3102 = _mm512_shuffle_f32x4(tmp3223, tmp3231, 136);
tmp3110 = _mm512_shuffle_f32x4(tmp3223, tmp3231, 221);
tmp3103 = _mm512_shuffle_f32x4(tmp3225, tmp3233, 136);
tmp3111 = _mm512_shuffle_f32x4(tmp3225, tmp3233, 221);
tmp3104 = _mm512_shuffle_f32x4(tmp3227, tmp3235, 136);
tmp3112 = _mm512_shuffle_f32x4(tmp3227, tmp3235, 221);
tmp3105 = _mm512_shuffle_f32x4(tmp3222, tmp3230, 136);
__m512 tmp3153 = _mm512_shuffle_f32x4(tmp3222, tmp3230, 221);
tmp3106 = _mm512_shuffle_f32x4(tmp3224, tmp3232, 136);
__m512 tmp3154 = _mm512_shuffle_f32x4(tmp3224, tmp3232, 221);
tmp3107 = _mm512_shuffle_f32x4(tmp3226, tmp3234, 136);
__m512 tmp3155 = _mm512_shuffle_f32x4(tmp3226, tmp3234, 221);
tmp3108 = _mm512_shuffle_f32x4(tmp3228, tmp3236, 136);
__m512 tmp3156 = _mm512_shuffle_f32x4(tmp3228, tmp3236, 221);
__m512 tmp3161 = _mm512_add_ps(tmp3102, tmp3103);
__m512 tmp3181 = _mm512_add_ps(tmp3110, tmp3111);
__m512 tmp3160 = _mm512_add_ps(tmp3104, tmp3105);
__m512 tmp3180 = _mm512_add_ps(tmp3112, tmp3153);
__m512 tmp3166 = _mm512_sub_ps(tmp3104, tmp3105);
__m512 tmp3186 = _mm512_sub_ps(tmp3112, tmp3153);
__m512 tmp3165 = _mm512_sub_ps(tmp3102, tmp3103);
__m512 tmp3185 = _mm512_sub_ps(tmp3110, tmp3111);
__m512 tmp3162 = _mm512_add_ps(tmp3106, tmp3107);
__m512 tmp3182 = _mm512_add_ps(tmp3154, tmp3155);
__m512 tmp3167 = _mm512_sub_ps(tmp3106, tmp3107);
__m512 tmp3187 = _mm512_sub_ps(tmp3154, tmp3155);
__m512 tmp3164 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(2e+00f), tmp3165);
__m512 tmp3184 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(2e+00f), tmp3185);
__m512 tmp3171 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(8e+00f), tmp3165);
__m512 tmp3191 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(8e+00f), tmp3185);
__m512 tmp3159 = _mm512_add_ps(tmp3160, tmp3161);
__m512 tmp3179 = _mm512_add_ps(tmp3180, tmp3181);
__m512 tmp3163 = _mm512_fmadd_ps(tmp3167, _mm512_set1_ps(1.6e+01f), tmp3164);
__m512 tmp3183 = _mm512_fmadd_ps(tmp3187, _mm512_set1_ps(1.6e+01f), tmp3184);
__m512 tmp3170 = _mm512_fmadd_ps(tmp3167, _mm512_set1_ps(4e+00f), tmp3171);
__m512 tmp3190 = _mm512_fmadd_ps(tmp3187, _mm512_set1_ps(4e+00f), tmp3191);
__m512 tmp3176 = _mm512_add_ps(tmp3167, tmp3165);
__m512 tmp3196 = _mm512_add_ps(tmp3187, tmp3185);
__m512 tmp3169 = _mm512_fmadd_ps(tmp3160, _mm512_set1_ps(4e+00f), tmp3161);
__m512 tmp3189 = _mm512_fmadd_ps(tmp3180, _mm512_set1_ps(4e+00f), tmp3181);
__m512 tmp3173 = _mm512_fmadd_ps(tmp3160, _mm512_set1_ps(1.6e+01f), tmp3161);
__m512 tmp3193 = _mm512_fmadd_ps(tmp3180, _mm512_set1_ps(1.6e+01f), tmp3181);
__m512 tmp3158 = _mm512_add_ps(tmp3159, tmp3101);
__m512 tmp3178 = _mm512_add_ps(tmp3179, tmp3109);
__m512 tmp3175 = _mm512_add_ps(tmp3176, tmp3108);
__m512 tmp3195 = _mm512_add_ps(tmp3196, tmp3156);
__m512 tmp3157 = _mm512_fmadd_ps(tmp3162, _mm512_set1_ps(3.2e+01f), tmp3158);
__m512 tmp3177 = _mm512_fmadd_ps(tmp3182, _mm512_set1_ps(3.2e+01f), tmp3178);
__m512 tmp3168 = _mm512_fmadd_ps(tmp3162, _mm512_set1_ps(8e+00f), tmp3169);
__m512 tmp3188 = _mm512_fmadd_ps(tmp3182, _mm512_set1_ps(8e+00f), tmp3189);
__m512 tmp3174 = _mm512_fmadd_ps(tmp3166, _mm512_set1_ps(3.2e+01f), tmp3175);
__m512 tmp3194 = _mm512_fmadd_ps(tmp3186, _mm512_set1_ps(3.2e+01f), tmp3195);
__m512 tmp3172 = _mm512_fmadd_ps(tmp3162, _mm512_set1_ps(2e+00f), tmp3173);
__m512 tmp3192 = _mm512_fmadd_ps(tmp3182, _mm512_set1_ps(2e+00f), tmp3193);
__m512 out531 = tmp3157;
__m512 out537 = tmp3177;
__m512 out532 = tmp3163;
__m512 out538 = tmp3183;
__m512 out533 = tmp3168;
__m512 out539 = tmp3188;
__m512 out534 = tmp3170;
__m512 out540 = tmp3190;
__m512 out535 = tmp3172;
__m512 out541 = tmp3192;
__m512 out536 = tmp3174;
__m512 out542 = tmp3194;
out531 = _mm512_max_ps(_mm512_setzero_ps(), out531);
out537 = _mm512_max_ps(_mm512_setzero_ps(), out537);
out532 = _mm512_max_ps(_mm512_setzero_ps(), out532);
out538 = _mm512_max_ps(_mm512_setzero_ps(), out538);
out533 = _mm512_max_ps(_mm512_setzero_ps(), out533);
out539 = _mm512_max_ps(_mm512_setzero_ps(), out539);
out534 = _mm512_max_ps(_mm512_setzero_ps(), out534);
out540 = _mm512_max_ps(_mm512_setzero_ps(), out540);
out535 = _mm512_max_ps(_mm512_setzero_ps(), out535);
out541 = _mm512_max_ps(_mm512_setzero_ps(), out541);
out536 = _mm512_max_ps(_mm512_setzero_ps(), out536);
out542 = _mm512_max_ps(_mm512_setzero_ps(), out542);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out531);
_mm512_mask_storeu_ps(datPtr6+1152+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out537);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out532);
_mm512_mask_storeu_ps(datPtr6+1376+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out538);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out533);
_mm512_mask_storeu_ps(datPtr6+1600+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out539);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out534);
_mm512_mask_storeu_ps(datPtr6+1824+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out540);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out535);
_mm512_mask_storeu_ps(datPtr6+2048+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out541);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out536);
_mm512_mask_storeu_ps(datPtr6+2272+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out542);
__m512 sf161 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf162 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in524 = _mm512_shuffle_f32x4(sf161, sf162, 68);
__m512 in525 = _mm512_shuffle_f32x4(sf161, sf162, 238);
__m512 sf163 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf164 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in532 = _mm512_shuffle_f32x4(sf163, sf164, 68);
__m512 in533 = _mm512_shuffle_f32x4(sf163, sf164, 238);
__m512 sf165 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf166 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in526 = _mm512_shuffle_f32x4(sf165, sf166, 68);
__m512 in527 = _mm512_shuffle_f32x4(sf165, sf166, 238);
__m512 sf167 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf168 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in534 = _mm512_shuffle_f32x4(sf167, sf168, 68);
__m512 in535 = _mm512_shuffle_f32x4(sf167, sf168, 238);
__m512 sf169 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf170 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in528 = _mm512_shuffle_f32x4(sf169, sf170, 68);
__m512 in529 = _mm512_shuffle_f32x4(sf169, sf170, 238);
__m512 sf171 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf172 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in536 = _mm512_shuffle_f32x4(sf171, sf172, 68);
__m512 in537 = _mm512_shuffle_f32x4(sf171, sf172, 238);
__m512 sf173 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf174 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in530 = _mm512_shuffle_f32x4(sf173, sf174, 68);
__m512 in531 = _mm512_shuffle_f32x4(sf173, sf174, 238);
__m512 sf175 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf176 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in538 = _mm512_shuffle_f32x4(sf175, sf176, 68);
__m512 in539 = _mm512_shuffle_f32x4(sf175, sf176, 238);
__m512 tmp3253 = _mm512_add_ps(in525, in526);
__m512 tmp3273 = _mm512_add_ps(in533, in534);
__m512 tmp3252 = _mm512_add_ps(in527, in528);
__m512 tmp3272 = _mm512_add_ps(in535, in536);
__m512 tmp3258 = _mm512_sub_ps(in527, in528);
__m512 tmp3278 = _mm512_sub_ps(in535, in536);
__m512 tmp3257 = _mm512_sub_ps(in525, in526);
__m512 tmp3277 = _mm512_sub_ps(in533, in534);
__m512 tmp3254 = _mm512_add_ps(in529, in530);
__m512 tmp3274 = _mm512_add_ps(in537, in538);
__m512 tmp3259 = _mm512_sub_ps(in529, in530);
__m512 tmp3279 = _mm512_sub_ps(in537, in538);
__m512 tmp3256 = _mm512_fmadd_ps(tmp3258, _mm512_set1_ps(2e+00f), tmp3257);
__m512 tmp3276 = _mm512_fmadd_ps(tmp3278, _mm512_set1_ps(2e+00f), tmp3277);
__m512 tmp3263 = _mm512_fmadd_ps(tmp3258, _mm512_set1_ps(8e+00f), tmp3257);
__m512 tmp3283 = _mm512_fmadd_ps(tmp3278, _mm512_set1_ps(8e+00f), tmp3277);
__m512 tmp3251 = _mm512_add_ps(tmp3252, tmp3253);
__m512 tmp3271 = _mm512_add_ps(tmp3272, tmp3273);
__m512 tmp3255 = _mm512_fmadd_ps(tmp3259, _mm512_set1_ps(1.6e+01f), tmp3256);
__m512 tmp3275 = _mm512_fmadd_ps(tmp3279, _mm512_set1_ps(1.6e+01f), tmp3276);
__m512 tmp3262 = _mm512_fmadd_ps(tmp3259, _mm512_set1_ps(4e+00f), tmp3263);
__m512 tmp3282 = _mm512_fmadd_ps(tmp3279, _mm512_set1_ps(4e+00f), tmp3283);
__m512 tmp3268 = _mm512_add_ps(tmp3259, tmp3257);
__m512 tmp3288 = _mm512_add_ps(tmp3279, tmp3277);
__m512 tmp3261 = _mm512_fmadd_ps(tmp3252, _mm512_set1_ps(4e+00f), tmp3253);
__m512 tmp3281 = _mm512_fmadd_ps(tmp3272, _mm512_set1_ps(4e+00f), tmp3273);
__m512 tmp3265 = _mm512_fmadd_ps(tmp3252, _mm512_set1_ps(1.6e+01f), tmp3253);
__m512 tmp3285 = _mm512_fmadd_ps(tmp3272, _mm512_set1_ps(1.6e+01f), tmp3273);
__m512 tmp3250 = _mm512_add_ps(tmp3251, in524);
__m512 tmp3270 = _mm512_add_ps(tmp3271, in532);
__m512 tmp3267 = _mm512_add_ps(tmp3268, in531);
__m512 tmp3287 = _mm512_add_ps(tmp3288, in539);
__m512 tmp3249 = _mm512_fmadd_ps(tmp3254, _mm512_set1_ps(3.2e+01f), tmp3250);
__m512 tmp3269 = _mm512_fmadd_ps(tmp3274, _mm512_set1_ps(3.2e+01f), tmp3270);
__m512 tmp3260 = _mm512_fmadd_ps(tmp3254, _mm512_set1_ps(8e+00f), tmp3261);
__m512 tmp3280 = _mm512_fmadd_ps(tmp3274, _mm512_set1_ps(8e+00f), tmp3281);
__m512 tmp3266 = _mm512_fmadd_ps(tmp3258, _mm512_set1_ps(3.2e+01f), tmp3267);
__m512 tmp3286 = _mm512_fmadd_ps(tmp3278, _mm512_set1_ps(3.2e+01f), tmp3287);
__m512 tmp3264 = _mm512_fmadd_ps(tmp3254, _mm512_set1_ps(2e+00f), tmp3265);
__m512 tmp3284 = _mm512_fmadd_ps(tmp3274, _mm512_set1_ps(2e+00f), tmp3285);
__m512 tmp3237 = tmp3249;
__m512 tmp3243 = tmp3269;
__m512 tmp3238 = tmp3255;
__m512 tmp3244 = tmp3275;
__m512 tmp3239 = tmp3260;
__m512 tmp3245 = tmp3280;
__m512 tmp3240 = tmp3262;
__m512 tmp3246 = tmp3282;
__m512 tmp3241 = tmp3264;
__m512 tmp3247 = tmp3284;
__m512 tmp3242 = tmp3266;
__m512 tmp3248 = tmp3286;
__m512 tmp3333 = _mm512_unpacklo_ps(tmp3237, tmp3238);
__m512 tmp3334 = _mm512_unpackhi_ps(tmp3237, tmp3238);
__m512 tmp3335 = _mm512_unpacklo_ps(tmp3239, tmp3240);
__m512 tmp3336 = _mm512_unpackhi_ps(tmp3239, tmp3240);
__m512 tmp3337 = _mm512_unpacklo_ps(tmp3241, tmp3242);
__m512 tmp3338 = _mm512_unpackhi_ps(tmp3241, tmp3242);
__m512 tmp3339 = _mm512_unpacklo_ps(tmp3243, tmp3244);
__m512 tmp3340 = _mm512_unpackhi_ps(tmp3243, tmp3244);
__m512 tmp3341 = _mm512_unpacklo_ps(tmp3245, tmp3246);
__m512 tmp3342 = _mm512_unpackhi_ps(tmp3245, tmp3246);
__m512 tmp3343 = _mm512_unpacklo_ps(tmp3247, tmp3248);
__m512 tmp3344 = _mm512_unpackhi_ps(tmp3247, tmp3248);
__m512 tmp3345 = _mm512_shuffle_ps(tmp3333, tmp3335, 68);
__m512 tmp3346 = _mm512_shuffle_ps(tmp3333, tmp3335, 238);
__m512 tmp3347 = _mm512_shuffle_ps(tmp3334, tmp3336, 68);
__m512 tmp3348 = _mm512_shuffle_ps(tmp3334, tmp3336, 238);
__m512 tmp3349 = _mm512_shuffle_ps(tmp3337, tmp3339, 68);
__m512 tmp3350 = _mm512_shuffle_ps(tmp3337, tmp3339, 238);
__m512 tmp3351 = _mm512_shuffle_ps(tmp3338, tmp3340, 68);
__m512 tmp3352 = _mm512_shuffle_ps(tmp3338, tmp3340, 238);
__m512 tmp3353 = _mm512_shuffle_ps(tmp3341, tmp3343, 68);
__m512 tmp3354 = _mm512_shuffle_ps(tmp3341, tmp3343, 238);
__m512 tmp3355 = _mm512_shuffle_ps(tmp3342, tmp3344, 68);
__m512 tmp3356 = _mm512_shuffle_ps(tmp3342, tmp3344, 238);
__m512 tmp3357 = _mm512_shuffle_f32x4(tmp3345, tmp3349, 136);
__m512 tmp3358 = _mm512_shuffle_f32x4(tmp3345, tmp3349, 221);
__m512 tmp3359 = _mm512_shuffle_f32x4(tmp3346, tmp3350, 136);
__m512 tmp3360 = _mm512_shuffle_f32x4(tmp3346, tmp3350, 221);
__m512 tmp3361 = _mm512_shuffle_f32x4(tmp3347, tmp3351, 136);
__m512 tmp3362 = _mm512_shuffle_f32x4(tmp3347, tmp3351, 221);
__m512 tmp3363 = _mm512_shuffle_f32x4(tmp3348, tmp3352, 136);
__m512 tmp3364 = _mm512_shuffle_f32x4(tmp3348, tmp3352, 221);
__m512 tmp3365 = _mm512_shuffle_f32x4(tmp3353, tmp3353, 136);
__m512 tmp3366 = _mm512_shuffle_f32x4(tmp3353, tmp3353, 221);
__m512 tmp3367 = _mm512_shuffle_f32x4(tmp3354, tmp3354, 136);
__m512 tmp3368 = _mm512_shuffle_f32x4(tmp3354, tmp3354, 221);
__m512 tmp3369 = _mm512_shuffle_f32x4(tmp3355, tmp3355, 136);
__m512 tmp3370 = _mm512_shuffle_f32x4(tmp3355, tmp3355, 221);
__m512 tmp3371 = _mm512_shuffle_f32x4(tmp3356, tmp3356, 136);
__m512 tmp3372 = _mm512_shuffle_f32x4(tmp3356, tmp3356, 221);
tmp3237 = _mm512_shuffle_f32x4(tmp3357, tmp3365, 136);
tmp3245 = _mm512_shuffle_f32x4(tmp3357, tmp3365, 221);
tmp3238 = _mm512_shuffle_f32x4(tmp3359, tmp3367, 136);
tmp3246 = _mm512_shuffle_f32x4(tmp3359, tmp3367, 221);
tmp3239 = _mm512_shuffle_f32x4(tmp3361, tmp3369, 136);
tmp3247 = _mm512_shuffle_f32x4(tmp3361, tmp3369, 221);
tmp3240 = _mm512_shuffle_f32x4(tmp3363, tmp3371, 136);
tmp3248 = _mm512_shuffle_f32x4(tmp3363, tmp3371, 221);
tmp3241 = _mm512_shuffle_f32x4(tmp3358, tmp3366, 136);
__m512 tmp3289 = _mm512_shuffle_f32x4(tmp3358, tmp3366, 221);
tmp3242 = _mm512_shuffle_f32x4(tmp3360, tmp3368, 136);
__m512 tmp3290 = _mm512_shuffle_f32x4(tmp3360, tmp3368, 221);
tmp3243 = _mm512_shuffle_f32x4(tmp3362, tmp3370, 136);
__m512 tmp3291 = _mm512_shuffle_f32x4(tmp3362, tmp3370, 221);
tmp3244 = _mm512_shuffle_f32x4(tmp3364, tmp3372, 136);
__m512 tmp3292 = _mm512_shuffle_f32x4(tmp3364, tmp3372, 221);
__m512 tmp3297 = _mm512_add_ps(tmp3238, tmp3239);
__m512 tmp3317 = _mm512_add_ps(tmp3246, tmp3247);
__m512 tmp3296 = _mm512_add_ps(tmp3240, tmp3241);
__m512 tmp3316 = _mm512_add_ps(tmp3248, tmp3289);
__m512 tmp3302 = _mm512_sub_ps(tmp3240, tmp3241);
__m512 tmp3322 = _mm512_sub_ps(tmp3248, tmp3289);
__m512 tmp3301 = _mm512_sub_ps(tmp3238, tmp3239);
__m512 tmp3321 = _mm512_sub_ps(tmp3246, tmp3247);
__m512 tmp3298 = _mm512_add_ps(tmp3242, tmp3243);
__m512 tmp3318 = _mm512_add_ps(tmp3290, tmp3291);
__m512 tmp3303 = _mm512_sub_ps(tmp3242, tmp3243);
__m512 tmp3323 = _mm512_sub_ps(tmp3290, tmp3291);
__m512 tmp3300 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(2e+00f), tmp3301);
__m512 tmp3320 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(2e+00f), tmp3321);
__m512 tmp3307 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(8e+00f), tmp3301);
__m512 tmp3327 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(8e+00f), tmp3321);
__m512 tmp3295 = _mm512_add_ps(tmp3296, tmp3297);
__m512 tmp3315 = _mm512_add_ps(tmp3316, tmp3317);
__m512 tmp3299 = _mm512_fmadd_ps(tmp3303, _mm512_set1_ps(1.6e+01f), tmp3300);
__m512 tmp3319 = _mm512_fmadd_ps(tmp3323, _mm512_set1_ps(1.6e+01f), tmp3320);
__m512 tmp3306 = _mm512_fmadd_ps(tmp3303, _mm512_set1_ps(4e+00f), tmp3307);
__m512 tmp3326 = _mm512_fmadd_ps(tmp3323, _mm512_set1_ps(4e+00f), tmp3327);
__m512 tmp3312 = _mm512_add_ps(tmp3303, tmp3301);
__m512 tmp3332 = _mm512_add_ps(tmp3323, tmp3321);
__m512 tmp3305 = _mm512_fmadd_ps(tmp3296, _mm512_set1_ps(4e+00f), tmp3297);
__m512 tmp3325 = _mm512_fmadd_ps(tmp3316, _mm512_set1_ps(4e+00f), tmp3317);
__m512 tmp3309 = _mm512_fmadd_ps(tmp3296, _mm512_set1_ps(1.6e+01f), tmp3297);
__m512 tmp3329 = _mm512_fmadd_ps(tmp3316, _mm512_set1_ps(1.6e+01f), tmp3317);
__m512 tmp3294 = _mm512_add_ps(tmp3295, tmp3237);
__m512 tmp3314 = _mm512_add_ps(tmp3315, tmp3245);
__m512 tmp3311 = _mm512_add_ps(tmp3312, tmp3244);
__m512 tmp3331 = _mm512_add_ps(tmp3332, tmp3292);
__m512 tmp3293 = _mm512_fmadd_ps(tmp3298, _mm512_set1_ps(3.2e+01f), tmp3294);
__m512 tmp3313 = _mm512_fmadd_ps(tmp3318, _mm512_set1_ps(3.2e+01f), tmp3314);
__m512 tmp3304 = _mm512_fmadd_ps(tmp3298, _mm512_set1_ps(8e+00f), tmp3305);
__m512 tmp3324 = _mm512_fmadd_ps(tmp3318, _mm512_set1_ps(8e+00f), tmp3325);
__m512 tmp3310 = _mm512_fmadd_ps(tmp3302, _mm512_set1_ps(3.2e+01f), tmp3311);
__m512 tmp3330 = _mm512_fmadd_ps(tmp3322, _mm512_set1_ps(3.2e+01f), tmp3331);
__m512 tmp3308 = _mm512_fmadd_ps(tmp3298, _mm512_set1_ps(2e+00f), tmp3309);
__m512 tmp3328 = _mm512_fmadd_ps(tmp3318, _mm512_set1_ps(2e+00f), tmp3329);
__m512 out543 = tmp3293;
__m512 out549 = tmp3313;
__m512 out544 = tmp3299;
__m512 out550 = tmp3319;
__m512 out545 = tmp3304;
__m512 out551 = tmp3324;
__m512 out546 = tmp3306;
__m512 out552 = tmp3326;
__m512 out547 = tmp3308;
__m512 out553 = tmp3328;
__m512 out548 = tmp3310;
__m512 out554 = tmp3330;
out543 = _mm512_max_ps(_mm512_setzero_ps(), out543);
out549 = _mm512_max_ps(_mm512_setzero_ps(), out549);
out544 = _mm512_max_ps(_mm512_setzero_ps(), out544);
out550 = _mm512_max_ps(_mm512_setzero_ps(), out550);
out545 = _mm512_max_ps(_mm512_setzero_ps(), out545);
out551 = _mm512_max_ps(_mm512_setzero_ps(), out551);
out546 = _mm512_max_ps(_mm512_setzero_ps(), out546);
out552 = _mm512_max_ps(_mm512_setzero_ps(), out552);
out547 = _mm512_max_ps(_mm512_setzero_ps(), out547);
out553 = _mm512_max_ps(_mm512_setzero_ps(), out553);
out548 = _mm512_max_ps(_mm512_setzero_ps(), out548);
out554 = _mm512_max_ps(_mm512_setzero_ps(), out554);
_mm512_mask_storeu_ps(datPtr6+1200+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out543);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out549);
_mm512_mask_storeu_ps(datPtr6+1424+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out544);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out550);
_mm512_mask_storeu_ps(datPtr6+1648+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out545);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out551);
_mm512_mask_storeu_ps(datPtr6+1872+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out546);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out552);
_mm512_mask_storeu_ps(datPtr6+2096+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out547);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out553);
_mm512_mask_storeu_ps(datPtr6+2320+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out548);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 255, out554);
__m512 sf177 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf178 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in540 = _mm512_shuffle_f32x4(sf177, sf178, 68);
__m512 in541 = _mm512_shuffle_f32x4(sf177, sf178, 238);
__m512 sf179 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf180 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in548 = _mm512_shuffle_f32x4(sf179, sf180, 68);
__m512 in549 = _mm512_shuffle_f32x4(sf179, sf180, 238);
__m512 sf181 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf182 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in542 = _mm512_shuffle_f32x4(sf181, sf182, 68);
__m512 in543 = _mm512_shuffle_f32x4(sf181, sf182, 238);
__m512 sf183 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf184 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in550 = _mm512_shuffle_f32x4(sf183, sf184, 68);
__m512 in551 = _mm512_shuffle_f32x4(sf183, sf184, 238);
__m512 sf185 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf186 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in544 = _mm512_shuffle_f32x4(sf185, sf186, 68);
__m512 in545 = _mm512_shuffle_f32x4(sf185, sf186, 238);
__m512 sf187 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf188 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in552 = _mm512_shuffle_f32x4(sf187, sf188, 68);
__m512 in553 = _mm512_shuffle_f32x4(sf187, sf188, 238);
__m512 sf189 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf190 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in546 = _mm512_shuffle_f32x4(sf189, sf190, 68);
__m512 in547 = _mm512_shuffle_f32x4(sf189, sf190, 238);
__m512 sf191 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k64+768*l16);
__m512 sf192 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k64+768*l16);
__m512 in554 = _mm512_shuffle_f32x4(sf191, sf192, 68);
__m512 in555 = _mm512_shuffle_f32x4(sf191, sf192, 238);
__m512 tmp3389 = _mm512_add_ps(in541, in542);
__m512 tmp3409 = _mm512_add_ps(in549, in550);
__m512 tmp3388 = _mm512_add_ps(in543, in544);
__m512 tmp3408 = _mm512_add_ps(in551, in552);
__m512 tmp3394 = _mm512_sub_ps(in543, in544);
__m512 tmp3414 = _mm512_sub_ps(in551, in552);
__m512 tmp3393 = _mm512_sub_ps(in541, in542);
__m512 tmp3413 = _mm512_sub_ps(in549, in550);
__m512 tmp3390 = _mm512_add_ps(in545, in546);
__m512 tmp3410 = _mm512_add_ps(in553, in554);
__m512 tmp3395 = _mm512_sub_ps(in545, in546);
__m512 tmp3415 = _mm512_sub_ps(in553, in554);
__m512 tmp3392 = _mm512_fmadd_ps(tmp3394, _mm512_set1_ps(2e+00f), tmp3393);
__m512 tmp3412 = _mm512_fmadd_ps(tmp3414, _mm512_set1_ps(2e+00f), tmp3413);
__m512 tmp3399 = _mm512_fmadd_ps(tmp3394, _mm512_set1_ps(8e+00f), tmp3393);
__m512 tmp3419 = _mm512_fmadd_ps(tmp3414, _mm512_set1_ps(8e+00f), tmp3413);
__m512 tmp3387 = _mm512_add_ps(tmp3388, tmp3389);
__m512 tmp3407 = _mm512_add_ps(tmp3408, tmp3409);
__m512 tmp3391 = _mm512_fmadd_ps(tmp3395, _mm512_set1_ps(1.6e+01f), tmp3392);
__m512 tmp3411 = _mm512_fmadd_ps(tmp3415, _mm512_set1_ps(1.6e+01f), tmp3412);
__m512 tmp3398 = _mm512_fmadd_ps(tmp3395, _mm512_set1_ps(4e+00f), tmp3399);
__m512 tmp3418 = _mm512_fmadd_ps(tmp3415, _mm512_set1_ps(4e+00f), tmp3419);
__m512 tmp3404 = _mm512_add_ps(tmp3395, tmp3393);
__m512 tmp3424 = _mm512_add_ps(tmp3415, tmp3413);
__m512 tmp3397 = _mm512_fmadd_ps(tmp3388, _mm512_set1_ps(4e+00f), tmp3389);
__m512 tmp3417 = _mm512_fmadd_ps(tmp3408, _mm512_set1_ps(4e+00f), tmp3409);
__m512 tmp3401 = _mm512_fmadd_ps(tmp3388, _mm512_set1_ps(1.6e+01f), tmp3389);
__m512 tmp3421 = _mm512_fmadd_ps(tmp3408, _mm512_set1_ps(1.6e+01f), tmp3409);
__m512 tmp3386 = _mm512_add_ps(tmp3387, in540);
__m512 tmp3406 = _mm512_add_ps(tmp3407, in548);
__m512 tmp3403 = _mm512_add_ps(tmp3404, in547);
__m512 tmp3423 = _mm512_add_ps(tmp3424, in555);
__m512 tmp3385 = _mm512_fmadd_ps(tmp3390, _mm512_set1_ps(3.2e+01f), tmp3386);
__m512 tmp3405 = _mm512_fmadd_ps(tmp3410, _mm512_set1_ps(3.2e+01f), tmp3406);
__m512 tmp3396 = _mm512_fmadd_ps(tmp3390, _mm512_set1_ps(8e+00f), tmp3397);
__m512 tmp3416 = _mm512_fmadd_ps(tmp3410, _mm512_set1_ps(8e+00f), tmp3417);
__m512 tmp3402 = _mm512_fmadd_ps(tmp3394, _mm512_set1_ps(3.2e+01f), tmp3403);
__m512 tmp3422 = _mm512_fmadd_ps(tmp3414, _mm512_set1_ps(3.2e+01f), tmp3423);
__m512 tmp3400 = _mm512_fmadd_ps(tmp3390, _mm512_set1_ps(2e+00f), tmp3401);
__m512 tmp3420 = _mm512_fmadd_ps(tmp3410, _mm512_set1_ps(2e+00f), tmp3421);
__m512 tmp3373 = tmp3385;
__m512 tmp3379 = tmp3405;
__m512 tmp3374 = tmp3391;
__m512 tmp3380 = tmp3411;
__m512 tmp3375 = tmp3396;
__m512 tmp3381 = tmp3416;
__m512 tmp3376 = tmp3398;
__m512 tmp3382 = tmp3418;
__m512 tmp3377 = tmp3400;
__m512 tmp3383 = tmp3420;
__m512 tmp3378 = tmp3402;
__m512 tmp3384 = tmp3422;
__m512 tmp3469 = _mm512_unpacklo_ps(tmp3373, tmp3374);
__m512 tmp3470 = _mm512_unpackhi_ps(tmp3373, tmp3374);
__m512 tmp3471 = _mm512_unpacklo_ps(tmp3375, tmp3376);
__m512 tmp3472 = _mm512_unpackhi_ps(tmp3375, tmp3376);
__m512 tmp3473 = _mm512_unpacklo_ps(tmp3377, tmp3378);
__m512 tmp3474 = _mm512_unpackhi_ps(tmp3377, tmp3378);
__m512 tmp3475 = _mm512_unpacklo_ps(tmp3379, tmp3380);
__m512 tmp3476 = _mm512_unpackhi_ps(tmp3379, tmp3380);
__m512 tmp3477 = _mm512_unpacklo_ps(tmp3381, tmp3382);
__m512 tmp3478 = _mm512_unpackhi_ps(tmp3381, tmp3382);
__m512 tmp3479 = _mm512_unpacklo_ps(tmp3383, tmp3384);
__m512 tmp3480 = _mm512_unpackhi_ps(tmp3383, tmp3384);
__m512 tmp3481 = _mm512_shuffle_ps(tmp3469, tmp3471, 68);
__m512 tmp3482 = _mm512_shuffle_ps(tmp3469, tmp3471, 238);
__m512 tmp3483 = _mm512_shuffle_ps(tmp3470, tmp3472, 68);
__m512 tmp3484 = _mm512_shuffle_ps(tmp3470, tmp3472, 238);
__m512 tmp3485 = _mm512_shuffle_ps(tmp3473, tmp3475, 68);
__m512 tmp3486 = _mm512_shuffle_ps(tmp3473, tmp3475, 238);
__m512 tmp3487 = _mm512_shuffle_ps(tmp3474, tmp3476, 68);
__m512 tmp3488 = _mm512_shuffle_ps(tmp3474, tmp3476, 238);
__m512 tmp3489 = _mm512_shuffle_ps(tmp3477, tmp3479, 68);
__m512 tmp3490 = _mm512_shuffle_ps(tmp3477, tmp3479, 238);
__m512 tmp3491 = _mm512_shuffle_ps(tmp3478, tmp3480, 68);
__m512 tmp3492 = _mm512_shuffle_ps(tmp3478, tmp3480, 238);
__m512 tmp3493 = _mm512_shuffle_f32x4(tmp3481, tmp3485, 136);
__m512 tmp3494 = _mm512_shuffle_f32x4(tmp3481, tmp3485, 221);
__m512 tmp3495 = _mm512_shuffle_f32x4(tmp3482, tmp3486, 136);
__m512 tmp3496 = _mm512_shuffle_f32x4(tmp3482, tmp3486, 221);
__m512 tmp3497 = _mm512_shuffle_f32x4(tmp3483, tmp3487, 136);
__m512 tmp3498 = _mm512_shuffle_f32x4(tmp3483, tmp3487, 221);
__m512 tmp3499 = _mm512_shuffle_f32x4(tmp3484, tmp3488, 136);
__m512 tmp3500 = _mm512_shuffle_f32x4(tmp3484, tmp3488, 221);
__m512 tmp3501 = _mm512_shuffle_f32x4(tmp3489, tmp3489, 136);
__m512 tmp3502 = _mm512_shuffle_f32x4(tmp3489, tmp3489, 221);
__m512 tmp3503 = _mm512_shuffle_f32x4(tmp3490, tmp3490, 136);
__m512 tmp3504 = _mm512_shuffle_f32x4(tmp3490, tmp3490, 221);
__m512 tmp3505 = _mm512_shuffle_f32x4(tmp3491, tmp3491, 136);
__m512 tmp3506 = _mm512_shuffle_f32x4(tmp3491, tmp3491, 221);
__m512 tmp3507 = _mm512_shuffle_f32x4(tmp3492, tmp3492, 136);
__m512 tmp3508 = _mm512_shuffle_f32x4(tmp3492, tmp3492, 221);
tmp3373 = _mm512_shuffle_f32x4(tmp3493, tmp3501, 136);
tmp3381 = _mm512_shuffle_f32x4(tmp3493, tmp3501, 221);
tmp3374 = _mm512_shuffle_f32x4(tmp3495, tmp3503, 136);
tmp3382 = _mm512_shuffle_f32x4(tmp3495, tmp3503, 221);
tmp3375 = _mm512_shuffle_f32x4(tmp3497, tmp3505, 136);
tmp3383 = _mm512_shuffle_f32x4(tmp3497, tmp3505, 221);
tmp3376 = _mm512_shuffle_f32x4(tmp3499, tmp3507, 136);
tmp3384 = _mm512_shuffle_f32x4(tmp3499, tmp3507, 221);
tmp3377 = _mm512_shuffle_f32x4(tmp3494, tmp3502, 136);
__m512 tmp3425 = _mm512_shuffle_f32x4(tmp3494, tmp3502, 221);
tmp3378 = _mm512_shuffle_f32x4(tmp3496, tmp3504, 136);
__m512 tmp3426 = _mm512_shuffle_f32x4(tmp3496, tmp3504, 221);
tmp3379 = _mm512_shuffle_f32x4(tmp3498, tmp3506, 136);
__m512 tmp3427 = _mm512_shuffle_f32x4(tmp3498, tmp3506, 221);
tmp3380 = _mm512_shuffle_f32x4(tmp3500, tmp3508, 136);
__m512 tmp3428 = _mm512_shuffle_f32x4(tmp3500, tmp3508, 221);
__m512 tmp3433 = _mm512_add_ps(tmp3374, tmp3375);
__m512 tmp3453 = _mm512_add_ps(tmp3382, tmp3383);
__m512 tmp3432 = _mm512_add_ps(tmp3376, tmp3377);
__m512 tmp3452 = _mm512_add_ps(tmp3384, tmp3425);
__m512 tmp3438 = _mm512_sub_ps(tmp3376, tmp3377);
__m512 tmp3458 = _mm512_sub_ps(tmp3384, tmp3425);
__m512 tmp3437 = _mm512_sub_ps(tmp3374, tmp3375);
__m512 tmp3457 = _mm512_sub_ps(tmp3382, tmp3383);
__m512 tmp3434 = _mm512_add_ps(tmp3378, tmp3379);
__m512 tmp3454 = _mm512_add_ps(tmp3426, tmp3427);
__m512 tmp3439 = _mm512_sub_ps(tmp3378, tmp3379);
__m512 tmp3459 = _mm512_sub_ps(tmp3426, tmp3427);
__m512 tmp3436 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(2e+00f), tmp3437);
__m512 tmp3456 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(2e+00f), tmp3457);
__m512 tmp3443 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(8e+00f), tmp3437);
__m512 tmp3463 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(8e+00f), tmp3457);
__m512 tmp3431 = _mm512_add_ps(tmp3432, tmp3433);
__m512 tmp3451 = _mm512_add_ps(tmp3452, tmp3453);
__m512 tmp3435 = _mm512_fmadd_ps(tmp3439, _mm512_set1_ps(1.6e+01f), tmp3436);
__m512 tmp3455 = _mm512_fmadd_ps(tmp3459, _mm512_set1_ps(1.6e+01f), tmp3456);
__m512 tmp3442 = _mm512_fmadd_ps(tmp3439, _mm512_set1_ps(4e+00f), tmp3443);
__m512 tmp3462 = _mm512_fmadd_ps(tmp3459, _mm512_set1_ps(4e+00f), tmp3463);
__m512 tmp3448 = _mm512_add_ps(tmp3439, tmp3437);
__m512 tmp3468 = _mm512_add_ps(tmp3459, tmp3457);
__m512 tmp3441 = _mm512_fmadd_ps(tmp3432, _mm512_set1_ps(4e+00f), tmp3433);
__m512 tmp3461 = _mm512_fmadd_ps(tmp3452, _mm512_set1_ps(4e+00f), tmp3453);
__m512 tmp3445 = _mm512_fmadd_ps(tmp3432, _mm512_set1_ps(1.6e+01f), tmp3433);
__m512 tmp3465 = _mm512_fmadd_ps(tmp3452, _mm512_set1_ps(1.6e+01f), tmp3453);
__m512 tmp3430 = _mm512_add_ps(tmp3431, tmp3373);
__m512 tmp3450 = _mm512_add_ps(tmp3451, tmp3381);
__m512 tmp3447 = _mm512_add_ps(tmp3448, tmp3380);
__m512 tmp3467 = _mm512_add_ps(tmp3468, tmp3428);
__m512 tmp3429 = _mm512_fmadd_ps(tmp3434, _mm512_set1_ps(3.2e+01f), tmp3430);
__m512 tmp3449 = _mm512_fmadd_ps(tmp3454, _mm512_set1_ps(3.2e+01f), tmp3450);
__m512 tmp3440 = _mm512_fmadd_ps(tmp3434, _mm512_set1_ps(8e+00f), tmp3441);
__m512 tmp3460 = _mm512_fmadd_ps(tmp3454, _mm512_set1_ps(8e+00f), tmp3461);
__m512 tmp3446 = _mm512_fmadd_ps(tmp3438, _mm512_set1_ps(3.2e+01f), tmp3447);
__m512 tmp3466 = _mm512_fmadd_ps(tmp3458, _mm512_set1_ps(3.2e+01f), tmp3467);
__m512 tmp3444 = _mm512_fmadd_ps(tmp3434, _mm512_set1_ps(2e+00f), tmp3445);
__m512 tmp3464 = _mm512_fmadd_ps(tmp3454, _mm512_set1_ps(2e+00f), tmp3465);
__m512 out555 = tmp3429;
__m512 out561 = tmp3449;
__m512 out556 = tmp3435;
__m512 out562 = tmp3455;
__m512 out557 = tmp3440;
__m512 out563 = tmp3460;
__m512 out558 = tmp3442;
__m512 out564 = tmp3462;
__m512 out559 = tmp3444;
__m512 out565 = tmp3464;
__m512 out560 = tmp3446;
__m512 out566 = tmp3466;
out555 = _mm512_max_ps(_mm512_setzero_ps(), out555);
out561 = _mm512_max_ps(_mm512_setzero_ps(), out561);
out556 = _mm512_max_ps(_mm512_setzero_ps(), out556);
out562 = _mm512_max_ps(_mm512_setzero_ps(), out562);
out557 = _mm512_max_ps(_mm512_setzero_ps(), out557);
out563 = _mm512_max_ps(_mm512_setzero_ps(), out563);
out558 = _mm512_max_ps(_mm512_setzero_ps(), out558);
out564 = _mm512_max_ps(_mm512_setzero_ps(), out564);
out559 = _mm512_max_ps(_mm512_setzero_ps(), out559);
out565 = _mm512_max_ps(_mm512_setzero_ps(), out565);
out560 = _mm512_max_ps(_mm512_setzero_ps(), out560);
out566 = _mm512_max_ps(_mm512_setzero_ps(), out566);
_mm512_mask_storeu_ps(datPtr6+13760+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out555);
_mm512_mask_storeu_ps(datPtr6+13808+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out561);
_mm512_mask_storeu_ps(datPtr6+13984+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out556);
_mm512_mask_storeu_ps(datPtr6+14032+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out562);
_mm512_mask_storeu_ps(datPtr6+14208+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out557);
_mm512_mask_storeu_ps(datPtr6+14256+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out563);
_mm512_mask_storeu_ps(datPtr6+14432+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out558);
_mm512_mask_storeu_ps(datPtr6+14480+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out564);
_mm512_mask_storeu_ps(datPtr6+14656+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out559);
_mm512_mask_storeu_ps(datPtr6+14704+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out565);
_mm512_mask_storeu_ps(datPtr6+14880+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out560);
_mm512_mask_storeu_ps(datPtr6+14928+50432*i19+224*toH23+4*toW23+50432*k64+25216*l16, 4095, out566);
}
}
++j13;
rel11 = 2;
}
if (rel11 < 3) {
ptrdiff_t toH24 = base11+6;
ptrdiff_t toW24 = 24;
ptrdiff_t k65 = 1*w33;
for (; k65 != 1; ++k65) {
ptrdiff_t l17 = 0;
for (; l17 != 2; ++l17) {
__m512 sf193 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf194 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in556 = _mm512_shuffle_f32x4(sf193, sf194, 68);
__m512 in557 = _mm512_shuffle_f32x4(sf193, sf194, 238);
__m512 sf195 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf196 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in564 = _mm512_shuffle_f32x4(sf195, sf196, 68);
__m512 in565 = _mm512_shuffle_f32x4(sf195, sf196, 238);
__m512 sf197 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf198 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in558 = _mm512_shuffle_f32x4(sf197, sf198, 68);
__m512 in559 = _mm512_shuffle_f32x4(sf197, sf198, 238);
__m512 sf199 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf200 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in566 = _mm512_shuffle_f32x4(sf199, sf200, 68);
__m512 in567 = _mm512_shuffle_f32x4(sf199, sf200, 238);
__m512 sf201 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf202 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in560 = _mm512_shuffle_f32x4(sf201, sf202, 68);
__m512 in561 = _mm512_shuffle_f32x4(sf201, sf202, 238);
__m512 sf203 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf204 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in568 = _mm512_shuffle_f32x4(sf203, sf204, 68);
__m512 in569 = _mm512_shuffle_f32x4(sf203, sf204, 238);
__m512 sf205 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf206 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in562 = _mm512_shuffle_f32x4(sf205, sf206, 68);
__m512 in563 = _mm512_shuffle_f32x4(sf205, sf206, 238);
__m512 sf207 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf208 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in570 = _mm512_shuffle_f32x4(sf207, sf208, 68);
__m512 in571 = _mm512_shuffle_f32x4(sf207, sf208, 238);
__m512 tmp3525 = _mm512_add_ps(in557, in558);
__m512 tmp3545 = _mm512_add_ps(in565, in566);
__m512 tmp3524 = _mm512_add_ps(in559, in560);
__m512 tmp3544 = _mm512_add_ps(in567, in568);
__m512 tmp3530 = _mm512_sub_ps(in559, in560);
__m512 tmp3550 = _mm512_sub_ps(in567, in568);
__m512 tmp3529 = _mm512_sub_ps(in557, in558);
__m512 tmp3549 = _mm512_sub_ps(in565, in566);
__m512 tmp3526 = _mm512_add_ps(in561, in562);
__m512 tmp3546 = _mm512_add_ps(in569, in570);
__m512 tmp3531 = _mm512_sub_ps(in561, in562);
__m512 tmp3551 = _mm512_sub_ps(in569, in570);
__m512 tmp3528 = _mm512_fmadd_ps(tmp3530, _mm512_set1_ps(2e+00f), tmp3529);
__m512 tmp3548 = _mm512_fmadd_ps(tmp3550, _mm512_set1_ps(2e+00f), tmp3549);
__m512 tmp3535 = _mm512_fmadd_ps(tmp3530, _mm512_set1_ps(8e+00f), tmp3529);
__m512 tmp3555 = _mm512_fmadd_ps(tmp3550, _mm512_set1_ps(8e+00f), tmp3549);
__m512 tmp3523 = _mm512_add_ps(tmp3524, tmp3525);
__m512 tmp3543 = _mm512_add_ps(tmp3544, tmp3545);
__m512 tmp3527 = _mm512_fmadd_ps(tmp3531, _mm512_set1_ps(1.6e+01f), tmp3528);
__m512 tmp3547 = _mm512_fmadd_ps(tmp3551, _mm512_set1_ps(1.6e+01f), tmp3548);
__m512 tmp3534 = _mm512_fmadd_ps(tmp3531, _mm512_set1_ps(4e+00f), tmp3535);
__m512 tmp3554 = _mm512_fmadd_ps(tmp3551, _mm512_set1_ps(4e+00f), tmp3555);
__m512 tmp3540 = _mm512_add_ps(tmp3531, tmp3529);
__m512 tmp3560 = _mm512_add_ps(tmp3551, tmp3549);
__m512 tmp3533 = _mm512_fmadd_ps(tmp3524, _mm512_set1_ps(4e+00f), tmp3525);
__m512 tmp3553 = _mm512_fmadd_ps(tmp3544, _mm512_set1_ps(4e+00f), tmp3545);
__m512 tmp3537 = _mm512_fmadd_ps(tmp3524, _mm512_set1_ps(1.6e+01f), tmp3525);
__m512 tmp3557 = _mm512_fmadd_ps(tmp3544, _mm512_set1_ps(1.6e+01f), tmp3545);
__m512 tmp3522 = _mm512_add_ps(tmp3523, in556);
__m512 tmp3542 = _mm512_add_ps(tmp3543, in564);
__m512 tmp3539 = _mm512_add_ps(tmp3540, in563);
__m512 tmp3559 = _mm512_add_ps(tmp3560, in571);
__m512 tmp3521 = _mm512_fmadd_ps(tmp3526, _mm512_set1_ps(3.2e+01f), tmp3522);
__m512 tmp3541 = _mm512_fmadd_ps(tmp3546, _mm512_set1_ps(3.2e+01f), tmp3542);
__m512 tmp3532 = _mm512_fmadd_ps(tmp3526, _mm512_set1_ps(8e+00f), tmp3533);
__m512 tmp3552 = _mm512_fmadd_ps(tmp3546, _mm512_set1_ps(8e+00f), tmp3553);
__m512 tmp3538 = _mm512_fmadd_ps(tmp3530, _mm512_set1_ps(3.2e+01f), tmp3539);
__m512 tmp3558 = _mm512_fmadd_ps(tmp3550, _mm512_set1_ps(3.2e+01f), tmp3559);
__m512 tmp3536 = _mm512_fmadd_ps(tmp3526, _mm512_set1_ps(2e+00f), tmp3537);
__m512 tmp3556 = _mm512_fmadd_ps(tmp3546, _mm512_set1_ps(2e+00f), tmp3557);
__m512 tmp3509 = tmp3521;
__m512 tmp3515 = tmp3541;
__m512 tmp3510 = tmp3527;
__m512 tmp3516 = tmp3547;
__m512 tmp3511 = tmp3532;
__m512 tmp3517 = tmp3552;
__m512 tmp3512 = tmp3534;
__m512 tmp3518 = tmp3554;
__m512 tmp3513 = tmp3536;
__m512 tmp3519 = tmp3556;
__m512 tmp3514 = tmp3538;
__m512 tmp3520 = tmp3558;
__m512 tmp3605 = _mm512_unpacklo_ps(tmp3509, tmp3510);
__m512 tmp3606 = _mm512_unpackhi_ps(tmp3509, tmp3510);
__m512 tmp3607 = _mm512_unpacklo_ps(tmp3511, tmp3512);
__m512 tmp3608 = _mm512_unpackhi_ps(tmp3511, tmp3512);
__m512 tmp3609 = _mm512_unpacklo_ps(tmp3513, tmp3514);
__m512 tmp3610 = _mm512_unpackhi_ps(tmp3513, tmp3514);
__m512 tmp3611 = _mm512_unpacklo_ps(tmp3515, tmp3516);
__m512 tmp3612 = _mm512_unpackhi_ps(tmp3515, tmp3516);
__m512 tmp3613 = _mm512_unpacklo_ps(tmp3517, tmp3518);
__m512 tmp3614 = _mm512_unpackhi_ps(tmp3517, tmp3518);
__m512 tmp3615 = _mm512_unpacklo_ps(tmp3519, tmp3520);
__m512 tmp3616 = _mm512_unpackhi_ps(tmp3519, tmp3520);
__m512 tmp3617 = _mm512_shuffle_ps(tmp3605, tmp3607, 68);
__m512 tmp3618 = _mm512_shuffle_ps(tmp3605, tmp3607, 238);
__m512 tmp3619 = _mm512_shuffle_ps(tmp3606, tmp3608, 68);
__m512 tmp3620 = _mm512_shuffle_ps(tmp3606, tmp3608, 238);
__m512 tmp3621 = _mm512_shuffle_ps(tmp3609, tmp3611, 68);
__m512 tmp3622 = _mm512_shuffle_ps(tmp3609, tmp3611, 238);
__m512 tmp3623 = _mm512_shuffle_ps(tmp3610, tmp3612, 68);
__m512 tmp3624 = _mm512_shuffle_ps(tmp3610, tmp3612, 238);
__m512 tmp3625 = _mm512_shuffle_ps(tmp3613, tmp3615, 68);
__m512 tmp3626 = _mm512_shuffle_ps(tmp3613, tmp3615, 238);
__m512 tmp3627 = _mm512_shuffle_ps(tmp3614, tmp3616, 68);
__m512 tmp3628 = _mm512_shuffle_ps(tmp3614, tmp3616, 238);
__m512 tmp3629 = _mm512_shuffle_f32x4(tmp3617, tmp3621, 136);
__m512 tmp3630 = _mm512_shuffle_f32x4(tmp3617, tmp3621, 221);
__m512 tmp3631 = _mm512_shuffle_f32x4(tmp3618, tmp3622, 136);
__m512 tmp3632 = _mm512_shuffle_f32x4(tmp3618, tmp3622, 221);
__m512 tmp3633 = _mm512_shuffle_f32x4(tmp3619, tmp3623, 136);
__m512 tmp3634 = _mm512_shuffle_f32x4(tmp3619, tmp3623, 221);
__m512 tmp3635 = _mm512_shuffle_f32x4(tmp3620, tmp3624, 136);
__m512 tmp3636 = _mm512_shuffle_f32x4(tmp3620, tmp3624, 221);
__m512 tmp3637 = _mm512_shuffle_f32x4(tmp3625, tmp3625, 136);
__m512 tmp3638 = _mm512_shuffle_f32x4(tmp3625, tmp3625, 221);
__m512 tmp3639 = _mm512_shuffle_f32x4(tmp3626, tmp3626, 136);
__m512 tmp3640 = _mm512_shuffle_f32x4(tmp3626, tmp3626, 221);
__m512 tmp3641 = _mm512_shuffle_f32x4(tmp3627, tmp3627, 136);
__m512 tmp3642 = _mm512_shuffle_f32x4(tmp3627, tmp3627, 221);
__m512 tmp3643 = _mm512_shuffle_f32x4(tmp3628, tmp3628, 136);
__m512 tmp3644 = _mm512_shuffle_f32x4(tmp3628, tmp3628, 221);
tmp3509 = _mm512_shuffle_f32x4(tmp3629, tmp3637, 136);
tmp3517 = _mm512_shuffle_f32x4(tmp3629, tmp3637, 221);
tmp3510 = _mm512_shuffle_f32x4(tmp3631, tmp3639, 136);
tmp3518 = _mm512_shuffle_f32x4(tmp3631, tmp3639, 221);
tmp3511 = _mm512_shuffle_f32x4(tmp3633, tmp3641, 136);
tmp3519 = _mm512_shuffle_f32x4(tmp3633, tmp3641, 221);
tmp3512 = _mm512_shuffle_f32x4(tmp3635, tmp3643, 136);
tmp3520 = _mm512_shuffle_f32x4(tmp3635, tmp3643, 221);
tmp3513 = _mm512_shuffle_f32x4(tmp3630, tmp3638, 136);
__m512 tmp3561 = _mm512_shuffle_f32x4(tmp3630, tmp3638, 221);
tmp3514 = _mm512_shuffle_f32x4(tmp3632, tmp3640, 136);
__m512 tmp3562 = _mm512_shuffle_f32x4(tmp3632, tmp3640, 221);
tmp3515 = _mm512_shuffle_f32x4(tmp3634, tmp3642, 136);
__m512 tmp3563 = _mm512_shuffle_f32x4(tmp3634, tmp3642, 221);
tmp3516 = _mm512_shuffle_f32x4(tmp3636, tmp3644, 136);
__m512 tmp3564 = _mm512_shuffle_f32x4(tmp3636, tmp3644, 221);
__m512 tmp3569 = _mm512_add_ps(tmp3510, tmp3511);
__m512 tmp3589 = _mm512_add_ps(tmp3518, tmp3519);
__m512 tmp3568 = _mm512_add_ps(tmp3512, tmp3513);
__m512 tmp3588 = _mm512_add_ps(tmp3520, tmp3561);
__m512 tmp3574 = _mm512_sub_ps(tmp3512, tmp3513);
__m512 tmp3594 = _mm512_sub_ps(tmp3520, tmp3561);
__m512 tmp3573 = _mm512_sub_ps(tmp3510, tmp3511);
__m512 tmp3593 = _mm512_sub_ps(tmp3518, tmp3519);
__m512 tmp3570 = _mm512_add_ps(tmp3514, tmp3515);
__m512 tmp3590 = _mm512_add_ps(tmp3562, tmp3563);
__m512 tmp3575 = _mm512_sub_ps(tmp3514, tmp3515);
__m512 tmp3595 = _mm512_sub_ps(tmp3562, tmp3563);
__m512 tmp3572 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(2e+00f), tmp3573);
__m512 tmp3592 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(2e+00f), tmp3593);
__m512 tmp3579 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(8e+00f), tmp3573);
__m512 tmp3599 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(8e+00f), tmp3593);
__m512 tmp3567 = _mm512_add_ps(tmp3568, tmp3569);
__m512 tmp3587 = _mm512_add_ps(tmp3588, tmp3589);
__m512 tmp3571 = _mm512_fmadd_ps(tmp3575, _mm512_set1_ps(1.6e+01f), tmp3572);
__m512 tmp3591 = _mm512_fmadd_ps(tmp3595, _mm512_set1_ps(1.6e+01f), tmp3592);
__m512 tmp3578 = _mm512_fmadd_ps(tmp3575, _mm512_set1_ps(4e+00f), tmp3579);
__m512 tmp3598 = _mm512_fmadd_ps(tmp3595, _mm512_set1_ps(4e+00f), tmp3599);
__m512 tmp3584 = _mm512_add_ps(tmp3575, tmp3573);
__m512 tmp3604 = _mm512_add_ps(tmp3595, tmp3593);
__m512 tmp3577 = _mm512_fmadd_ps(tmp3568, _mm512_set1_ps(4e+00f), tmp3569);
__m512 tmp3597 = _mm512_fmadd_ps(tmp3588, _mm512_set1_ps(4e+00f), tmp3589);
__m512 tmp3581 = _mm512_fmadd_ps(tmp3568, _mm512_set1_ps(1.6e+01f), tmp3569);
__m512 tmp3601 = _mm512_fmadd_ps(tmp3588, _mm512_set1_ps(1.6e+01f), tmp3589);
__m512 tmp3566 = _mm512_add_ps(tmp3567, tmp3509);
__m512 tmp3586 = _mm512_add_ps(tmp3587, tmp3517);
__m512 tmp3583 = _mm512_add_ps(tmp3584, tmp3516);
__m512 tmp3603 = _mm512_add_ps(tmp3604, tmp3564);
__m512 tmp3565 = _mm512_fmadd_ps(tmp3570, _mm512_set1_ps(3.2e+01f), tmp3566);
__m512 tmp3585 = _mm512_fmadd_ps(tmp3590, _mm512_set1_ps(3.2e+01f), tmp3586);
__m512 tmp3576 = _mm512_fmadd_ps(tmp3570, _mm512_set1_ps(8e+00f), tmp3577);
__m512 tmp3596 = _mm512_fmadd_ps(tmp3590, _mm512_set1_ps(8e+00f), tmp3597);
__m512 tmp3582 = _mm512_fmadd_ps(tmp3574, _mm512_set1_ps(3.2e+01f), tmp3583);
__m512 tmp3602 = _mm512_fmadd_ps(tmp3594, _mm512_set1_ps(3.2e+01f), tmp3603);
__m512 tmp3580 = _mm512_fmadd_ps(tmp3570, _mm512_set1_ps(2e+00f), tmp3581);
__m512 tmp3600 = _mm512_fmadd_ps(tmp3590, _mm512_set1_ps(2e+00f), tmp3601);
__m512 out567 = tmp3565;
__m512 out573 = tmp3585;
__m512 out568 = tmp3571;
__m512 out574 = tmp3591;
__m512 out569 = tmp3576;
__m512 out575 = tmp3596;
__m512 out570 = tmp3578;
__m512 out576 = tmp3598;
__m512 out571 = tmp3580;
__m512 out577 = tmp3600;
__m512 out572 = tmp3582;
__m512 out578 = tmp3602;
out567 = _mm512_max_ps(_mm512_setzero_ps(), out567);
out573 = _mm512_max_ps(_mm512_setzero_ps(), out573);
out568 = _mm512_max_ps(_mm512_setzero_ps(), out568);
out574 = _mm512_max_ps(_mm512_setzero_ps(), out574);
out569 = _mm512_max_ps(_mm512_setzero_ps(), out569);
out575 = _mm512_max_ps(_mm512_setzero_ps(), out575);
out570 = _mm512_max_ps(_mm512_setzero_ps(), out570);
out576 = _mm512_max_ps(_mm512_setzero_ps(), out576);
out571 = _mm512_max_ps(_mm512_setzero_ps(), out571);
out577 = _mm512_max_ps(_mm512_setzero_ps(), out577);
out572 = _mm512_max_ps(_mm512_setzero_ps(), out572);
out578 = _mm512_max_ps(_mm512_setzero_ps(), out578);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out567);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out573);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out568);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out574);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out569);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out575);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out570);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out576);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out571);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out577);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out572);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out578);
__m512 sf209 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf210 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in572 = _mm512_shuffle_f32x4(sf209, sf210, 68);
__m512 in573 = _mm512_shuffle_f32x4(sf209, sf210, 238);
__m512 sf211 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf212 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in580 = _mm512_shuffle_f32x4(sf211, sf212, 68);
__m512 in581 = _mm512_shuffle_f32x4(sf211, sf212, 238);
__m512 sf213 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf214 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in574 = _mm512_shuffle_f32x4(sf213, sf214, 68);
__m512 in575 = _mm512_shuffle_f32x4(sf213, sf214, 238);
__m512 sf215 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf216 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in582 = _mm512_shuffle_f32x4(sf215, sf216, 68);
__m512 in583 = _mm512_shuffle_f32x4(sf215, sf216, 238);
__m512 sf217 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf218 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in576 = _mm512_shuffle_f32x4(sf217, sf218, 68);
__m512 in577 = _mm512_shuffle_f32x4(sf217, sf218, 238);
__m512 sf219 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf220 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in584 = _mm512_shuffle_f32x4(sf219, sf220, 68);
__m512 in585 = _mm512_shuffle_f32x4(sf219, sf220, 238);
__m512 sf221 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf222 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in578 = _mm512_shuffle_f32x4(sf221, sf222, 68);
__m512 in579 = _mm512_shuffle_f32x4(sf221, sf222, 238);
__m512 sf223 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf224 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in586 = _mm512_shuffle_f32x4(sf223, sf224, 68);
__m512 in587 = _mm512_shuffle_f32x4(sf223, sf224, 238);
__m512 tmp3661 = _mm512_add_ps(in573, in574);
__m512 tmp3681 = _mm512_add_ps(in581, in582);
__m512 tmp3660 = _mm512_add_ps(in575, in576);
__m512 tmp3680 = _mm512_add_ps(in583, in584);
__m512 tmp3666 = _mm512_sub_ps(in575, in576);
__m512 tmp3686 = _mm512_sub_ps(in583, in584);
__m512 tmp3665 = _mm512_sub_ps(in573, in574);
__m512 tmp3685 = _mm512_sub_ps(in581, in582);
__m512 tmp3662 = _mm512_add_ps(in577, in578);
__m512 tmp3682 = _mm512_add_ps(in585, in586);
__m512 tmp3667 = _mm512_sub_ps(in577, in578);
__m512 tmp3687 = _mm512_sub_ps(in585, in586);
__m512 tmp3664 = _mm512_fmadd_ps(tmp3666, _mm512_set1_ps(2e+00f), tmp3665);
__m512 tmp3684 = _mm512_fmadd_ps(tmp3686, _mm512_set1_ps(2e+00f), tmp3685);
__m512 tmp3671 = _mm512_fmadd_ps(tmp3666, _mm512_set1_ps(8e+00f), tmp3665);
__m512 tmp3691 = _mm512_fmadd_ps(tmp3686, _mm512_set1_ps(8e+00f), tmp3685);
__m512 tmp3659 = _mm512_add_ps(tmp3660, tmp3661);
__m512 tmp3679 = _mm512_add_ps(tmp3680, tmp3681);
__m512 tmp3663 = _mm512_fmadd_ps(tmp3667, _mm512_set1_ps(1.6e+01f), tmp3664);
__m512 tmp3683 = _mm512_fmadd_ps(tmp3687, _mm512_set1_ps(1.6e+01f), tmp3684);
__m512 tmp3670 = _mm512_fmadd_ps(tmp3667, _mm512_set1_ps(4e+00f), tmp3671);
__m512 tmp3690 = _mm512_fmadd_ps(tmp3687, _mm512_set1_ps(4e+00f), tmp3691);
__m512 tmp3676 = _mm512_add_ps(tmp3667, tmp3665);
__m512 tmp3696 = _mm512_add_ps(tmp3687, tmp3685);
__m512 tmp3669 = _mm512_fmadd_ps(tmp3660, _mm512_set1_ps(4e+00f), tmp3661);
__m512 tmp3689 = _mm512_fmadd_ps(tmp3680, _mm512_set1_ps(4e+00f), tmp3681);
__m512 tmp3673 = _mm512_fmadd_ps(tmp3660, _mm512_set1_ps(1.6e+01f), tmp3661);
__m512 tmp3693 = _mm512_fmadd_ps(tmp3680, _mm512_set1_ps(1.6e+01f), tmp3681);
__m512 tmp3658 = _mm512_add_ps(tmp3659, in572);
__m512 tmp3678 = _mm512_add_ps(tmp3679, in580);
__m512 tmp3675 = _mm512_add_ps(tmp3676, in579);
__m512 tmp3695 = _mm512_add_ps(tmp3696, in587);
__m512 tmp3657 = _mm512_fmadd_ps(tmp3662, _mm512_set1_ps(3.2e+01f), tmp3658);
__m512 tmp3677 = _mm512_fmadd_ps(tmp3682, _mm512_set1_ps(3.2e+01f), tmp3678);
__m512 tmp3668 = _mm512_fmadd_ps(tmp3662, _mm512_set1_ps(8e+00f), tmp3669);
__m512 tmp3688 = _mm512_fmadd_ps(tmp3682, _mm512_set1_ps(8e+00f), tmp3689);
__m512 tmp3674 = _mm512_fmadd_ps(tmp3666, _mm512_set1_ps(3.2e+01f), tmp3675);
__m512 tmp3694 = _mm512_fmadd_ps(tmp3686, _mm512_set1_ps(3.2e+01f), tmp3695);
__m512 tmp3672 = _mm512_fmadd_ps(tmp3662, _mm512_set1_ps(2e+00f), tmp3673);
__m512 tmp3692 = _mm512_fmadd_ps(tmp3682, _mm512_set1_ps(2e+00f), tmp3693);
__m512 tmp3645 = tmp3657;
__m512 tmp3651 = tmp3677;
__m512 tmp3646 = tmp3663;
__m512 tmp3652 = tmp3683;
__m512 tmp3647 = tmp3668;
__m512 tmp3653 = tmp3688;
__m512 tmp3648 = tmp3670;
__m512 tmp3654 = tmp3690;
__m512 tmp3649 = tmp3672;
__m512 tmp3655 = tmp3692;
__m512 tmp3650 = tmp3674;
__m512 tmp3656 = tmp3694;
__m512 tmp3741 = _mm512_unpacklo_ps(tmp3645, tmp3646);
__m512 tmp3742 = _mm512_unpackhi_ps(tmp3645, tmp3646);
__m512 tmp3743 = _mm512_unpacklo_ps(tmp3647, tmp3648);
__m512 tmp3744 = _mm512_unpackhi_ps(tmp3647, tmp3648);
__m512 tmp3745 = _mm512_unpacklo_ps(tmp3649, tmp3650);
__m512 tmp3746 = _mm512_unpackhi_ps(tmp3649, tmp3650);
__m512 tmp3747 = _mm512_unpacklo_ps(tmp3651, tmp3652);
__m512 tmp3748 = _mm512_unpackhi_ps(tmp3651, tmp3652);
__m512 tmp3749 = _mm512_unpacklo_ps(tmp3653, tmp3654);
__m512 tmp3750 = _mm512_unpackhi_ps(tmp3653, tmp3654);
__m512 tmp3751 = _mm512_unpacklo_ps(tmp3655, tmp3656);
__m512 tmp3752 = _mm512_unpackhi_ps(tmp3655, tmp3656);
__m512 tmp3753 = _mm512_shuffle_ps(tmp3741, tmp3743, 68);
__m512 tmp3754 = _mm512_shuffle_ps(tmp3741, tmp3743, 238);
__m512 tmp3755 = _mm512_shuffle_ps(tmp3742, tmp3744, 68);
__m512 tmp3756 = _mm512_shuffle_ps(tmp3742, tmp3744, 238);
__m512 tmp3757 = _mm512_shuffle_ps(tmp3745, tmp3747, 68);
__m512 tmp3758 = _mm512_shuffle_ps(tmp3745, tmp3747, 238);
__m512 tmp3759 = _mm512_shuffle_ps(tmp3746, tmp3748, 68);
__m512 tmp3760 = _mm512_shuffle_ps(tmp3746, tmp3748, 238);
__m512 tmp3761 = _mm512_shuffle_ps(tmp3749, tmp3751, 68);
__m512 tmp3762 = _mm512_shuffle_ps(tmp3749, tmp3751, 238);
__m512 tmp3763 = _mm512_shuffle_ps(tmp3750, tmp3752, 68);
__m512 tmp3764 = _mm512_shuffle_ps(tmp3750, tmp3752, 238);
__m512 tmp3765 = _mm512_shuffle_f32x4(tmp3753, tmp3757, 136);
__m512 tmp3766 = _mm512_shuffle_f32x4(tmp3753, tmp3757, 221);
__m512 tmp3767 = _mm512_shuffle_f32x4(tmp3754, tmp3758, 136);
__m512 tmp3768 = _mm512_shuffle_f32x4(tmp3754, tmp3758, 221);
__m512 tmp3769 = _mm512_shuffle_f32x4(tmp3755, tmp3759, 136);
__m512 tmp3770 = _mm512_shuffle_f32x4(tmp3755, tmp3759, 221);
__m512 tmp3771 = _mm512_shuffle_f32x4(tmp3756, tmp3760, 136);
__m512 tmp3772 = _mm512_shuffle_f32x4(tmp3756, tmp3760, 221);
__m512 tmp3773 = _mm512_shuffle_f32x4(tmp3761, tmp3761, 136);
__m512 tmp3774 = _mm512_shuffle_f32x4(tmp3761, tmp3761, 221);
__m512 tmp3775 = _mm512_shuffle_f32x4(tmp3762, tmp3762, 136);
__m512 tmp3776 = _mm512_shuffle_f32x4(tmp3762, tmp3762, 221);
__m512 tmp3777 = _mm512_shuffle_f32x4(tmp3763, tmp3763, 136);
__m512 tmp3778 = _mm512_shuffle_f32x4(tmp3763, tmp3763, 221);
__m512 tmp3779 = _mm512_shuffle_f32x4(tmp3764, tmp3764, 136);
__m512 tmp3780 = _mm512_shuffle_f32x4(tmp3764, tmp3764, 221);
tmp3645 = _mm512_shuffle_f32x4(tmp3765, tmp3773, 136);
tmp3653 = _mm512_shuffle_f32x4(tmp3765, tmp3773, 221);
tmp3646 = _mm512_shuffle_f32x4(tmp3767, tmp3775, 136);
tmp3654 = _mm512_shuffle_f32x4(tmp3767, tmp3775, 221);
tmp3647 = _mm512_shuffle_f32x4(tmp3769, tmp3777, 136);
tmp3655 = _mm512_shuffle_f32x4(tmp3769, tmp3777, 221);
tmp3648 = _mm512_shuffle_f32x4(tmp3771, tmp3779, 136);
tmp3656 = _mm512_shuffle_f32x4(tmp3771, tmp3779, 221);
tmp3649 = _mm512_shuffle_f32x4(tmp3766, tmp3774, 136);
__m512 tmp3697 = _mm512_shuffle_f32x4(tmp3766, tmp3774, 221);
tmp3650 = _mm512_shuffle_f32x4(tmp3768, tmp3776, 136);
__m512 tmp3698 = _mm512_shuffle_f32x4(tmp3768, tmp3776, 221);
tmp3651 = _mm512_shuffle_f32x4(tmp3770, tmp3778, 136);
__m512 tmp3699 = _mm512_shuffle_f32x4(tmp3770, tmp3778, 221);
tmp3652 = _mm512_shuffle_f32x4(tmp3772, tmp3780, 136);
__m512 tmp3700 = _mm512_shuffle_f32x4(tmp3772, tmp3780, 221);
__m512 tmp3705 = _mm512_add_ps(tmp3646, tmp3647);
__m512 tmp3725 = _mm512_add_ps(tmp3654, tmp3655);
__m512 tmp3704 = _mm512_add_ps(tmp3648, tmp3649);
__m512 tmp3724 = _mm512_add_ps(tmp3656, tmp3697);
__m512 tmp3710 = _mm512_sub_ps(tmp3648, tmp3649);
__m512 tmp3730 = _mm512_sub_ps(tmp3656, tmp3697);
__m512 tmp3709 = _mm512_sub_ps(tmp3646, tmp3647);
__m512 tmp3729 = _mm512_sub_ps(tmp3654, tmp3655);
__m512 tmp3706 = _mm512_add_ps(tmp3650, tmp3651);
__m512 tmp3726 = _mm512_add_ps(tmp3698, tmp3699);
__m512 tmp3711 = _mm512_sub_ps(tmp3650, tmp3651);
__m512 tmp3731 = _mm512_sub_ps(tmp3698, tmp3699);
__m512 tmp3708 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(2e+00f), tmp3709);
__m512 tmp3728 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(2e+00f), tmp3729);
__m512 tmp3715 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(8e+00f), tmp3709);
__m512 tmp3735 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(8e+00f), tmp3729);
__m512 tmp3703 = _mm512_add_ps(tmp3704, tmp3705);
__m512 tmp3723 = _mm512_add_ps(tmp3724, tmp3725);
__m512 tmp3707 = _mm512_fmadd_ps(tmp3711, _mm512_set1_ps(1.6e+01f), tmp3708);
__m512 tmp3727 = _mm512_fmadd_ps(tmp3731, _mm512_set1_ps(1.6e+01f), tmp3728);
__m512 tmp3714 = _mm512_fmadd_ps(tmp3711, _mm512_set1_ps(4e+00f), tmp3715);
__m512 tmp3734 = _mm512_fmadd_ps(tmp3731, _mm512_set1_ps(4e+00f), tmp3735);
__m512 tmp3720 = _mm512_add_ps(tmp3711, tmp3709);
__m512 tmp3740 = _mm512_add_ps(tmp3731, tmp3729);
__m512 tmp3713 = _mm512_fmadd_ps(tmp3704, _mm512_set1_ps(4e+00f), tmp3705);
__m512 tmp3733 = _mm512_fmadd_ps(tmp3724, _mm512_set1_ps(4e+00f), tmp3725);
__m512 tmp3717 = _mm512_fmadd_ps(tmp3704, _mm512_set1_ps(1.6e+01f), tmp3705);
__m512 tmp3737 = _mm512_fmadd_ps(tmp3724, _mm512_set1_ps(1.6e+01f), tmp3725);
__m512 tmp3702 = _mm512_add_ps(tmp3703, tmp3645);
__m512 tmp3722 = _mm512_add_ps(tmp3723, tmp3653);
__m512 tmp3719 = _mm512_add_ps(tmp3720, tmp3652);
__m512 tmp3739 = _mm512_add_ps(tmp3740, tmp3700);
__m512 tmp3701 = _mm512_fmadd_ps(tmp3706, _mm512_set1_ps(3.2e+01f), tmp3702);
__m512 tmp3721 = _mm512_fmadd_ps(tmp3726, _mm512_set1_ps(3.2e+01f), tmp3722);
__m512 tmp3712 = _mm512_fmadd_ps(tmp3706, _mm512_set1_ps(8e+00f), tmp3713);
__m512 tmp3732 = _mm512_fmadd_ps(tmp3726, _mm512_set1_ps(8e+00f), tmp3733);
__m512 tmp3718 = _mm512_fmadd_ps(tmp3710, _mm512_set1_ps(3.2e+01f), tmp3719);
__m512 tmp3738 = _mm512_fmadd_ps(tmp3730, _mm512_set1_ps(3.2e+01f), tmp3739);
__m512 tmp3716 = _mm512_fmadd_ps(tmp3706, _mm512_set1_ps(2e+00f), tmp3717);
__m512 tmp3736 = _mm512_fmadd_ps(tmp3726, _mm512_set1_ps(2e+00f), tmp3737);
__m512 out579 = tmp3701;
__m512 out585 = tmp3721;
__m512 out580 = tmp3707;
__m512 out586 = tmp3727;
__m512 out581 = tmp3712;
__m512 out587 = tmp3732;
__m512 out582 = tmp3714;
__m512 out588 = tmp3734;
__m512 out583 = tmp3716;
__m512 out589 = tmp3736;
__m512 out584 = tmp3718;
__m512 out590 = tmp3738;
out579 = _mm512_max_ps(_mm512_setzero_ps(), out579);
out585 = _mm512_max_ps(_mm512_setzero_ps(), out585);
out580 = _mm512_max_ps(_mm512_setzero_ps(), out580);
out586 = _mm512_max_ps(_mm512_setzero_ps(), out586);
out581 = _mm512_max_ps(_mm512_setzero_ps(), out581);
out587 = _mm512_max_ps(_mm512_setzero_ps(), out587);
out582 = _mm512_max_ps(_mm512_setzero_ps(), out582);
out588 = _mm512_max_ps(_mm512_setzero_ps(), out588);
out583 = _mm512_max_ps(_mm512_setzero_ps(), out583);
out589 = _mm512_max_ps(_mm512_setzero_ps(), out589);
out584 = _mm512_max_ps(_mm512_setzero_ps(), out584);
out590 = _mm512_max_ps(_mm512_setzero_ps(), out590);
_mm512_mask_storeu_ps(datPtr6+96+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out579);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out585);
_mm512_mask_storeu_ps(datPtr6+320+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out580);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out586);
_mm512_mask_storeu_ps(datPtr6+544+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out581);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out587);
_mm512_mask_storeu_ps(datPtr6+768+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out582);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out588);
_mm512_mask_storeu_ps(datPtr6+992+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out583);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out589);
_mm512_mask_storeu_ps(datPtr6+1216+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out584);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out590);
__m512 sf225 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf226 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in588 = _mm512_shuffle_f32x4(sf225, sf226, 68);
__m512 in589 = _mm512_shuffle_f32x4(sf225, sf226, 238);
__m512 sf227 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf228 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in596 = _mm512_shuffle_f32x4(sf227, sf228, 68);
__m512 in597 = _mm512_shuffle_f32x4(sf227, sf228, 238);
__m512 sf229 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf230 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in590 = _mm512_shuffle_f32x4(sf229, sf230, 68);
__m512 in591 = _mm512_shuffle_f32x4(sf229, sf230, 238);
__m512 sf231 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf232 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in598 = _mm512_shuffle_f32x4(sf231, sf232, 68);
__m512 in599 = _mm512_shuffle_f32x4(sf231, sf232, 238);
__m512 sf233 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf234 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in592 = _mm512_shuffle_f32x4(sf233, sf234, 68);
__m512 in593 = _mm512_shuffle_f32x4(sf233, sf234, 238);
__m512 sf235 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf236 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in600 = _mm512_shuffle_f32x4(sf235, sf236, 68);
__m512 in601 = _mm512_shuffle_f32x4(sf235, sf236, 238);
__m512 sf237 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf238 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in594 = _mm512_shuffle_f32x4(sf237, sf238, 68);
__m512 in595 = _mm512_shuffle_f32x4(sf237, sf238, 238);
__m512 sf239 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k65+768*l17);
__m512 sf240 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k65+768*l17);
__m512 in602 = _mm512_shuffle_f32x4(sf239, sf240, 68);
__m512 in603 = _mm512_shuffle_f32x4(sf239, sf240, 238);
__m512 tmp3797 = _mm512_add_ps(in589, in590);
__m512 tmp3817 = _mm512_add_ps(in597, in598);
__m512 tmp3796 = _mm512_add_ps(in591, in592);
__m512 tmp3816 = _mm512_add_ps(in599, in600);
__m512 tmp3802 = _mm512_sub_ps(in591, in592);
__m512 tmp3822 = _mm512_sub_ps(in599, in600);
__m512 tmp3801 = _mm512_sub_ps(in589, in590);
__m512 tmp3821 = _mm512_sub_ps(in597, in598);
__m512 tmp3798 = _mm512_add_ps(in593, in594);
__m512 tmp3818 = _mm512_add_ps(in601, in602);
__m512 tmp3803 = _mm512_sub_ps(in593, in594);
__m512 tmp3823 = _mm512_sub_ps(in601, in602);
__m512 tmp3800 = _mm512_fmadd_ps(tmp3802, _mm512_set1_ps(2e+00f), tmp3801);
__m512 tmp3820 = _mm512_fmadd_ps(tmp3822, _mm512_set1_ps(2e+00f), tmp3821);
__m512 tmp3807 = _mm512_fmadd_ps(tmp3802, _mm512_set1_ps(8e+00f), tmp3801);
__m512 tmp3827 = _mm512_fmadd_ps(tmp3822, _mm512_set1_ps(8e+00f), tmp3821);
__m512 tmp3795 = _mm512_add_ps(tmp3796, tmp3797);
__m512 tmp3815 = _mm512_add_ps(tmp3816, tmp3817);
__m512 tmp3799 = _mm512_fmadd_ps(tmp3803, _mm512_set1_ps(1.6e+01f), tmp3800);
__m512 tmp3819 = _mm512_fmadd_ps(tmp3823, _mm512_set1_ps(1.6e+01f), tmp3820);
__m512 tmp3806 = _mm512_fmadd_ps(tmp3803, _mm512_set1_ps(4e+00f), tmp3807);
__m512 tmp3826 = _mm512_fmadd_ps(tmp3823, _mm512_set1_ps(4e+00f), tmp3827);
__m512 tmp3812 = _mm512_add_ps(tmp3803, tmp3801);
__m512 tmp3832 = _mm512_add_ps(tmp3823, tmp3821);
__m512 tmp3805 = _mm512_fmadd_ps(tmp3796, _mm512_set1_ps(4e+00f), tmp3797);
__m512 tmp3825 = _mm512_fmadd_ps(tmp3816, _mm512_set1_ps(4e+00f), tmp3817);
__m512 tmp3809 = _mm512_fmadd_ps(tmp3796, _mm512_set1_ps(1.6e+01f), tmp3797);
__m512 tmp3829 = _mm512_fmadd_ps(tmp3816, _mm512_set1_ps(1.6e+01f), tmp3817);
__m512 tmp3794 = _mm512_add_ps(tmp3795, in588);
__m512 tmp3814 = _mm512_add_ps(tmp3815, in596);
__m512 tmp3811 = _mm512_add_ps(tmp3812, in595);
__m512 tmp3831 = _mm512_add_ps(tmp3832, in603);
__m512 tmp3793 = _mm512_fmadd_ps(tmp3798, _mm512_set1_ps(3.2e+01f), tmp3794);
__m512 tmp3813 = _mm512_fmadd_ps(tmp3818, _mm512_set1_ps(3.2e+01f), tmp3814);
__m512 tmp3804 = _mm512_fmadd_ps(tmp3798, _mm512_set1_ps(8e+00f), tmp3805);
__m512 tmp3824 = _mm512_fmadd_ps(tmp3818, _mm512_set1_ps(8e+00f), tmp3825);
__m512 tmp3810 = _mm512_fmadd_ps(tmp3802, _mm512_set1_ps(3.2e+01f), tmp3811);
__m512 tmp3830 = _mm512_fmadd_ps(tmp3822, _mm512_set1_ps(3.2e+01f), tmp3831);
__m512 tmp3808 = _mm512_fmadd_ps(tmp3798, _mm512_set1_ps(2e+00f), tmp3809);
__m512 tmp3828 = _mm512_fmadd_ps(tmp3818, _mm512_set1_ps(2e+00f), tmp3829);
__m512 tmp3781 = tmp3793;
__m512 tmp3787 = tmp3813;
__m512 tmp3782 = tmp3799;
__m512 tmp3788 = tmp3819;
__m512 tmp3783 = tmp3804;
__m512 tmp3789 = tmp3824;
__m512 tmp3784 = tmp3806;
__m512 tmp3790 = tmp3826;
__m512 tmp3785 = tmp3808;
__m512 tmp3791 = tmp3828;
__m512 tmp3786 = tmp3810;
__m512 tmp3792 = tmp3830;
__m512 tmp3877 = _mm512_unpacklo_ps(tmp3781, tmp3782);
__m512 tmp3878 = _mm512_unpackhi_ps(tmp3781, tmp3782);
__m512 tmp3879 = _mm512_unpacklo_ps(tmp3783, tmp3784);
__m512 tmp3880 = _mm512_unpackhi_ps(tmp3783, tmp3784);
__m512 tmp3881 = _mm512_unpacklo_ps(tmp3785, tmp3786);
__m512 tmp3882 = _mm512_unpackhi_ps(tmp3785, tmp3786);
__m512 tmp3883 = _mm512_unpacklo_ps(tmp3787, tmp3788);
__m512 tmp3884 = _mm512_unpackhi_ps(tmp3787, tmp3788);
__m512 tmp3885 = _mm512_unpacklo_ps(tmp3789, tmp3790);
__m512 tmp3886 = _mm512_unpackhi_ps(tmp3789, tmp3790);
__m512 tmp3887 = _mm512_unpacklo_ps(tmp3791, tmp3792);
__m512 tmp3888 = _mm512_unpackhi_ps(tmp3791, tmp3792);
__m512 tmp3889 = _mm512_shuffle_ps(tmp3877, tmp3879, 68);
__m512 tmp3890 = _mm512_shuffle_ps(tmp3877, tmp3879, 238);
__m512 tmp3891 = _mm512_shuffle_ps(tmp3878, tmp3880, 68);
__m512 tmp3892 = _mm512_shuffle_ps(tmp3878, tmp3880, 238);
__m512 tmp3893 = _mm512_shuffle_ps(tmp3881, tmp3883, 68);
__m512 tmp3894 = _mm512_shuffle_ps(tmp3881, tmp3883, 238);
__m512 tmp3895 = _mm512_shuffle_ps(tmp3882, tmp3884, 68);
__m512 tmp3896 = _mm512_shuffle_ps(tmp3882, tmp3884, 238);
__m512 tmp3897 = _mm512_shuffle_ps(tmp3885, tmp3887, 68);
__m512 tmp3898 = _mm512_shuffle_ps(tmp3885, tmp3887, 238);
__m512 tmp3899 = _mm512_shuffle_ps(tmp3886, tmp3888, 68);
__m512 tmp3900 = _mm512_shuffle_ps(tmp3886, tmp3888, 238);
__m512 tmp3901 = _mm512_shuffle_f32x4(tmp3889, tmp3893, 136);
__m512 tmp3902 = _mm512_shuffle_f32x4(tmp3889, tmp3893, 221);
__m512 tmp3903 = _mm512_shuffle_f32x4(tmp3890, tmp3894, 136);
__m512 tmp3904 = _mm512_shuffle_f32x4(tmp3890, tmp3894, 221);
__m512 tmp3905 = _mm512_shuffle_f32x4(tmp3891, tmp3895, 136);
__m512 tmp3906 = _mm512_shuffle_f32x4(tmp3891, tmp3895, 221);
__m512 tmp3907 = _mm512_shuffle_f32x4(tmp3892, tmp3896, 136);
__m512 tmp3908 = _mm512_shuffle_f32x4(tmp3892, tmp3896, 221);
__m512 tmp3909 = _mm512_shuffle_f32x4(tmp3897, tmp3897, 136);
__m512 tmp3910 = _mm512_shuffle_f32x4(tmp3897, tmp3897, 221);
__m512 tmp3911 = _mm512_shuffle_f32x4(tmp3898, tmp3898, 136);
__m512 tmp3912 = _mm512_shuffle_f32x4(tmp3898, tmp3898, 221);
__m512 tmp3913 = _mm512_shuffle_f32x4(tmp3899, tmp3899, 136);
__m512 tmp3914 = _mm512_shuffle_f32x4(tmp3899, tmp3899, 221);
__m512 tmp3915 = _mm512_shuffle_f32x4(tmp3900, tmp3900, 136);
__m512 tmp3916 = _mm512_shuffle_f32x4(tmp3900, tmp3900, 221);
tmp3781 = _mm512_shuffle_f32x4(tmp3901, tmp3909, 136);
tmp3789 = _mm512_shuffle_f32x4(tmp3901, tmp3909, 221);
tmp3782 = _mm512_shuffle_f32x4(tmp3903, tmp3911, 136);
tmp3790 = _mm512_shuffle_f32x4(tmp3903, tmp3911, 221);
tmp3783 = _mm512_shuffle_f32x4(tmp3905, tmp3913, 136);
tmp3791 = _mm512_shuffle_f32x4(tmp3905, tmp3913, 221);
tmp3784 = _mm512_shuffle_f32x4(tmp3907, tmp3915, 136);
tmp3792 = _mm512_shuffle_f32x4(tmp3907, tmp3915, 221);
tmp3785 = _mm512_shuffle_f32x4(tmp3902, tmp3910, 136);
__m512 tmp3833 = _mm512_shuffle_f32x4(tmp3902, tmp3910, 221);
tmp3786 = _mm512_shuffle_f32x4(tmp3904, tmp3912, 136);
__m512 tmp3834 = _mm512_shuffle_f32x4(tmp3904, tmp3912, 221);
tmp3787 = _mm512_shuffle_f32x4(tmp3906, tmp3914, 136);
__m512 tmp3835 = _mm512_shuffle_f32x4(tmp3906, tmp3914, 221);
tmp3788 = _mm512_shuffle_f32x4(tmp3908, tmp3916, 136);
__m512 tmp3836 = _mm512_shuffle_f32x4(tmp3908, tmp3916, 221);
__m512 tmp3841 = _mm512_add_ps(tmp3782, tmp3783);
__m512 tmp3861 = _mm512_add_ps(tmp3790, tmp3791);
__m512 tmp3840 = _mm512_add_ps(tmp3784, tmp3785);
__m512 tmp3860 = _mm512_add_ps(tmp3792, tmp3833);
__m512 tmp3846 = _mm512_sub_ps(tmp3784, tmp3785);
__m512 tmp3866 = _mm512_sub_ps(tmp3792, tmp3833);
__m512 tmp3845 = _mm512_sub_ps(tmp3782, tmp3783);
__m512 tmp3865 = _mm512_sub_ps(tmp3790, tmp3791);
__m512 tmp3842 = _mm512_add_ps(tmp3786, tmp3787);
__m512 tmp3862 = _mm512_add_ps(tmp3834, tmp3835);
__m512 tmp3847 = _mm512_sub_ps(tmp3786, tmp3787);
__m512 tmp3867 = _mm512_sub_ps(tmp3834, tmp3835);
__m512 tmp3844 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(2e+00f), tmp3845);
__m512 tmp3864 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(2e+00f), tmp3865);
__m512 tmp3851 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(8e+00f), tmp3845);
__m512 tmp3871 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(8e+00f), tmp3865);
__m512 tmp3839 = _mm512_add_ps(tmp3840, tmp3841);
__m512 tmp3859 = _mm512_add_ps(tmp3860, tmp3861);
__m512 tmp3843 = _mm512_fmadd_ps(tmp3847, _mm512_set1_ps(1.6e+01f), tmp3844);
__m512 tmp3863 = _mm512_fmadd_ps(tmp3867, _mm512_set1_ps(1.6e+01f), tmp3864);
__m512 tmp3850 = _mm512_fmadd_ps(tmp3847, _mm512_set1_ps(4e+00f), tmp3851);
__m512 tmp3870 = _mm512_fmadd_ps(tmp3867, _mm512_set1_ps(4e+00f), tmp3871);
__m512 tmp3856 = _mm512_add_ps(tmp3847, tmp3845);
__m512 tmp3876 = _mm512_add_ps(tmp3867, tmp3865);
__m512 tmp3849 = _mm512_fmadd_ps(tmp3840, _mm512_set1_ps(4e+00f), tmp3841);
__m512 tmp3869 = _mm512_fmadd_ps(tmp3860, _mm512_set1_ps(4e+00f), tmp3861);
__m512 tmp3853 = _mm512_fmadd_ps(tmp3840, _mm512_set1_ps(1.6e+01f), tmp3841);
__m512 tmp3873 = _mm512_fmadd_ps(tmp3860, _mm512_set1_ps(1.6e+01f), tmp3861);
__m512 tmp3838 = _mm512_add_ps(tmp3839, tmp3781);
__m512 tmp3858 = _mm512_add_ps(tmp3859, tmp3789);
__m512 tmp3855 = _mm512_add_ps(tmp3856, tmp3788);
__m512 tmp3875 = _mm512_add_ps(tmp3876, tmp3836);
__m512 tmp3837 = _mm512_fmadd_ps(tmp3842, _mm512_set1_ps(3.2e+01f), tmp3838);
__m512 tmp3857 = _mm512_fmadd_ps(tmp3862, _mm512_set1_ps(3.2e+01f), tmp3858);
__m512 tmp3848 = _mm512_fmadd_ps(tmp3842, _mm512_set1_ps(8e+00f), tmp3849);
__m512 tmp3868 = _mm512_fmadd_ps(tmp3862, _mm512_set1_ps(8e+00f), tmp3869);
__m512 tmp3854 = _mm512_fmadd_ps(tmp3846, _mm512_set1_ps(3.2e+01f), tmp3855);
__m512 tmp3874 = _mm512_fmadd_ps(tmp3866, _mm512_set1_ps(3.2e+01f), tmp3875);
__m512 tmp3852 = _mm512_fmadd_ps(tmp3842, _mm512_set1_ps(2e+00f), tmp3853);
__m512 tmp3872 = _mm512_fmadd_ps(tmp3862, _mm512_set1_ps(2e+00f), tmp3873);
__m512 out591 = tmp3837;
__m512 out597 = tmp3857;
__m512 out592 = tmp3843;
__m512 out598 = tmp3863;
__m512 out593 = tmp3848;
__m512 out599 = tmp3868;
__m512 out594 = tmp3850;
__m512 out600 = tmp3870;
__m512 out595 = tmp3852;
__m512 out601 = tmp3872;
__m512 out596 = tmp3854;
__m512 out602 = tmp3874;
out591 = _mm512_max_ps(_mm512_setzero_ps(), out591);
out597 = _mm512_max_ps(_mm512_setzero_ps(), out597);
out592 = _mm512_max_ps(_mm512_setzero_ps(), out592);
out598 = _mm512_max_ps(_mm512_setzero_ps(), out598);
out593 = _mm512_max_ps(_mm512_setzero_ps(), out593);
out599 = _mm512_max_ps(_mm512_setzero_ps(), out599);
out594 = _mm512_max_ps(_mm512_setzero_ps(), out594);
out600 = _mm512_max_ps(_mm512_setzero_ps(), out600);
out595 = _mm512_max_ps(_mm512_setzero_ps(), out595);
out601 = _mm512_max_ps(_mm512_setzero_ps(), out601);
out596 = _mm512_max_ps(_mm512_setzero_ps(), out596);
out602 = _mm512_max_ps(_mm512_setzero_ps(), out602);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out591);
_mm512_mask_storeu_ps(datPtr6+12704+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out597);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out592);
_mm512_mask_storeu_ps(datPtr6+12928+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out598);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out593);
_mm512_mask_storeu_ps(datPtr6+13152+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out599);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out594);
_mm512_mask_storeu_ps(datPtr6+13376+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out600);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out595);
_mm512_mask_storeu_ps(datPtr6+13600+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out601);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 4095, out596);
_mm512_mask_storeu_ps(datPtr6+13824+50432*i19+224*toH24+4*toW24+50432*k65+25216*l17, 255, out602);
}
}
++j13;
if (j13 >= 15) break;
rel11 = 3;
}
if (rel11 < 4) {
ptrdiff_t toH25 = base11+12;
ptrdiff_t toW25 = 0;
ptrdiff_t k66 = 1*w33;
for (; k66 != 1; ++k66) {
ptrdiff_t l18 = 0;
for (; l18 != 2; ++l18) {
__m512 sf241 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf242 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in604 = _mm512_shuffle_f32x4(sf241, sf242, 68);
__m512 in605 = _mm512_shuffle_f32x4(sf241, sf242, 238);
__m512 sf243 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf244 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in612 = _mm512_shuffle_f32x4(sf243, sf244, 68);
__m512 in613 = _mm512_shuffle_f32x4(sf243, sf244, 238);
__m512 sf245 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf246 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in606 = _mm512_shuffle_f32x4(sf245, sf246, 68);
__m512 in607 = _mm512_shuffle_f32x4(sf245, sf246, 238);
__m512 sf247 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf248 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in614 = _mm512_shuffle_f32x4(sf247, sf248, 68);
__m512 in615 = _mm512_shuffle_f32x4(sf247, sf248, 238);
__m512 sf249 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf250 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in608 = _mm512_shuffle_f32x4(sf249, sf250, 68);
__m512 in609 = _mm512_shuffle_f32x4(sf249, sf250, 238);
__m512 sf251 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf252 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in616 = _mm512_shuffle_f32x4(sf251, sf252, 68);
__m512 in617 = _mm512_shuffle_f32x4(sf251, sf252, 238);
__m512 sf253 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf254 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in610 = _mm512_shuffle_f32x4(sf253, sf254, 68);
__m512 in611 = _mm512_shuffle_f32x4(sf253, sf254, 238);
__m512 sf255 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf256 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in618 = _mm512_shuffle_f32x4(sf255, sf256, 68);
__m512 in619 = _mm512_shuffle_f32x4(sf255, sf256, 238);
__m512 tmp3933 = _mm512_add_ps(in605, in606);
__m512 tmp3953 = _mm512_add_ps(in613, in614);
__m512 tmp3932 = _mm512_add_ps(in607, in608);
__m512 tmp3952 = _mm512_add_ps(in615, in616);
__m512 tmp3938 = _mm512_sub_ps(in607, in608);
__m512 tmp3958 = _mm512_sub_ps(in615, in616);
__m512 tmp3937 = _mm512_sub_ps(in605, in606);
__m512 tmp3957 = _mm512_sub_ps(in613, in614);
__m512 tmp3934 = _mm512_add_ps(in609, in610);
__m512 tmp3954 = _mm512_add_ps(in617, in618);
__m512 tmp3939 = _mm512_sub_ps(in609, in610);
__m512 tmp3959 = _mm512_sub_ps(in617, in618);
__m512 tmp3936 = _mm512_fmadd_ps(tmp3938, _mm512_set1_ps(2e+00f), tmp3937);
__m512 tmp3956 = _mm512_fmadd_ps(tmp3958, _mm512_set1_ps(2e+00f), tmp3957);
__m512 tmp3943 = _mm512_fmadd_ps(tmp3938, _mm512_set1_ps(8e+00f), tmp3937);
__m512 tmp3963 = _mm512_fmadd_ps(tmp3958, _mm512_set1_ps(8e+00f), tmp3957);
__m512 tmp3931 = _mm512_add_ps(tmp3932, tmp3933);
__m512 tmp3951 = _mm512_add_ps(tmp3952, tmp3953);
__m512 tmp3935 = _mm512_fmadd_ps(tmp3939, _mm512_set1_ps(1.6e+01f), tmp3936);
__m512 tmp3955 = _mm512_fmadd_ps(tmp3959, _mm512_set1_ps(1.6e+01f), tmp3956);
__m512 tmp3942 = _mm512_fmadd_ps(tmp3939, _mm512_set1_ps(4e+00f), tmp3943);
__m512 tmp3962 = _mm512_fmadd_ps(tmp3959, _mm512_set1_ps(4e+00f), tmp3963);
__m512 tmp3948 = _mm512_add_ps(tmp3939, tmp3937);
__m512 tmp3968 = _mm512_add_ps(tmp3959, tmp3957);
__m512 tmp3941 = _mm512_fmadd_ps(tmp3932, _mm512_set1_ps(4e+00f), tmp3933);
__m512 tmp3961 = _mm512_fmadd_ps(tmp3952, _mm512_set1_ps(4e+00f), tmp3953);
__m512 tmp3945 = _mm512_fmadd_ps(tmp3932, _mm512_set1_ps(1.6e+01f), tmp3933);
__m512 tmp3965 = _mm512_fmadd_ps(tmp3952, _mm512_set1_ps(1.6e+01f), tmp3953);
__m512 tmp3930 = _mm512_add_ps(tmp3931, in604);
__m512 tmp3950 = _mm512_add_ps(tmp3951, in612);
__m512 tmp3947 = _mm512_add_ps(tmp3948, in611);
__m512 tmp3967 = _mm512_add_ps(tmp3968, in619);
__m512 tmp3929 = _mm512_fmadd_ps(tmp3934, _mm512_set1_ps(3.2e+01f), tmp3930);
__m512 tmp3949 = _mm512_fmadd_ps(tmp3954, _mm512_set1_ps(3.2e+01f), tmp3950);
__m512 tmp3940 = _mm512_fmadd_ps(tmp3934, _mm512_set1_ps(8e+00f), tmp3941);
__m512 tmp3960 = _mm512_fmadd_ps(tmp3954, _mm512_set1_ps(8e+00f), tmp3961);
__m512 tmp3946 = _mm512_fmadd_ps(tmp3938, _mm512_set1_ps(3.2e+01f), tmp3947);
__m512 tmp3966 = _mm512_fmadd_ps(tmp3958, _mm512_set1_ps(3.2e+01f), tmp3967);
__m512 tmp3944 = _mm512_fmadd_ps(tmp3934, _mm512_set1_ps(2e+00f), tmp3945);
__m512 tmp3964 = _mm512_fmadd_ps(tmp3954, _mm512_set1_ps(2e+00f), tmp3965);
__m512 tmp3917 = tmp3929;
__m512 tmp3923 = tmp3949;
__m512 tmp3918 = tmp3935;
__m512 tmp3924 = tmp3955;
__m512 tmp3919 = tmp3940;
__m512 tmp3925 = tmp3960;
__m512 tmp3920 = tmp3942;
__m512 tmp3926 = tmp3962;
__m512 tmp3921 = tmp3944;
__m512 tmp3927 = tmp3964;
__m512 tmp3922 = tmp3946;
__m512 tmp3928 = tmp3966;
__m512 tmp4013 = _mm512_unpacklo_ps(tmp3917, tmp3918);
__m512 tmp4014 = _mm512_unpackhi_ps(tmp3917, tmp3918);
__m512 tmp4015 = _mm512_unpacklo_ps(tmp3919, tmp3920);
__m512 tmp4016 = _mm512_unpackhi_ps(tmp3919, tmp3920);
__m512 tmp4017 = _mm512_unpacklo_ps(tmp3921, tmp3922);
__m512 tmp4018 = _mm512_unpackhi_ps(tmp3921, tmp3922);
__m512 tmp4019 = _mm512_unpacklo_ps(tmp3923, tmp3924);
__m512 tmp4020 = _mm512_unpackhi_ps(tmp3923, tmp3924);
__m512 tmp4021 = _mm512_unpacklo_ps(tmp3925, tmp3926);
__m512 tmp4022 = _mm512_unpackhi_ps(tmp3925, tmp3926);
__m512 tmp4023 = _mm512_unpacklo_ps(tmp3927, tmp3928);
__m512 tmp4024 = _mm512_unpackhi_ps(tmp3927, tmp3928);
__m512 tmp4025 = _mm512_shuffle_ps(tmp4013, tmp4015, 68);
__m512 tmp4026 = _mm512_shuffle_ps(tmp4013, tmp4015, 238);
__m512 tmp4027 = _mm512_shuffle_ps(tmp4014, tmp4016, 68);
__m512 tmp4028 = _mm512_shuffle_ps(tmp4014, tmp4016, 238);
__m512 tmp4029 = _mm512_shuffle_ps(tmp4017, tmp4019, 68);
__m512 tmp4030 = _mm512_shuffle_ps(tmp4017, tmp4019, 238);
__m512 tmp4031 = _mm512_shuffle_ps(tmp4018, tmp4020, 68);
__m512 tmp4032 = _mm512_shuffle_ps(tmp4018, tmp4020, 238);
__m512 tmp4033 = _mm512_shuffle_ps(tmp4021, tmp4023, 68);
__m512 tmp4034 = _mm512_shuffle_ps(tmp4021, tmp4023, 238);
__m512 tmp4035 = _mm512_shuffle_ps(tmp4022, tmp4024, 68);
__m512 tmp4036 = _mm512_shuffle_ps(tmp4022, tmp4024, 238);
__m512 tmp4037 = _mm512_shuffle_f32x4(tmp4025, tmp4029, 136);
__m512 tmp4038 = _mm512_shuffle_f32x4(tmp4025, tmp4029, 221);
__m512 tmp4039 = _mm512_shuffle_f32x4(tmp4026, tmp4030, 136);
__m512 tmp4040 = _mm512_shuffle_f32x4(tmp4026, tmp4030, 221);
__m512 tmp4041 = _mm512_shuffle_f32x4(tmp4027, tmp4031, 136);
__m512 tmp4042 = _mm512_shuffle_f32x4(tmp4027, tmp4031, 221);
__m512 tmp4043 = _mm512_shuffle_f32x4(tmp4028, tmp4032, 136);
__m512 tmp4044 = _mm512_shuffle_f32x4(tmp4028, tmp4032, 221);
__m512 tmp4045 = _mm512_shuffle_f32x4(tmp4033, tmp4033, 136);
__m512 tmp4046 = _mm512_shuffle_f32x4(tmp4033, tmp4033, 221);
__m512 tmp4047 = _mm512_shuffle_f32x4(tmp4034, tmp4034, 136);
__m512 tmp4048 = _mm512_shuffle_f32x4(tmp4034, tmp4034, 221);
__m512 tmp4049 = _mm512_shuffle_f32x4(tmp4035, tmp4035, 136);
__m512 tmp4050 = _mm512_shuffle_f32x4(tmp4035, tmp4035, 221);
__m512 tmp4051 = _mm512_shuffle_f32x4(tmp4036, tmp4036, 136);
__m512 tmp4052 = _mm512_shuffle_f32x4(tmp4036, tmp4036, 221);
tmp3917 = _mm512_shuffle_f32x4(tmp4037, tmp4045, 136);
tmp3925 = _mm512_shuffle_f32x4(tmp4037, tmp4045, 221);
tmp3918 = _mm512_shuffle_f32x4(tmp4039, tmp4047, 136);
tmp3926 = _mm512_shuffle_f32x4(tmp4039, tmp4047, 221);
tmp3919 = _mm512_shuffle_f32x4(tmp4041, tmp4049, 136);
tmp3927 = _mm512_shuffle_f32x4(tmp4041, tmp4049, 221);
tmp3920 = _mm512_shuffle_f32x4(tmp4043, tmp4051, 136);
tmp3928 = _mm512_shuffle_f32x4(tmp4043, tmp4051, 221);
tmp3921 = _mm512_shuffle_f32x4(tmp4038, tmp4046, 136);
__m512 tmp3969 = _mm512_shuffle_f32x4(tmp4038, tmp4046, 221);
tmp3922 = _mm512_shuffle_f32x4(tmp4040, tmp4048, 136);
__m512 tmp3970 = _mm512_shuffle_f32x4(tmp4040, tmp4048, 221);
tmp3923 = _mm512_shuffle_f32x4(tmp4042, tmp4050, 136);
__m512 tmp3971 = _mm512_shuffle_f32x4(tmp4042, tmp4050, 221);
tmp3924 = _mm512_shuffle_f32x4(tmp4044, tmp4052, 136);
__m512 tmp3972 = _mm512_shuffle_f32x4(tmp4044, tmp4052, 221);
__m512 tmp3977 = _mm512_add_ps(tmp3918, tmp3919);
__m512 tmp3997 = _mm512_add_ps(tmp3926, tmp3927);
__m512 tmp3976 = _mm512_add_ps(tmp3920, tmp3921);
__m512 tmp3996 = _mm512_add_ps(tmp3928, tmp3969);
__m512 tmp3982 = _mm512_sub_ps(tmp3920, tmp3921);
__m512 tmp4002 = _mm512_sub_ps(tmp3928, tmp3969);
__m512 tmp3981 = _mm512_sub_ps(tmp3918, tmp3919);
__m512 tmp4001 = _mm512_sub_ps(tmp3926, tmp3927);
__m512 tmp3978 = _mm512_add_ps(tmp3922, tmp3923);
__m512 tmp3998 = _mm512_add_ps(tmp3970, tmp3971);
__m512 tmp3983 = _mm512_sub_ps(tmp3922, tmp3923);
__m512 tmp4003 = _mm512_sub_ps(tmp3970, tmp3971);
__m512 tmp3980 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(2e+00f), tmp3981);
__m512 tmp4000 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(2e+00f), tmp4001);
__m512 tmp3987 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(8e+00f), tmp3981);
__m512 tmp4007 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(8e+00f), tmp4001);
__m512 tmp3975 = _mm512_add_ps(tmp3976, tmp3977);
__m512 tmp3995 = _mm512_add_ps(tmp3996, tmp3997);
__m512 tmp3979 = _mm512_fmadd_ps(tmp3983, _mm512_set1_ps(1.6e+01f), tmp3980);
__m512 tmp3999 = _mm512_fmadd_ps(tmp4003, _mm512_set1_ps(1.6e+01f), tmp4000);
__m512 tmp3986 = _mm512_fmadd_ps(tmp3983, _mm512_set1_ps(4e+00f), tmp3987);
__m512 tmp4006 = _mm512_fmadd_ps(tmp4003, _mm512_set1_ps(4e+00f), tmp4007);
__m512 tmp3992 = _mm512_add_ps(tmp3983, tmp3981);
__m512 tmp4012 = _mm512_add_ps(tmp4003, tmp4001);
__m512 tmp3985 = _mm512_fmadd_ps(tmp3976, _mm512_set1_ps(4e+00f), tmp3977);
__m512 tmp4005 = _mm512_fmadd_ps(tmp3996, _mm512_set1_ps(4e+00f), tmp3997);
__m512 tmp3989 = _mm512_fmadd_ps(tmp3976, _mm512_set1_ps(1.6e+01f), tmp3977);
__m512 tmp4009 = _mm512_fmadd_ps(tmp3996, _mm512_set1_ps(1.6e+01f), tmp3997);
__m512 tmp3974 = _mm512_add_ps(tmp3975, tmp3917);
__m512 tmp3994 = _mm512_add_ps(tmp3995, tmp3925);
__m512 tmp3991 = _mm512_add_ps(tmp3992, tmp3924);
__m512 tmp4011 = _mm512_add_ps(tmp4012, tmp3972);
__m512 tmp3973 = _mm512_fmadd_ps(tmp3978, _mm512_set1_ps(3.2e+01f), tmp3974);
__m512 tmp3993 = _mm512_fmadd_ps(tmp3998, _mm512_set1_ps(3.2e+01f), tmp3994);
__m512 tmp3984 = _mm512_fmadd_ps(tmp3978, _mm512_set1_ps(8e+00f), tmp3985);
__m512 tmp4004 = _mm512_fmadd_ps(tmp3998, _mm512_set1_ps(8e+00f), tmp4005);
__m512 tmp3990 = _mm512_fmadd_ps(tmp3982, _mm512_set1_ps(3.2e+01f), tmp3991);
__m512 tmp4010 = _mm512_fmadd_ps(tmp4002, _mm512_set1_ps(3.2e+01f), tmp4011);
__m512 tmp3988 = _mm512_fmadd_ps(tmp3978, _mm512_set1_ps(2e+00f), tmp3989);
__m512 tmp4008 = _mm512_fmadd_ps(tmp3998, _mm512_set1_ps(2e+00f), tmp4009);
__m512 out603 = tmp3973;
__m512 out609 = tmp3993;
__m512 out604 = tmp3979;
__m512 out610 = tmp3999;
__m512 out605 = tmp3984;
__m512 out611 = tmp4004;
__m512 out606 = tmp3986;
__m512 out612 = tmp4006;
__m512 out607 = tmp3988;
__m512 out613 = tmp4008;
__m512 out608 = tmp3990;
__m512 out614 = tmp4010;
out603 = _mm512_max_ps(_mm512_setzero_ps(), out603);
out609 = _mm512_max_ps(_mm512_setzero_ps(), out609);
out604 = _mm512_max_ps(_mm512_setzero_ps(), out604);
out610 = _mm512_max_ps(_mm512_setzero_ps(), out610);
out605 = _mm512_max_ps(_mm512_setzero_ps(), out605);
out611 = _mm512_max_ps(_mm512_setzero_ps(), out611);
out606 = _mm512_max_ps(_mm512_setzero_ps(), out606);
out612 = _mm512_max_ps(_mm512_setzero_ps(), out612);
out607 = _mm512_max_ps(_mm512_setzero_ps(), out607);
out613 = _mm512_max_ps(_mm512_setzero_ps(), out613);
out608 = _mm512_max_ps(_mm512_setzero_ps(), out608);
out614 = _mm512_max_ps(_mm512_setzero_ps(), out614);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out603);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out609);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out604);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out610);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out605);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out611);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out606);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out612);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out607);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out613);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out608);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out614);
__m512 sf257 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf258 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in620 = _mm512_shuffle_f32x4(sf257, sf258, 68);
__m512 in621 = _mm512_shuffle_f32x4(sf257, sf258, 238);
__m512 sf259 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf260 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in628 = _mm512_shuffle_f32x4(sf259, sf260, 68);
__m512 in629 = _mm512_shuffle_f32x4(sf259, sf260, 238);
__m512 sf261 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf262 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in622 = _mm512_shuffle_f32x4(sf261, sf262, 68);
__m512 in623 = _mm512_shuffle_f32x4(sf261, sf262, 238);
__m512 sf263 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf264 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in630 = _mm512_shuffle_f32x4(sf263, sf264, 68);
__m512 in631 = _mm512_shuffle_f32x4(sf263, sf264, 238);
__m512 sf265 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf266 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in624 = _mm512_shuffle_f32x4(sf265, sf266, 68);
__m512 in625 = _mm512_shuffle_f32x4(sf265, sf266, 238);
__m512 sf267 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf268 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in632 = _mm512_shuffle_f32x4(sf267, sf268, 68);
__m512 in633 = _mm512_shuffle_f32x4(sf267, sf268, 238);
__m512 sf269 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf270 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in626 = _mm512_shuffle_f32x4(sf269, sf270, 68);
__m512 in627 = _mm512_shuffle_f32x4(sf269, sf270, 238);
__m512 sf271 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf272 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in634 = _mm512_shuffle_f32x4(sf271, sf272, 68);
__m512 in635 = _mm512_shuffle_f32x4(sf271, sf272, 238);
__m512 tmp4069 = _mm512_add_ps(in621, in622);
__m512 tmp4089 = _mm512_add_ps(in629, in630);
__m512 tmp4068 = _mm512_add_ps(in623, in624);
__m512 tmp4088 = _mm512_add_ps(in631, in632);
__m512 tmp4074 = _mm512_sub_ps(in623, in624);
__m512 tmp4094 = _mm512_sub_ps(in631, in632);
__m512 tmp4073 = _mm512_sub_ps(in621, in622);
__m512 tmp4093 = _mm512_sub_ps(in629, in630);
__m512 tmp4070 = _mm512_add_ps(in625, in626);
__m512 tmp4090 = _mm512_add_ps(in633, in634);
__m512 tmp4075 = _mm512_sub_ps(in625, in626);
__m512 tmp4095 = _mm512_sub_ps(in633, in634);
__m512 tmp4072 = _mm512_fmadd_ps(tmp4074, _mm512_set1_ps(2e+00f), tmp4073);
__m512 tmp4092 = _mm512_fmadd_ps(tmp4094, _mm512_set1_ps(2e+00f), tmp4093);
__m512 tmp4079 = _mm512_fmadd_ps(tmp4074, _mm512_set1_ps(8e+00f), tmp4073);
__m512 tmp4099 = _mm512_fmadd_ps(tmp4094, _mm512_set1_ps(8e+00f), tmp4093);
__m512 tmp4067 = _mm512_add_ps(tmp4068, tmp4069);
__m512 tmp4087 = _mm512_add_ps(tmp4088, tmp4089);
__m512 tmp4071 = _mm512_fmadd_ps(tmp4075, _mm512_set1_ps(1.6e+01f), tmp4072);
__m512 tmp4091 = _mm512_fmadd_ps(tmp4095, _mm512_set1_ps(1.6e+01f), tmp4092);
__m512 tmp4078 = _mm512_fmadd_ps(tmp4075, _mm512_set1_ps(4e+00f), tmp4079);
__m512 tmp4098 = _mm512_fmadd_ps(tmp4095, _mm512_set1_ps(4e+00f), tmp4099);
__m512 tmp4084 = _mm512_add_ps(tmp4075, tmp4073);
__m512 tmp4104 = _mm512_add_ps(tmp4095, tmp4093);
__m512 tmp4077 = _mm512_fmadd_ps(tmp4068, _mm512_set1_ps(4e+00f), tmp4069);
__m512 tmp4097 = _mm512_fmadd_ps(tmp4088, _mm512_set1_ps(4e+00f), tmp4089);
__m512 tmp4081 = _mm512_fmadd_ps(tmp4068, _mm512_set1_ps(1.6e+01f), tmp4069);
__m512 tmp4101 = _mm512_fmadd_ps(tmp4088, _mm512_set1_ps(1.6e+01f), tmp4089);
__m512 tmp4066 = _mm512_add_ps(tmp4067, in620);
__m512 tmp4086 = _mm512_add_ps(tmp4087, in628);
__m512 tmp4083 = _mm512_add_ps(tmp4084, in627);
__m512 tmp4103 = _mm512_add_ps(tmp4104, in635);
__m512 tmp4065 = _mm512_fmadd_ps(tmp4070, _mm512_set1_ps(3.2e+01f), tmp4066);
__m512 tmp4085 = _mm512_fmadd_ps(tmp4090, _mm512_set1_ps(3.2e+01f), tmp4086);
__m512 tmp4076 = _mm512_fmadd_ps(tmp4070, _mm512_set1_ps(8e+00f), tmp4077);
__m512 tmp4096 = _mm512_fmadd_ps(tmp4090, _mm512_set1_ps(8e+00f), tmp4097);
__m512 tmp4082 = _mm512_fmadd_ps(tmp4074, _mm512_set1_ps(3.2e+01f), tmp4083);
__m512 tmp4102 = _mm512_fmadd_ps(tmp4094, _mm512_set1_ps(3.2e+01f), tmp4103);
__m512 tmp4080 = _mm512_fmadd_ps(tmp4070, _mm512_set1_ps(2e+00f), tmp4081);
__m512 tmp4100 = _mm512_fmadd_ps(tmp4090, _mm512_set1_ps(2e+00f), tmp4101);
__m512 tmp4053 = tmp4065;
__m512 tmp4059 = tmp4085;
__m512 tmp4054 = tmp4071;
__m512 tmp4060 = tmp4091;
__m512 tmp4055 = tmp4076;
__m512 tmp4061 = tmp4096;
__m512 tmp4056 = tmp4078;
__m512 tmp4062 = tmp4098;
__m512 tmp4057 = tmp4080;
__m512 tmp4063 = tmp4100;
__m512 tmp4058 = tmp4082;
__m512 tmp4064 = tmp4102;
__m512 tmp4149 = _mm512_unpacklo_ps(tmp4053, tmp4054);
__m512 tmp4150 = _mm512_unpackhi_ps(tmp4053, tmp4054);
__m512 tmp4151 = _mm512_unpacklo_ps(tmp4055, tmp4056);
__m512 tmp4152 = _mm512_unpackhi_ps(tmp4055, tmp4056);
__m512 tmp4153 = _mm512_unpacklo_ps(tmp4057, tmp4058);
__m512 tmp4154 = _mm512_unpackhi_ps(tmp4057, tmp4058);
__m512 tmp4155 = _mm512_unpacklo_ps(tmp4059, tmp4060);
__m512 tmp4156 = _mm512_unpackhi_ps(tmp4059, tmp4060);
__m512 tmp4157 = _mm512_unpacklo_ps(tmp4061, tmp4062);
__m512 tmp4158 = _mm512_unpackhi_ps(tmp4061, tmp4062);
__m512 tmp4159 = _mm512_unpacklo_ps(tmp4063, tmp4064);
__m512 tmp4160 = _mm512_unpackhi_ps(tmp4063, tmp4064);
__m512 tmp4161 = _mm512_shuffle_ps(tmp4149, tmp4151, 68);
__m512 tmp4162 = _mm512_shuffle_ps(tmp4149, tmp4151, 238);
__m512 tmp4163 = _mm512_shuffle_ps(tmp4150, tmp4152, 68);
__m512 tmp4164 = _mm512_shuffle_ps(tmp4150, tmp4152, 238);
__m512 tmp4165 = _mm512_shuffle_ps(tmp4153, tmp4155, 68);
__m512 tmp4166 = _mm512_shuffle_ps(tmp4153, tmp4155, 238);
__m512 tmp4167 = _mm512_shuffle_ps(tmp4154, tmp4156, 68);
__m512 tmp4168 = _mm512_shuffle_ps(tmp4154, tmp4156, 238);
__m512 tmp4169 = _mm512_shuffle_ps(tmp4157, tmp4159, 68);
__m512 tmp4170 = _mm512_shuffle_ps(tmp4157, tmp4159, 238);
__m512 tmp4171 = _mm512_shuffle_ps(tmp4158, tmp4160, 68);
__m512 tmp4172 = _mm512_shuffle_ps(tmp4158, tmp4160, 238);
__m512 tmp4173 = _mm512_shuffle_f32x4(tmp4161, tmp4165, 136);
__m512 tmp4174 = _mm512_shuffle_f32x4(tmp4161, tmp4165, 221);
__m512 tmp4175 = _mm512_shuffle_f32x4(tmp4162, tmp4166, 136);
__m512 tmp4176 = _mm512_shuffle_f32x4(tmp4162, tmp4166, 221);
__m512 tmp4177 = _mm512_shuffle_f32x4(tmp4163, tmp4167, 136);
__m512 tmp4178 = _mm512_shuffle_f32x4(tmp4163, tmp4167, 221);
__m512 tmp4179 = _mm512_shuffle_f32x4(tmp4164, tmp4168, 136);
__m512 tmp4180 = _mm512_shuffle_f32x4(tmp4164, tmp4168, 221);
__m512 tmp4181 = _mm512_shuffle_f32x4(tmp4169, tmp4169, 136);
__m512 tmp4182 = _mm512_shuffle_f32x4(tmp4169, tmp4169, 221);
__m512 tmp4183 = _mm512_shuffle_f32x4(tmp4170, tmp4170, 136);
__m512 tmp4184 = _mm512_shuffle_f32x4(tmp4170, tmp4170, 221);
__m512 tmp4185 = _mm512_shuffle_f32x4(tmp4171, tmp4171, 136);
__m512 tmp4186 = _mm512_shuffle_f32x4(tmp4171, tmp4171, 221);
__m512 tmp4187 = _mm512_shuffle_f32x4(tmp4172, tmp4172, 136);
__m512 tmp4188 = _mm512_shuffle_f32x4(tmp4172, tmp4172, 221);
tmp4053 = _mm512_shuffle_f32x4(tmp4173, tmp4181, 136);
tmp4061 = _mm512_shuffle_f32x4(tmp4173, tmp4181, 221);
tmp4054 = _mm512_shuffle_f32x4(tmp4175, tmp4183, 136);
tmp4062 = _mm512_shuffle_f32x4(tmp4175, tmp4183, 221);
tmp4055 = _mm512_shuffle_f32x4(tmp4177, tmp4185, 136);
tmp4063 = _mm512_shuffle_f32x4(tmp4177, tmp4185, 221);
tmp4056 = _mm512_shuffle_f32x4(tmp4179, tmp4187, 136);
tmp4064 = _mm512_shuffle_f32x4(tmp4179, tmp4187, 221);
tmp4057 = _mm512_shuffle_f32x4(tmp4174, tmp4182, 136);
__m512 tmp4105 = _mm512_shuffle_f32x4(tmp4174, tmp4182, 221);
tmp4058 = _mm512_shuffle_f32x4(tmp4176, tmp4184, 136);
__m512 tmp4106 = _mm512_shuffle_f32x4(tmp4176, tmp4184, 221);
tmp4059 = _mm512_shuffle_f32x4(tmp4178, tmp4186, 136);
__m512 tmp4107 = _mm512_shuffle_f32x4(tmp4178, tmp4186, 221);
tmp4060 = _mm512_shuffle_f32x4(tmp4180, tmp4188, 136);
__m512 tmp4108 = _mm512_shuffle_f32x4(tmp4180, tmp4188, 221);
__m512 tmp4113 = _mm512_add_ps(tmp4054, tmp4055);
__m512 tmp4133 = _mm512_add_ps(tmp4062, tmp4063);
__m512 tmp4112 = _mm512_add_ps(tmp4056, tmp4057);
__m512 tmp4132 = _mm512_add_ps(tmp4064, tmp4105);
__m512 tmp4118 = _mm512_sub_ps(tmp4056, tmp4057);
__m512 tmp4138 = _mm512_sub_ps(tmp4064, tmp4105);
__m512 tmp4117 = _mm512_sub_ps(tmp4054, tmp4055);
__m512 tmp4137 = _mm512_sub_ps(tmp4062, tmp4063);
__m512 tmp4114 = _mm512_add_ps(tmp4058, tmp4059);
__m512 tmp4134 = _mm512_add_ps(tmp4106, tmp4107);
__m512 tmp4119 = _mm512_sub_ps(tmp4058, tmp4059);
__m512 tmp4139 = _mm512_sub_ps(tmp4106, tmp4107);
__m512 tmp4116 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(2e+00f), tmp4117);
__m512 tmp4136 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(2e+00f), tmp4137);
__m512 tmp4123 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(8e+00f), tmp4117);
__m512 tmp4143 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(8e+00f), tmp4137);
__m512 tmp4111 = _mm512_add_ps(tmp4112, tmp4113);
__m512 tmp4131 = _mm512_add_ps(tmp4132, tmp4133);
__m512 tmp4115 = _mm512_fmadd_ps(tmp4119, _mm512_set1_ps(1.6e+01f), tmp4116);
__m512 tmp4135 = _mm512_fmadd_ps(tmp4139, _mm512_set1_ps(1.6e+01f), tmp4136);
__m512 tmp4122 = _mm512_fmadd_ps(tmp4119, _mm512_set1_ps(4e+00f), tmp4123);
__m512 tmp4142 = _mm512_fmadd_ps(tmp4139, _mm512_set1_ps(4e+00f), tmp4143);
__m512 tmp4128 = _mm512_add_ps(tmp4119, tmp4117);
__m512 tmp4148 = _mm512_add_ps(tmp4139, tmp4137);
__m512 tmp4121 = _mm512_fmadd_ps(tmp4112, _mm512_set1_ps(4e+00f), tmp4113);
__m512 tmp4141 = _mm512_fmadd_ps(tmp4132, _mm512_set1_ps(4e+00f), tmp4133);
__m512 tmp4125 = _mm512_fmadd_ps(tmp4112, _mm512_set1_ps(1.6e+01f), tmp4113);
__m512 tmp4145 = _mm512_fmadd_ps(tmp4132, _mm512_set1_ps(1.6e+01f), tmp4133);
__m512 tmp4110 = _mm512_add_ps(tmp4111, tmp4053);
__m512 tmp4130 = _mm512_add_ps(tmp4131, tmp4061);
__m512 tmp4127 = _mm512_add_ps(tmp4128, tmp4060);
__m512 tmp4147 = _mm512_add_ps(tmp4148, tmp4108);
__m512 tmp4109 = _mm512_fmadd_ps(tmp4114, _mm512_set1_ps(3.2e+01f), tmp4110);
__m512 tmp4129 = _mm512_fmadd_ps(tmp4134, _mm512_set1_ps(3.2e+01f), tmp4130);
__m512 tmp4120 = _mm512_fmadd_ps(tmp4114, _mm512_set1_ps(8e+00f), tmp4121);
__m512 tmp4140 = _mm512_fmadd_ps(tmp4134, _mm512_set1_ps(8e+00f), tmp4141);
__m512 tmp4126 = _mm512_fmadd_ps(tmp4118, _mm512_set1_ps(3.2e+01f), tmp4127);
__m512 tmp4146 = _mm512_fmadd_ps(tmp4138, _mm512_set1_ps(3.2e+01f), tmp4147);
__m512 tmp4124 = _mm512_fmadd_ps(tmp4114, _mm512_set1_ps(2e+00f), tmp4125);
__m512 tmp4144 = _mm512_fmadd_ps(tmp4134, _mm512_set1_ps(2e+00f), tmp4145);
__m512 out615 = tmp4109;
__m512 out621 = tmp4129;
__m512 out616 = tmp4115;
__m512 out622 = tmp4135;
__m512 out617 = tmp4120;
__m512 out623 = tmp4140;
__m512 out618 = tmp4122;
__m512 out624 = tmp4142;
__m512 out619 = tmp4124;
__m512 out625 = tmp4144;
__m512 out620 = tmp4126;
__m512 out626 = tmp4146;
out615 = _mm512_max_ps(_mm512_setzero_ps(), out615);
out621 = _mm512_max_ps(_mm512_setzero_ps(), out621);
out616 = _mm512_max_ps(_mm512_setzero_ps(), out616);
out622 = _mm512_max_ps(_mm512_setzero_ps(), out622);
out617 = _mm512_max_ps(_mm512_setzero_ps(), out617);
out623 = _mm512_max_ps(_mm512_setzero_ps(), out623);
out618 = _mm512_max_ps(_mm512_setzero_ps(), out618);
out624 = _mm512_max_ps(_mm512_setzero_ps(), out624);
out619 = _mm512_max_ps(_mm512_setzero_ps(), out619);
out625 = _mm512_max_ps(_mm512_setzero_ps(), out625);
out620 = _mm512_max_ps(_mm512_setzero_ps(), out620);
out626 = _mm512_max_ps(_mm512_setzero_ps(), out626);
_mm512_mask_storeu_ps(datPtr6+96+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out615);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out621);
_mm512_mask_storeu_ps(datPtr6+320+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out616);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out622);
_mm512_mask_storeu_ps(datPtr6+544+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out617);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out623);
_mm512_mask_storeu_ps(datPtr6+768+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out618);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out624);
_mm512_mask_storeu_ps(datPtr6+992+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out619);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out625);
_mm512_mask_storeu_ps(datPtr6+1216+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out620);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out626);
__m512 sf273 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf274 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in636 = _mm512_shuffle_f32x4(sf273, sf274, 68);
__m512 in637 = _mm512_shuffle_f32x4(sf273, sf274, 238);
__m512 sf275 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf276 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in644 = _mm512_shuffle_f32x4(sf275, sf276, 68);
__m512 in645 = _mm512_shuffle_f32x4(sf275, sf276, 238);
__m512 sf277 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf278 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in638 = _mm512_shuffle_f32x4(sf277, sf278, 68);
__m512 in639 = _mm512_shuffle_f32x4(sf277, sf278, 238);
__m512 sf279 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf280 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in646 = _mm512_shuffle_f32x4(sf279, sf280, 68);
__m512 in647 = _mm512_shuffle_f32x4(sf279, sf280, 238);
__m512 sf281 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf282 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in640 = _mm512_shuffle_f32x4(sf281, sf282, 68);
__m512 in641 = _mm512_shuffle_f32x4(sf281, sf282, 238);
__m512 sf283 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf284 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in648 = _mm512_shuffle_f32x4(sf283, sf284, 68);
__m512 in649 = _mm512_shuffle_f32x4(sf283, sf284, 238);
__m512 sf285 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf286 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in642 = _mm512_shuffle_f32x4(sf285, sf286, 68);
__m512 in643 = _mm512_shuffle_f32x4(sf285, sf286, 238);
__m512 sf287 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k66+768*l18);
__m512 sf288 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k66+768*l18);
__m512 in650 = _mm512_shuffle_f32x4(sf287, sf288, 68);
__m512 in651 = _mm512_shuffle_f32x4(sf287, sf288, 238);
__m512 tmp4205 = _mm512_add_ps(in637, in638);
__m512 tmp4225 = _mm512_add_ps(in645, in646);
__m512 tmp4204 = _mm512_add_ps(in639, in640);
__m512 tmp4224 = _mm512_add_ps(in647, in648);
__m512 tmp4210 = _mm512_sub_ps(in639, in640);
__m512 tmp4230 = _mm512_sub_ps(in647, in648);
__m512 tmp4209 = _mm512_sub_ps(in637, in638);
__m512 tmp4229 = _mm512_sub_ps(in645, in646);
__m512 tmp4206 = _mm512_add_ps(in641, in642);
__m512 tmp4226 = _mm512_add_ps(in649, in650);
__m512 tmp4211 = _mm512_sub_ps(in641, in642);
__m512 tmp4231 = _mm512_sub_ps(in649, in650);
__m512 tmp4208 = _mm512_fmadd_ps(tmp4210, _mm512_set1_ps(2e+00f), tmp4209);
__m512 tmp4228 = _mm512_fmadd_ps(tmp4230, _mm512_set1_ps(2e+00f), tmp4229);
__m512 tmp4215 = _mm512_fmadd_ps(tmp4210, _mm512_set1_ps(8e+00f), tmp4209);
__m512 tmp4235 = _mm512_fmadd_ps(tmp4230, _mm512_set1_ps(8e+00f), tmp4229);
__m512 tmp4203 = _mm512_add_ps(tmp4204, tmp4205);
__m512 tmp4223 = _mm512_add_ps(tmp4224, tmp4225);
__m512 tmp4207 = _mm512_fmadd_ps(tmp4211, _mm512_set1_ps(1.6e+01f), tmp4208);
__m512 tmp4227 = _mm512_fmadd_ps(tmp4231, _mm512_set1_ps(1.6e+01f), tmp4228);
__m512 tmp4214 = _mm512_fmadd_ps(tmp4211, _mm512_set1_ps(4e+00f), tmp4215);
__m512 tmp4234 = _mm512_fmadd_ps(tmp4231, _mm512_set1_ps(4e+00f), tmp4235);
__m512 tmp4220 = _mm512_add_ps(tmp4211, tmp4209);
__m512 tmp4240 = _mm512_add_ps(tmp4231, tmp4229);
__m512 tmp4213 = _mm512_fmadd_ps(tmp4204, _mm512_set1_ps(4e+00f), tmp4205);
__m512 tmp4233 = _mm512_fmadd_ps(tmp4224, _mm512_set1_ps(4e+00f), tmp4225);
__m512 tmp4217 = _mm512_fmadd_ps(tmp4204, _mm512_set1_ps(1.6e+01f), tmp4205);
__m512 tmp4237 = _mm512_fmadd_ps(tmp4224, _mm512_set1_ps(1.6e+01f), tmp4225);
__m512 tmp4202 = _mm512_add_ps(tmp4203, in636);
__m512 tmp4222 = _mm512_add_ps(tmp4223, in644);
__m512 tmp4219 = _mm512_add_ps(tmp4220, in643);
__m512 tmp4239 = _mm512_add_ps(tmp4240, in651);
__m512 tmp4201 = _mm512_fmadd_ps(tmp4206, _mm512_set1_ps(3.2e+01f), tmp4202);
__m512 tmp4221 = _mm512_fmadd_ps(tmp4226, _mm512_set1_ps(3.2e+01f), tmp4222);
__m512 tmp4212 = _mm512_fmadd_ps(tmp4206, _mm512_set1_ps(8e+00f), tmp4213);
__m512 tmp4232 = _mm512_fmadd_ps(tmp4226, _mm512_set1_ps(8e+00f), tmp4233);
__m512 tmp4218 = _mm512_fmadd_ps(tmp4210, _mm512_set1_ps(3.2e+01f), tmp4219);
__m512 tmp4238 = _mm512_fmadd_ps(tmp4230, _mm512_set1_ps(3.2e+01f), tmp4239);
__m512 tmp4216 = _mm512_fmadd_ps(tmp4206, _mm512_set1_ps(2e+00f), tmp4217);
__m512 tmp4236 = _mm512_fmadd_ps(tmp4226, _mm512_set1_ps(2e+00f), tmp4237);
__m512 tmp4189 = tmp4201;
__m512 tmp4195 = tmp4221;
__m512 tmp4190 = tmp4207;
__m512 tmp4196 = tmp4227;
__m512 tmp4191 = tmp4212;
__m512 tmp4197 = tmp4232;
__m512 tmp4192 = tmp4214;
__m512 tmp4198 = tmp4234;
__m512 tmp4193 = tmp4216;
__m512 tmp4199 = tmp4236;
__m512 tmp4194 = tmp4218;
__m512 tmp4200 = tmp4238;
__m512 tmp4285 = _mm512_unpacklo_ps(tmp4189, tmp4190);
__m512 tmp4286 = _mm512_unpackhi_ps(tmp4189, tmp4190);
__m512 tmp4287 = _mm512_unpacklo_ps(tmp4191, tmp4192);
__m512 tmp4288 = _mm512_unpackhi_ps(tmp4191, tmp4192);
__m512 tmp4289 = _mm512_unpacklo_ps(tmp4193, tmp4194);
__m512 tmp4290 = _mm512_unpackhi_ps(tmp4193, tmp4194);
__m512 tmp4291 = _mm512_unpacklo_ps(tmp4195, tmp4196);
__m512 tmp4292 = _mm512_unpackhi_ps(tmp4195, tmp4196);
__m512 tmp4293 = _mm512_unpacklo_ps(tmp4197, tmp4198);
__m512 tmp4294 = _mm512_unpackhi_ps(tmp4197, tmp4198);
__m512 tmp4295 = _mm512_unpacklo_ps(tmp4199, tmp4200);
__m512 tmp4296 = _mm512_unpackhi_ps(tmp4199, tmp4200);
__m512 tmp4297 = _mm512_shuffle_ps(tmp4285, tmp4287, 68);
__m512 tmp4298 = _mm512_shuffle_ps(tmp4285, tmp4287, 238);
__m512 tmp4299 = _mm512_shuffle_ps(tmp4286, tmp4288, 68);
__m512 tmp4300 = _mm512_shuffle_ps(tmp4286, tmp4288, 238);
__m512 tmp4301 = _mm512_shuffle_ps(tmp4289, tmp4291, 68);
__m512 tmp4302 = _mm512_shuffle_ps(tmp4289, tmp4291, 238);
__m512 tmp4303 = _mm512_shuffle_ps(tmp4290, tmp4292, 68);
__m512 tmp4304 = _mm512_shuffle_ps(tmp4290, tmp4292, 238);
__m512 tmp4305 = _mm512_shuffle_ps(tmp4293, tmp4295, 68);
__m512 tmp4306 = _mm512_shuffle_ps(tmp4293, tmp4295, 238);
__m512 tmp4307 = _mm512_shuffle_ps(tmp4294, tmp4296, 68);
__m512 tmp4308 = _mm512_shuffle_ps(tmp4294, tmp4296, 238);
__m512 tmp4309 = _mm512_shuffle_f32x4(tmp4297, tmp4301, 136);
__m512 tmp4310 = _mm512_shuffle_f32x4(tmp4297, tmp4301, 221);
__m512 tmp4311 = _mm512_shuffle_f32x4(tmp4298, tmp4302, 136);
__m512 tmp4312 = _mm512_shuffle_f32x4(tmp4298, tmp4302, 221);
__m512 tmp4313 = _mm512_shuffle_f32x4(tmp4299, tmp4303, 136);
__m512 tmp4314 = _mm512_shuffle_f32x4(tmp4299, tmp4303, 221);
__m512 tmp4315 = _mm512_shuffle_f32x4(tmp4300, tmp4304, 136);
__m512 tmp4316 = _mm512_shuffle_f32x4(tmp4300, tmp4304, 221);
__m512 tmp4317 = _mm512_shuffle_f32x4(tmp4305, tmp4305, 136);
__m512 tmp4318 = _mm512_shuffle_f32x4(tmp4305, tmp4305, 221);
__m512 tmp4319 = _mm512_shuffle_f32x4(tmp4306, tmp4306, 136);
__m512 tmp4320 = _mm512_shuffle_f32x4(tmp4306, tmp4306, 221);
__m512 tmp4321 = _mm512_shuffle_f32x4(tmp4307, tmp4307, 136);
__m512 tmp4322 = _mm512_shuffle_f32x4(tmp4307, tmp4307, 221);
__m512 tmp4323 = _mm512_shuffle_f32x4(tmp4308, tmp4308, 136);
__m512 tmp4324 = _mm512_shuffle_f32x4(tmp4308, tmp4308, 221);
tmp4189 = _mm512_shuffle_f32x4(tmp4309, tmp4317, 136);
tmp4197 = _mm512_shuffle_f32x4(tmp4309, tmp4317, 221);
tmp4190 = _mm512_shuffle_f32x4(tmp4311, tmp4319, 136);
tmp4198 = _mm512_shuffle_f32x4(tmp4311, tmp4319, 221);
tmp4191 = _mm512_shuffle_f32x4(tmp4313, tmp4321, 136);
tmp4199 = _mm512_shuffle_f32x4(tmp4313, tmp4321, 221);
tmp4192 = _mm512_shuffle_f32x4(tmp4315, tmp4323, 136);
tmp4200 = _mm512_shuffle_f32x4(tmp4315, tmp4323, 221);
tmp4193 = _mm512_shuffle_f32x4(tmp4310, tmp4318, 136);
__m512 tmp4241 = _mm512_shuffle_f32x4(tmp4310, tmp4318, 221);
tmp4194 = _mm512_shuffle_f32x4(tmp4312, tmp4320, 136);
__m512 tmp4242 = _mm512_shuffle_f32x4(tmp4312, tmp4320, 221);
tmp4195 = _mm512_shuffle_f32x4(tmp4314, tmp4322, 136);
__m512 tmp4243 = _mm512_shuffle_f32x4(tmp4314, tmp4322, 221);
tmp4196 = _mm512_shuffle_f32x4(tmp4316, tmp4324, 136);
__m512 tmp4244 = _mm512_shuffle_f32x4(tmp4316, tmp4324, 221);
__m512 tmp4249 = _mm512_add_ps(tmp4190, tmp4191);
__m512 tmp4269 = _mm512_add_ps(tmp4198, tmp4199);
__m512 tmp4248 = _mm512_add_ps(tmp4192, tmp4193);
__m512 tmp4268 = _mm512_add_ps(tmp4200, tmp4241);
__m512 tmp4254 = _mm512_sub_ps(tmp4192, tmp4193);
__m512 tmp4274 = _mm512_sub_ps(tmp4200, tmp4241);
__m512 tmp4253 = _mm512_sub_ps(tmp4190, tmp4191);
__m512 tmp4273 = _mm512_sub_ps(tmp4198, tmp4199);
__m512 tmp4250 = _mm512_add_ps(tmp4194, tmp4195);
__m512 tmp4270 = _mm512_add_ps(tmp4242, tmp4243);
__m512 tmp4255 = _mm512_sub_ps(tmp4194, tmp4195);
__m512 tmp4275 = _mm512_sub_ps(tmp4242, tmp4243);
__m512 tmp4252 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(2e+00f), tmp4253);
__m512 tmp4272 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(2e+00f), tmp4273);
__m512 tmp4259 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(8e+00f), tmp4253);
__m512 tmp4279 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(8e+00f), tmp4273);
__m512 tmp4247 = _mm512_add_ps(tmp4248, tmp4249);
__m512 tmp4267 = _mm512_add_ps(tmp4268, tmp4269);
__m512 tmp4251 = _mm512_fmadd_ps(tmp4255, _mm512_set1_ps(1.6e+01f), tmp4252);
__m512 tmp4271 = _mm512_fmadd_ps(tmp4275, _mm512_set1_ps(1.6e+01f), tmp4272);
__m512 tmp4258 = _mm512_fmadd_ps(tmp4255, _mm512_set1_ps(4e+00f), tmp4259);
__m512 tmp4278 = _mm512_fmadd_ps(tmp4275, _mm512_set1_ps(4e+00f), tmp4279);
__m512 tmp4264 = _mm512_add_ps(tmp4255, tmp4253);
__m512 tmp4284 = _mm512_add_ps(tmp4275, tmp4273);
__m512 tmp4257 = _mm512_fmadd_ps(tmp4248, _mm512_set1_ps(4e+00f), tmp4249);
__m512 tmp4277 = _mm512_fmadd_ps(tmp4268, _mm512_set1_ps(4e+00f), tmp4269);
__m512 tmp4261 = _mm512_fmadd_ps(tmp4248, _mm512_set1_ps(1.6e+01f), tmp4249);
__m512 tmp4281 = _mm512_fmadd_ps(tmp4268, _mm512_set1_ps(1.6e+01f), tmp4269);
__m512 tmp4246 = _mm512_add_ps(tmp4247, tmp4189);
__m512 tmp4266 = _mm512_add_ps(tmp4267, tmp4197);
__m512 tmp4263 = _mm512_add_ps(tmp4264, tmp4196);
__m512 tmp4283 = _mm512_add_ps(tmp4284, tmp4244);
__m512 tmp4245 = _mm512_fmadd_ps(tmp4250, _mm512_set1_ps(3.2e+01f), tmp4246);
__m512 tmp4265 = _mm512_fmadd_ps(tmp4270, _mm512_set1_ps(3.2e+01f), tmp4266);
__m512 tmp4256 = _mm512_fmadd_ps(tmp4250, _mm512_set1_ps(8e+00f), tmp4257);
__m512 tmp4276 = _mm512_fmadd_ps(tmp4270, _mm512_set1_ps(8e+00f), tmp4277);
__m512 tmp4262 = _mm512_fmadd_ps(tmp4254, _mm512_set1_ps(3.2e+01f), tmp4263);
__m512 tmp4282 = _mm512_fmadd_ps(tmp4274, _mm512_set1_ps(3.2e+01f), tmp4283);
__m512 tmp4260 = _mm512_fmadd_ps(tmp4250, _mm512_set1_ps(2e+00f), tmp4261);
__m512 tmp4280 = _mm512_fmadd_ps(tmp4270, _mm512_set1_ps(2e+00f), tmp4281);
__m512 out627 = tmp4245;
__m512 out633 = tmp4265;
__m512 out628 = tmp4251;
__m512 out634 = tmp4271;
__m512 out629 = tmp4256;
__m512 out635 = tmp4276;
__m512 out630 = tmp4258;
__m512 out636 = tmp4278;
__m512 out631 = tmp4260;
__m512 out637 = tmp4280;
__m512 out632 = tmp4262;
__m512 out638 = tmp4282;
out627 = _mm512_max_ps(_mm512_setzero_ps(), out627);
out633 = _mm512_max_ps(_mm512_setzero_ps(), out633);
out628 = _mm512_max_ps(_mm512_setzero_ps(), out628);
out634 = _mm512_max_ps(_mm512_setzero_ps(), out634);
out629 = _mm512_max_ps(_mm512_setzero_ps(), out629);
out635 = _mm512_max_ps(_mm512_setzero_ps(), out635);
out630 = _mm512_max_ps(_mm512_setzero_ps(), out630);
out636 = _mm512_max_ps(_mm512_setzero_ps(), out636);
out631 = _mm512_max_ps(_mm512_setzero_ps(), out631);
out637 = _mm512_max_ps(_mm512_setzero_ps(), out637);
out632 = _mm512_max_ps(_mm512_setzero_ps(), out632);
out638 = _mm512_max_ps(_mm512_setzero_ps(), out638);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out627);
_mm512_mask_storeu_ps(datPtr6+12704+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out633);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out628);
_mm512_mask_storeu_ps(datPtr6+12928+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out634);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out629);
_mm512_mask_storeu_ps(datPtr6+13152+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out635);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out630);
_mm512_mask_storeu_ps(datPtr6+13376+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out636);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out631);
_mm512_mask_storeu_ps(datPtr6+13600+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out637);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out632);
_mm512_mask_storeu_ps(datPtr6+13824+50432*i19+224*toH25+4*toW25+50432*k66+25216*l18, 4095, out638);
}
}
++j13;
rel11 = 4;
}
ptrdiff_t toH26 = base11+12;
ptrdiff_t toW26 = 36;
ptrdiff_t k67 = 1*w33;
for (; k67 != 1; ++k67) {
ptrdiff_t l19 = 0;
for (; l19 != 2; ++l19) {
__m512 sf289 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf290 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in652 = _mm512_shuffle_f32x4(sf289, sf290, 68);
__m512 in653 = _mm512_shuffle_f32x4(sf289, sf290, 238);
__m512 sf291 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf292 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in660 = _mm512_shuffle_f32x4(sf291, sf292, 68);
__m512 in661 = _mm512_shuffle_f32x4(sf291, sf292, 238);
__m512 sf293 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf294 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in654 = _mm512_shuffle_f32x4(sf293, sf294, 68);
__m512 in655 = _mm512_shuffle_f32x4(sf293, sf294, 238);
__m512 sf295 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf296 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in662 = _mm512_shuffle_f32x4(sf295, sf296, 68);
__m512 in663 = _mm512_shuffle_f32x4(sf295, sf296, 238);
__m512 sf297 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf298 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in656 = _mm512_shuffle_f32x4(sf297, sf298, 68);
__m512 in657 = _mm512_shuffle_f32x4(sf297, sf298, 238);
__m512 sf299 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf300 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in664 = _mm512_shuffle_f32x4(sf299, sf300, 68);
__m512 in665 = _mm512_shuffle_f32x4(sf299, sf300, 238);
__m512 sf301 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf302 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in658 = _mm512_shuffle_f32x4(sf301, sf302, 68);
__m512 in659 = _mm512_shuffle_f32x4(sf301, sf302, 238);
__m512 sf303 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf304 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in666 = _mm512_shuffle_f32x4(sf303, sf304, 68);
__m512 in667 = _mm512_shuffle_f32x4(sf303, sf304, 238);
__m512 tmp4341 = _mm512_add_ps(in653, in654);
__m512 tmp4361 = _mm512_add_ps(in661, in662);
__m512 tmp4340 = _mm512_add_ps(in655, in656);
__m512 tmp4360 = _mm512_add_ps(in663, in664);
__m512 tmp4346 = _mm512_sub_ps(in655, in656);
__m512 tmp4366 = _mm512_sub_ps(in663, in664);
__m512 tmp4345 = _mm512_sub_ps(in653, in654);
__m512 tmp4365 = _mm512_sub_ps(in661, in662);
__m512 tmp4342 = _mm512_add_ps(in657, in658);
__m512 tmp4362 = _mm512_add_ps(in665, in666);
__m512 tmp4347 = _mm512_sub_ps(in657, in658);
__m512 tmp4367 = _mm512_sub_ps(in665, in666);
__m512 tmp4344 = _mm512_fmadd_ps(tmp4346, _mm512_set1_ps(2e+00f), tmp4345);
__m512 tmp4364 = _mm512_fmadd_ps(tmp4366, _mm512_set1_ps(2e+00f), tmp4365);
__m512 tmp4351 = _mm512_fmadd_ps(tmp4346, _mm512_set1_ps(8e+00f), tmp4345);
__m512 tmp4371 = _mm512_fmadd_ps(tmp4366, _mm512_set1_ps(8e+00f), tmp4365);
__m512 tmp4339 = _mm512_add_ps(tmp4340, tmp4341);
__m512 tmp4359 = _mm512_add_ps(tmp4360, tmp4361);
__m512 tmp4343 = _mm512_fmadd_ps(tmp4347, _mm512_set1_ps(1.6e+01f), tmp4344);
__m512 tmp4363 = _mm512_fmadd_ps(tmp4367, _mm512_set1_ps(1.6e+01f), tmp4364);
__m512 tmp4350 = _mm512_fmadd_ps(tmp4347, _mm512_set1_ps(4e+00f), tmp4351);
__m512 tmp4370 = _mm512_fmadd_ps(tmp4367, _mm512_set1_ps(4e+00f), tmp4371);
__m512 tmp4356 = _mm512_add_ps(tmp4347, tmp4345);
__m512 tmp4376 = _mm512_add_ps(tmp4367, tmp4365);
__m512 tmp4349 = _mm512_fmadd_ps(tmp4340, _mm512_set1_ps(4e+00f), tmp4341);
__m512 tmp4369 = _mm512_fmadd_ps(tmp4360, _mm512_set1_ps(4e+00f), tmp4361);
__m512 tmp4353 = _mm512_fmadd_ps(tmp4340, _mm512_set1_ps(1.6e+01f), tmp4341);
__m512 tmp4373 = _mm512_fmadd_ps(tmp4360, _mm512_set1_ps(1.6e+01f), tmp4361);
__m512 tmp4338 = _mm512_add_ps(tmp4339, in652);
__m512 tmp4358 = _mm512_add_ps(tmp4359, in660);
__m512 tmp4355 = _mm512_add_ps(tmp4356, in659);
__m512 tmp4375 = _mm512_add_ps(tmp4376, in667);
__m512 tmp4337 = _mm512_fmadd_ps(tmp4342, _mm512_set1_ps(3.2e+01f), tmp4338);
__m512 tmp4357 = _mm512_fmadd_ps(tmp4362, _mm512_set1_ps(3.2e+01f), tmp4358);
__m512 tmp4348 = _mm512_fmadd_ps(tmp4342, _mm512_set1_ps(8e+00f), tmp4349);
__m512 tmp4368 = _mm512_fmadd_ps(tmp4362, _mm512_set1_ps(8e+00f), tmp4369);
__m512 tmp4354 = _mm512_fmadd_ps(tmp4346, _mm512_set1_ps(3.2e+01f), tmp4355);
__m512 tmp4374 = _mm512_fmadd_ps(tmp4366, _mm512_set1_ps(3.2e+01f), tmp4375);
__m512 tmp4352 = _mm512_fmadd_ps(tmp4342, _mm512_set1_ps(2e+00f), tmp4353);
__m512 tmp4372 = _mm512_fmadd_ps(tmp4362, _mm512_set1_ps(2e+00f), tmp4373);
__m512 tmp4325 = tmp4337;
__m512 tmp4331 = tmp4357;
__m512 tmp4326 = tmp4343;
__m512 tmp4332 = tmp4363;
__m512 tmp4327 = tmp4348;
__m512 tmp4333 = tmp4368;
__m512 tmp4328 = tmp4350;
__m512 tmp4334 = tmp4370;
__m512 tmp4329 = tmp4352;
__m512 tmp4335 = tmp4372;
__m512 tmp4330 = tmp4354;
__m512 tmp4336 = tmp4374;
__m512 tmp4421 = _mm512_unpacklo_ps(tmp4325, tmp4326);
__m512 tmp4422 = _mm512_unpackhi_ps(tmp4325, tmp4326);
__m512 tmp4423 = _mm512_unpacklo_ps(tmp4327, tmp4328);
__m512 tmp4424 = _mm512_unpackhi_ps(tmp4327, tmp4328);
__m512 tmp4425 = _mm512_unpacklo_ps(tmp4329, tmp4330);
__m512 tmp4426 = _mm512_unpackhi_ps(tmp4329, tmp4330);
__m512 tmp4427 = _mm512_unpacklo_ps(tmp4331, tmp4332);
__m512 tmp4428 = _mm512_unpackhi_ps(tmp4331, tmp4332);
__m512 tmp4429 = _mm512_unpacklo_ps(tmp4333, tmp4334);
__m512 tmp4430 = _mm512_unpackhi_ps(tmp4333, tmp4334);
__m512 tmp4431 = _mm512_unpacklo_ps(tmp4335, tmp4336);
__m512 tmp4432 = _mm512_unpackhi_ps(tmp4335, tmp4336);
__m512 tmp4433 = _mm512_shuffle_ps(tmp4421, tmp4423, 68);
__m512 tmp4434 = _mm512_shuffle_ps(tmp4421, tmp4423, 238);
__m512 tmp4435 = _mm512_shuffle_ps(tmp4422, tmp4424, 68);
__m512 tmp4436 = _mm512_shuffle_ps(tmp4422, tmp4424, 238);
__m512 tmp4437 = _mm512_shuffle_ps(tmp4425, tmp4427, 68);
__m512 tmp4438 = _mm512_shuffle_ps(tmp4425, tmp4427, 238);
__m512 tmp4439 = _mm512_shuffle_ps(tmp4426, tmp4428, 68);
__m512 tmp4440 = _mm512_shuffle_ps(tmp4426, tmp4428, 238);
__m512 tmp4441 = _mm512_shuffle_ps(tmp4429, tmp4431, 68);
__m512 tmp4442 = _mm512_shuffle_ps(tmp4429, tmp4431, 238);
__m512 tmp4443 = _mm512_shuffle_ps(tmp4430, tmp4432, 68);
__m512 tmp4444 = _mm512_shuffle_ps(tmp4430, tmp4432, 238);
__m512 tmp4445 = _mm512_shuffle_f32x4(tmp4433, tmp4437, 136);
__m512 tmp4446 = _mm512_shuffle_f32x4(tmp4433, tmp4437, 221);
__m512 tmp4447 = _mm512_shuffle_f32x4(tmp4434, tmp4438, 136);
__m512 tmp4448 = _mm512_shuffle_f32x4(tmp4434, tmp4438, 221);
__m512 tmp4449 = _mm512_shuffle_f32x4(tmp4435, tmp4439, 136);
__m512 tmp4450 = _mm512_shuffle_f32x4(tmp4435, tmp4439, 221);
__m512 tmp4451 = _mm512_shuffle_f32x4(tmp4436, tmp4440, 136);
__m512 tmp4452 = _mm512_shuffle_f32x4(tmp4436, tmp4440, 221);
__m512 tmp4453 = _mm512_shuffle_f32x4(tmp4441, tmp4441, 136);
__m512 tmp4454 = _mm512_shuffle_f32x4(tmp4441, tmp4441, 221);
__m512 tmp4455 = _mm512_shuffle_f32x4(tmp4442, tmp4442, 136);
__m512 tmp4456 = _mm512_shuffle_f32x4(tmp4442, tmp4442, 221);
__m512 tmp4457 = _mm512_shuffle_f32x4(tmp4443, tmp4443, 136);
__m512 tmp4458 = _mm512_shuffle_f32x4(tmp4443, tmp4443, 221);
__m512 tmp4459 = _mm512_shuffle_f32x4(tmp4444, tmp4444, 136);
__m512 tmp4460 = _mm512_shuffle_f32x4(tmp4444, tmp4444, 221);
tmp4325 = _mm512_shuffle_f32x4(tmp4445, tmp4453, 136);
tmp4333 = _mm512_shuffle_f32x4(tmp4445, tmp4453, 221);
tmp4326 = _mm512_shuffle_f32x4(tmp4447, tmp4455, 136);
tmp4334 = _mm512_shuffle_f32x4(tmp4447, tmp4455, 221);
tmp4327 = _mm512_shuffle_f32x4(tmp4449, tmp4457, 136);
tmp4335 = _mm512_shuffle_f32x4(tmp4449, tmp4457, 221);
tmp4328 = _mm512_shuffle_f32x4(tmp4451, tmp4459, 136);
tmp4336 = _mm512_shuffle_f32x4(tmp4451, tmp4459, 221);
tmp4329 = _mm512_shuffle_f32x4(tmp4446, tmp4454, 136);
__m512 tmp4377 = _mm512_shuffle_f32x4(tmp4446, tmp4454, 221);
tmp4330 = _mm512_shuffle_f32x4(tmp4448, tmp4456, 136);
__m512 tmp4378 = _mm512_shuffle_f32x4(tmp4448, tmp4456, 221);
tmp4331 = _mm512_shuffle_f32x4(tmp4450, tmp4458, 136);
__m512 tmp4379 = _mm512_shuffle_f32x4(tmp4450, tmp4458, 221);
tmp4332 = _mm512_shuffle_f32x4(tmp4452, tmp4460, 136);
__m512 tmp4380 = _mm512_shuffle_f32x4(tmp4452, tmp4460, 221);
__m512 tmp4385 = _mm512_add_ps(tmp4326, tmp4327);
__m512 tmp4405 = _mm512_add_ps(tmp4334, tmp4335);
__m512 tmp4384 = _mm512_add_ps(tmp4328, tmp4329);
__m512 tmp4404 = _mm512_add_ps(tmp4336, tmp4377);
__m512 tmp4390 = _mm512_sub_ps(tmp4328, tmp4329);
__m512 tmp4410 = _mm512_sub_ps(tmp4336, tmp4377);
__m512 tmp4389 = _mm512_sub_ps(tmp4326, tmp4327);
__m512 tmp4409 = _mm512_sub_ps(tmp4334, tmp4335);
__m512 tmp4386 = _mm512_add_ps(tmp4330, tmp4331);
__m512 tmp4406 = _mm512_add_ps(tmp4378, tmp4379);
__m512 tmp4391 = _mm512_sub_ps(tmp4330, tmp4331);
__m512 tmp4411 = _mm512_sub_ps(tmp4378, tmp4379);
__m512 tmp4388 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(2e+00f), tmp4389);
__m512 tmp4408 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(2e+00f), tmp4409);
__m512 tmp4395 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(8e+00f), tmp4389);
__m512 tmp4415 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(8e+00f), tmp4409);
__m512 tmp4383 = _mm512_add_ps(tmp4384, tmp4385);
__m512 tmp4403 = _mm512_add_ps(tmp4404, tmp4405);
__m512 tmp4387 = _mm512_fmadd_ps(tmp4391, _mm512_set1_ps(1.6e+01f), tmp4388);
__m512 tmp4407 = _mm512_fmadd_ps(tmp4411, _mm512_set1_ps(1.6e+01f), tmp4408);
__m512 tmp4394 = _mm512_fmadd_ps(tmp4391, _mm512_set1_ps(4e+00f), tmp4395);
__m512 tmp4414 = _mm512_fmadd_ps(tmp4411, _mm512_set1_ps(4e+00f), tmp4415);
__m512 tmp4400 = _mm512_add_ps(tmp4391, tmp4389);
__m512 tmp4420 = _mm512_add_ps(tmp4411, tmp4409);
__m512 tmp4393 = _mm512_fmadd_ps(tmp4384, _mm512_set1_ps(4e+00f), tmp4385);
__m512 tmp4413 = _mm512_fmadd_ps(tmp4404, _mm512_set1_ps(4e+00f), tmp4405);
__m512 tmp4397 = _mm512_fmadd_ps(tmp4384, _mm512_set1_ps(1.6e+01f), tmp4385);
__m512 tmp4417 = _mm512_fmadd_ps(tmp4404, _mm512_set1_ps(1.6e+01f), tmp4405);
__m512 tmp4382 = _mm512_add_ps(tmp4383, tmp4325);
__m512 tmp4402 = _mm512_add_ps(tmp4403, tmp4333);
__m512 tmp4399 = _mm512_add_ps(tmp4400, tmp4332);
__m512 tmp4419 = _mm512_add_ps(tmp4420, tmp4380);
__m512 tmp4381 = _mm512_fmadd_ps(tmp4386, _mm512_set1_ps(3.2e+01f), tmp4382);
__m512 tmp4401 = _mm512_fmadd_ps(tmp4406, _mm512_set1_ps(3.2e+01f), tmp4402);
__m512 tmp4392 = _mm512_fmadd_ps(tmp4386, _mm512_set1_ps(8e+00f), tmp4393);
__m512 tmp4412 = _mm512_fmadd_ps(tmp4406, _mm512_set1_ps(8e+00f), tmp4413);
__m512 tmp4398 = _mm512_fmadd_ps(tmp4390, _mm512_set1_ps(3.2e+01f), tmp4399);
__m512 tmp4418 = _mm512_fmadd_ps(tmp4410, _mm512_set1_ps(3.2e+01f), tmp4419);
__m512 tmp4396 = _mm512_fmadd_ps(tmp4386, _mm512_set1_ps(2e+00f), tmp4397);
__m512 tmp4416 = _mm512_fmadd_ps(tmp4406, _mm512_set1_ps(2e+00f), tmp4417);
__m512 out639 = tmp4381;
__m512 out645 = tmp4401;
__m512 out640 = tmp4387;
__m512 out646 = tmp4407;
__m512 out641 = tmp4392;
__m512 out647 = tmp4412;
__m512 out642 = tmp4394;
__m512 out648 = tmp4414;
__m512 out643 = tmp4396;
__m512 out649 = tmp4416;
__m512 out644 = tmp4398;
__m512 out650 = tmp4418;
out639 = _mm512_max_ps(_mm512_setzero_ps(), out639);
out645 = _mm512_max_ps(_mm512_setzero_ps(), out645);
out640 = _mm512_max_ps(_mm512_setzero_ps(), out640);
out646 = _mm512_max_ps(_mm512_setzero_ps(), out646);
out641 = _mm512_max_ps(_mm512_setzero_ps(), out641);
out647 = _mm512_max_ps(_mm512_setzero_ps(), out647);
out642 = _mm512_max_ps(_mm512_setzero_ps(), out642);
out648 = _mm512_max_ps(_mm512_setzero_ps(), out648);
out643 = _mm512_max_ps(_mm512_setzero_ps(), out643);
out649 = _mm512_max_ps(_mm512_setzero_ps(), out649);
out644 = _mm512_max_ps(_mm512_setzero_ps(), out644);
out650 = _mm512_max_ps(_mm512_setzero_ps(), out650);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out639);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out645);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out640);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out646);
_mm512_mask_storeu_ps(datPtr6+448+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out641);
_mm512_mask_storeu_ps(datPtr6+496+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out647);
_mm512_mask_storeu_ps(datPtr6+672+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out642);
_mm512_mask_storeu_ps(datPtr6+720+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out648);
_mm512_mask_storeu_ps(datPtr6+896+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out643);
_mm512_mask_storeu_ps(datPtr6+944+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out649);
_mm512_mask_storeu_ps(datPtr6+1120+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out644);
_mm512_mask_storeu_ps(datPtr6+1168+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out650);
__m512 sf305 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf306 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in668 = _mm512_shuffle_f32x4(sf305, sf306, 68);
__m512 in669 = _mm512_shuffle_f32x4(sf305, sf306, 238);
__m512 sf307 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf308 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in676 = _mm512_shuffle_f32x4(sf307, sf308, 68);
__m512 in677 = _mm512_shuffle_f32x4(sf307, sf308, 238);
__m512 sf309 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf310 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in670 = _mm512_shuffle_f32x4(sf309, sf310, 68);
__m512 in671 = _mm512_shuffle_f32x4(sf309, sf310, 238);
__m512 sf311 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf312 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in678 = _mm512_shuffle_f32x4(sf311, sf312, 68);
__m512 in679 = _mm512_shuffle_f32x4(sf311, sf312, 238);
__m512 sf313 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf314 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in672 = _mm512_shuffle_f32x4(sf313, sf314, 68);
__m512 in673 = _mm512_shuffle_f32x4(sf313, sf314, 238);
__m512 sf315 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf316 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in680 = _mm512_shuffle_f32x4(sf315, sf316, 68);
__m512 in681 = _mm512_shuffle_f32x4(sf315, sf316, 238);
__m512 sf317 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf318 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in674 = _mm512_shuffle_f32x4(sf317, sf318, 68);
__m512 in675 = _mm512_shuffle_f32x4(sf317, sf318, 238);
__m512 sf319 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf320 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in682 = _mm512_shuffle_f32x4(sf319, sf320, 68);
__m512 in683 = _mm512_shuffle_f32x4(sf319, sf320, 238);
__m512 tmp4477 = _mm512_add_ps(in669, in670);
__m512 tmp4497 = _mm512_add_ps(in677, in678);
__m512 tmp4476 = _mm512_add_ps(in671, in672);
__m512 tmp4496 = _mm512_add_ps(in679, in680);
__m512 tmp4482 = _mm512_sub_ps(in671, in672);
__m512 tmp4502 = _mm512_sub_ps(in679, in680);
__m512 tmp4481 = _mm512_sub_ps(in669, in670);
__m512 tmp4501 = _mm512_sub_ps(in677, in678);
__m512 tmp4478 = _mm512_add_ps(in673, in674);
__m512 tmp4498 = _mm512_add_ps(in681, in682);
__m512 tmp4483 = _mm512_sub_ps(in673, in674);
__m512 tmp4503 = _mm512_sub_ps(in681, in682);
__m512 tmp4480 = _mm512_fmadd_ps(tmp4482, _mm512_set1_ps(2e+00f), tmp4481);
__m512 tmp4500 = _mm512_fmadd_ps(tmp4502, _mm512_set1_ps(2e+00f), tmp4501);
__m512 tmp4487 = _mm512_fmadd_ps(tmp4482, _mm512_set1_ps(8e+00f), tmp4481);
__m512 tmp4507 = _mm512_fmadd_ps(tmp4502, _mm512_set1_ps(8e+00f), tmp4501);
__m512 tmp4475 = _mm512_add_ps(tmp4476, tmp4477);
__m512 tmp4495 = _mm512_add_ps(tmp4496, tmp4497);
__m512 tmp4479 = _mm512_fmadd_ps(tmp4483, _mm512_set1_ps(1.6e+01f), tmp4480);
__m512 tmp4499 = _mm512_fmadd_ps(tmp4503, _mm512_set1_ps(1.6e+01f), tmp4500);
__m512 tmp4486 = _mm512_fmadd_ps(tmp4483, _mm512_set1_ps(4e+00f), tmp4487);
__m512 tmp4506 = _mm512_fmadd_ps(tmp4503, _mm512_set1_ps(4e+00f), tmp4507);
__m512 tmp4492 = _mm512_add_ps(tmp4483, tmp4481);
__m512 tmp4512 = _mm512_add_ps(tmp4503, tmp4501);
__m512 tmp4485 = _mm512_fmadd_ps(tmp4476, _mm512_set1_ps(4e+00f), tmp4477);
__m512 tmp4505 = _mm512_fmadd_ps(tmp4496, _mm512_set1_ps(4e+00f), tmp4497);
__m512 tmp4489 = _mm512_fmadd_ps(tmp4476, _mm512_set1_ps(1.6e+01f), tmp4477);
__m512 tmp4509 = _mm512_fmadd_ps(tmp4496, _mm512_set1_ps(1.6e+01f), tmp4497);
__m512 tmp4474 = _mm512_add_ps(tmp4475, in668);
__m512 tmp4494 = _mm512_add_ps(tmp4495, in676);
__m512 tmp4491 = _mm512_add_ps(tmp4492, in675);
__m512 tmp4511 = _mm512_add_ps(tmp4512, in683);
__m512 tmp4473 = _mm512_fmadd_ps(tmp4478, _mm512_set1_ps(3.2e+01f), tmp4474);
__m512 tmp4493 = _mm512_fmadd_ps(tmp4498, _mm512_set1_ps(3.2e+01f), tmp4494);
__m512 tmp4484 = _mm512_fmadd_ps(tmp4478, _mm512_set1_ps(8e+00f), tmp4485);
__m512 tmp4504 = _mm512_fmadd_ps(tmp4498, _mm512_set1_ps(8e+00f), tmp4505);
__m512 tmp4490 = _mm512_fmadd_ps(tmp4482, _mm512_set1_ps(3.2e+01f), tmp4491);
__m512 tmp4510 = _mm512_fmadd_ps(tmp4502, _mm512_set1_ps(3.2e+01f), tmp4511);
__m512 tmp4488 = _mm512_fmadd_ps(tmp4478, _mm512_set1_ps(2e+00f), tmp4489);
__m512 tmp4508 = _mm512_fmadd_ps(tmp4498, _mm512_set1_ps(2e+00f), tmp4509);
__m512 tmp4461 = tmp4473;
__m512 tmp4467 = tmp4493;
__m512 tmp4462 = tmp4479;
__m512 tmp4468 = tmp4499;
__m512 tmp4463 = tmp4484;
__m512 tmp4469 = tmp4504;
__m512 tmp4464 = tmp4486;
__m512 tmp4470 = tmp4506;
__m512 tmp4465 = tmp4488;
__m512 tmp4471 = tmp4508;
__m512 tmp4466 = tmp4490;
__m512 tmp4472 = tmp4510;
__m512 tmp4557 = _mm512_unpacklo_ps(tmp4461, tmp4462);
__m512 tmp4558 = _mm512_unpackhi_ps(tmp4461, tmp4462);
__m512 tmp4559 = _mm512_unpacklo_ps(tmp4463, tmp4464);
__m512 tmp4560 = _mm512_unpackhi_ps(tmp4463, tmp4464);
__m512 tmp4561 = _mm512_unpacklo_ps(tmp4465, tmp4466);
__m512 tmp4562 = _mm512_unpackhi_ps(tmp4465, tmp4466);
__m512 tmp4563 = _mm512_unpacklo_ps(tmp4467, tmp4468);
__m512 tmp4564 = _mm512_unpackhi_ps(tmp4467, tmp4468);
__m512 tmp4565 = _mm512_unpacklo_ps(tmp4469, tmp4470);
__m512 tmp4566 = _mm512_unpackhi_ps(tmp4469, tmp4470);
__m512 tmp4567 = _mm512_unpacklo_ps(tmp4471, tmp4472);
__m512 tmp4568 = _mm512_unpackhi_ps(tmp4471, tmp4472);
__m512 tmp4569 = _mm512_shuffle_ps(tmp4557, tmp4559, 68);
__m512 tmp4570 = _mm512_shuffle_ps(tmp4557, tmp4559, 238);
__m512 tmp4571 = _mm512_shuffle_ps(tmp4558, tmp4560, 68);
__m512 tmp4572 = _mm512_shuffle_ps(tmp4558, tmp4560, 238);
__m512 tmp4573 = _mm512_shuffle_ps(tmp4561, tmp4563, 68);
__m512 tmp4574 = _mm512_shuffle_ps(tmp4561, tmp4563, 238);
__m512 tmp4575 = _mm512_shuffle_ps(tmp4562, tmp4564, 68);
__m512 tmp4576 = _mm512_shuffle_ps(tmp4562, tmp4564, 238);
__m512 tmp4577 = _mm512_shuffle_ps(tmp4565, tmp4567, 68);
__m512 tmp4578 = _mm512_shuffle_ps(tmp4565, tmp4567, 238);
__m512 tmp4579 = _mm512_shuffle_ps(tmp4566, tmp4568, 68);
__m512 tmp4580 = _mm512_shuffle_ps(tmp4566, tmp4568, 238);
__m512 tmp4581 = _mm512_shuffle_f32x4(tmp4569, tmp4573, 136);
__m512 tmp4582 = _mm512_shuffle_f32x4(tmp4569, tmp4573, 221);
__m512 tmp4583 = _mm512_shuffle_f32x4(tmp4570, tmp4574, 136);
__m512 tmp4584 = _mm512_shuffle_f32x4(tmp4570, tmp4574, 221);
__m512 tmp4585 = _mm512_shuffle_f32x4(tmp4571, tmp4575, 136);
__m512 tmp4586 = _mm512_shuffle_f32x4(tmp4571, tmp4575, 221);
__m512 tmp4587 = _mm512_shuffle_f32x4(tmp4572, tmp4576, 136);
__m512 tmp4588 = _mm512_shuffle_f32x4(tmp4572, tmp4576, 221);
__m512 tmp4589 = _mm512_shuffle_f32x4(tmp4577, tmp4577, 136);
__m512 tmp4590 = _mm512_shuffle_f32x4(tmp4577, tmp4577, 221);
__m512 tmp4591 = _mm512_shuffle_f32x4(tmp4578, tmp4578, 136);
__m512 tmp4592 = _mm512_shuffle_f32x4(tmp4578, tmp4578, 221);
__m512 tmp4593 = _mm512_shuffle_f32x4(tmp4579, tmp4579, 136);
__m512 tmp4594 = _mm512_shuffle_f32x4(tmp4579, tmp4579, 221);
__m512 tmp4595 = _mm512_shuffle_f32x4(tmp4580, tmp4580, 136);
__m512 tmp4596 = _mm512_shuffle_f32x4(tmp4580, tmp4580, 221);
tmp4461 = _mm512_shuffle_f32x4(tmp4581, tmp4589, 136);
tmp4469 = _mm512_shuffle_f32x4(tmp4581, tmp4589, 221);
tmp4462 = _mm512_shuffle_f32x4(tmp4583, tmp4591, 136);
tmp4470 = _mm512_shuffle_f32x4(tmp4583, tmp4591, 221);
tmp4463 = _mm512_shuffle_f32x4(tmp4585, tmp4593, 136);
tmp4471 = _mm512_shuffle_f32x4(tmp4585, tmp4593, 221);
tmp4464 = _mm512_shuffle_f32x4(tmp4587, tmp4595, 136);
tmp4472 = _mm512_shuffle_f32x4(tmp4587, tmp4595, 221);
tmp4465 = _mm512_shuffle_f32x4(tmp4582, tmp4590, 136);
__m512 tmp4513 = _mm512_shuffle_f32x4(tmp4582, tmp4590, 221);
tmp4466 = _mm512_shuffle_f32x4(tmp4584, tmp4592, 136);
__m512 tmp4514 = _mm512_shuffle_f32x4(tmp4584, tmp4592, 221);
tmp4467 = _mm512_shuffle_f32x4(tmp4586, tmp4594, 136);
__m512 tmp4515 = _mm512_shuffle_f32x4(tmp4586, tmp4594, 221);
tmp4468 = _mm512_shuffle_f32x4(tmp4588, tmp4596, 136);
__m512 tmp4516 = _mm512_shuffle_f32x4(tmp4588, tmp4596, 221);
__m512 tmp4521 = _mm512_add_ps(tmp4462, tmp4463);
__m512 tmp4541 = _mm512_add_ps(tmp4470, tmp4471);
__m512 tmp4520 = _mm512_add_ps(tmp4464, tmp4465);
__m512 tmp4540 = _mm512_add_ps(tmp4472, tmp4513);
__m512 tmp4526 = _mm512_sub_ps(tmp4464, tmp4465);
__m512 tmp4546 = _mm512_sub_ps(tmp4472, tmp4513);
__m512 tmp4525 = _mm512_sub_ps(tmp4462, tmp4463);
__m512 tmp4545 = _mm512_sub_ps(tmp4470, tmp4471);
__m512 tmp4522 = _mm512_add_ps(tmp4466, tmp4467);
__m512 tmp4542 = _mm512_add_ps(tmp4514, tmp4515);
__m512 tmp4527 = _mm512_sub_ps(tmp4466, tmp4467);
__m512 tmp4547 = _mm512_sub_ps(tmp4514, tmp4515);
__m512 tmp4524 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(2e+00f), tmp4525);
__m512 tmp4544 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(2e+00f), tmp4545);
__m512 tmp4531 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(8e+00f), tmp4525);
__m512 tmp4551 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(8e+00f), tmp4545);
__m512 tmp4519 = _mm512_add_ps(tmp4520, tmp4521);
__m512 tmp4539 = _mm512_add_ps(tmp4540, tmp4541);
__m512 tmp4523 = _mm512_fmadd_ps(tmp4527, _mm512_set1_ps(1.6e+01f), tmp4524);
__m512 tmp4543 = _mm512_fmadd_ps(tmp4547, _mm512_set1_ps(1.6e+01f), tmp4544);
__m512 tmp4530 = _mm512_fmadd_ps(tmp4527, _mm512_set1_ps(4e+00f), tmp4531);
__m512 tmp4550 = _mm512_fmadd_ps(tmp4547, _mm512_set1_ps(4e+00f), tmp4551);
__m512 tmp4536 = _mm512_add_ps(tmp4527, tmp4525);
__m512 tmp4556 = _mm512_add_ps(tmp4547, tmp4545);
__m512 tmp4529 = _mm512_fmadd_ps(tmp4520, _mm512_set1_ps(4e+00f), tmp4521);
__m512 tmp4549 = _mm512_fmadd_ps(tmp4540, _mm512_set1_ps(4e+00f), tmp4541);
__m512 tmp4533 = _mm512_fmadd_ps(tmp4520, _mm512_set1_ps(1.6e+01f), tmp4521);
__m512 tmp4553 = _mm512_fmadd_ps(tmp4540, _mm512_set1_ps(1.6e+01f), tmp4541);
__m512 tmp4518 = _mm512_add_ps(tmp4519, tmp4461);
__m512 tmp4538 = _mm512_add_ps(tmp4539, tmp4469);
__m512 tmp4535 = _mm512_add_ps(tmp4536, tmp4468);
__m512 tmp4555 = _mm512_add_ps(tmp4556, tmp4516);
__m512 tmp4517 = _mm512_fmadd_ps(tmp4522, _mm512_set1_ps(3.2e+01f), tmp4518);
__m512 tmp4537 = _mm512_fmadd_ps(tmp4542, _mm512_set1_ps(3.2e+01f), tmp4538);
__m512 tmp4528 = _mm512_fmadd_ps(tmp4522, _mm512_set1_ps(8e+00f), tmp4529);
__m512 tmp4548 = _mm512_fmadd_ps(tmp4542, _mm512_set1_ps(8e+00f), tmp4549);
__m512 tmp4534 = _mm512_fmadd_ps(tmp4526, _mm512_set1_ps(3.2e+01f), tmp4535);
__m512 tmp4554 = _mm512_fmadd_ps(tmp4546, _mm512_set1_ps(3.2e+01f), tmp4555);
__m512 tmp4532 = _mm512_fmadd_ps(tmp4522, _mm512_set1_ps(2e+00f), tmp4533);
__m512 tmp4552 = _mm512_fmadd_ps(tmp4542, _mm512_set1_ps(2e+00f), tmp4553);
__m512 out651 = tmp4517;
__m512 out657 = tmp4537;
__m512 out652 = tmp4523;
__m512 out658 = tmp4543;
__m512 out653 = tmp4528;
__m512 out659 = tmp4548;
__m512 out654 = tmp4530;
__m512 out660 = tmp4550;
__m512 out655 = tmp4532;
__m512 out661 = tmp4552;
__m512 out656 = tmp4534;
__m512 out662 = tmp4554;
out651 = _mm512_max_ps(_mm512_setzero_ps(), out651);
out657 = _mm512_max_ps(_mm512_setzero_ps(), out657);
out652 = _mm512_max_ps(_mm512_setzero_ps(), out652);
out658 = _mm512_max_ps(_mm512_setzero_ps(), out658);
out653 = _mm512_max_ps(_mm512_setzero_ps(), out653);
out659 = _mm512_max_ps(_mm512_setzero_ps(), out659);
out654 = _mm512_max_ps(_mm512_setzero_ps(), out654);
out660 = _mm512_max_ps(_mm512_setzero_ps(), out660);
out655 = _mm512_max_ps(_mm512_setzero_ps(), out655);
out661 = _mm512_max_ps(_mm512_setzero_ps(), out661);
out656 = _mm512_max_ps(_mm512_setzero_ps(), out656);
out662 = _mm512_max_ps(_mm512_setzero_ps(), out662);
_mm512_mask_storeu_ps(datPtr6+1200+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out651);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out657);
_mm512_mask_storeu_ps(datPtr6+1424+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out652);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out658);
_mm512_mask_storeu_ps(datPtr6+1648+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out653);
_mm512_mask_storeu_ps(datPtr6+13056+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out659);
_mm512_mask_storeu_ps(datPtr6+1872+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out654);
_mm512_mask_storeu_ps(datPtr6+13280+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out660);
_mm512_mask_storeu_ps(datPtr6+2096+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out655);
_mm512_mask_storeu_ps(datPtr6+13504+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out661);
_mm512_mask_storeu_ps(datPtr6+2320+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out656);
_mm512_mask_storeu_ps(datPtr6+13728+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out662);
__m512 sf321 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf322 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in684 = _mm512_shuffle_f32x4(sf321, sf322, 68);
__m512 in685 = _mm512_shuffle_f32x4(sf321, sf322, 238);
__m512 sf323 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf324 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in692 = _mm512_shuffle_f32x4(sf323, sf324, 68);
__m512 in693 = _mm512_shuffle_f32x4(sf323, sf324, 238);
__m512 sf325 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf326 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in686 = _mm512_shuffle_f32x4(sf325, sf326, 68);
__m512 in687 = _mm512_shuffle_f32x4(sf325, sf326, 238);
__m512 sf327 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf328 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in694 = _mm512_shuffle_f32x4(sf327, sf328, 68);
__m512 in695 = _mm512_shuffle_f32x4(sf327, sf328, 238);
__m512 sf329 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf330 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in688 = _mm512_shuffle_f32x4(sf329, sf330, 68);
__m512 in689 = _mm512_shuffle_f32x4(sf329, sf330, 238);
__m512 sf331 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf332 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in696 = _mm512_shuffle_f32x4(sf331, sf332, 68);
__m512 in697 = _mm512_shuffle_f32x4(sf331, sf332, 238);
__m512 sf333 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf334 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in690 = _mm512_shuffle_f32x4(sf333, sf334, 68);
__m512 in691 = _mm512_shuffle_f32x4(sf333, sf334, 238);
__m512 sf335 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k67+768*l19);
__m512 sf336 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k67+768*l19);
__m512 in698 = _mm512_shuffle_f32x4(sf335, sf336, 68);
__m512 in699 = _mm512_shuffle_f32x4(sf335, sf336, 238);
__m512 tmp4613 = _mm512_add_ps(in685, in686);
__m512 tmp4633 = _mm512_add_ps(in693, in694);
__m512 tmp4612 = _mm512_add_ps(in687, in688);
__m512 tmp4632 = _mm512_add_ps(in695, in696);
__m512 tmp4618 = _mm512_sub_ps(in687, in688);
__m512 tmp4638 = _mm512_sub_ps(in695, in696);
__m512 tmp4617 = _mm512_sub_ps(in685, in686);
__m512 tmp4637 = _mm512_sub_ps(in693, in694);
__m512 tmp4614 = _mm512_add_ps(in689, in690);
__m512 tmp4634 = _mm512_add_ps(in697, in698);
__m512 tmp4619 = _mm512_sub_ps(in689, in690);
__m512 tmp4639 = _mm512_sub_ps(in697, in698);
__m512 tmp4616 = _mm512_fmadd_ps(tmp4618, _mm512_set1_ps(2e+00f), tmp4617);
__m512 tmp4636 = _mm512_fmadd_ps(tmp4638, _mm512_set1_ps(2e+00f), tmp4637);
__m512 tmp4623 = _mm512_fmadd_ps(tmp4618, _mm512_set1_ps(8e+00f), tmp4617);
__m512 tmp4643 = _mm512_fmadd_ps(tmp4638, _mm512_set1_ps(8e+00f), tmp4637);
__m512 tmp4611 = _mm512_add_ps(tmp4612, tmp4613);
__m512 tmp4631 = _mm512_add_ps(tmp4632, tmp4633);
__m512 tmp4615 = _mm512_fmadd_ps(tmp4619, _mm512_set1_ps(1.6e+01f), tmp4616);
__m512 tmp4635 = _mm512_fmadd_ps(tmp4639, _mm512_set1_ps(1.6e+01f), tmp4636);
__m512 tmp4622 = _mm512_fmadd_ps(tmp4619, _mm512_set1_ps(4e+00f), tmp4623);
__m512 tmp4642 = _mm512_fmadd_ps(tmp4639, _mm512_set1_ps(4e+00f), tmp4643);
__m512 tmp4628 = _mm512_add_ps(tmp4619, tmp4617);
__m512 tmp4648 = _mm512_add_ps(tmp4639, tmp4637);
__m512 tmp4621 = _mm512_fmadd_ps(tmp4612, _mm512_set1_ps(4e+00f), tmp4613);
__m512 tmp4641 = _mm512_fmadd_ps(tmp4632, _mm512_set1_ps(4e+00f), tmp4633);
__m512 tmp4625 = _mm512_fmadd_ps(tmp4612, _mm512_set1_ps(1.6e+01f), tmp4613);
__m512 tmp4645 = _mm512_fmadd_ps(tmp4632, _mm512_set1_ps(1.6e+01f), tmp4633);
__m512 tmp4610 = _mm512_add_ps(tmp4611, in684);
__m512 tmp4630 = _mm512_add_ps(tmp4631, in692);
__m512 tmp4627 = _mm512_add_ps(tmp4628, in691);
__m512 tmp4647 = _mm512_add_ps(tmp4648, in699);
__m512 tmp4609 = _mm512_fmadd_ps(tmp4614, _mm512_set1_ps(3.2e+01f), tmp4610);
__m512 tmp4629 = _mm512_fmadd_ps(tmp4634, _mm512_set1_ps(3.2e+01f), tmp4630);
__m512 tmp4620 = _mm512_fmadd_ps(tmp4614, _mm512_set1_ps(8e+00f), tmp4621);
__m512 tmp4640 = _mm512_fmadd_ps(tmp4634, _mm512_set1_ps(8e+00f), tmp4641);
__m512 tmp4626 = _mm512_fmadd_ps(tmp4618, _mm512_set1_ps(3.2e+01f), tmp4627);
__m512 tmp4646 = _mm512_fmadd_ps(tmp4638, _mm512_set1_ps(3.2e+01f), tmp4647);
__m512 tmp4624 = _mm512_fmadd_ps(tmp4614, _mm512_set1_ps(2e+00f), tmp4625);
__m512 tmp4644 = _mm512_fmadd_ps(tmp4634, _mm512_set1_ps(2e+00f), tmp4645);
__m512 tmp4597 = tmp4609;
__m512 tmp4603 = tmp4629;
__m512 tmp4598 = tmp4615;
__m512 tmp4604 = tmp4635;
__m512 tmp4599 = tmp4620;
__m512 tmp4605 = tmp4640;
__m512 tmp4600 = tmp4622;
__m512 tmp4606 = tmp4642;
__m512 tmp4601 = tmp4624;
__m512 tmp4607 = tmp4644;
__m512 tmp4602 = tmp4626;
__m512 tmp4608 = tmp4646;
__m512 tmp4693 = _mm512_unpacklo_ps(tmp4597, tmp4598);
__m512 tmp4694 = _mm512_unpackhi_ps(tmp4597, tmp4598);
__m512 tmp4695 = _mm512_unpacklo_ps(tmp4599, tmp4600);
__m512 tmp4696 = _mm512_unpackhi_ps(tmp4599, tmp4600);
__m512 tmp4697 = _mm512_unpacklo_ps(tmp4601, tmp4602);
__m512 tmp4698 = _mm512_unpackhi_ps(tmp4601, tmp4602);
__m512 tmp4699 = _mm512_unpacklo_ps(tmp4603, tmp4604);
__m512 tmp4700 = _mm512_unpackhi_ps(tmp4603, tmp4604);
__m512 tmp4701 = _mm512_unpacklo_ps(tmp4605, tmp4606);
__m512 tmp4702 = _mm512_unpackhi_ps(tmp4605, tmp4606);
__m512 tmp4703 = _mm512_unpacklo_ps(tmp4607, tmp4608);
__m512 tmp4704 = _mm512_unpackhi_ps(tmp4607, tmp4608);
__m512 tmp4705 = _mm512_shuffle_ps(tmp4693, tmp4695, 68);
__m512 tmp4706 = _mm512_shuffle_ps(tmp4693, tmp4695, 238);
__m512 tmp4707 = _mm512_shuffle_ps(tmp4694, tmp4696, 68);
__m512 tmp4708 = _mm512_shuffle_ps(tmp4694, tmp4696, 238);
__m512 tmp4709 = _mm512_shuffle_ps(tmp4697, tmp4699, 68);
__m512 tmp4710 = _mm512_shuffle_ps(tmp4697, tmp4699, 238);
__m512 tmp4711 = _mm512_shuffle_ps(tmp4698, tmp4700, 68);
__m512 tmp4712 = _mm512_shuffle_ps(tmp4698, tmp4700, 238);
__m512 tmp4713 = _mm512_shuffle_ps(tmp4701, tmp4703, 68);
__m512 tmp4714 = _mm512_shuffle_ps(tmp4701, tmp4703, 238);
__m512 tmp4715 = _mm512_shuffle_ps(tmp4702, tmp4704, 68);
__m512 tmp4716 = _mm512_shuffle_ps(tmp4702, tmp4704, 238);
__m512 tmp4717 = _mm512_shuffle_f32x4(tmp4705, tmp4709, 136);
__m512 tmp4718 = _mm512_shuffle_f32x4(tmp4705, tmp4709, 221);
__m512 tmp4719 = _mm512_shuffle_f32x4(tmp4706, tmp4710, 136);
__m512 tmp4720 = _mm512_shuffle_f32x4(tmp4706, tmp4710, 221);
__m512 tmp4721 = _mm512_shuffle_f32x4(tmp4707, tmp4711, 136);
__m512 tmp4722 = _mm512_shuffle_f32x4(tmp4707, tmp4711, 221);
__m512 tmp4723 = _mm512_shuffle_f32x4(tmp4708, tmp4712, 136);
__m512 tmp4724 = _mm512_shuffle_f32x4(tmp4708, tmp4712, 221);
__m512 tmp4725 = _mm512_shuffle_f32x4(tmp4713, tmp4713, 136);
__m512 tmp4726 = _mm512_shuffle_f32x4(tmp4713, tmp4713, 221);
__m512 tmp4727 = _mm512_shuffle_f32x4(tmp4714, tmp4714, 136);
__m512 tmp4728 = _mm512_shuffle_f32x4(tmp4714, tmp4714, 221);
__m512 tmp4729 = _mm512_shuffle_f32x4(tmp4715, tmp4715, 136);
__m512 tmp4730 = _mm512_shuffle_f32x4(tmp4715, tmp4715, 221);
__m512 tmp4731 = _mm512_shuffle_f32x4(tmp4716, tmp4716, 136);
__m512 tmp4732 = _mm512_shuffle_f32x4(tmp4716, tmp4716, 221);
tmp4597 = _mm512_shuffle_f32x4(tmp4717, tmp4725, 136);
tmp4605 = _mm512_shuffle_f32x4(tmp4717, tmp4725, 221);
tmp4598 = _mm512_shuffle_f32x4(tmp4719, tmp4727, 136);
tmp4606 = _mm512_shuffle_f32x4(tmp4719, tmp4727, 221);
tmp4599 = _mm512_shuffle_f32x4(tmp4721, tmp4729, 136);
tmp4607 = _mm512_shuffle_f32x4(tmp4721, tmp4729, 221);
tmp4600 = _mm512_shuffle_f32x4(tmp4723, tmp4731, 136);
tmp4608 = _mm512_shuffle_f32x4(tmp4723, tmp4731, 221);
tmp4601 = _mm512_shuffle_f32x4(tmp4718, tmp4726, 136);
__m512 tmp4649 = _mm512_shuffle_f32x4(tmp4718, tmp4726, 221);
tmp4602 = _mm512_shuffle_f32x4(tmp4720, tmp4728, 136);
__m512 tmp4650 = _mm512_shuffle_f32x4(tmp4720, tmp4728, 221);
tmp4603 = _mm512_shuffle_f32x4(tmp4722, tmp4730, 136);
__m512 tmp4651 = _mm512_shuffle_f32x4(tmp4722, tmp4730, 221);
tmp4604 = _mm512_shuffle_f32x4(tmp4724, tmp4732, 136);
__m512 tmp4652 = _mm512_shuffle_f32x4(tmp4724, tmp4732, 221);
__m512 tmp4657 = _mm512_add_ps(tmp4598, tmp4599);
__m512 tmp4677 = _mm512_add_ps(tmp4606, tmp4607);
__m512 tmp4656 = _mm512_add_ps(tmp4600, tmp4601);
__m512 tmp4676 = _mm512_add_ps(tmp4608, tmp4649);
__m512 tmp4662 = _mm512_sub_ps(tmp4600, tmp4601);
__m512 tmp4682 = _mm512_sub_ps(tmp4608, tmp4649);
__m512 tmp4661 = _mm512_sub_ps(tmp4598, tmp4599);
__m512 tmp4681 = _mm512_sub_ps(tmp4606, tmp4607);
__m512 tmp4658 = _mm512_add_ps(tmp4602, tmp4603);
__m512 tmp4678 = _mm512_add_ps(tmp4650, tmp4651);
__m512 tmp4663 = _mm512_sub_ps(tmp4602, tmp4603);
__m512 tmp4683 = _mm512_sub_ps(tmp4650, tmp4651);
__m512 tmp4660 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(2e+00f), tmp4661);
__m512 tmp4680 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(2e+00f), tmp4681);
__m512 tmp4667 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(8e+00f), tmp4661);
__m512 tmp4687 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(8e+00f), tmp4681);
__m512 tmp4655 = _mm512_add_ps(tmp4656, tmp4657);
__m512 tmp4675 = _mm512_add_ps(tmp4676, tmp4677);
__m512 tmp4659 = _mm512_fmadd_ps(tmp4663, _mm512_set1_ps(1.6e+01f), tmp4660);
__m512 tmp4679 = _mm512_fmadd_ps(tmp4683, _mm512_set1_ps(1.6e+01f), tmp4680);
__m512 tmp4666 = _mm512_fmadd_ps(tmp4663, _mm512_set1_ps(4e+00f), tmp4667);
__m512 tmp4686 = _mm512_fmadd_ps(tmp4683, _mm512_set1_ps(4e+00f), tmp4687);
__m512 tmp4672 = _mm512_add_ps(tmp4663, tmp4661);
__m512 tmp4692 = _mm512_add_ps(tmp4683, tmp4681);
__m512 tmp4665 = _mm512_fmadd_ps(tmp4656, _mm512_set1_ps(4e+00f), tmp4657);
__m512 tmp4685 = _mm512_fmadd_ps(tmp4676, _mm512_set1_ps(4e+00f), tmp4677);
__m512 tmp4669 = _mm512_fmadd_ps(tmp4656, _mm512_set1_ps(1.6e+01f), tmp4657);
__m512 tmp4689 = _mm512_fmadd_ps(tmp4676, _mm512_set1_ps(1.6e+01f), tmp4677);
__m512 tmp4654 = _mm512_add_ps(tmp4655, tmp4597);
__m512 tmp4674 = _mm512_add_ps(tmp4675, tmp4605);
__m512 tmp4671 = _mm512_add_ps(tmp4672, tmp4604);
__m512 tmp4691 = _mm512_add_ps(tmp4692, tmp4652);
__m512 tmp4653 = _mm512_fmadd_ps(tmp4658, _mm512_set1_ps(3.2e+01f), tmp4654);
__m512 tmp4673 = _mm512_fmadd_ps(tmp4678, _mm512_set1_ps(3.2e+01f), tmp4674);
__m512 tmp4664 = _mm512_fmadd_ps(tmp4658, _mm512_set1_ps(8e+00f), tmp4665);
__m512 tmp4684 = _mm512_fmadd_ps(tmp4678, _mm512_set1_ps(8e+00f), tmp4685);
__m512 tmp4670 = _mm512_fmadd_ps(tmp4662, _mm512_set1_ps(3.2e+01f), tmp4671);
__m512 tmp4690 = _mm512_fmadd_ps(tmp4682, _mm512_set1_ps(3.2e+01f), tmp4691);
__m512 tmp4668 = _mm512_fmadd_ps(tmp4658, _mm512_set1_ps(2e+00f), tmp4669);
__m512 tmp4688 = _mm512_fmadd_ps(tmp4678, _mm512_set1_ps(2e+00f), tmp4689);
__m512 out663 = tmp4653;
__m512 out669 = tmp4673;
__m512 out664 = tmp4659;
__m512 out670 = tmp4679;
__m512 out665 = tmp4664;
__m512 out671 = tmp4684;
__m512 out666 = tmp4666;
__m512 out672 = tmp4686;
__m512 out667 = tmp4668;
__m512 out673 = tmp4688;
__m512 out668 = tmp4670;
__m512 out674 = tmp4690;
out663 = _mm512_max_ps(_mm512_setzero_ps(), out663);
out669 = _mm512_max_ps(_mm512_setzero_ps(), out669);
out664 = _mm512_max_ps(_mm512_setzero_ps(), out664);
out670 = _mm512_max_ps(_mm512_setzero_ps(), out670);
out665 = _mm512_max_ps(_mm512_setzero_ps(), out665);
out671 = _mm512_max_ps(_mm512_setzero_ps(), out671);
out666 = _mm512_max_ps(_mm512_setzero_ps(), out666);
out672 = _mm512_max_ps(_mm512_setzero_ps(), out672);
out667 = _mm512_max_ps(_mm512_setzero_ps(), out667);
out673 = _mm512_max_ps(_mm512_setzero_ps(), out673);
out668 = _mm512_max_ps(_mm512_setzero_ps(), out668);
out674 = _mm512_max_ps(_mm512_setzero_ps(), out674);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out663);
_mm512_mask_storeu_ps(datPtr6+13808+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out669);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out664);
_mm512_mask_storeu_ps(datPtr6+14032+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out670);
_mm512_mask_storeu_ps(datPtr6+13104+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out665);
_mm512_mask_storeu_ps(datPtr6+14256+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out671);
_mm512_mask_storeu_ps(datPtr6+13328+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out666);
_mm512_mask_storeu_ps(datPtr6+14480+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out672);
_mm512_mask_storeu_ps(datPtr6+13552+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out667);
_mm512_mask_storeu_ps(datPtr6+14704+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out673);
_mm512_mask_storeu_ps(datPtr6+13776+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 255, out668);
_mm512_mask_storeu_ps(datPtr6+14928+50432*i19+224*toH26+4*toW26+50432*k67+25216*l19, 4095, out674);
}
}
++j13;
}
j13 = 15;
}
ptrdiff_t rel12 = j13-15;
ptrdiff_t base12 = 54;
if (rel12 < 1) {
ptrdiff_t toH27 = base12+0;
ptrdiff_t toW27 = 0;
ptrdiff_t k68 = 1*w33;
for (; k68 != 1; ++k68) {
ptrdiff_t l20 = 0;
for (; l20 != 2; ++l20) {
__m512 sf337 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf338 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in700 = _mm512_shuffle_f32x4(sf337, sf338, 68);
__m512 in701 = _mm512_shuffle_f32x4(sf337, sf338, 238);
__m512 sf339 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf340 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in708 = _mm512_shuffle_f32x4(sf339, sf340, 68);
__m512 in709 = _mm512_shuffle_f32x4(sf339, sf340, 238);
__m512 sf341 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf342 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in702 = _mm512_shuffle_f32x4(sf341, sf342, 68);
__m512 in703 = _mm512_shuffle_f32x4(sf341, sf342, 238);
__m512 sf343 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf344 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in710 = _mm512_shuffle_f32x4(sf343, sf344, 68);
__m512 in711 = _mm512_shuffle_f32x4(sf343, sf344, 238);
__m512 sf345 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf346 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in704 = _mm512_shuffle_f32x4(sf345, sf346, 68);
__m512 in705 = _mm512_shuffle_f32x4(sf345, sf346, 238);
__m512 sf347 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf348 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in712 = _mm512_shuffle_f32x4(sf347, sf348, 68);
__m512 in713 = _mm512_shuffle_f32x4(sf347, sf348, 238);
__m512 sf349 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf350 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in706 = _mm512_shuffle_f32x4(sf349, sf350, 68);
__m512 in707 = _mm512_shuffle_f32x4(sf349, sf350, 238);
__m512 sf351 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf352 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in714 = _mm512_shuffle_f32x4(sf351, sf352, 68);
__m512 in715 = _mm512_shuffle_f32x4(sf351, sf352, 238);
__m512 tmp4749 = _mm512_add_ps(in701, in702);
__m512 tmp4769 = _mm512_add_ps(in709, in710);
__m512 tmp4748 = _mm512_add_ps(in703, in704);
__m512 tmp4768 = _mm512_add_ps(in711, in712);
__m512 tmp4754 = _mm512_sub_ps(in703, in704);
__m512 tmp4774 = _mm512_sub_ps(in711, in712);
__m512 tmp4753 = _mm512_sub_ps(in701, in702);
__m512 tmp4773 = _mm512_sub_ps(in709, in710);
__m512 tmp4750 = _mm512_add_ps(in705, in706);
__m512 tmp4770 = _mm512_add_ps(in713, in714);
__m512 tmp4755 = _mm512_sub_ps(in705, in706);
__m512 tmp4775 = _mm512_sub_ps(in713, in714);
__m512 tmp4752 = _mm512_fmadd_ps(tmp4754, _mm512_set1_ps(2e+00f), tmp4753);
__m512 tmp4772 = _mm512_fmadd_ps(tmp4774, _mm512_set1_ps(2e+00f), tmp4773);
__m512 tmp4759 = _mm512_fmadd_ps(tmp4754, _mm512_set1_ps(8e+00f), tmp4753);
__m512 tmp4779 = _mm512_fmadd_ps(tmp4774, _mm512_set1_ps(8e+00f), tmp4773);
__m512 tmp4747 = _mm512_add_ps(tmp4748, tmp4749);
__m512 tmp4767 = _mm512_add_ps(tmp4768, tmp4769);
__m512 tmp4751 = _mm512_fmadd_ps(tmp4755, _mm512_set1_ps(1.6e+01f), tmp4752);
__m512 tmp4771 = _mm512_fmadd_ps(tmp4775, _mm512_set1_ps(1.6e+01f), tmp4772);
__m512 tmp4758 = _mm512_fmadd_ps(tmp4755, _mm512_set1_ps(4e+00f), tmp4759);
__m512 tmp4778 = _mm512_fmadd_ps(tmp4775, _mm512_set1_ps(4e+00f), tmp4779);
__m512 tmp4764 = _mm512_add_ps(tmp4755, tmp4753);
__m512 tmp4784 = _mm512_add_ps(tmp4775, tmp4773);
__m512 tmp4757 = _mm512_fmadd_ps(tmp4748, _mm512_set1_ps(4e+00f), tmp4749);
__m512 tmp4777 = _mm512_fmadd_ps(tmp4768, _mm512_set1_ps(4e+00f), tmp4769);
__m512 tmp4761 = _mm512_fmadd_ps(tmp4748, _mm512_set1_ps(1.6e+01f), tmp4749);
__m512 tmp4781 = _mm512_fmadd_ps(tmp4768, _mm512_set1_ps(1.6e+01f), tmp4769);
__m512 tmp4746 = _mm512_add_ps(tmp4747, in700);
__m512 tmp4766 = _mm512_add_ps(tmp4767, in708);
__m512 tmp4763 = _mm512_add_ps(tmp4764, in707);
__m512 tmp4783 = _mm512_add_ps(tmp4784, in715);
__m512 tmp4745 = _mm512_fmadd_ps(tmp4750, _mm512_set1_ps(3.2e+01f), tmp4746);
__m512 tmp4765 = _mm512_fmadd_ps(tmp4770, _mm512_set1_ps(3.2e+01f), tmp4766);
__m512 tmp4756 = _mm512_fmadd_ps(tmp4750, _mm512_set1_ps(8e+00f), tmp4757);
__m512 tmp4776 = _mm512_fmadd_ps(tmp4770, _mm512_set1_ps(8e+00f), tmp4777);
__m512 tmp4762 = _mm512_fmadd_ps(tmp4754, _mm512_set1_ps(3.2e+01f), tmp4763);
__m512 tmp4782 = _mm512_fmadd_ps(tmp4774, _mm512_set1_ps(3.2e+01f), tmp4783);
__m512 tmp4760 = _mm512_fmadd_ps(tmp4750, _mm512_set1_ps(2e+00f), tmp4761);
__m512 tmp4780 = _mm512_fmadd_ps(tmp4770, _mm512_set1_ps(2e+00f), tmp4781);
__m512 tmp4733 = tmp4745;
__m512 tmp4739 = tmp4765;
__m512 tmp4734 = tmp4751;
__m512 tmp4740 = tmp4771;
__m512 tmp4735 = tmp4756;
__m512 tmp4741 = tmp4776;
__m512 tmp4736 = tmp4758;
__m512 tmp4742 = tmp4778;
__m512 tmp4737 = tmp4760;
__m512 tmp4743 = tmp4780;
__m512 tmp4738 = tmp4762;
__m512 tmp4744 = tmp4782;
__m512 tmp4811 = _mm512_unpacklo_ps(tmp4733, tmp4734);
__m512 tmp4812 = _mm512_unpackhi_ps(tmp4733, tmp4734);
__m512 tmp4813 = _mm512_unpacklo_ps(tmp4735, tmp4736);
__m512 tmp4814 = _mm512_unpackhi_ps(tmp4735, tmp4736);
__m512 tmp4815 = _mm512_unpacklo_ps(tmp4737, tmp4738);
__m512 tmp4816 = _mm512_unpackhi_ps(tmp4737, tmp4738);
__m512 tmp4817 = _mm512_unpacklo_ps(tmp4739, tmp4740);
__m512 tmp4818 = _mm512_unpackhi_ps(tmp4739, tmp4740);
__m512 tmp4819 = _mm512_unpacklo_ps(tmp4741, tmp4742);
__m512 tmp4820 = _mm512_unpackhi_ps(tmp4741, tmp4742);
__m512 tmp4821 = _mm512_unpacklo_ps(tmp4743, tmp4744);
__m512 tmp4822 = _mm512_unpackhi_ps(tmp4743, tmp4744);
__m512 tmp4823 = _mm512_shuffle_ps(tmp4811, tmp4813, 68);
__m512 tmp4824 = _mm512_shuffle_ps(tmp4811, tmp4813, 238);
__m512 tmp4825 = _mm512_shuffle_ps(tmp4812, tmp4814, 68);
__m512 tmp4826 = _mm512_shuffle_ps(tmp4812, tmp4814, 238);
__m512 tmp4827 = _mm512_shuffle_ps(tmp4815, tmp4817, 68);
__m512 tmp4828 = _mm512_shuffle_ps(tmp4815, tmp4817, 238);
__m512 tmp4829 = _mm512_shuffle_ps(tmp4816, tmp4818, 68);
__m512 tmp4830 = _mm512_shuffle_ps(tmp4816, tmp4818, 238);
__m512 tmp4831 = _mm512_shuffle_ps(tmp4819, tmp4821, 68);
__m512 tmp4832 = _mm512_shuffle_ps(tmp4819, tmp4821, 238);
__m512 tmp4833 = _mm512_shuffle_ps(tmp4820, tmp4822, 68);
__m512 tmp4834 = _mm512_shuffle_ps(tmp4820, tmp4822, 238);
__m512 tmp4835 = _mm512_shuffle_f32x4(tmp4823, tmp4827, 136);
__m512 tmp4836 = _mm512_shuffle_f32x4(tmp4823, tmp4827, 221);
__m512 tmp4837 = _mm512_shuffle_f32x4(tmp4824, tmp4828, 136);
__m512 tmp4838 = _mm512_shuffle_f32x4(tmp4824, tmp4828, 221);
__m512 tmp4839 = _mm512_shuffle_f32x4(tmp4825, tmp4829, 136);
__m512 tmp4840 = _mm512_shuffle_f32x4(tmp4825, tmp4829, 221);
__m512 tmp4841 = _mm512_shuffle_f32x4(tmp4826, tmp4830, 136);
__m512 tmp4842 = _mm512_shuffle_f32x4(tmp4826, tmp4830, 221);
__m512 tmp4843 = _mm512_shuffle_f32x4(tmp4831, tmp4831, 136);
__m512 tmp4844 = _mm512_shuffle_f32x4(tmp4831, tmp4831, 221);
__m512 tmp4845 = _mm512_shuffle_f32x4(tmp4832, tmp4832, 136);
__m512 tmp4846 = _mm512_shuffle_f32x4(tmp4832, tmp4832, 221);
__m512 tmp4847 = _mm512_shuffle_f32x4(tmp4833, tmp4833, 136);
__m512 tmp4848 = _mm512_shuffle_f32x4(tmp4833, tmp4833, 221);
__m512 tmp4849 = _mm512_shuffle_f32x4(tmp4834, tmp4834, 136);
__m512 tmp4850 = _mm512_shuffle_f32x4(tmp4834, tmp4834, 221);
tmp4733 = _mm512_shuffle_f32x4(tmp4835, tmp4843, 136);
tmp4741 = _mm512_shuffle_f32x4(tmp4835, tmp4843, 221);
tmp4734 = _mm512_shuffle_f32x4(tmp4837, tmp4845, 136);
tmp4742 = _mm512_shuffle_f32x4(tmp4837, tmp4845, 221);
tmp4735 = _mm512_shuffle_f32x4(tmp4839, tmp4847, 136);
tmp4743 = _mm512_shuffle_f32x4(tmp4839, tmp4847, 221);
tmp4736 = _mm512_shuffle_f32x4(tmp4841, tmp4849, 136);
tmp4744 = _mm512_shuffle_f32x4(tmp4841, tmp4849, 221);
tmp4737 = _mm512_shuffle_f32x4(tmp4836, tmp4844, 136);
__m512 tmp4785 = _mm512_shuffle_f32x4(tmp4836, tmp4844, 221);
tmp4738 = _mm512_shuffle_f32x4(tmp4838, tmp4846, 136);
__m512 tmp4786 = _mm512_shuffle_f32x4(tmp4838, tmp4846, 221);
tmp4739 = _mm512_shuffle_f32x4(tmp4840, tmp4848, 136);
__m512 tmp4787 = _mm512_shuffle_f32x4(tmp4840, tmp4848, 221);
tmp4740 = _mm512_shuffle_f32x4(tmp4842, tmp4850, 136);
__m512 tmp4788 = _mm512_shuffle_f32x4(tmp4842, tmp4850, 221);
(void)tmp4740;
(void)tmp4788;
__m512 tmp4793 = _mm512_add_ps(tmp4734, tmp4735);
__m512 tmp4804 = _mm512_add_ps(tmp4742, tmp4743);
__m512 tmp4792 = _mm512_add_ps(tmp4736, tmp4737);
__m512 tmp4803 = _mm512_add_ps(tmp4744, tmp4785);
__m512 tmp4798 = _mm512_sub_ps(tmp4736, tmp4737);
__m512 tmp4809 = _mm512_sub_ps(tmp4744, tmp4785);
__m512 tmp4797 = _mm512_sub_ps(tmp4734, tmp4735);
__m512 tmp4808 = _mm512_sub_ps(tmp4742, tmp4743);
__m512 tmp4794 = _mm512_add_ps(tmp4738, tmp4739);
__m512 tmp4805 = _mm512_add_ps(tmp4786, tmp4787);
__m512 tmp4799 = _mm512_sub_ps(tmp4738, tmp4739);
__m512 tmp4810 = _mm512_sub_ps(tmp4786, tmp4787);
__m512 tmp4796 = _mm512_fmadd_ps(tmp4798, _mm512_set1_ps(2e+00f), tmp4797);
__m512 tmp4807 = _mm512_fmadd_ps(tmp4809, _mm512_set1_ps(2e+00f), tmp4808);
__m512 tmp4791 = _mm512_add_ps(tmp4792, tmp4793);
__m512 tmp4802 = _mm512_add_ps(tmp4803, tmp4804);
__m512 tmp4795 = _mm512_fmadd_ps(tmp4799, _mm512_set1_ps(1.6e+01f), tmp4796);
__m512 tmp4806 = _mm512_fmadd_ps(tmp4810, _mm512_set1_ps(1.6e+01f), tmp4807);
__m512 tmp4790 = _mm512_add_ps(tmp4791, tmp4733);
__m512 tmp4801 = _mm512_add_ps(tmp4802, tmp4741);
__m512 tmp4789 = _mm512_fmadd_ps(tmp4794, _mm512_set1_ps(3.2e+01f), tmp4790);
__m512 tmp4800 = _mm512_fmadd_ps(tmp4805, _mm512_set1_ps(3.2e+01f), tmp4801);
__m512 out675 = tmp4789;
__m512 out677 = tmp4800;
__m512 out676 = tmp4795;
__m512 out678 = tmp4806;
out675 = _mm512_max_ps(_mm512_setzero_ps(), out675);
out677 = _mm512_max_ps(_mm512_setzero_ps(), out677);
out676 = _mm512_max_ps(_mm512_setzero_ps(), out676);
out678 = _mm512_max_ps(_mm512_setzero_ps(), out678);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out675);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out677);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out676);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out678);
__m512 sf353 = _mm512_loadu_ps(sfPtr5+256+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf354 = _mm512_loadu_ps(sfPtr5+384+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in716 = _mm512_shuffle_f32x4(sf353, sf354, 68);
__m512 in717 = _mm512_shuffle_f32x4(sf353, sf354, 238);
__m512 sf355 = _mm512_loadu_ps(sfPtr5+320+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf356 = _mm512_loadu_ps(sfPtr5+448+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in724 = _mm512_shuffle_f32x4(sf355, sf356, 68);
__m512 in725 = _mm512_shuffle_f32x4(sf355, sf356, 238);
__m512 sf357 = _mm512_loadu_ps(sfPtr5+25856+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf358 = _mm512_loadu_ps(sfPtr5+25984+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in718 = _mm512_shuffle_f32x4(sf357, sf358, 68);
__m512 in719 = _mm512_shuffle_f32x4(sf357, sf358, 238);
__m512 sf359 = _mm512_loadu_ps(sfPtr5+25920+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf360 = _mm512_loadu_ps(sfPtr5+26048+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in726 = _mm512_shuffle_f32x4(sf359, sf360, 68);
__m512 in727 = _mm512_shuffle_f32x4(sf359, sf360, 238);
__m512 sf361 = _mm512_loadu_ps(sfPtr5+51456+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf362 = _mm512_loadu_ps(sfPtr5+51584+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in720 = _mm512_shuffle_f32x4(sf361, sf362, 68);
__m512 in721 = _mm512_shuffle_f32x4(sf361, sf362, 238);
__m512 sf363 = _mm512_loadu_ps(sfPtr5+51520+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf364 = _mm512_loadu_ps(sfPtr5+51648+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in728 = _mm512_shuffle_f32x4(sf363, sf364, 68);
__m512 in729 = _mm512_shuffle_f32x4(sf363, sf364, 238);
__m512 sf365 = _mm512_loadu_ps(sfPtr5+77056+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf366 = _mm512_loadu_ps(sfPtr5+77184+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in722 = _mm512_shuffle_f32x4(sf365, sf366, 68);
__m512 in723 = _mm512_shuffle_f32x4(sf365, sf366, 238);
__m512 sf367 = _mm512_loadu_ps(sfPtr5+77120+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf368 = _mm512_loadu_ps(sfPtr5+77248+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in730 = _mm512_shuffle_f32x4(sf367, sf368, 68);
__m512 in731 = _mm512_shuffle_f32x4(sf367, sf368, 238);
__m512 tmp4867 = _mm512_add_ps(in717, in718);
__m512 tmp4887 = _mm512_add_ps(in725, in726);
__m512 tmp4866 = _mm512_add_ps(in719, in720);
__m512 tmp4886 = _mm512_add_ps(in727, in728);
__m512 tmp4872 = _mm512_sub_ps(in719, in720);
__m512 tmp4892 = _mm512_sub_ps(in727, in728);
__m512 tmp4871 = _mm512_sub_ps(in717, in718);
__m512 tmp4891 = _mm512_sub_ps(in725, in726);
__m512 tmp4868 = _mm512_add_ps(in721, in722);
__m512 tmp4888 = _mm512_add_ps(in729, in730);
__m512 tmp4873 = _mm512_sub_ps(in721, in722);
__m512 tmp4893 = _mm512_sub_ps(in729, in730);
__m512 tmp4870 = _mm512_fmadd_ps(tmp4872, _mm512_set1_ps(2e+00f), tmp4871);
__m512 tmp4890 = _mm512_fmadd_ps(tmp4892, _mm512_set1_ps(2e+00f), tmp4891);
__m512 tmp4877 = _mm512_fmadd_ps(tmp4872, _mm512_set1_ps(8e+00f), tmp4871);
__m512 tmp4897 = _mm512_fmadd_ps(tmp4892, _mm512_set1_ps(8e+00f), tmp4891);
__m512 tmp4865 = _mm512_add_ps(tmp4866, tmp4867);
__m512 tmp4885 = _mm512_add_ps(tmp4886, tmp4887);
__m512 tmp4869 = _mm512_fmadd_ps(tmp4873, _mm512_set1_ps(1.6e+01f), tmp4870);
__m512 tmp4889 = _mm512_fmadd_ps(tmp4893, _mm512_set1_ps(1.6e+01f), tmp4890);
__m512 tmp4876 = _mm512_fmadd_ps(tmp4873, _mm512_set1_ps(4e+00f), tmp4877);
__m512 tmp4896 = _mm512_fmadd_ps(tmp4893, _mm512_set1_ps(4e+00f), tmp4897);
__m512 tmp4882 = _mm512_add_ps(tmp4873, tmp4871);
__m512 tmp4902 = _mm512_add_ps(tmp4893, tmp4891);
__m512 tmp4875 = _mm512_fmadd_ps(tmp4866, _mm512_set1_ps(4e+00f), tmp4867);
__m512 tmp4895 = _mm512_fmadd_ps(tmp4886, _mm512_set1_ps(4e+00f), tmp4887);
__m512 tmp4879 = _mm512_fmadd_ps(tmp4866, _mm512_set1_ps(1.6e+01f), tmp4867);
__m512 tmp4899 = _mm512_fmadd_ps(tmp4886, _mm512_set1_ps(1.6e+01f), tmp4887);
__m512 tmp4864 = _mm512_add_ps(tmp4865, in716);
__m512 tmp4884 = _mm512_add_ps(tmp4885, in724);
__m512 tmp4881 = _mm512_add_ps(tmp4882, in723);
__m512 tmp4901 = _mm512_add_ps(tmp4902, in731);
__m512 tmp4863 = _mm512_fmadd_ps(tmp4868, _mm512_set1_ps(3.2e+01f), tmp4864);
__m512 tmp4883 = _mm512_fmadd_ps(tmp4888, _mm512_set1_ps(3.2e+01f), tmp4884);
__m512 tmp4874 = _mm512_fmadd_ps(tmp4868, _mm512_set1_ps(8e+00f), tmp4875);
__m512 tmp4894 = _mm512_fmadd_ps(tmp4888, _mm512_set1_ps(8e+00f), tmp4895);
__m512 tmp4880 = _mm512_fmadd_ps(tmp4872, _mm512_set1_ps(3.2e+01f), tmp4881);
__m512 tmp4900 = _mm512_fmadd_ps(tmp4892, _mm512_set1_ps(3.2e+01f), tmp4901);
__m512 tmp4878 = _mm512_fmadd_ps(tmp4868, _mm512_set1_ps(2e+00f), tmp4879);
__m512 tmp4898 = _mm512_fmadd_ps(tmp4888, _mm512_set1_ps(2e+00f), tmp4899);
__m512 tmp4851 = tmp4863;
__m512 tmp4857 = tmp4883;
__m512 tmp4852 = tmp4869;
__m512 tmp4858 = tmp4889;
__m512 tmp4853 = tmp4874;
__m512 tmp4859 = tmp4894;
__m512 tmp4854 = tmp4876;
__m512 tmp4860 = tmp4896;
__m512 tmp4855 = tmp4878;
__m512 tmp4861 = tmp4898;
__m512 tmp4856 = tmp4880;
__m512 tmp4862 = tmp4900;
__m512 tmp4929 = _mm512_unpacklo_ps(tmp4851, tmp4852);
__m512 tmp4930 = _mm512_unpackhi_ps(tmp4851, tmp4852);
__m512 tmp4931 = _mm512_unpacklo_ps(tmp4853, tmp4854);
__m512 tmp4932 = _mm512_unpackhi_ps(tmp4853, tmp4854);
__m512 tmp4933 = _mm512_unpacklo_ps(tmp4855, tmp4856);
__m512 tmp4934 = _mm512_unpackhi_ps(tmp4855, tmp4856);
__m512 tmp4935 = _mm512_unpacklo_ps(tmp4857, tmp4858);
__m512 tmp4936 = _mm512_unpackhi_ps(tmp4857, tmp4858);
__m512 tmp4937 = _mm512_unpacklo_ps(tmp4859, tmp4860);
__m512 tmp4938 = _mm512_unpackhi_ps(tmp4859, tmp4860);
__m512 tmp4939 = _mm512_unpacklo_ps(tmp4861, tmp4862);
__m512 tmp4940 = _mm512_unpackhi_ps(tmp4861, tmp4862);
__m512 tmp4941 = _mm512_shuffle_ps(tmp4929, tmp4931, 68);
__m512 tmp4942 = _mm512_shuffle_ps(tmp4929, tmp4931, 238);
__m512 tmp4943 = _mm512_shuffle_ps(tmp4930, tmp4932, 68);
__m512 tmp4944 = _mm512_shuffle_ps(tmp4930, tmp4932, 238);
__m512 tmp4945 = _mm512_shuffle_ps(tmp4933, tmp4935, 68);
__m512 tmp4946 = _mm512_shuffle_ps(tmp4933, tmp4935, 238);
__m512 tmp4947 = _mm512_shuffle_ps(tmp4934, tmp4936, 68);
__m512 tmp4948 = _mm512_shuffle_ps(tmp4934, tmp4936, 238);
__m512 tmp4949 = _mm512_shuffle_ps(tmp4937, tmp4939, 68);
__m512 tmp4950 = _mm512_shuffle_ps(tmp4937, tmp4939, 238);
__m512 tmp4951 = _mm512_shuffle_ps(tmp4938, tmp4940, 68);
__m512 tmp4952 = _mm512_shuffle_ps(tmp4938, tmp4940, 238);
__m512 tmp4953 = _mm512_shuffle_f32x4(tmp4941, tmp4945, 136);
__m512 tmp4954 = _mm512_shuffle_f32x4(tmp4941, tmp4945, 221);
__m512 tmp4955 = _mm512_shuffle_f32x4(tmp4942, tmp4946, 136);
__m512 tmp4956 = _mm512_shuffle_f32x4(tmp4942, tmp4946, 221);
__m512 tmp4957 = _mm512_shuffle_f32x4(tmp4943, tmp4947, 136);
__m512 tmp4958 = _mm512_shuffle_f32x4(tmp4943, tmp4947, 221);
__m512 tmp4959 = _mm512_shuffle_f32x4(tmp4944, tmp4948, 136);
__m512 tmp4960 = _mm512_shuffle_f32x4(tmp4944, tmp4948, 221);
__m512 tmp4961 = _mm512_shuffle_f32x4(tmp4949, tmp4949, 136);
__m512 tmp4962 = _mm512_shuffle_f32x4(tmp4949, tmp4949, 221);
__m512 tmp4963 = _mm512_shuffle_f32x4(tmp4950, tmp4950, 136);
__m512 tmp4964 = _mm512_shuffle_f32x4(tmp4950, tmp4950, 221);
__m512 tmp4965 = _mm512_shuffle_f32x4(tmp4951, tmp4951, 136);
__m512 tmp4966 = _mm512_shuffle_f32x4(tmp4951, tmp4951, 221);
__m512 tmp4967 = _mm512_shuffle_f32x4(tmp4952, tmp4952, 136);
__m512 tmp4968 = _mm512_shuffle_f32x4(tmp4952, tmp4952, 221);
tmp4851 = _mm512_shuffle_f32x4(tmp4953, tmp4961, 136);
tmp4859 = _mm512_shuffle_f32x4(tmp4953, tmp4961, 221);
tmp4852 = _mm512_shuffle_f32x4(tmp4955, tmp4963, 136);
tmp4860 = _mm512_shuffle_f32x4(tmp4955, tmp4963, 221);
tmp4853 = _mm512_shuffle_f32x4(tmp4957, tmp4965, 136);
tmp4861 = _mm512_shuffle_f32x4(tmp4957, tmp4965, 221);
tmp4854 = _mm512_shuffle_f32x4(tmp4959, tmp4967, 136);
tmp4862 = _mm512_shuffle_f32x4(tmp4959, tmp4967, 221);
tmp4855 = _mm512_shuffle_f32x4(tmp4954, tmp4962, 136);
__m512 tmp4903 = _mm512_shuffle_f32x4(tmp4954, tmp4962, 221);
tmp4856 = _mm512_shuffle_f32x4(tmp4956, tmp4964, 136);
__m512 tmp4904 = _mm512_shuffle_f32x4(tmp4956, tmp4964, 221);
tmp4857 = _mm512_shuffle_f32x4(tmp4958, tmp4966, 136);
__m512 tmp4905 = _mm512_shuffle_f32x4(tmp4958, tmp4966, 221);
tmp4858 = _mm512_shuffle_f32x4(tmp4960, tmp4968, 136);
__m512 tmp4906 = _mm512_shuffle_f32x4(tmp4960, tmp4968, 221);
(void)tmp4858;
(void)tmp4906;
__m512 tmp4911 = _mm512_add_ps(tmp4852, tmp4853);
__m512 tmp4922 = _mm512_add_ps(tmp4860, tmp4861);
__m512 tmp4910 = _mm512_add_ps(tmp4854, tmp4855);
__m512 tmp4921 = _mm512_add_ps(tmp4862, tmp4903);
__m512 tmp4916 = _mm512_sub_ps(tmp4854, tmp4855);
__m512 tmp4927 = _mm512_sub_ps(tmp4862, tmp4903);
__m512 tmp4915 = _mm512_sub_ps(tmp4852, tmp4853);
__m512 tmp4926 = _mm512_sub_ps(tmp4860, tmp4861);
__m512 tmp4912 = _mm512_add_ps(tmp4856, tmp4857);
__m512 tmp4923 = _mm512_add_ps(tmp4904, tmp4905);
__m512 tmp4917 = _mm512_sub_ps(tmp4856, tmp4857);
__m512 tmp4928 = _mm512_sub_ps(tmp4904, tmp4905);
__m512 tmp4914 = _mm512_fmadd_ps(tmp4916, _mm512_set1_ps(2e+00f), tmp4915);
__m512 tmp4925 = _mm512_fmadd_ps(tmp4927, _mm512_set1_ps(2e+00f), tmp4926);
__m512 tmp4909 = _mm512_add_ps(tmp4910, tmp4911);
__m512 tmp4920 = _mm512_add_ps(tmp4921, tmp4922);
__m512 tmp4913 = _mm512_fmadd_ps(tmp4917, _mm512_set1_ps(1.6e+01f), tmp4914);
__m512 tmp4924 = _mm512_fmadd_ps(tmp4928, _mm512_set1_ps(1.6e+01f), tmp4925);
__m512 tmp4908 = _mm512_add_ps(tmp4909, tmp4851);
__m512 tmp4919 = _mm512_add_ps(tmp4920, tmp4859);
__m512 tmp4907 = _mm512_fmadd_ps(tmp4912, _mm512_set1_ps(3.2e+01f), tmp4908);
__m512 tmp4918 = _mm512_fmadd_ps(tmp4923, _mm512_set1_ps(3.2e+01f), tmp4919);
__m512 out679 = tmp4907;
__m512 out681 = tmp4918;
__m512 out680 = tmp4913;
__m512 out682 = tmp4924;
out679 = _mm512_max_ps(_mm512_setzero_ps(), out679);
out681 = _mm512_max_ps(_mm512_setzero_ps(), out681);
out680 = _mm512_max_ps(_mm512_setzero_ps(), out680);
out682 = _mm512_max_ps(_mm512_setzero_ps(), out682);
_mm512_mask_storeu_ps(datPtr6+96+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out679);
_mm512_mask_storeu_ps(datPtr6+12608+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out681);
_mm512_mask_storeu_ps(datPtr6+320+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out680);
_mm512_mask_storeu_ps(datPtr6+12832+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out682);
__m512 sf369 = _mm512_loadu_ps(sfPtr5+512+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf370 = _mm512_loadu_ps(sfPtr5+640+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in732 = _mm512_shuffle_f32x4(sf369, sf370, 68);
__m512 in733 = _mm512_shuffle_f32x4(sf369, sf370, 238);
__m512 sf371 = _mm512_loadu_ps(sfPtr5+576+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf372 = _mm512_loadu_ps(sfPtr5+704+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in740 = _mm512_shuffle_f32x4(sf371, sf372, 68);
__m512 in741 = _mm512_shuffle_f32x4(sf371, sf372, 238);
__m512 sf373 = _mm512_loadu_ps(sfPtr5+26112+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf374 = _mm512_loadu_ps(sfPtr5+26240+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in734 = _mm512_shuffle_f32x4(sf373, sf374, 68);
__m512 in735 = _mm512_shuffle_f32x4(sf373, sf374, 238);
__m512 sf375 = _mm512_loadu_ps(sfPtr5+26176+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf376 = _mm512_loadu_ps(sfPtr5+26304+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in742 = _mm512_shuffle_f32x4(sf375, sf376, 68);
__m512 in743 = _mm512_shuffle_f32x4(sf375, sf376, 238);
__m512 sf377 = _mm512_loadu_ps(sfPtr5+51712+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf378 = _mm512_loadu_ps(sfPtr5+51840+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in736 = _mm512_shuffle_f32x4(sf377, sf378, 68);
__m512 in737 = _mm512_shuffle_f32x4(sf377, sf378, 238);
__m512 sf379 = _mm512_loadu_ps(sfPtr5+51776+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf380 = _mm512_loadu_ps(sfPtr5+51904+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in744 = _mm512_shuffle_f32x4(sf379, sf380, 68);
__m512 in745 = _mm512_shuffle_f32x4(sf379, sf380, 238);
__m512 sf381 = _mm512_loadu_ps(sfPtr5+77312+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf382 = _mm512_loadu_ps(sfPtr5+77440+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in738 = _mm512_shuffle_f32x4(sf381, sf382, 68);
__m512 in739 = _mm512_shuffle_f32x4(sf381, sf382, 238);
__m512 sf383 = _mm512_loadu_ps(sfPtr5+77376+102400*i19+1536*j13+1536*k68+768*l20);
__m512 sf384 = _mm512_loadu_ps(sfPtr5+77504+102400*i19+1536*j13+1536*k68+768*l20);
__m512 in746 = _mm512_shuffle_f32x4(sf383, sf384, 68);
__m512 in747 = _mm512_shuffle_f32x4(sf383, sf384, 238);
__m512 tmp4985 = _mm512_add_ps(in733, in734);
__m512 tmp5005 = _mm512_add_ps(in741, in742);
__m512 tmp4984 = _mm512_add_ps(in735, in736);
__m512 tmp5004 = _mm512_add_ps(in743, in744);
__m512 tmp4990 = _mm512_sub_ps(in735, in736);
__m512 tmp5010 = _mm512_sub_ps(in743, in744);
__m512 tmp4989 = _mm512_sub_ps(in733, in734);
__m512 tmp5009 = _mm512_sub_ps(in741, in742);
__m512 tmp4986 = _mm512_add_ps(in737, in738);
__m512 tmp5006 = _mm512_add_ps(in745, in746);
__m512 tmp4991 = _mm512_sub_ps(in737, in738);
__m512 tmp5011 = _mm512_sub_ps(in745, in746);
__m512 tmp4988 = _mm512_fmadd_ps(tmp4990, _mm512_set1_ps(2e+00f), tmp4989);
__m512 tmp5008 = _mm512_fmadd_ps(tmp5010, _mm512_set1_ps(2e+00f), tmp5009);
__m512 tmp4995 = _mm512_fmadd_ps(tmp4990, _mm512_set1_ps(8e+00f), tmp4989);
__m512 tmp5015 = _mm512_fmadd_ps(tmp5010, _mm512_set1_ps(8e+00f), tmp5009);
__m512 tmp4983 = _mm512_add_ps(tmp4984, tmp4985);
__m512 tmp5003 = _mm512_add_ps(tmp5004, tmp5005);
__m512 tmp4987 = _mm512_fmadd_ps(tmp4991, _mm512_set1_ps(1.6e+01f), tmp4988);
__m512 tmp5007 = _mm512_fmadd_ps(tmp5011, _mm512_set1_ps(1.6e+01f), tmp5008);
__m512 tmp4994 = _mm512_fmadd_ps(tmp4991, _mm512_set1_ps(4e+00f), tmp4995);
__m512 tmp5014 = _mm512_fmadd_ps(tmp5011, _mm512_set1_ps(4e+00f), tmp5015);
__m512 tmp5000 = _mm512_add_ps(tmp4991, tmp4989);
__m512 tmp5020 = _mm512_add_ps(tmp5011, tmp5009);
__m512 tmp4993 = _mm512_fmadd_ps(tmp4984, _mm512_set1_ps(4e+00f), tmp4985);
__m512 tmp5013 = _mm512_fmadd_ps(tmp5004, _mm512_set1_ps(4e+00f), tmp5005);
__m512 tmp4997 = _mm512_fmadd_ps(tmp4984, _mm512_set1_ps(1.6e+01f), tmp4985);
__m512 tmp5017 = _mm512_fmadd_ps(tmp5004, _mm512_set1_ps(1.6e+01f), tmp5005);
__m512 tmp4982 = _mm512_add_ps(tmp4983, in732);
__m512 tmp5002 = _mm512_add_ps(tmp5003, in740);
__m512 tmp4999 = _mm512_add_ps(tmp5000, in739);
__m512 tmp5019 = _mm512_add_ps(tmp5020, in747);
__m512 tmp4981 = _mm512_fmadd_ps(tmp4986, _mm512_set1_ps(3.2e+01f), tmp4982);
__m512 tmp5001 = _mm512_fmadd_ps(tmp5006, _mm512_set1_ps(3.2e+01f), tmp5002);
__m512 tmp4992 = _mm512_fmadd_ps(tmp4986, _mm512_set1_ps(8e+00f), tmp4993);
__m512 tmp5012 = _mm512_fmadd_ps(tmp5006, _mm512_set1_ps(8e+00f), tmp5013);
__m512 tmp4998 = _mm512_fmadd_ps(tmp4990, _mm512_set1_ps(3.2e+01f), tmp4999);
__m512 tmp5018 = _mm512_fmadd_ps(tmp5010, _mm512_set1_ps(3.2e+01f), tmp5019);
__m512 tmp4996 = _mm512_fmadd_ps(tmp4986, _mm512_set1_ps(2e+00f), tmp4997);
__m512 tmp5016 = _mm512_fmadd_ps(tmp5006, _mm512_set1_ps(2e+00f), tmp5017);
__m512 tmp4969 = tmp4981;
__m512 tmp4975 = tmp5001;
__m512 tmp4970 = tmp4987;
__m512 tmp4976 = tmp5007;
__m512 tmp4971 = tmp4992;
__m512 tmp4977 = tmp5012;
__m512 tmp4972 = tmp4994;
__m512 tmp4978 = tmp5014;
__m512 tmp4973 = tmp4996;
__m512 tmp4979 = tmp5016;
__m512 tmp4974 = tmp4998;
__m512 tmp4980 = tmp5018;
__m512 tmp5047 = _mm512_unpacklo_ps(tmp4969, tmp4970);
__m512 tmp5048 = _mm512_unpackhi_ps(tmp4969, tmp4970);
__m512 tmp5049 = _mm512_unpacklo_ps(tmp4971, tmp4972);
__m512 tmp5050 = _mm512_unpackhi_ps(tmp4971, tmp4972);
__m512 tmp5051 = _mm512_unpacklo_ps(tmp4973, tmp4974);
__m512 tmp5052 = _mm512_unpackhi_ps(tmp4973, tmp4974);
__m512 tmp5053 = _mm512_unpacklo_ps(tmp4975, tmp4976);
__m512 tmp5054 = _mm512_unpackhi_ps(tmp4975, tmp4976);
__m512 tmp5055 = _mm512_unpacklo_ps(tmp4977, tmp4978);
__m512 tmp5056 = _mm512_unpackhi_ps(tmp4977, tmp4978);
__m512 tmp5057 = _mm512_unpacklo_ps(tmp4979, tmp4980);
__m512 tmp5058 = _mm512_unpackhi_ps(tmp4979, tmp4980);
__m512 tmp5059 = _mm512_shuffle_ps(tmp5047, tmp5049, 68);
__m512 tmp5060 = _mm512_shuffle_ps(tmp5047, tmp5049, 238);
__m512 tmp5061 = _mm512_shuffle_ps(tmp5048, tmp5050, 68);
__m512 tmp5062 = _mm512_shuffle_ps(tmp5048, tmp5050, 238);
__m512 tmp5063 = _mm512_shuffle_ps(tmp5051, tmp5053, 68);
__m512 tmp5064 = _mm512_shuffle_ps(tmp5051, tmp5053, 238);
__m512 tmp5065 = _mm512_shuffle_ps(tmp5052, tmp5054, 68);
__m512 tmp5066 = _mm512_shuffle_ps(tmp5052, tmp5054, 238);
__m512 tmp5067 = _mm512_shuffle_ps(tmp5055, tmp5057, 68);
__m512 tmp5068 = _mm512_shuffle_ps(tmp5055, tmp5057, 238);
__m512 tmp5069 = _mm512_shuffle_ps(tmp5056, tmp5058, 68);
__m512 tmp5070 = _mm512_shuffle_ps(tmp5056, tmp5058, 238);
__m512 tmp5071 = _mm512_shuffle_f32x4(tmp5059, tmp5063, 136);
__m512 tmp5072 = _mm512_shuffle_f32x4(tmp5059, tmp5063, 221);
__m512 tmp5073 = _mm512_shuffle_f32x4(tmp5060, tmp5064, 136);
__m512 tmp5074 = _mm512_shuffle_f32x4(tmp5060, tmp5064, 221);
__m512 tmp5075 = _mm512_shuffle_f32x4(tmp5061, tmp5065, 136);
__m512 tmp5076 = _mm512_shuffle_f32x4(tmp5061, tmp5065, 221);
__m512 tmp5077 = _mm512_shuffle_f32x4(tmp5062, tmp5066, 136);
__m512 tmp5078 = _mm512_shuffle_f32x4(tmp5062, tmp5066, 221);
__m512 tmp5079 = _mm512_shuffle_f32x4(tmp5067, tmp5067, 136);
__m512 tmp5080 = _mm512_shuffle_f32x4(tmp5067, tmp5067, 221);
__m512 tmp5081 = _mm512_shuffle_f32x4(tmp5068, tmp5068, 136);
__m512 tmp5082 = _mm512_shuffle_f32x4(tmp5068, tmp5068, 221);
__m512 tmp5083 = _mm512_shuffle_f32x4(tmp5069, tmp5069, 136);
__m512 tmp5084 = _mm512_shuffle_f32x4(tmp5069, tmp5069, 221);
__m512 tmp5085 = _mm512_shuffle_f32x4(tmp5070, tmp5070, 136);
__m512 tmp5086 = _mm512_shuffle_f32x4(tmp5070, tmp5070, 221);
tmp4969 = _mm512_shuffle_f32x4(tmp5071, tmp5079, 136);
tmp4977 = _mm512_shuffle_f32x4(tmp5071, tmp5079, 221);
tmp4970 = _mm512_shuffle_f32x4(tmp5073, tmp5081, 136);
tmp4978 = _mm512_shuffle_f32x4(tmp5073, tmp5081, 221);
tmp4971 = _mm512_shuffle_f32x4(tmp5075, tmp5083, 136);
tmp4979 = _mm512_shuffle_f32x4(tmp5075, tmp5083, 221);
tmp4972 = _mm512_shuffle_f32x4(tmp5077, tmp5085, 136);
tmp4980 = _mm512_shuffle_f32x4(tmp5077, tmp5085, 221);
tmp4973 = _mm512_shuffle_f32x4(tmp5072, tmp5080, 136);
__m512 tmp5021 = _mm512_shuffle_f32x4(tmp5072, tmp5080, 221);
tmp4974 = _mm512_shuffle_f32x4(tmp5074, tmp5082, 136);
__m512 tmp5022 = _mm512_shuffle_f32x4(tmp5074, tmp5082, 221);
tmp4975 = _mm512_shuffle_f32x4(tmp5076, tmp5084, 136);
__m512 tmp5023 = _mm512_shuffle_f32x4(tmp5076, tmp5084, 221);
tmp4976 = _mm512_shuffle_f32x4(tmp5078, tmp5086, 136);
__m512 tmp5024 = _mm512_shuffle_f32x4(tmp5078, tmp5086, 221);
(void)tmp4976;
(void)tmp5024;
__m512 tmp5029 = _mm512_add_ps(tmp4970, tmp4971);
__m512 tmp5040 = _mm512_add_ps(tmp4978, tmp4979);
__m512 tmp5028 = _mm512_add_ps(tmp4972, tmp4973);
__m512 tmp5039 = _mm512_add_ps(tmp4980, tmp5021);
__m512 tmp5034 = _mm512_sub_ps(tmp4972, tmp4973);
__m512 tmp5045 = _mm512_sub_ps(tmp4980, tmp5021);
__m512 tmp5033 = _mm512_sub_ps(tmp4970, tmp4971);
__m512 tmp5044 = _mm512_sub_ps(tmp4978, tmp4979);
__m512 tmp5030 = _mm512_add_ps(tmp4974, tmp4975);
__m512 tmp5041 = _mm512_add_ps(tmp5022, tmp5023);
__m512 tmp5035 = _mm512_sub_ps(tmp4974, tmp4975);
__m512 tmp5046 = _mm512_sub_ps(tmp5022, tmp5023);
__m512 tmp5032 = _mm512_fmadd_ps(tmp5034, _mm512_set1_ps(2e+00f), tmp5033);
__m512 tmp5043 = _mm512_fmadd_ps(tmp5045, _mm512_set1_ps(2e+00f), tmp5044);
__m512 tmp5027 = _mm512_add_ps(tmp5028, tmp5029);
__m512 tmp5038 = _mm512_add_ps(tmp5039, tmp5040);
__m512 tmp5031 = _mm512_fmadd_ps(tmp5035, _mm512_set1_ps(1.6e+01f), tmp5032);
__m512 tmp5042 = _mm512_fmadd_ps(tmp5046, _mm512_set1_ps(1.6e+01f), tmp5043);
__m512 tmp5026 = _mm512_add_ps(tmp5027, tmp4969);
__m512 tmp5037 = _mm512_add_ps(tmp5038, tmp4977);
__m512 tmp5025 = _mm512_fmadd_ps(tmp5030, _mm512_set1_ps(3.2e+01f), tmp5026);
__m512 tmp5036 = _mm512_fmadd_ps(tmp5041, _mm512_set1_ps(3.2e+01f), tmp5037);
__m512 out683 = tmp5025;
__m512 out685 = tmp5036;
__m512 out684 = tmp5031;
__m512 out686 = tmp5042;
out683 = _mm512_max_ps(_mm512_setzero_ps(), out683);
out685 = _mm512_max_ps(_mm512_setzero_ps(), out685);
out684 = _mm512_max_ps(_mm512_setzero_ps(), out684);
out686 = _mm512_max_ps(_mm512_setzero_ps(), out686);
_mm512_mask_storeu_ps(datPtr6+12656+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out683);
_mm512_mask_storeu_ps(datPtr6+12704+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out685);
_mm512_mask_storeu_ps(datPtr6+12880+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out684);
_mm512_mask_storeu_ps(datPtr6+12928+50432*i19+224*toH27+4*toW27+50432*k68+25216*l20, 4095, out686);
}
}
++j13;
rel12 = 1;
}
ptrdiff_t toH28 = base12+0;
ptrdiff_t toW28 = 36;
ptrdiff_t k69 = 1*w33;
for (; k69 != 1; ++k69) {
ptrdiff_t l21 = 0;
for (; l21 != 4; ++l21) {
__m512 sf385 = _mm512_loadu_ps(sfPtr5+0+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf386 = _mm512_loadu_ps(sfPtr5+128+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in748 = _mm512_shuffle_f32x4(sf385, sf386, 68);
__m512 in749 = _mm512_shuffle_f32x4(sf385, sf386, 238);
__m512 sf387 = _mm512_loadu_ps(sfPtr5+64+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf388 = _mm512_loadu_ps(sfPtr5+192+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in756 = _mm512_shuffle_f32x4(sf387, sf388, 68);
__m512 in757 = _mm512_shuffle_f32x4(sf387, sf388, 238);
__m512 sf389 = _mm512_loadu_ps(sfPtr5+25600+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf390 = _mm512_loadu_ps(sfPtr5+25728+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in750 = _mm512_shuffle_f32x4(sf389, sf390, 68);
__m512 in751 = _mm512_shuffle_f32x4(sf389, sf390, 238);
__m512 sf391 = _mm512_loadu_ps(sfPtr5+25664+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf392 = _mm512_loadu_ps(sfPtr5+25792+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in758 = _mm512_shuffle_f32x4(sf391, sf392, 68);
__m512 in759 = _mm512_shuffle_f32x4(sf391, sf392, 238);
__m512 sf393 = _mm512_loadu_ps(sfPtr5+51200+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf394 = _mm512_loadu_ps(sfPtr5+51328+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in752 = _mm512_shuffle_f32x4(sf393, sf394, 68);
__m512 in753 = _mm512_shuffle_f32x4(sf393, sf394, 238);
__m512 sf395 = _mm512_loadu_ps(sfPtr5+51264+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf396 = _mm512_loadu_ps(sfPtr5+51392+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in760 = _mm512_shuffle_f32x4(sf395, sf396, 68);
__m512 in761 = _mm512_shuffle_f32x4(sf395, sf396, 238);
__m512 sf397 = _mm512_loadu_ps(sfPtr5+76800+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf398 = _mm512_loadu_ps(sfPtr5+76928+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in754 = _mm512_shuffle_f32x4(sf397, sf398, 68);
__m512 in755 = _mm512_shuffle_f32x4(sf397, sf398, 238);
__m512 sf399 = _mm512_loadu_ps(sfPtr5+76864+102400*i19+1536*j13+1024*k69+256*l21);
__m512 sf400 = _mm512_loadu_ps(sfPtr5+76992+102400*i19+1536*j13+1024*k69+256*l21);
__m512 in762 = _mm512_shuffle_f32x4(sf399, sf400, 68);
__m512 in763 = _mm512_shuffle_f32x4(sf399, sf400, 238);
__m512 tmp5103 = _mm512_add_ps(in749, in750);
__m512 tmp5123 = _mm512_add_ps(in757, in758);
__m512 tmp5102 = _mm512_add_ps(in751, in752);
__m512 tmp5122 = _mm512_add_ps(in759, in760);
__m512 tmp5108 = _mm512_sub_ps(in751, in752);
__m512 tmp5128 = _mm512_sub_ps(in759, in760);
__m512 tmp5107 = _mm512_sub_ps(in749, in750);
__m512 tmp5127 = _mm512_sub_ps(in757, in758);
__m512 tmp5104 = _mm512_add_ps(in753, in754);
__m512 tmp5124 = _mm512_add_ps(in761, in762);
__m512 tmp5109 = _mm512_sub_ps(in753, in754);
__m512 tmp5129 = _mm512_sub_ps(in761, in762);
__m512 tmp5106 = _mm512_fmadd_ps(tmp5108, _mm512_set1_ps(2e+00f), tmp5107);
__m512 tmp5126 = _mm512_fmadd_ps(tmp5128, _mm512_set1_ps(2e+00f), tmp5127);
__m512 tmp5113 = _mm512_fmadd_ps(tmp5108, _mm512_set1_ps(8e+00f), tmp5107);
__m512 tmp5133 = _mm512_fmadd_ps(tmp5128, _mm512_set1_ps(8e+00f), tmp5127);
__m512 tmp5101 = _mm512_add_ps(tmp5102, tmp5103);
__m512 tmp5121 = _mm512_add_ps(tmp5122, tmp5123);
__m512 tmp5105 = _mm512_fmadd_ps(tmp5109, _mm512_set1_ps(1.6e+01f), tmp5106);
__m512 tmp5125 = _mm512_fmadd_ps(tmp5129, _mm512_set1_ps(1.6e+01f), tmp5126);
__m512 tmp5112 = _mm512_fmadd_ps(tmp5109, _mm512_set1_ps(4e+00f), tmp5113);
__m512 tmp5132 = _mm512_fmadd_ps(tmp5129, _mm512_set1_ps(4e+00f), tmp5133);
__m512 tmp5118 = _mm512_add_ps(tmp5109, tmp5107);
__m512 tmp5138 = _mm512_add_ps(tmp5129, tmp5127);
__m512 tmp5111 = _mm512_fmadd_ps(tmp5102, _mm512_set1_ps(4e+00f), tmp5103);
__m512 tmp5131 = _mm512_fmadd_ps(tmp5122, _mm512_set1_ps(4e+00f), tmp5123);
__m512 tmp5115 = _mm512_fmadd_ps(tmp5102, _mm512_set1_ps(1.6e+01f), tmp5103);
__m512 tmp5135 = _mm512_fmadd_ps(tmp5122, _mm512_set1_ps(1.6e+01f), tmp5123);
__m512 tmp5100 = _mm512_add_ps(tmp5101, in748);
__m512 tmp5120 = _mm512_add_ps(tmp5121, in756);
__m512 tmp5117 = _mm512_add_ps(tmp5118, in755);
__m512 tmp5137 = _mm512_add_ps(tmp5138, in763);
__m512 tmp5099 = _mm512_fmadd_ps(tmp5104, _mm512_set1_ps(3.2e+01f), tmp5100);
__m512 tmp5119 = _mm512_fmadd_ps(tmp5124, _mm512_set1_ps(3.2e+01f), tmp5120);
__m512 tmp5110 = _mm512_fmadd_ps(tmp5104, _mm512_set1_ps(8e+00f), tmp5111);
__m512 tmp5130 = _mm512_fmadd_ps(tmp5124, _mm512_set1_ps(8e+00f), tmp5131);
__m512 tmp5116 = _mm512_fmadd_ps(tmp5108, _mm512_set1_ps(3.2e+01f), tmp5117);
__m512 tmp5136 = _mm512_fmadd_ps(tmp5128, _mm512_set1_ps(3.2e+01f), tmp5137);
__m512 tmp5114 = _mm512_fmadd_ps(tmp5104, _mm512_set1_ps(2e+00f), tmp5115);
__m512 tmp5134 = _mm512_fmadd_ps(tmp5124, _mm512_set1_ps(2e+00f), tmp5135);
__m512 tmp5087 = tmp5099;
__m512 tmp5093 = tmp5119;
__m512 tmp5088 = tmp5105;
__m512 tmp5094 = tmp5125;
__m512 tmp5089 = tmp5110;
__m512 tmp5095 = tmp5130;
__m512 tmp5090 = tmp5112;
__m512 tmp5096 = tmp5132;
__m512 tmp5091 = tmp5114;
__m512 tmp5097 = tmp5134;
__m512 tmp5092 = tmp5116;
__m512 tmp5098 = tmp5136;
__m512 tmp5165 = _mm512_unpacklo_ps(tmp5087, tmp5088);
__m512 tmp5166 = _mm512_unpackhi_ps(tmp5087, tmp5088);
__m512 tmp5167 = _mm512_unpacklo_ps(tmp5089, tmp5090);
__m512 tmp5168 = _mm512_unpackhi_ps(tmp5089, tmp5090);
__m512 tmp5169 = _mm512_unpacklo_ps(tmp5091, tmp5092);
__m512 tmp5170 = _mm512_unpackhi_ps(tmp5091, tmp5092);
__m512 tmp5171 = _mm512_unpacklo_ps(tmp5093, tmp5094);
__m512 tmp5172 = _mm512_unpackhi_ps(tmp5093, tmp5094);
__m512 tmp5173 = _mm512_unpacklo_ps(tmp5095, tmp5096);
__m512 tmp5174 = _mm512_unpackhi_ps(tmp5095, tmp5096);
__m512 tmp5175 = _mm512_unpacklo_ps(tmp5097, tmp5098);
__m512 tmp5176 = _mm512_unpackhi_ps(tmp5097, tmp5098);
__m512 tmp5177 = _mm512_shuffle_ps(tmp5165, tmp5167, 68);
__m512 tmp5178 = _mm512_shuffle_ps(tmp5165, tmp5167, 238);
__m512 tmp5179 = _mm512_shuffle_ps(tmp5166, tmp5168, 68);
__m512 tmp5180 = _mm512_shuffle_ps(tmp5166, tmp5168, 238);
__m512 tmp5181 = _mm512_shuffle_ps(tmp5169, tmp5171, 68);
__m512 tmp5182 = _mm512_shuffle_ps(tmp5169, tmp5171, 238);
__m512 tmp5183 = _mm512_shuffle_ps(tmp5170, tmp5172, 68);
__m512 tmp5184 = _mm512_shuffle_ps(tmp5170, tmp5172, 238);
__m512 tmp5185 = _mm512_shuffle_ps(tmp5173, tmp5175, 68);
__m512 tmp5186 = _mm512_shuffle_ps(tmp5173, tmp5175, 238);
__m512 tmp5187 = _mm512_shuffle_ps(tmp5174, tmp5176, 68);
__m512 tmp5188 = _mm512_shuffle_ps(tmp5174, tmp5176, 238);
__m512 tmp5189 = _mm512_shuffle_f32x4(tmp5177, tmp5181, 136);
__m512 tmp5190 = _mm512_shuffle_f32x4(tmp5177, tmp5181, 221);
__m512 tmp5191 = _mm512_shuffle_f32x4(tmp5178, tmp5182, 136);
__m512 tmp5192 = _mm512_shuffle_f32x4(tmp5178, tmp5182, 221);
__m512 tmp5193 = _mm512_shuffle_f32x4(tmp5179, tmp5183, 136);
__m512 tmp5194 = _mm512_shuffle_f32x4(tmp5179, tmp5183, 221);
__m512 tmp5195 = _mm512_shuffle_f32x4(tmp5180, tmp5184, 136);
__m512 tmp5196 = _mm512_shuffle_f32x4(tmp5180, tmp5184, 221);
__m512 tmp5197 = _mm512_shuffle_f32x4(tmp5185, tmp5185, 136);
__m512 tmp5198 = _mm512_shuffle_f32x4(tmp5185, tmp5185, 221);
__m512 tmp5199 = _mm512_shuffle_f32x4(tmp5186, tmp5186, 136);
__m512 tmp5200 = _mm512_shuffle_f32x4(tmp5186, tmp5186, 221);
__m512 tmp5201 = _mm512_shuffle_f32x4(tmp5187, tmp5187, 136);
__m512 tmp5202 = _mm512_shuffle_f32x4(tmp5187, tmp5187, 221);
__m512 tmp5203 = _mm512_shuffle_f32x4(tmp5188, tmp5188, 136);
__m512 tmp5204 = _mm512_shuffle_f32x4(tmp5188, tmp5188, 221);
tmp5087 = _mm512_shuffle_f32x4(tmp5189, tmp5197, 136);
tmp5095 = _mm512_shuffle_f32x4(tmp5189, tmp5197, 221);
tmp5088 = _mm512_shuffle_f32x4(tmp5191, tmp5199, 136);
tmp5096 = _mm512_shuffle_f32x4(tmp5191, tmp5199, 221);
tmp5089 = _mm512_shuffle_f32x4(tmp5193, tmp5201, 136);
tmp5097 = _mm512_shuffle_f32x4(tmp5193, tmp5201, 221);
tmp5090 = _mm512_shuffle_f32x4(tmp5195, tmp5203, 136);
tmp5098 = _mm512_shuffle_f32x4(tmp5195, tmp5203, 221);
tmp5091 = _mm512_shuffle_f32x4(tmp5190, tmp5198, 136);
__m512 tmp5139 = _mm512_shuffle_f32x4(tmp5190, tmp5198, 221);
tmp5092 = _mm512_shuffle_f32x4(tmp5192, tmp5200, 136);
__m512 tmp5140 = _mm512_shuffle_f32x4(tmp5192, tmp5200, 221);
tmp5093 = _mm512_shuffle_f32x4(tmp5194, tmp5202, 136);
__m512 tmp5141 = _mm512_shuffle_f32x4(tmp5194, tmp5202, 221);
tmp5094 = _mm512_shuffle_f32x4(tmp5196, tmp5204, 136);
__m512 tmp5142 = _mm512_shuffle_f32x4(tmp5196, tmp5204, 221);
(void)tmp5094;
(void)tmp5142;
__m512 tmp5147 = _mm512_add_ps(tmp5088, tmp5089);
__m512 tmp5158 = _mm512_add_ps(tmp5096, tmp5097);
__m512 tmp5146 = _mm512_add_ps(tmp5090, tmp5091);
__m512 tmp5157 = _mm512_add_ps(tmp5098, tmp5139);
__m512 tmp5152 = _mm512_sub_ps(tmp5090, tmp5091);
__m512 tmp5163 = _mm512_sub_ps(tmp5098, tmp5139);
__m512 tmp5151 = _mm512_sub_ps(tmp5088, tmp5089);
__m512 tmp5162 = _mm512_sub_ps(tmp5096, tmp5097);
__m512 tmp5148 = _mm512_add_ps(tmp5092, tmp5093);
__m512 tmp5159 = _mm512_add_ps(tmp5140, tmp5141);
__m512 tmp5153 = _mm512_sub_ps(tmp5092, tmp5093);
__m512 tmp5164 = _mm512_sub_ps(tmp5140, tmp5141);
__m512 tmp5150 = _mm512_fmadd_ps(tmp5152, _mm512_set1_ps(2e+00f), tmp5151);
__m512 tmp5161 = _mm512_fmadd_ps(tmp5163, _mm512_set1_ps(2e+00f), tmp5162);
__m512 tmp5145 = _mm512_add_ps(tmp5146, tmp5147);
__m512 tmp5156 = _mm512_add_ps(tmp5157, tmp5158);
__m512 tmp5149 = _mm512_fmadd_ps(tmp5153, _mm512_set1_ps(1.6e+01f), tmp5150);
__m512 tmp5160 = _mm512_fmadd_ps(tmp5164, _mm512_set1_ps(1.6e+01f), tmp5161);
__m512 tmp5144 = _mm512_add_ps(tmp5145, tmp5087);
__m512 tmp5155 = _mm512_add_ps(tmp5156, tmp5095);
__m512 tmp5143 = _mm512_fmadd_ps(tmp5148, _mm512_set1_ps(3.2e+01f), tmp5144);
__m512 tmp5154 = _mm512_fmadd_ps(tmp5159, _mm512_set1_ps(3.2e+01f), tmp5155);
__m512 out687 = tmp5143;
__m512 out689 = tmp5154;
__m512 out688 = tmp5149;
__m512 out690 = tmp5160;
out687 = _mm512_max_ps(_mm512_setzero_ps(), out687);
out689 = _mm512_max_ps(_mm512_setzero_ps(), out689);
out688 = _mm512_max_ps(_mm512_setzero_ps(), out688);
out690 = _mm512_max_ps(_mm512_setzero_ps(), out690);
_mm512_mask_storeu_ps(datPtr6+0+50432*i19+224*toH28+4*toW28+50432*k69+12608*l21, 4095, out687);
_mm512_mask_storeu_ps(datPtr6+48+50432*i19+224*toH28+4*toW28+50432*k69+12608*l21, 255, out689);
_mm512_mask_storeu_ps(datPtr6+224+50432*i19+224*toH28+4*toW28+50432*k69+12608*l21, 4095, out688);
_mm512_mask_storeu_ps(datPtr6+272+50432*i19+224*toH28+4*toW28+50432*k69+12608*l21, 255, out690);
}
}
++j13;
}
}

static void ResNeXt50ThreeConsumeSums1(ResNeXt50ThreaderTeam1* team25, char** tensors23) {
ResNeXt50ThreaderTask1 task27;
task27.callee1 = ResNeXt50ThreeConsumeSums1Callee1;
task27.any1 = tensors23;
task27.nd1 = 3;
task27.hull1[0] = 1;
task27.hull1[1] = 1;
task27.hull1[2] = 16;
ResNeXt50ThreaderDo1(team25, &task27);
}

static void ResNeXt50ThreeArrangeFilts2Callee1(ResNeXt50ThreaderTask1* task40, int64_t* pt25) {
char** tensors38 = task40->any1;
ptrdiff_t b50 = 0;
ptrdiff_t g13 = 0;
ptrdiff_t e12 = 0;
(void)pt25;
char*restrict bfPtr6 = tensors38[3]+512*e12;
char*restrict wfPtr6 = tensors38[3]+512+6488064*e12;
char*restrict wtPtr7 = tensors38[0]+14256*e12;
char*restrict biasPtr7 = tensors38[1];
char*restrict bnPtr8 = tensors38[2];
ptrdiff_t i26 = 32*g13;
ptrdiff_t ii14 = i26+31;
for (; i26 <= ii14; ++i26) {
ptrdiff_t j20 = 1*b50;
if (j20 == 0) {
ptrdiff_t k78 = 0+1*j20;
ptrdiff_t cut8 = 0;
__m512 postMul19 = _mm512_set1_ps(((float*)bnPtr8+(ptrdiff_t)2*(0+4*i26+4*j20))[0]);
__m512 postMul20 = _mm512_set1_ps(((float*)bnPtr8+(ptrdiff_t)2*(1+4*i26+4*j20))[0]);
__m512 postMul21 = _mm512_set1_ps(((float*)bnPtr8+(ptrdiff_t)2*(2+4*i26+4*j20))[0]);
__m512 postMul22 = _mm512_set1_ps(((float*)bnPtr8+(ptrdiff_t)2*(3+4*i26+4*j20))[0]);
ptrdiff_t s18 = 0;
for (; s18 != 4; ++s18) {
__m512 wt203 = _mm512_maskz_loadu_ps(511, wtPtr7+0+576*i26+576*j20+36*s18);
__m512 wt204 = _mm512_maskz_loadu_ps(511, wtPtr7+144+576*i26+576*j20+36*s18);
__m512 wt205 = _mm512_maskz_loadu_ps(511, wtPtr7+288+576*i26+576*j20+36*s18);
__m512 wt206 = _mm512_maskz_loadu_ps(511, wtPtr7+432+576*i26+576*j20+36*s18);
wt203 = _mm512_mul_ps(wt203, postMul19);
wt204 = _mm512_mul_ps(wt204, postMul20);
wt205 = _mm512_mul_ps(wt205, postMul21);
wt206 = _mm512_mul_ps(wt206, postMul22);
__m512i pm108 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm109 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp5493 = _mm512_permutex2var_ps(wt203, pm108, wt205);
__m512 tmp5494 = _mm512_permutex2var_ps(wt204, pm108, wt206);
__m512 tmp5495 = _mm512_permutex2var_ps(wt203, pm109, wt205);
__m512 tmp5496 = _mm512_permutex2var_ps(wt204, pm109, wt206);
__m512 in764 = _mm512_permutex2var_ps(tmp5493, pm108, tmp5494);
__m512 in765 = _mm512_permutex2var_ps(tmp5493, pm109, tmp5494);
__m512 in766 = _mm512_permutex2var_ps(tmp5495, pm108, tmp5496);
__m512 tmp5497 = _mm512_fmadd_ps(in764, _mm512_set1_ps(4e+00f), in766);
__m512 tmp5498 = _mm512_add_ps(in764, in766);
__m512 tmp5499 = _mm512_fmadd_ps(in766, _mm512_set1_ps(4e+00f), in764);
__m512 tmp5500 = _mm512_add_ps(in765, tmp5498);
__m512 tmp5501 = _mm512_fmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5499);
tmp5499 = _mm512_fnmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5499);
__m512 tmp5502 = _mm512_fnmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5497);
tmp5497 = _mm512_fmadd_ps(in765, _mm512_set1_ps(2e+00f), tmp5497);
tmp5498 = _mm512_sub_ps(tmp5498, in765);
__m512 tmp5519 = _mm512_unpacklo_ps(in764, tmp5500);
__m512 tmp5520 = _mm512_unpackhi_ps(in764, tmp5500);
__m512 tmp5521 = _mm512_unpacklo_ps(tmp5498, tmp5501);
__m512 tmp5522 = _mm512_unpackhi_ps(tmp5498, tmp5501);
__m512 tmp5523 = _mm512_unpacklo_ps(tmp5499, tmp5497);
__m512 tmp5524 = _mm512_unpackhi_ps(tmp5499, tmp5497);
__m512 tmp5525 = _mm512_unpacklo_ps(tmp5502, in766);
__m512 tmp5526 = _mm512_unpackhi_ps(tmp5502, in766);
__m512 tmp5527 = _mm512_shuffle_ps(tmp5519, tmp5521, 68);
__m512 tmp5528 = _mm512_shuffle_ps(tmp5519, tmp5521, 238);
__m512 tmp5529 = _mm512_shuffle_ps(tmp5520, tmp5522, 68);
__m512 tmp5530 = _mm512_shuffle_ps(tmp5520, tmp5522, 238);
__m512 tmp5531 = _mm512_shuffle_ps(tmp5523, tmp5525, 68);
__m512 tmp5532 = _mm512_shuffle_ps(tmp5523, tmp5525, 238);
__m512 tmp5533 = _mm512_shuffle_ps(tmp5524, tmp5526, 68);
__m512 tmp5534 = _mm512_shuffle_ps(tmp5524, tmp5526, 238);
__m512 tmp5535 = _mm512_shuffle_f32x4(tmp5527, tmp5531, 136);
__m512 tmp5536 = _mm512_shuffle_f32x4(tmp5527, tmp5531, 221);
__m512 tmp5537 = _mm512_shuffle_f32x4(tmp5528, tmp5532, 136);
__m512 tmp5538 = _mm512_shuffle_f32x4(tmp5528, tmp5532, 221);
__m512 tmp5539 = _mm512_shuffle_f32x4(tmp5529, tmp5533, 136);
__m512 tmp5540 = _mm512_shuffle_f32x4(tmp5529, tmp5533, 221);
__m512 tmp5541 = _mm512_shuffle_f32x4(tmp5530, tmp5534, 136);
__m512 tmp5542 = _mm512_shuffle_f32x4(tmp5530, tmp5534, 221);
in764 = _mm512_shuffle_f32x4(tmp5535, tmp5535, 136);
__m512 tmp5503 = _mm512_shuffle_f32x4(tmp5535, tmp5535, 221);
tmp5500 = _mm512_shuffle_f32x4(tmp5537, tmp5537, 136);
__m512 tmp5504 = _mm512_shuffle_f32x4(tmp5537, tmp5537, 221);
tmp5498 = _mm512_shuffle_f32x4(tmp5539, tmp5539, 136);
__m512 tmp5505 = _mm512_shuffle_f32x4(tmp5539, tmp5539, 221);
tmp5501 = _mm512_shuffle_f32x4(tmp5541, tmp5541, 136);
__m512 tmp5506 = _mm512_shuffle_f32x4(tmp5541, tmp5541, 221);
tmp5499 = _mm512_shuffle_f32x4(tmp5536, tmp5536, 136);
tmp5497 = _mm512_shuffle_f32x4(tmp5538, tmp5538, 136);
tmp5502 = _mm512_shuffle_f32x4(tmp5540, tmp5540, 136);
in766 = _mm512_shuffle_f32x4(tmp5542, tmp5542, 136);
in764 = _mm512_shuffle_f32x4(in764, tmp5501, 68);
tmp5500 = _mm512_shuffle_f32x4(tmp5500, tmp5499, 68);
tmp5498 = _mm512_shuffle_f32x4(tmp5498, tmp5497, 68);
tmp5502 = _mm512_shuffle_f32x4(tmp5502, tmp5504, 68);
in766 = _mm512_shuffle_f32x4(in766, tmp5505, 68);
tmp5503 = _mm512_shuffle_f32x4(tmp5503, tmp5506, 68);
__m512 tmp5507 = _mm512_fmadd_ps(in764, _mm512_set1_ps(4e+00f), tmp5498);
__m512 tmp5513 = _mm512_fmadd_ps(tmp5502, _mm512_set1_ps(4e+00f), tmp5503);
__m512 tmp5508 = _mm512_add_ps(in764, tmp5498);
__m512 tmp5514 = _mm512_add_ps(tmp5502, tmp5503);
__m512 tmp5509 = _mm512_fmadd_ps(tmp5498, _mm512_set1_ps(4e+00f), in764);
__m512 tmp5515 = _mm512_fmadd_ps(tmp5503, _mm512_set1_ps(4e+00f), tmp5502);
__m512 tmp5510 = _mm512_add_ps(tmp5500, tmp5508);
__m512 tmp5516 = _mm512_add_ps(in766, tmp5514);
__m512 tmp5511 = _mm512_fmadd_ps(tmp5500, _mm512_set1_ps(2e+00f), tmp5509);
__m512 tmp5517 = _mm512_fmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5515);
tmp5509 = _mm512_fnmadd_ps(tmp5500, _mm512_set1_ps(2e+00f), tmp5509);
tmp5515 = _mm512_fnmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5515);
__m512 tmp5512 = _mm512_fnmadd_ps(tmp5500, _mm512_set1_ps(2e+00f), tmp5507);
__m512 tmp5518 = _mm512_fnmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5513);
tmp5507 = _mm512_fmadd_ps(tmp5500, _mm512_set1_ps(2e+00f), tmp5507);
tmp5513 = _mm512_fmadd_ps(in766, _mm512_set1_ps(2e+00f), tmp5513);
tmp5508 = _mm512_sub_ps(tmp5508, tmp5500);
tmp5514 = _mm512_sub_ps(tmp5514, in766);
in764 = _mm512_mul_ps(in764, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5510 = _mm512_mul_ps(tmp5510, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5508 = _mm512_mul_ps(tmp5508, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5511 = _mm512_mul_ps(tmp5511, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5509 = _mm512_mul_ps(tmp5509, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5507 = _mm512_mul_ps(tmp5507, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5512 = _mm512_mul_ps(tmp5512, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5498 = _mm512_mul_ps(tmp5498, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5502 = _mm512_mul_ps(tmp5502, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp5516 = _mm512_mul_ps(tmp5516, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5514 = _mm512_mul_ps(tmp5514, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp5517 = _mm512_mul_ps(tmp5517, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5515 = _mm512_mul_ps(tmp5515, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp5513 = _mm512_mul_ps(tmp5513, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5518 = _mm512_mul_ps(tmp5518, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp5503 = _mm512_mul_ps(tmp5503, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out691 = _mm512_shuffle_f32x4(in764, tmp5510, 68);
__m512 out695 = _mm512_shuffle_f32x4(in764, tmp5510, 238);
__m512 out692 = _mm512_shuffle_f32x4(tmp5508, tmp5511, 68);
__m512 out696 = _mm512_shuffle_f32x4(tmp5508, tmp5511, 238);
__m512 out693 = _mm512_shuffle_f32x4(tmp5509, tmp5507, 68);
__m512 out697 = _mm512_shuffle_f32x4(tmp5509, tmp5507, 238);
__m512 out694 = _mm512_shuffle_f32x4(tmp5512, tmp5498, 68);
__m512 out698 = _mm512_shuffle_f32x4(tmp5512, tmp5498, 238);
__m512 out699 = _mm512_shuffle_f32x4(tmp5502, tmp5516, 68);
__m512 out703 = _mm512_shuffle_f32x4(tmp5502, tmp5516, 238);
__m512 out700 = _mm512_shuffle_f32x4(tmp5514, tmp5517, 68);
__m512 out704 = _mm512_shuffle_f32x4(tmp5514, tmp5517, 238);
__m512 out701 = _mm512_shuffle_f32x4(tmp5515, tmp5513, 68);
__m512 out705 = _mm512_shuffle_f32x4(tmp5515, tmp5513, 238);
__m512 out702 = _mm512_shuffle_f32x4(tmp5518, tmp5503, 68);
__m512 out706 = _mm512_shuffle_f32x4(tmp5518, tmp5503, 238);
ptrdiff_t off5 = 32*cut8;
ptrdiff_t off6 = (size_t)(cut8+1)/4*512+(size_t)(cut8+1)%4*32;
ptrdiff_t off7 = (size_t)(cut8+2)/4*512+(size_t)(cut8+2)%4*32;
ptrdiff_t off8 = (size_t)(cut8+3)/4*512+(size_t)(cut8+3)%4*32;
__m512i wf57 = _mm512_castsi256_si512(_mm512_cvtps_ph(out691, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf58 = _mm512_castsi256_si512(_mm512_cvtps_ph(out695, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf59 = _mm512_castsi256_si512(_mm512_cvtps_ph(out699, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf60 = _mm512_castsi256_si512(_mm512_cvtps_ph(out703, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf61 = _mm512_castsi256_si512(_mm512_cvtps_ph(out692, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf62 = _mm512_castsi256_si512(_mm512_cvtps_ph(out696, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf63 = _mm512_castsi256_si512(_mm512_cvtps_ph(out700, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf64 = _mm512_castsi256_si512(_mm512_cvtps_ph(out704, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf65 = _mm512_castsi256_si512(_mm512_cvtps_ph(out693, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf66 = _mm512_castsi256_si512(_mm512_cvtps_ph(out697, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf67 = _mm512_castsi256_si512(_mm512_cvtps_ph(out701, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf68 = _mm512_castsi256_si512(_mm512_cvtps_ph(out705, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf69 = _mm512_castsi256_si512(_mm512_cvtps_ph(out694, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf70 = _mm512_castsi256_si512(_mm512_cvtps_ph(out698, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf71 = _mm512_castsi256_si512(_mm512_cvtps_ph(out702, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf72 = _mm512_castsi256_si512(_mm512_cvtps_ph(out706, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr6+0+2048*i26+512*k78+off5+128*s18, 255, wf57);
_mm512_mask_storeu_epi32(wfPtr6+0+2048*i26+512*k78+off6+128*s18, 255, wf58);
_mm512_mask_storeu_epi32(wfPtr6+0+2048*i26+512*k78+off7+128*s18, 255, wf59);
_mm512_mask_storeu_epi32(wfPtr6+0+2048*i26+512*k78+off8+128*s18, 255, wf60);
_mm512_mask_storeu_epi32(wfPtr6+512+2048*i26+512*k78+off5+128*s18, 255, wf61);
_mm512_mask_storeu_epi32(wfPtr6+512+2048*i26+512*k78+off6+128*s18, 255, wf62);
_mm512_mask_storeu_epi32(wfPtr6+512+2048*i26+512*k78+off7+128*s18, 255, wf63);
_mm512_mask_storeu_epi32(wfPtr6+512+2048*i26+512*k78+off8+128*s18, 255, wf64);
_mm512_mask_storeu_epi32(wfPtr6+1024+2048*i26+512*k78+off5+128*s18, 255, wf65);
_mm512_mask_storeu_epi32(wfPtr6+1024+2048*i26+512*k78+off6+128*s18, 255, wf66);
_mm512_mask_storeu_epi32(wfPtr6+1024+2048*i26+512*k78+off7+128*s18, 255, wf67);
_mm512_mask_storeu_epi32(wfPtr6+1024+2048*i26+512*k78+off8+128*s18, 255, wf68);
_mm512_mask_storeu_epi32(wfPtr6+1536+2048*i26+512*k78+off5+128*s18, 255, wf69);
_mm512_mask_storeu_epi32(wfPtr6+1536+2048*i26+512*k78+off6+128*s18, 255, wf70);
_mm512_mask_storeu_epi32(wfPtr6+1536+2048*i26+512*k78+off7+128*s18, 255, wf71);
_mm512_mask_storeu_epi32(wfPtr6+1536+2048*i26+512*k78+off8+128*s18, 255, wf72);
}
__m512 bias3 = _mm512_setzero_ps();
if (!e12) {
bias3 = _mm512_maskz_loadu_ps(15, biasPtr7-0+16*i26+16*j20);
__m512i pmMul13 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd13 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas7 = _mm512_maskz_loadu_ps(255, bnPtr8+(ptrdiff_t)8*(0+4*i26+4*j20));
__m512 postMul23 = _mm512_permutexvar_ps(pmMul13, mas7);
__m512 postAdd13 = _mm512_permutexvar_ps(pmAdd13, mas7);
bias3 = _mm512_fmadd_ps(bias3, postMul23, postAdd13);
}
_mm512_mask_storeu_ps(bfPtr6-0+16*i26+16*j20, 15, bias3);
j20 = 1;
}
}
}

static void ResNeXt50ThreeArrangeFilts2(ResNeXt50ThreaderTeam1* team32, char** tensors37) {
ResNeXt50ThreaderTask1 task41;
task41.callee1 = ResNeXt50ThreeArrangeFilts2Callee1;
task41.any1 = tensors37;
task41.nd1 = 3;
task41.hull1[0] = 1;
task41.hull1[1] = 1;
task41.hull1[2] = 1;
ResNeXt50ThreaderDo1(team32, &task41);
}

static void ResNeXt50ThreeArrangeDats2Callee1(ResNeXt50ThreaderTask1* task42, int64_t* pt26) {
char** tensors40 = task42->any1;
ptrdiff_t s19 = 0;
ptrdiff_t c19 = 0;
ptrdiff_t g14 = pt26[2];
ptrdiff_t e13 = 0;
char*restrict datPtr12 = tensors40[0]-228+4992768*e13;
char*restrict dfPtr6 = tensors40[1]+324403200*e13;
ptrdiff_t i27 = 2*g14;
ptrdiff_t ii15 = i27+1;
for (; i27 <= ii15; ++i27) {
ptrdiff_t j21 = 17*c19;
if (j21 < 2) {
ptrdiff_t rel13 = j21-0;
ptrdiff_t base13 = 0;
if (rel13 < 1) {
ptrdiff_t h29 = base13+0;
ptrdiff_t w36 = 0;
ptrdiff_t k79 = 0;
for (; k79 != 2; ++k79) {
__m512 dat1291 = _mm512_maskz_loadu_ps(8191, datPtr12+228+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1292 = _mm512_maskz_loadu_ps(16383, datPtr12+272+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512i pm110 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in767 = _mm512_permutexvar_ps(pm110, dat1291);
__m512i pm111 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in774 = _mm512_permutexvar_ps(pm111, dat1292);
__m512 dat1293 = _mm512_maskz_loadu_ps(8191, datPtr12+452+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1294 = _mm512_maskz_loadu_ps(16383, datPtr12+496+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in768 = _mm512_permutexvar_ps(pm110, dat1293);
__m512 in775 = _mm512_permutexvar_ps(pm111, dat1294);
__m512 dat1295 = _mm512_maskz_loadu_ps(8191, datPtr12+676+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1296 = _mm512_maskz_loadu_ps(16383, datPtr12+720+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in769 = _mm512_permutexvar_ps(pm110, dat1295);
__m512 in776 = _mm512_permutexvar_ps(pm111, dat1296);
__m512 dat1297 = _mm512_maskz_loadu_ps(8191, datPtr12+900+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1298 = _mm512_maskz_loadu_ps(16383, datPtr12+944+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in770 = _mm512_permutexvar_ps(pm110, dat1297);
__m512 in777 = _mm512_permutexvar_ps(pm111, dat1298);
__m512 dat1299 = _mm512_maskz_loadu_ps(8191, datPtr12+1124+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1300 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in771 = _mm512_permutexvar_ps(pm110, dat1299);
__m512 in778 = _mm512_permutexvar_ps(pm111, dat1300);
__m512 dat1301 = _mm512_maskz_loadu_ps(8191, datPtr12+1348+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1302 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in772 = _mm512_permutexvar_ps(pm110, dat1301);
__m512 in779 = _mm512_permutexvar_ps(pm111, dat1302);
__m512 dat1303 = _mm512_maskz_loadu_ps(8191, datPtr12+1572+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1304 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in773 = _mm512_permutexvar_ps(pm110, dat1303);
__m512 in780 = _mm512_permutexvar_ps(pm111, dat1304);
__m512 tmp5543 = _mm512_add_ps(in767, in771);
__m512 tmp5548 = _mm512_add_ps(in774, in778);
__m512 tmp5544 = _mm512_sub_ps(in770, in768);
__m512 tmp5549 = _mm512_sub_ps(in777, in775);
__m512 tmp5545 = _mm512_add_ps(in768, in772);
__m512 tmp5550 = _mm512_add_ps(in775, in779);
__m512 tmp5546 = _mm512_sub_ps(_mm512_setzero_ps(), in772);
__m512 tmp5551 = _mm512_sub_ps(_mm512_setzero_ps(), in779);
tmp5543 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-4.25e+00f), tmp5543);
tmp5548 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-4.25e+00f), tmp5548);
tmp5545 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-4.25e+00f), tmp5545);
tmp5550 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-4.25e+00f), tmp5550);
tmp5546 = _mm512_fmadd_ps(tmp5544, _mm512_set1_ps(5.25e+00f), tmp5546);
tmp5551 = _mm512_fmadd_ps(tmp5549, _mm512_set1_ps(5.25e+00f), tmp5551);
tmp5544 = _mm512_fmadd_ps(in768, _mm512_set1_ps(2.5e-01f), in772);
tmp5549 = _mm512_fmadd_ps(in775, _mm512_set1_ps(2.5e-01f), in779);
in768 = _mm512_fmadd_ps(in768, _mm512_set1_ps(4e+00f), in772);
in775 = _mm512_fmadd_ps(in775, _mm512_set1_ps(4e+00f), in779);
__m512 tmp5547 = _mm512_sub_ps(tmp5545, tmp5543);
__m512 tmp5552 = _mm512_sub_ps(tmp5550, tmp5548);
tmp5545 = _mm512_add_ps(tmp5543, tmp5545);
tmp5550 = _mm512_add_ps(tmp5548, tmp5550);
tmp5543 = _mm512_fmadd_ps(in767, _mm512_set1_ps(2.5e-01f), in771);
tmp5548 = _mm512_fmadd_ps(in774, _mm512_set1_ps(2.5e-01f), in778);
tmp5544 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-1.25e+00f), tmp5544);
tmp5549 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-1.25e+00f), tmp5549);
in770 = _mm512_fmadd_ps(in770, _mm512_set1_ps(-5e+00f), in768);
in777 = _mm512_fmadd_ps(in777, _mm512_set1_ps(-5e+00f), in775);
tmp5543 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-1.25e+00f), tmp5543);
tmp5548 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-1.25e+00f), tmp5548);
in772 = _mm512_fmadd_ps(tmp5543, _mm512_set1_ps(2e+00f), tmp5544);
in779 = _mm512_fmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5549);
tmp5544 = _mm512_fnmadd_ps(tmp5543, _mm512_set1_ps(2e+00f), tmp5544);
tmp5549 = _mm512_fnmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), tmp5549);
tmp5543 = _mm512_fmadd_ps(in771, _mm512_set1_ps(2.5e-01f), in767);
tmp5548 = _mm512_fmadd_ps(in778, _mm512_set1_ps(2.5e-01f), in774);
in767 = _mm512_sub_ps(in773, in767);
in774 = _mm512_sub_ps(in780, in774);
tmp5543 = _mm512_fmadd_ps(in769, _mm512_set1_ps(-1.25e+00f), tmp5543);
tmp5548 = _mm512_fmadd_ps(in776, _mm512_set1_ps(-1.25e+00f), tmp5548);
in769 = _mm512_sub_ps(in769, in771);
in776 = _mm512_sub_ps(in776, in778);
in769 = _mm512_fmadd_ps(in769, _mm512_set1_ps(5.25e+00f), in767);
in776 = _mm512_fmadd_ps(in776, _mm512_set1_ps(5.25e+00f), in774);
in768 = _mm512_fmadd_ps(tmp5543, _mm512_set1_ps(2e+00f), in770);
in775 = _mm512_fmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), in777);
in770 = _mm512_fnmadd_ps(tmp5543, _mm512_set1_ps(2e+00f), in770);
in777 = _mm512_fnmadd_ps(tmp5548, _mm512_set1_ps(2e+00f), in777);
__m512 tmp5561 = _mm512_unpacklo_ps(tmp5546, tmp5545);
__m512 tmp5562 = _mm512_unpackhi_ps(tmp5546, tmp5545);
__m512 tmp5563 = _mm512_unpacklo_ps(tmp5547, in772);
__m512 tmp5564 = _mm512_unpackhi_ps(tmp5547, in772);
__m512 tmp5565 = _mm512_unpacklo_ps(tmp5544, in768);
__m512 tmp5566 = _mm512_unpackhi_ps(tmp5544, in768);
__m512 tmp5567 = _mm512_unpacklo_ps(in770, in769);
__m512 tmp5568 = _mm512_unpackhi_ps(in770, in769);
__m512 tmp5569 = _mm512_unpacklo_ps(tmp5551, tmp5550);
__m512 tmp5570 = _mm512_unpackhi_ps(tmp5551, tmp5550);
__m512 tmp5571 = _mm512_unpacklo_ps(tmp5552, in779);
__m512 tmp5572 = _mm512_unpackhi_ps(tmp5552, in779);
__m512 tmp5573 = _mm512_unpacklo_ps(tmp5549, in775);
__m512 tmp5574 = _mm512_unpackhi_ps(tmp5549, in775);
__m512 tmp5575 = _mm512_unpacklo_ps(in777, in776);
__m512 tmp5576 = _mm512_unpackhi_ps(in777, in776);
__m512 tmp5577 = _mm512_shuffle_ps(tmp5561, tmp5563, 68);
__m512 tmp5578 = _mm512_shuffle_ps(tmp5561, tmp5563, 238);
__m512 tmp5579 = _mm512_shuffle_ps(tmp5562, tmp5564, 68);
__m512 tmp5580 = _mm512_shuffle_ps(tmp5562, tmp5564, 238);
__m512 tmp5581 = _mm512_shuffle_ps(tmp5565, tmp5567, 68);
__m512 tmp5582 = _mm512_shuffle_ps(tmp5565, tmp5567, 238);
__m512 tmp5583 = _mm512_shuffle_ps(tmp5566, tmp5568, 68);
__m512 tmp5584 = _mm512_shuffle_ps(tmp5566, tmp5568, 238);
__m512 tmp5585 = _mm512_shuffle_ps(tmp5569, tmp5571, 68);
__m512 tmp5586 = _mm512_shuffle_ps(tmp5569, tmp5571, 238);
__m512 tmp5587 = _mm512_shuffle_ps(tmp5570, tmp5572, 68);
__m512 tmp5588 = _mm512_shuffle_ps(tmp5570, tmp5572, 238);
__m512 tmp5589 = _mm512_shuffle_ps(tmp5573, tmp5575, 68);
__m512 tmp5590 = _mm512_shuffle_ps(tmp5573, tmp5575, 238);
__m512 tmp5591 = _mm512_shuffle_ps(tmp5574, tmp5576, 68);
__m512 tmp5592 = _mm512_shuffle_ps(tmp5574, tmp5576, 238);
__m512 tmp5593 = _mm512_shuffle_f32x4(tmp5577, tmp5581, 136);
__m512 tmp5594 = _mm512_shuffle_f32x4(tmp5577, tmp5581, 221);
__m512 tmp5595 = _mm512_shuffle_f32x4(tmp5578, tmp5582, 136);
__m512 tmp5596 = _mm512_shuffle_f32x4(tmp5578, tmp5582, 221);
__m512 tmp5597 = _mm512_shuffle_f32x4(tmp5579, tmp5583, 136);
__m512 tmp5598 = _mm512_shuffle_f32x4(tmp5579, tmp5583, 221);
__m512 tmp5599 = _mm512_shuffle_f32x4(tmp5580, tmp5584, 136);
__m512 tmp5600 = _mm512_shuffle_f32x4(tmp5580, tmp5584, 221);
__m512 tmp5601 = _mm512_shuffle_f32x4(tmp5585, tmp5589, 136);
__m512 tmp5602 = _mm512_shuffle_f32x4(tmp5585, tmp5589, 221);
__m512 tmp5603 = _mm512_shuffle_f32x4(tmp5586, tmp5590, 136);
__m512 tmp5604 = _mm512_shuffle_f32x4(tmp5586, tmp5590, 221);
__m512 tmp5605 = _mm512_shuffle_f32x4(tmp5587, tmp5591, 136);
__m512 tmp5606 = _mm512_shuffle_f32x4(tmp5587, tmp5591, 221);
__m512 tmp5607 = _mm512_shuffle_f32x4(tmp5588, tmp5592, 136);
__m512 tmp5608 = _mm512_shuffle_f32x4(tmp5588, tmp5592, 221);
tmp5546 = _mm512_shuffle_f32x4(tmp5593, tmp5601, 136);
tmp5551 = _mm512_shuffle_f32x4(tmp5593, tmp5601, 221);
tmp5545 = _mm512_shuffle_f32x4(tmp5595, tmp5603, 136);
tmp5550 = _mm512_shuffle_f32x4(tmp5595, tmp5603, 221);
tmp5547 = _mm512_shuffle_f32x4(tmp5597, tmp5605, 136);
tmp5552 = _mm512_shuffle_f32x4(tmp5597, tmp5605, 221);
in772 = _mm512_shuffle_f32x4(tmp5599, tmp5607, 136);
in779 = _mm512_shuffle_f32x4(tmp5599, tmp5607, 221);
tmp5544 = _mm512_shuffle_f32x4(tmp5594, tmp5602, 136);
tmp5549 = _mm512_shuffle_f32x4(tmp5594, tmp5602, 221);
in768 = _mm512_shuffle_f32x4(tmp5596, tmp5604, 136);
in775 = _mm512_shuffle_f32x4(tmp5596, tmp5604, 221);
in770 = _mm512_shuffle_f32x4(tmp5598, tmp5606, 136);
in777 = _mm512_shuffle_f32x4(tmp5598, tmp5606, 221);
in769 = _mm512_shuffle_f32x4(tmp5600, tmp5608, 136);
in776 = _mm512_shuffle_f32x4(tmp5600, tmp5608, 221);
__m512 tmp5553 = _mm512_add_ps(tmp5545, in768);
__m512 tmp5557 = _mm512_add_ps(tmp5550, in775);
__m512 tmp5554 = _mm512_sub_ps(tmp5544, tmp5547);
__m512 tmp5558 = _mm512_sub_ps(tmp5549, tmp5552);
__m512 tmp5555 = _mm512_add_ps(tmp5547, in770);
__m512 tmp5559 = _mm512_add_ps(tmp5552, in777);
tmp5546 = _mm512_sub_ps(tmp5546, in770);
tmp5551 = _mm512_sub_ps(tmp5551, in777);
tmp5553 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-4.25e+00f), tmp5553);
tmp5557 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-4.25e+00f), tmp5557);
tmp5555 = _mm512_fmadd_ps(tmp5544, _mm512_set1_ps(-4.25e+00f), tmp5555);
tmp5559 = _mm512_fmadd_ps(tmp5549, _mm512_set1_ps(-4.25e+00f), tmp5559);
tmp5546 = _mm512_fmadd_ps(tmp5554, _mm512_set1_ps(5.25e+00f), tmp5546);
tmp5551 = _mm512_fmadd_ps(tmp5558, _mm512_set1_ps(5.25e+00f), tmp5551);
tmp5554 = _mm512_fmadd_ps(tmp5547, _mm512_set1_ps(2.5e-01f), in770);
tmp5558 = _mm512_fmadd_ps(tmp5552, _mm512_set1_ps(2.5e-01f), in777);
tmp5547 = _mm512_fmadd_ps(tmp5547, _mm512_set1_ps(4e+00f), in770);
tmp5552 = _mm512_fmadd_ps(tmp5552, _mm512_set1_ps(4e+00f), in777);
__m512 tmp5556 = _mm512_sub_ps(tmp5555, tmp5553);
__m512 tmp5560 = _mm512_sub_ps(tmp5559, tmp5557);
tmp5555 = _mm512_add_ps(tmp5553, tmp5555);
tmp5559 = _mm512_add_ps(tmp5557, tmp5559);
tmp5553 = _mm512_fmadd_ps(tmp5545, _mm512_set1_ps(2.5e-01f), in768);
tmp5557 = _mm512_fmadd_ps(tmp5550, _mm512_set1_ps(2.5e-01f), in775);
tmp5554 = _mm512_fmadd_ps(tmp5544, _mm512_set1_ps(-1.25e+00f), tmp5554);
tmp5558 = _mm512_fmadd_ps(tmp5549, _mm512_set1_ps(-1.25e+00f), tmp5558);
tmp5544 = _mm512_fmadd_ps(tmp5544, _mm512_set1_ps(-5e+00f), tmp5547);
tmp5549 = _mm512_fmadd_ps(tmp5549, _mm512_set1_ps(-5e+00f), tmp5552);
tmp5553 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-1.25e+00f), tmp5553);
tmp5557 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-1.25e+00f), tmp5557);
in770 = _mm512_fmadd_ps(tmp5553, _mm512_set1_ps(2e+00f), tmp5554);
in777 = _mm512_fmadd_ps(tmp5557, _mm512_set1_ps(2e+00f), tmp5558);
tmp5554 = _mm512_fnmadd_ps(tmp5553, _mm512_set1_ps(2e+00f), tmp5554);
tmp5558 = _mm512_fnmadd_ps(tmp5557, _mm512_set1_ps(2e+00f), tmp5558);
tmp5553 = _mm512_fmadd_ps(in768, _mm512_set1_ps(2.5e-01f), tmp5545);
tmp5557 = _mm512_fmadd_ps(in775, _mm512_set1_ps(2.5e-01f), tmp5550);
tmp5545 = _mm512_sub_ps(in769, tmp5545);
tmp5550 = _mm512_sub_ps(in776, tmp5550);
tmp5553 = _mm512_fmadd_ps(in772, _mm512_set1_ps(-1.25e+00f), tmp5553);
tmp5557 = _mm512_fmadd_ps(in779, _mm512_set1_ps(-1.25e+00f), tmp5557);
in772 = _mm512_sub_ps(in772, in768);
in779 = _mm512_sub_ps(in779, in775);
in772 = _mm512_fmadd_ps(in772, _mm512_set1_ps(5.25e+00f), tmp5545);
in779 = _mm512_fmadd_ps(in779, _mm512_set1_ps(5.25e+00f), tmp5550);
tmp5547 = _mm512_fmadd_ps(tmp5553, _mm512_set1_ps(2e+00f), tmp5544);
tmp5552 = _mm512_fmadd_ps(tmp5557, _mm512_set1_ps(2e+00f), tmp5549);
tmp5544 = _mm512_fnmadd_ps(tmp5553, _mm512_set1_ps(2e+00f), tmp5544);
tmp5549 = _mm512_fnmadd_ps(tmp5557, _mm512_set1_ps(2e+00f), tmp5549);
__m512 out707 = _mm512_shuffle_f32x4(tmp5546, tmp5555, 68);
__m512 out715 = _mm512_shuffle_f32x4(tmp5546, tmp5555, 238);
__m512 out708 = _mm512_shuffle_f32x4(tmp5556, in770, 68);
__m512 out716 = _mm512_shuffle_f32x4(tmp5556, in770, 238);
__m512 out709 = _mm512_shuffle_f32x4(tmp5554, tmp5547, 68);
__m512 out717 = _mm512_shuffle_f32x4(tmp5554, tmp5547, 238);
__m512 out710 = _mm512_shuffle_f32x4(tmp5544, in772, 68);
__m512 out718 = _mm512_shuffle_f32x4(tmp5544, in772, 238);
__m512 out711 = _mm512_shuffle_f32x4(tmp5551, tmp5559, 68);
__m512 out719 = _mm512_shuffle_f32x4(tmp5551, tmp5559, 238);
__m512 out712 = _mm512_shuffle_f32x4(tmp5560, in777, 68);
__m512 out720 = _mm512_shuffle_f32x4(tmp5560, in777, 238);
__m512 out713 = _mm512_shuffle_f32x4(tmp5558, tmp5552, 68);
__m512 out721 = _mm512_shuffle_f32x4(tmp5558, tmp5552, 238);
__m512 out714 = _mm512_shuffle_f32x4(tmp5549, in779, 68);
__m512 out722 = _mm512_shuffle_f32x4(tmp5549, in779, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k79, out707);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k79, out715);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k79, out711);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k79, out719);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k79, out708);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k79, out716);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k79, out712);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k79, out720);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k79, out709);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k79, out717);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k79, out713);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k79, out721);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k79, out710);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k79, out718);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k79, out714);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k79, out722);
__m512 dat1305 = _mm512_maskz_loadu_ps(16383, datPtr12+320+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1306 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512i pm112 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in781 = _mm512_permutexvar_ps(pm112, dat1305);
__m512i pm113 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in788 = _mm512_permutexvar_ps(pm113, dat1306);
__m512 dat1307 = _mm512_maskz_loadu_ps(16383, datPtr12+544+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1308 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in782 = _mm512_permutexvar_ps(pm112, dat1307);
__m512 in789 = _mm512_permutexvar_ps(pm113, dat1308);
__m512 dat1309 = _mm512_maskz_loadu_ps(16383, datPtr12+768+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1310 = _mm512_maskz_loadu_ps(8191, datPtr12+13284+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in783 = _mm512_permutexvar_ps(pm112, dat1309);
__m512 in790 = _mm512_permutexvar_ps(pm113, dat1310);
__m512 dat1311 = _mm512_maskz_loadu_ps(16383, datPtr12+992+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1312 = _mm512_maskz_loadu_ps(8191, datPtr12+13508+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in784 = _mm512_permutexvar_ps(pm112, dat1311);
__m512 in791 = _mm512_permutexvar_ps(pm113, dat1312);
__m512 dat1313 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1314 = _mm512_maskz_loadu_ps(8191, datPtr12+13732+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in785 = _mm512_permutexvar_ps(pm112, dat1313);
__m512 in792 = _mm512_permutexvar_ps(pm113, dat1314);
__m512 dat1315 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1316 = _mm512_maskz_loadu_ps(8191, datPtr12+13956+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in786 = _mm512_permutexvar_ps(pm112, dat1315);
__m512 in793 = _mm512_permutexvar_ps(pm113, dat1316);
__m512 dat1317 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1318 = _mm512_maskz_loadu_ps(8191, datPtr12+14180+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in787 = _mm512_permutexvar_ps(pm112, dat1317);
__m512 in794 = _mm512_permutexvar_ps(pm113, dat1318);
__m512 tmp5609 = _mm512_add_ps(in781, in785);
__m512 tmp5614 = _mm512_add_ps(in788, in792);
__m512 tmp5610 = _mm512_sub_ps(in784, in782);
__m512 tmp5615 = _mm512_sub_ps(in791, in789);
__m512 tmp5611 = _mm512_add_ps(in782, in786);
__m512 tmp5616 = _mm512_add_ps(in789, in793);
__m512 tmp5612 = _mm512_sub_ps(_mm512_setzero_ps(), in786);
__m512 tmp5617 = _mm512_sub_ps(_mm512_setzero_ps(), in793);
tmp5609 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-4.25e+00f), tmp5609);
tmp5614 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-4.25e+00f), tmp5614);
tmp5611 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-4.25e+00f), tmp5611);
tmp5616 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-4.25e+00f), tmp5616);
tmp5612 = _mm512_fmadd_ps(tmp5610, _mm512_set1_ps(5.25e+00f), tmp5612);
tmp5617 = _mm512_fmadd_ps(tmp5615, _mm512_set1_ps(5.25e+00f), tmp5617);
tmp5610 = _mm512_fmadd_ps(in782, _mm512_set1_ps(2.5e-01f), in786);
tmp5615 = _mm512_fmadd_ps(in789, _mm512_set1_ps(2.5e-01f), in793);
in782 = _mm512_fmadd_ps(in782, _mm512_set1_ps(4e+00f), in786);
in789 = _mm512_fmadd_ps(in789, _mm512_set1_ps(4e+00f), in793);
__m512 tmp5613 = _mm512_sub_ps(tmp5611, tmp5609);
__m512 tmp5618 = _mm512_sub_ps(tmp5616, tmp5614);
tmp5611 = _mm512_add_ps(tmp5609, tmp5611);
tmp5616 = _mm512_add_ps(tmp5614, tmp5616);
tmp5609 = _mm512_fmadd_ps(in781, _mm512_set1_ps(2.5e-01f), in785);
tmp5614 = _mm512_fmadd_ps(in788, _mm512_set1_ps(2.5e-01f), in792);
tmp5610 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-1.25e+00f), tmp5610);
tmp5615 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-1.25e+00f), tmp5615);
in784 = _mm512_fmadd_ps(in784, _mm512_set1_ps(-5e+00f), in782);
in791 = _mm512_fmadd_ps(in791, _mm512_set1_ps(-5e+00f), in789);
tmp5609 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-1.25e+00f), tmp5609);
tmp5614 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-1.25e+00f), tmp5614);
in786 = _mm512_fmadd_ps(tmp5609, _mm512_set1_ps(2e+00f), tmp5610);
in793 = _mm512_fmadd_ps(tmp5614, _mm512_set1_ps(2e+00f), tmp5615);
tmp5610 = _mm512_fnmadd_ps(tmp5609, _mm512_set1_ps(2e+00f), tmp5610);
tmp5615 = _mm512_fnmadd_ps(tmp5614, _mm512_set1_ps(2e+00f), tmp5615);
tmp5609 = _mm512_fmadd_ps(in785, _mm512_set1_ps(2.5e-01f), in781);
tmp5614 = _mm512_fmadd_ps(in792, _mm512_set1_ps(2.5e-01f), in788);
in781 = _mm512_sub_ps(in787, in781);
in788 = _mm512_sub_ps(in794, in788);
tmp5609 = _mm512_fmadd_ps(in783, _mm512_set1_ps(-1.25e+00f), tmp5609);
tmp5614 = _mm512_fmadd_ps(in790, _mm512_set1_ps(-1.25e+00f), tmp5614);
in783 = _mm512_sub_ps(in783, in785);
in790 = _mm512_sub_ps(in790, in792);
in783 = _mm512_fmadd_ps(in783, _mm512_set1_ps(5.25e+00f), in781);
in790 = _mm512_fmadd_ps(in790, _mm512_set1_ps(5.25e+00f), in788);
in782 = _mm512_fmadd_ps(tmp5609, _mm512_set1_ps(2e+00f), in784);
in789 = _mm512_fmadd_ps(tmp5614, _mm512_set1_ps(2e+00f), in791);
in784 = _mm512_fnmadd_ps(tmp5609, _mm512_set1_ps(2e+00f), in784);
in791 = _mm512_fnmadd_ps(tmp5614, _mm512_set1_ps(2e+00f), in791);
__m512 tmp5627 = _mm512_unpacklo_ps(tmp5612, tmp5611);
__m512 tmp5628 = _mm512_unpackhi_ps(tmp5612, tmp5611);
__m512 tmp5629 = _mm512_unpacklo_ps(tmp5613, in786);
__m512 tmp5630 = _mm512_unpackhi_ps(tmp5613, in786);
__m512 tmp5631 = _mm512_unpacklo_ps(tmp5610, in782);
__m512 tmp5632 = _mm512_unpackhi_ps(tmp5610, in782);
__m512 tmp5633 = _mm512_unpacklo_ps(in784, in783);
__m512 tmp5634 = _mm512_unpackhi_ps(in784, in783);
__m512 tmp5635 = _mm512_unpacklo_ps(tmp5617, tmp5616);
__m512 tmp5636 = _mm512_unpackhi_ps(tmp5617, tmp5616);
__m512 tmp5637 = _mm512_unpacklo_ps(tmp5618, in793);
__m512 tmp5638 = _mm512_unpackhi_ps(tmp5618, in793);
__m512 tmp5639 = _mm512_unpacklo_ps(tmp5615, in789);
__m512 tmp5640 = _mm512_unpackhi_ps(tmp5615, in789);
__m512 tmp5641 = _mm512_unpacklo_ps(in791, in790);
__m512 tmp5642 = _mm512_unpackhi_ps(in791, in790);
__m512 tmp5643 = _mm512_shuffle_ps(tmp5627, tmp5629, 68);
__m512 tmp5644 = _mm512_shuffle_ps(tmp5627, tmp5629, 238);
__m512 tmp5645 = _mm512_shuffle_ps(tmp5628, tmp5630, 68);
__m512 tmp5646 = _mm512_shuffle_ps(tmp5628, tmp5630, 238);
__m512 tmp5647 = _mm512_shuffle_ps(tmp5631, tmp5633, 68);
__m512 tmp5648 = _mm512_shuffle_ps(tmp5631, tmp5633, 238);
__m512 tmp5649 = _mm512_shuffle_ps(tmp5632, tmp5634, 68);
__m512 tmp5650 = _mm512_shuffle_ps(tmp5632, tmp5634, 238);
__m512 tmp5651 = _mm512_shuffle_ps(tmp5635, tmp5637, 68);
__m512 tmp5652 = _mm512_shuffle_ps(tmp5635, tmp5637, 238);
__m512 tmp5653 = _mm512_shuffle_ps(tmp5636, tmp5638, 68);
__m512 tmp5654 = _mm512_shuffle_ps(tmp5636, tmp5638, 238);
__m512 tmp5655 = _mm512_shuffle_ps(tmp5639, tmp5641, 68);
__m512 tmp5656 = _mm512_shuffle_ps(tmp5639, tmp5641, 238);
__m512 tmp5657 = _mm512_shuffle_ps(tmp5640, tmp5642, 68);
__m512 tmp5658 = _mm512_shuffle_ps(tmp5640, tmp5642, 238);
__m512 tmp5659 = _mm512_shuffle_f32x4(tmp5643, tmp5647, 136);
__m512 tmp5660 = _mm512_shuffle_f32x4(tmp5643, tmp5647, 221);
__m512 tmp5661 = _mm512_shuffle_f32x4(tmp5644, tmp5648, 136);
__m512 tmp5662 = _mm512_shuffle_f32x4(tmp5644, tmp5648, 221);
__m512 tmp5663 = _mm512_shuffle_f32x4(tmp5645, tmp5649, 136);
__m512 tmp5664 = _mm512_shuffle_f32x4(tmp5645, tmp5649, 221);
__m512 tmp5665 = _mm512_shuffle_f32x4(tmp5646, tmp5650, 136);
__m512 tmp5666 = _mm512_shuffle_f32x4(tmp5646, tmp5650, 221);
__m512 tmp5667 = _mm512_shuffle_f32x4(tmp5651, tmp5655, 136);
__m512 tmp5668 = _mm512_shuffle_f32x4(tmp5651, tmp5655, 221);
__m512 tmp5669 = _mm512_shuffle_f32x4(tmp5652, tmp5656, 136);
__m512 tmp5670 = _mm512_shuffle_f32x4(tmp5652, tmp5656, 221);
__m512 tmp5671 = _mm512_shuffle_f32x4(tmp5653, tmp5657, 136);
__m512 tmp5672 = _mm512_shuffle_f32x4(tmp5653, tmp5657, 221);
__m512 tmp5673 = _mm512_shuffle_f32x4(tmp5654, tmp5658, 136);
__m512 tmp5674 = _mm512_shuffle_f32x4(tmp5654, tmp5658, 221);
tmp5612 = _mm512_shuffle_f32x4(tmp5659, tmp5667, 136);
tmp5617 = _mm512_shuffle_f32x4(tmp5659, tmp5667, 221);
tmp5611 = _mm512_shuffle_f32x4(tmp5661, tmp5669, 136);
tmp5616 = _mm512_shuffle_f32x4(tmp5661, tmp5669, 221);
tmp5613 = _mm512_shuffle_f32x4(tmp5663, tmp5671, 136);
tmp5618 = _mm512_shuffle_f32x4(tmp5663, tmp5671, 221);
in786 = _mm512_shuffle_f32x4(tmp5665, tmp5673, 136);
in793 = _mm512_shuffle_f32x4(tmp5665, tmp5673, 221);
tmp5610 = _mm512_shuffle_f32x4(tmp5660, tmp5668, 136);
tmp5615 = _mm512_shuffle_f32x4(tmp5660, tmp5668, 221);
in782 = _mm512_shuffle_f32x4(tmp5662, tmp5670, 136);
in789 = _mm512_shuffle_f32x4(tmp5662, tmp5670, 221);
in784 = _mm512_shuffle_f32x4(tmp5664, tmp5672, 136);
in791 = _mm512_shuffle_f32x4(tmp5664, tmp5672, 221);
in783 = _mm512_shuffle_f32x4(tmp5666, tmp5674, 136);
in790 = _mm512_shuffle_f32x4(tmp5666, tmp5674, 221);
__m512 tmp5619 = _mm512_add_ps(tmp5611, in782);
__m512 tmp5623 = _mm512_add_ps(tmp5616, in789);
__m512 tmp5620 = _mm512_sub_ps(tmp5610, tmp5613);
__m512 tmp5624 = _mm512_sub_ps(tmp5615, tmp5618);
__m512 tmp5621 = _mm512_add_ps(tmp5613, in784);
__m512 tmp5625 = _mm512_add_ps(tmp5618, in791);
tmp5612 = _mm512_sub_ps(tmp5612, in784);
tmp5617 = _mm512_sub_ps(tmp5617, in791);
tmp5619 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-4.25e+00f), tmp5619);
tmp5623 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-4.25e+00f), tmp5623);
tmp5621 = _mm512_fmadd_ps(tmp5610, _mm512_set1_ps(-4.25e+00f), tmp5621);
tmp5625 = _mm512_fmadd_ps(tmp5615, _mm512_set1_ps(-4.25e+00f), tmp5625);
tmp5612 = _mm512_fmadd_ps(tmp5620, _mm512_set1_ps(5.25e+00f), tmp5612);
tmp5617 = _mm512_fmadd_ps(tmp5624, _mm512_set1_ps(5.25e+00f), tmp5617);
tmp5620 = _mm512_fmadd_ps(tmp5613, _mm512_set1_ps(2.5e-01f), in784);
tmp5624 = _mm512_fmadd_ps(tmp5618, _mm512_set1_ps(2.5e-01f), in791);
tmp5613 = _mm512_fmadd_ps(tmp5613, _mm512_set1_ps(4e+00f), in784);
tmp5618 = _mm512_fmadd_ps(tmp5618, _mm512_set1_ps(4e+00f), in791);
__m512 tmp5622 = _mm512_sub_ps(tmp5621, tmp5619);
__m512 tmp5626 = _mm512_sub_ps(tmp5625, tmp5623);
tmp5621 = _mm512_add_ps(tmp5619, tmp5621);
tmp5625 = _mm512_add_ps(tmp5623, tmp5625);
tmp5619 = _mm512_fmadd_ps(tmp5611, _mm512_set1_ps(2.5e-01f), in782);
tmp5623 = _mm512_fmadd_ps(tmp5616, _mm512_set1_ps(2.5e-01f), in789);
tmp5620 = _mm512_fmadd_ps(tmp5610, _mm512_set1_ps(-1.25e+00f), tmp5620);
tmp5624 = _mm512_fmadd_ps(tmp5615, _mm512_set1_ps(-1.25e+00f), tmp5624);
tmp5610 = _mm512_fmadd_ps(tmp5610, _mm512_set1_ps(-5e+00f), tmp5613);
tmp5615 = _mm512_fmadd_ps(tmp5615, _mm512_set1_ps(-5e+00f), tmp5618);
tmp5619 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-1.25e+00f), tmp5619);
tmp5623 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-1.25e+00f), tmp5623);
in784 = _mm512_fmadd_ps(tmp5619, _mm512_set1_ps(2e+00f), tmp5620);
in791 = _mm512_fmadd_ps(tmp5623, _mm512_set1_ps(2e+00f), tmp5624);
tmp5620 = _mm512_fnmadd_ps(tmp5619, _mm512_set1_ps(2e+00f), tmp5620);
tmp5624 = _mm512_fnmadd_ps(tmp5623, _mm512_set1_ps(2e+00f), tmp5624);
tmp5619 = _mm512_fmadd_ps(in782, _mm512_set1_ps(2.5e-01f), tmp5611);
tmp5623 = _mm512_fmadd_ps(in789, _mm512_set1_ps(2.5e-01f), tmp5616);
tmp5611 = _mm512_sub_ps(in783, tmp5611);
tmp5616 = _mm512_sub_ps(in790, tmp5616);
tmp5619 = _mm512_fmadd_ps(in786, _mm512_set1_ps(-1.25e+00f), tmp5619);
tmp5623 = _mm512_fmadd_ps(in793, _mm512_set1_ps(-1.25e+00f), tmp5623);
in786 = _mm512_sub_ps(in786, in782);
in793 = _mm512_sub_ps(in793, in789);
in786 = _mm512_fmadd_ps(in786, _mm512_set1_ps(5.25e+00f), tmp5611);
in793 = _mm512_fmadd_ps(in793, _mm512_set1_ps(5.25e+00f), tmp5616);
tmp5613 = _mm512_fmadd_ps(tmp5619, _mm512_set1_ps(2e+00f), tmp5610);
tmp5618 = _mm512_fmadd_ps(tmp5623, _mm512_set1_ps(2e+00f), tmp5615);
tmp5610 = _mm512_fnmadd_ps(tmp5619, _mm512_set1_ps(2e+00f), tmp5610);
tmp5615 = _mm512_fnmadd_ps(tmp5623, _mm512_set1_ps(2e+00f), tmp5615);
__m512 out723 = _mm512_shuffle_f32x4(tmp5612, tmp5621, 68);
__m512 out731 = _mm512_shuffle_f32x4(tmp5612, tmp5621, 238);
__m512 out724 = _mm512_shuffle_f32x4(tmp5622, in784, 68);
__m512 out732 = _mm512_shuffle_f32x4(tmp5622, in784, 238);
__m512 out725 = _mm512_shuffle_f32x4(tmp5620, tmp5613, 68);
__m512 out733 = _mm512_shuffle_f32x4(tmp5620, tmp5613, 238);
__m512 out726 = _mm512_shuffle_f32x4(tmp5610, in786, 68);
__m512 out734 = _mm512_shuffle_f32x4(tmp5610, in786, 238);
__m512 out727 = _mm512_shuffle_f32x4(tmp5617, tmp5625, 68);
__m512 out735 = _mm512_shuffle_f32x4(tmp5617, tmp5625, 238);
__m512 out728 = _mm512_shuffle_f32x4(tmp5626, in791, 68);
__m512 out736 = _mm512_shuffle_f32x4(tmp5626, in791, 238);
__m512 out729 = _mm512_shuffle_f32x4(tmp5624, tmp5618, 68);
__m512 out737 = _mm512_shuffle_f32x4(tmp5624, tmp5618, 238);
__m512 out730 = _mm512_shuffle_f32x4(tmp5615, in793, 68);
__m512 out738 = _mm512_shuffle_f32x4(tmp5615, in793, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k79, out723);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k79, out731);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k79, out727);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k79, out735);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k79, out724);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k79, out732);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k79, out728);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k79, out736);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k79, out725);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k79, out733);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k79, out729);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k79, out737);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k79, out726);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k79, out734);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k79, out730);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k79, out738);
__m512 dat1319 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1320 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512i pm114 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in795 = _mm512_permutexvar_ps(pm114, dat1319);
__m512 in802 = _mm512_permutexvar_ps(pm114, dat1320);
__m512 dat1321 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1322 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in796 = _mm512_permutexvar_ps(pm114, dat1321);
__m512 in803 = _mm512_permutexvar_ps(pm114, dat1322);
__m512 dat1323 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1324 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in797 = _mm512_permutexvar_ps(pm114, dat1323);
__m512 in804 = _mm512_permutexvar_ps(pm114, dat1324);
__m512 dat1325 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1326 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in798 = _mm512_permutexvar_ps(pm114, dat1325);
__m512 in805 = _mm512_permutexvar_ps(pm114, dat1326);
__m512 dat1327 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1328 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in799 = _mm512_permutexvar_ps(pm114, dat1327);
__m512 in806 = _mm512_permutexvar_ps(pm114, dat1328);
__m512 dat1329 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1330 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in800 = _mm512_permutexvar_ps(pm114, dat1329);
__m512 in807 = _mm512_permutexvar_ps(pm114, dat1330);
__m512 dat1331 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 dat1332 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+50432*i27+224*h29+4*w36+50432*s19+25216*k79);
__m512 in801 = _mm512_permutexvar_ps(pm114, dat1331);
__m512 in808 = _mm512_permutexvar_ps(pm114, dat1332);
__m512 tmp5675 = _mm512_add_ps(in795, in799);
__m512 tmp5680 = _mm512_add_ps(in802, in806);
__m512 tmp5676 = _mm512_sub_ps(in798, in796);
__m512 tmp5681 = _mm512_sub_ps(in805, in803);
__m512 tmp5677 = _mm512_add_ps(in796, in800);
__m512 tmp5682 = _mm512_add_ps(in803, in807);
__m512 tmp5678 = _mm512_sub_ps(_mm512_setzero_ps(), in800);
__m512 tmp5683 = _mm512_sub_ps(_mm512_setzero_ps(), in807);
tmp5675 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-4.25e+00f), tmp5675);
tmp5680 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-4.25e+00f), tmp5680);
tmp5677 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-4.25e+00f), tmp5677);
tmp5682 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-4.25e+00f), tmp5682);
tmp5678 = _mm512_fmadd_ps(tmp5676, _mm512_set1_ps(5.25e+00f), tmp5678);
tmp5683 = _mm512_fmadd_ps(tmp5681, _mm512_set1_ps(5.25e+00f), tmp5683);
tmp5676 = _mm512_fmadd_ps(in796, _mm512_set1_ps(2.5e-01f), in800);
tmp5681 = _mm512_fmadd_ps(in803, _mm512_set1_ps(2.5e-01f), in807);
in796 = _mm512_fmadd_ps(in796, _mm512_set1_ps(4e+00f), in800);
in803 = _mm512_fmadd_ps(in803, _mm512_set1_ps(4e+00f), in807);
__m512 tmp5679 = _mm512_sub_ps(tmp5677, tmp5675);
__m512 tmp5684 = _mm512_sub_ps(tmp5682, tmp5680);
tmp5677 = _mm512_add_ps(tmp5675, tmp5677);
tmp5682 = _mm512_add_ps(tmp5680, tmp5682);
tmp5675 = _mm512_fmadd_ps(in795, _mm512_set1_ps(2.5e-01f), in799);
tmp5680 = _mm512_fmadd_ps(in802, _mm512_set1_ps(2.5e-01f), in806);
tmp5676 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-1.25e+00f), tmp5676);
tmp5681 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-1.25e+00f), tmp5681);
in798 = _mm512_fmadd_ps(in798, _mm512_set1_ps(-5e+00f), in796);
in805 = _mm512_fmadd_ps(in805, _mm512_set1_ps(-5e+00f), in803);
tmp5675 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-1.25e+00f), tmp5675);
tmp5680 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-1.25e+00f), tmp5680);
in800 = _mm512_fmadd_ps(tmp5675, _mm512_set1_ps(2e+00f), tmp5676);
in807 = _mm512_fmadd_ps(tmp5680, _mm512_set1_ps(2e+00f), tmp5681);
tmp5676 = _mm512_fnmadd_ps(tmp5675, _mm512_set1_ps(2e+00f), tmp5676);
tmp5681 = _mm512_fnmadd_ps(tmp5680, _mm512_set1_ps(2e+00f), tmp5681);
tmp5675 = _mm512_fmadd_ps(in799, _mm512_set1_ps(2.5e-01f), in795);
tmp5680 = _mm512_fmadd_ps(in806, _mm512_set1_ps(2.5e-01f), in802);
in795 = _mm512_sub_ps(in801, in795);
in802 = _mm512_sub_ps(in808, in802);
tmp5675 = _mm512_fmadd_ps(in797, _mm512_set1_ps(-1.25e+00f), tmp5675);
tmp5680 = _mm512_fmadd_ps(in804, _mm512_set1_ps(-1.25e+00f), tmp5680);
in797 = _mm512_sub_ps(in797, in799);
in804 = _mm512_sub_ps(in804, in806);
in797 = _mm512_fmadd_ps(in797, _mm512_set1_ps(5.25e+00f), in795);
in804 = _mm512_fmadd_ps(in804, _mm512_set1_ps(5.25e+00f), in802);
in796 = _mm512_fmadd_ps(tmp5675, _mm512_set1_ps(2e+00f), in798);
in803 = _mm512_fmadd_ps(tmp5680, _mm512_set1_ps(2e+00f), in805);
in798 = _mm512_fnmadd_ps(tmp5675, _mm512_set1_ps(2e+00f), in798);
in805 = _mm512_fnmadd_ps(tmp5680, _mm512_set1_ps(2e+00f), in805);
__m512 tmp5693 = _mm512_unpacklo_ps(tmp5678, tmp5677);
__m512 tmp5694 = _mm512_unpackhi_ps(tmp5678, tmp5677);
__m512 tmp5695 = _mm512_unpacklo_ps(tmp5679, in800);
__m512 tmp5696 = _mm512_unpackhi_ps(tmp5679, in800);
__m512 tmp5697 = _mm512_unpacklo_ps(tmp5676, in796);
__m512 tmp5698 = _mm512_unpackhi_ps(tmp5676, in796);
__m512 tmp5699 = _mm512_unpacklo_ps(in798, in797);
__m512 tmp5700 = _mm512_unpackhi_ps(in798, in797);
__m512 tmp5701 = _mm512_unpacklo_ps(tmp5683, tmp5682);
__m512 tmp5702 = _mm512_unpackhi_ps(tmp5683, tmp5682);
__m512 tmp5703 = _mm512_unpacklo_ps(tmp5684, in807);
__m512 tmp5704 = _mm512_unpackhi_ps(tmp5684, in807);
__m512 tmp5705 = _mm512_unpacklo_ps(tmp5681, in803);
__m512 tmp5706 = _mm512_unpackhi_ps(tmp5681, in803);
__m512 tmp5707 = _mm512_unpacklo_ps(in805, in804);
__m512 tmp5708 = _mm512_unpackhi_ps(in805, in804);
__m512 tmp5709 = _mm512_shuffle_ps(tmp5693, tmp5695, 68);
__m512 tmp5710 = _mm512_shuffle_ps(tmp5693, tmp5695, 238);
__m512 tmp5711 = _mm512_shuffle_ps(tmp5694, tmp5696, 68);
__m512 tmp5712 = _mm512_shuffle_ps(tmp5694, tmp5696, 238);
__m512 tmp5713 = _mm512_shuffle_ps(tmp5697, tmp5699, 68);
__m512 tmp5714 = _mm512_shuffle_ps(tmp5697, tmp5699, 238);
__m512 tmp5715 = _mm512_shuffle_ps(tmp5698, tmp5700, 68);
__m512 tmp5716 = _mm512_shuffle_ps(tmp5698, tmp5700, 238);
__m512 tmp5717 = _mm512_shuffle_ps(tmp5701, tmp5703, 68);
__m512 tmp5718 = _mm512_shuffle_ps(tmp5701, tmp5703, 238);
__m512 tmp5719 = _mm512_shuffle_ps(tmp5702, tmp5704, 68);
__m512 tmp5720 = _mm512_shuffle_ps(tmp5702, tmp5704, 238);
__m512 tmp5721 = _mm512_shuffle_ps(tmp5705, tmp5707, 68);
__m512 tmp5722 = _mm512_shuffle_ps(tmp5705, tmp5707, 238);
__m512 tmp5723 = _mm512_shuffle_ps(tmp5706, tmp5708, 68);
__m512 tmp5724 = _mm512_shuffle_ps(tmp5706, tmp5708, 238);
__m512 tmp5725 = _mm512_shuffle_f32x4(tmp5709, tmp5713, 136);
__m512 tmp5726 = _mm512_shuffle_f32x4(tmp5709, tmp5713, 221);
__m512 tmp5727 = _mm512_shuffle_f32x4(tmp5710, tmp5714, 136);
__m512 tmp5728 = _mm512_shuffle_f32x4(tmp5710, tmp5714, 221);
__m512 tmp5729 = _mm512_shuffle_f32x4(tmp5711, tmp5715, 136);
__m512 tmp5730 = _mm512_shuffle_f32x4(tmp5711, tmp5715, 221);
__m512 tmp5731 = _mm512_shuffle_f32x4(tmp5712, tmp5716, 136);
__m512 tmp5732 = _mm512_shuffle_f32x4(tmp5712, tmp5716, 221);
__m512 tmp5733 = _mm512_shuffle_f32x4(tmp5717, tmp5721, 136);
__m512 tmp5734 = _mm512_shuffle_f32x4(tmp5717, tmp5721, 221);
__m512 tmp5735 = _mm512_shuffle_f32x4(tmp5718, tmp5722, 136);
__m512 tmp5736 = _mm512_shuffle_f32x4(tmp5718, tmp5722, 221);
__m512 tmp5737 = _mm512_shuffle_f32x4(tmp5719, tmp5723, 136);
__m512 tmp5738 = _mm512_shuffle_f32x4(tmp5719, tmp5723, 221);
__m512 tmp5739 = _mm512_shuffle_f32x4(tmp5720, tmp5724, 136);
__m512 tmp5740 = _mm512_shuffle_f32x4(tmp5720, tmp5724, 221);
tmp5678 = _mm512_shuffle_f32x4(tmp5725, tmp5733, 136);
tmp5683 = _mm512_shuffle_f32x4(tmp5725, tmp5733, 221);
tmp5677 = _mm512_shuffle_f32x4(tmp5727, tmp5735, 136);
tmp5682 = _mm512_shuffle_f32x4(tmp5727, tmp5735, 221);
tmp5679 = _mm512_shuffle_f32x4(tmp5729, tmp5737, 136);
tmp5684 = _mm512_shuffle_f32x4(tmp5729, tmp5737, 221);
in800 = _mm512_shuffle_f32x4(tmp5731, tmp5739, 136);
in807 = _mm512_shuffle_f32x4(tmp5731, tmp5739, 221);
tmp5676 = _mm512_shuffle_f32x4(tmp5726, tmp5734, 136);
tmp5681 = _mm512_shuffle_f32x4(tmp5726, tmp5734, 221);
in796 = _mm512_shuffle_f32x4(tmp5728, tmp5736, 136);
in803 = _mm512_shuffle_f32x4(tmp5728, tmp5736, 221);
in798 = _mm512_shuffle_f32x4(tmp5730, tmp5738, 136);
in805 = _mm512_shuffle_f32x4(tmp5730, tmp5738, 221);
in797 = _mm512_shuffle_f32x4(tmp5732, tmp5740, 136);
in804 = _mm512_shuffle_f32x4(tmp5732, tmp5740, 221);
__m512 tmp5685 = _mm512_add_ps(tmp5677, in796);
__m512 tmp5689 = _mm512_add_ps(tmp5682, in803);
__m512 tmp5686 = _mm512_sub_ps(tmp5676, tmp5679);
__m512 tmp5690 = _mm512_sub_ps(tmp5681, tmp5684);
__m512 tmp5687 = _mm512_add_ps(tmp5679, in798);
__m512 tmp5691 = _mm512_add_ps(tmp5684, in805);
tmp5678 = _mm512_sub_ps(tmp5678, in798);
tmp5683 = _mm512_sub_ps(tmp5683, in805);
tmp5685 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-4.25e+00f), tmp5685);
tmp5689 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-4.25e+00f), tmp5689);
tmp5687 = _mm512_fmadd_ps(tmp5676, _mm512_set1_ps(-4.25e+00f), tmp5687);
tmp5691 = _mm512_fmadd_ps(tmp5681, _mm512_set1_ps(-4.25e+00f), tmp5691);
tmp5678 = _mm512_fmadd_ps(tmp5686, _mm512_set1_ps(5.25e+00f), tmp5678);
tmp5683 = _mm512_fmadd_ps(tmp5690, _mm512_set1_ps(5.25e+00f), tmp5683);
tmp5686 = _mm512_fmadd_ps(tmp5679, _mm512_set1_ps(2.5e-01f), in798);
tmp5690 = _mm512_fmadd_ps(tmp5684, _mm512_set1_ps(2.5e-01f), in805);
tmp5679 = _mm512_fmadd_ps(tmp5679, _mm512_set1_ps(4e+00f), in798);
tmp5684 = _mm512_fmadd_ps(tmp5684, _mm512_set1_ps(4e+00f), in805);
__m512 tmp5688 = _mm512_sub_ps(tmp5687, tmp5685);
__m512 tmp5692 = _mm512_sub_ps(tmp5691, tmp5689);
tmp5687 = _mm512_add_ps(tmp5685, tmp5687);
tmp5691 = _mm512_add_ps(tmp5689, tmp5691);
tmp5685 = _mm512_fmadd_ps(tmp5677, _mm512_set1_ps(2.5e-01f), in796);
tmp5689 = _mm512_fmadd_ps(tmp5682, _mm512_set1_ps(2.5e-01f), in803);
tmp5686 = _mm512_fmadd_ps(tmp5676, _mm512_set1_ps(-1.25e+00f), tmp5686);
tmp5690 = _mm512_fmadd_ps(tmp5681, _mm512_set1_ps(-1.25e+00f), tmp5690);
tmp5676 = _mm512_fmadd_ps(tmp5676, _mm512_set1_ps(-5e+00f), tmp5679);
tmp5681 = _mm512_fmadd_ps(tmp5681, _mm512_set1_ps(-5e+00f), tmp5684);
tmp5685 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-1.25e+00f), tmp5685);
tmp5689 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-1.25e+00f), tmp5689);
in798 = _mm512_fmadd_ps(tmp5685, _mm512_set1_ps(2e+00f), tmp5686);
in805 = _mm512_fmadd_ps(tmp5689, _mm512_set1_ps(2e+00f), tmp5690);
tmp5686 = _mm512_fnmadd_ps(tmp5685, _mm512_set1_ps(2e+00f), tmp5686);
tmp5690 = _mm512_fnmadd_ps(tmp5689, _mm512_set1_ps(2e+00f), tmp5690);
tmp5685 = _mm512_fmadd_ps(in796, _mm512_set1_ps(2.5e-01f), tmp5677);
tmp5689 = _mm512_fmadd_ps(in803, _mm512_set1_ps(2.5e-01f), tmp5682);
tmp5677 = _mm512_sub_ps(in797, tmp5677);
tmp5682 = _mm512_sub_ps(in804, tmp5682);
tmp5685 = _mm512_fmadd_ps(in800, _mm512_set1_ps(-1.25e+00f), tmp5685);
tmp5689 = _mm512_fmadd_ps(in807, _mm512_set1_ps(-1.25e+00f), tmp5689);
in800 = _mm512_sub_ps(in800, in796);
in807 = _mm512_sub_ps(in807, in803);
in800 = _mm512_fmadd_ps(in800, _mm512_set1_ps(5.25e+00f), tmp5677);
in807 = _mm512_fmadd_ps(in807, _mm512_set1_ps(5.25e+00f), tmp5682);
tmp5679 = _mm512_fmadd_ps(tmp5685, _mm512_set1_ps(2e+00f), tmp5676);
tmp5684 = _mm512_fmadd_ps(tmp5689, _mm512_set1_ps(2e+00f), tmp5681);
tmp5676 = _mm512_fnmadd_ps(tmp5685, _mm512_set1_ps(2e+00f), tmp5676);
tmp5681 = _mm512_fnmadd_ps(tmp5689, _mm512_set1_ps(2e+00f), tmp5681);
__m512 out739 = _mm512_shuffle_f32x4(tmp5678, tmp5687, 68);
__m512 out747 = _mm512_shuffle_f32x4(tmp5678, tmp5687, 238);
__m512 out740 = _mm512_shuffle_f32x4(tmp5688, in798, 68);
__m512 out748 = _mm512_shuffle_f32x4(tmp5688, in798, 238);
__m512 out741 = _mm512_shuffle_f32x4(tmp5686, tmp5679, 68);
__m512 out749 = _mm512_shuffle_f32x4(tmp5686, tmp5679, 238);
__m512 out742 = _mm512_shuffle_f32x4(tmp5676, in800, 68);
__m512 out750 = _mm512_shuffle_f32x4(tmp5676, in800, 238);
__m512 out743 = _mm512_shuffle_f32x4(tmp5683, tmp5691, 68);
__m512 out751 = _mm512_shuffle_f32x4(tmp5683, tmp5691, 238);
__m512 out744 = _mm512_shuffle_f32x4(tmp5692, in805, 68);
__m512 out752 = _mm512_shuffle_f32x4(tmp5692, in805, 238);
__m512 out745 = _mm512_shuffle_f32x4(tmp5690, tmp5684, 68);
__m512 out753 = _mm512_shuffle_f32x4(tmp5690, tmp5684, 238);
__m512 out746 = _mm512_shuffle_f32x4(tmp5681, in807, 68);
__m512 out754 = _mm512_shuffle_f32x4(tmp5681, in807, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k79, out739);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k79, out747);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k79, out743);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k79, out751);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k79, out740);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k79, out748);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k79, out744);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k79, out752);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k79, out741);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k79, out749);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k79, out745);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k79, out753);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k79, out742);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k79, out750);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k79, out746);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k79, out754);
}
++j21;
rel13 = 1;
}
ptrdiff_t h30 = base13+0;
ptrdiff_t w37 = 36;
ptrdiff_t k80 = 0;
for (; k80 != 2; ++k80) {
__m512 dat1333 = _mm512_maskz_loadu_ps(16383, datPtr12+224+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1334 = _mm512_maskz_loadu_ps(511, datPtr12+272+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512i pm115 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in809 = _mm512_permutexvar_ps(pm115, dat1333);
__m512i pm116 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in816 = _mm512_permutexvar_ps(pm116, dat1334);
__m512 dat1335 = _mm512_maskz_loadu_ps(16383, datPtr12+448+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1336 = _mm512_maskz_loadu_ps(511, datPtr12+496+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in810 = _mm512_permutexvar_ps(pm115, dat1335);
__m512 in817 = _mm512_permutexvar_ps(pm116, dat1336);
__m512 dat1337 = _mm512_maskz_loadu_ps(16383, datPtr12+672+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1338 = _mm512_maskz_loadu_ps(511, datPtr12+720+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in811 = _mm512_permutexvar_ps(pm115, dat1337);
__m512 in818 = _mm512_permutexvar_ps(pm116, dat1338);
__m512 dat1339 = _mm512_maskz_loadu_ps(16383, datPtr12+896+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1340 = _mm512_maskz_loadu_ps(511, datPtr12+944+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in812 = _mm512_permutexvar_ps(pm115, dat1339);
__m512 in819 = _mm512_permutexvar_ps(pm116, dat1340);
__m512 dat1341 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1342 = _mm512_maskz_loadu_ps(511, datPtr12+1168+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in813 = _mm512_permutexvar_ps(pm115, dat1341);
__m512 in820 = _mm512_permutexvar_ps(pm116, dat1342);
__m512 dat1343 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1344 = _mm512_maskz_loadu_ps(511, datPtr12+1392+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in814 = _mm512_permutexvar_ps(pm115, dat1343);
__m512 in821 = _mm512_permutexvar_ps(pm116, dat1344);
__m512 dat1345 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1346 = _mm512_maskz_loadu_ps(511, datPtr12+1616+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in815 = _mm512_permutexvar_ps(pm115, dat1345);
__m512 in822 = _mm512_permutexvar_ps(pm116, dat1346);
__m512 tmp5741 = _mm512_add_ps(in809, in813);
__m512 tmp5746 = _mm512_add_ps(in816, in820);
__m512 tmp5742 = _mm512_sub_ps(in812, in810);
__m512 tmp5747 = _mm512_sub_ps(in819, in817);
__m512 tmp5743 = _mm512_add_ps(in810, in814);
__m512 tmp5748 = _mm512_add_ps(in817, in821);
__m512 tmp5744 = _mm512_sub_ps(_mm512_setzero_ps(), in814);
__m512 tmp5749 = _mm512_sub_ps(_mm512_setzero_ps(), in821);
tmp5741 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-4.25e+00f), tmp5741);
tmp5746 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-4.25e+00f), tmp5746);
tmp5743 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-4.25e+00f), tmp5743);
tmp5748 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-4.25e+00f), tmp5748);
tmp5744 = _mm512_fmadd_ps(tmp5742, _mm512_set1_ps(5.25e+00f), tmp5744);
tmp5749 = _mm512_fmadd_ps(tmp5747, _mm512_set1_ps(5.25e+00f), tmp5749);
tmp5742 = _mm512_fmadd_ps(in810, _mm512_set1_ps(2.5e-01f), in814);
tmp5747 = _mm512_fmadd_ps(in817, _mm512_set1_ps(2.5e-01f), in821);
in810 = _mm512_fmadd_ps(in810, _mm512_set1_ps(4e+00f), in814);
in817 = _mm512_fmadd_ps(in817, _mm512_set1_ps(4e+00f), in821);
__m512 tmp5745 = _mm512_sub_ps(tmp5743, tmp5741);
__m512 tmp5750 = _mm512_sub_ps(tmp5748, tmp5746);
tmp5743 = _mm512_add_ps(tmp5741, tmp5743);
tmp5748 = _mm512_add_ps(tmp5746, tmp5748);
tmp5741 = _mm512_fmadd_ps(in809, _mm512_set1_ps(2.5e-01f), in813);
tmp5746 = _mm512_fmadd_ps(in816, _mm512_set1_ps(2.5e-01f), in820);
tmp5742 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-1.25e+00f), tmp5742);
tmp5747 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-1.25e+00f), tmp5747);
in812 = _mm512_fmadd_ps(in812, _mm512_set1_ps(-5e+00f), in810);
in819 = _mm512_fmadd_ps(in819, _mm512_set1_ps(-5e+00f), in817);
tmp5741 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-1.25e+00f), tmp5741);
tmp5746 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-1.25e+00f), tmp5746);
in814 = _mm512_fmadd_ps(tmp5741, _mm512_set1_ps(2e+00f), tmp5742);
in821 = _mm512_fmadd_ps(tmp5746, _mm512_set1_ps(2e+00f), tmp5747);
tmp5742 = _mm512_fnmadd_ps(tmp5741, _mm512_set1_ps(2e+00f), tmp5742);
tmp5747 = _mm512_fnmadd_ps(tmp5746, _mm512_set1_ps(2e+00f), tmp5747);
tmp5741 = _mm512_fmadd_ps(in813, _mm512_set1_ps(2.5e-01f), in809);
tmp5746 = _mm512_fmadd_ps(in820, _mm512_set1_ps(2.5e-01f), in816);
in809 = _mm512_sub_ps(in815, in809);
in816 = _mm512_sub_ps(in822, in816);
tmp5741 = _mm512_fmadd_ps(in811, _mm512_set1_ps(-1.25e+00f), tmp5741);
tmp5746 = _mm512_fmadd_ps(in818, _mm512_set1_ps(-1.25e+00f), tmp5746);
in811 = _mm512_sub_ps(in811, in813);
in818 = _mm512_sub_ps(in818, in820);
in811 = _mm512_fmadd_ps(in811, _mm512_set1_ps(5.25e+00f), in809);
in818 = _mm512_fmadd_ps(in818, _mm512_set1_ps(5.25e+00f), in816);
in810 = _mm512_fmadd_ps(tmp5741, _mm512_set1_ps(2e+00f), in812);
in817 = _mm512_fmadd_ps(tmp5746, _mm512_set1_ps(2e+00f), in819);
in812 = _mm512_fnmadd_ps(tmp5741, _mm512_set1_ps(2e+00f), in812);
in819 = _mm512_fnmadd_ps(tmp5746, _mm512_set1_ps(2e+00f), in819);
__m512 tmp5759 = _mm512_unpacklo_ps(tmp5744, tmp5743);
__m512 tmp5760 = _mm512_unpackhi_ps(tmp5744, tmp5743);
__m512 tmp5761 = _mm512_unpacklo_ps(tmp5745, in814);
__m512 tmp5762 = _mm512_unpackhi_ps(tmp5745, in814);
__m512 tmp5763 = _mm512_unpacklo_ps(tmp5742, in810);
__m512 tmp5764 = _mm512_unpackhi_ps(tmp5742, in810);
__m512 tmp5765 = _mm512_unpacklo_ps(in812, in811);
__m512 tmp5766 = _mm512_unpackhi_ps(in812, in811);
__m512 tmp5767 = _mm512_unpacklo_ps(tmp5749, tmp5748);
__m512 tmp5768 = _mm512_unpackhi_ps(tmp5749, tmp5748);
__m512 tmp5769 = _mm512_unpacklo_ps(tmp5750, in821);
__m512 tmp5770 = _mm512_unpackhi_ps(tmp5750, in821);
__m512 tmp5771 = _mm512_unpacklo_ps(tmp5747, in817);
__m512 tmp5772 = _mm512_unpackhi_ps(tmp5747, in817);
__m512 tmp5773 = _mm512_unpacklo_ps(in819, in818);
__m512 tmp5774 = _mm512_unpackhi_ps(in819, in818);
__m512 tmp5775 = _mm512_shuffle_ps(tmp5759, tmp5761, 68);
__m512 tmp5776 = _mm512_shuffle_ps(tmp5759, tmp5761, 238);
__m512 tmp5777 = _mm512_shuffle_ps(tmp5760, tmp5762, 68);
__m512 tmp5778 = _mm512_shuffle_ps(tmp5760, tmp5762, 238);
__m512 tmp5779 = _mm512_shuffle_ps(tmp5763, tmp5765, 68);
__m512 tmp5780 = _mm512_shuffle_ps(tmp5763, tmp5765, 238);
__m512 tmp5781 = _mm512_shuffle_ps(tmp5764, tmp5766, 68);
__m512 tmp5782 = _mm512_shuffle_ps(tmp5764, tmp5766, 238);
__m512 tmp5783 = _mm512_shuffle_ps(tmp5767, tmp5769, 68);
__m512 tmp5784 = _mm512_shuffle_ps(tmp5767, tmp5769, 238);
__m512 tmp5785 = _mm512_shuffle_ps(tmp5768, tmp5770, 68);
__m512 tmp5786 = _mm512_shuffle_ps(tmp5768, tmp5770, 238);
__m512 tmp5787 = _mm512_shuffle_ps(tmp5771, tmp5773, 68);
__m512 tmp5788 = _mm512_shuffle_ps(tmp5771, tmp5773, 238);
__m512 tmp5789 = _mm512_shuffle_ps(tmp5772, tmp5774, 68);
__m512 tmp5790 = _mm512_shuffle_ps(tmp5772, tmp5774, 238);
__m512 tmp5791 = _mm512_shuffle_f32x4(tmp5775, tmp5779, 136);
__m512 tmp5792 = _mm512_shuffle_f32x4(tmp5775, tmp5779, 221);
__m512 tmp5793 = _mm512_shuffle_f32x4(tmp5776, tmp5780, 136);
__m512 tmp5794 = _mm512_shuffle_f32x4(tmp5776, tmp5780, 221);
__m512 tmp5795 = _mm512_shuffle_f32x4(tmp5777, tmp5781, 136);
__m512 tmp5796 = _mm512_shuffle_f32x4(tmp5777, tmp5781, 221);
__m512 tmp5797 = _mm512_shuffle_f32x4(tmp5778, tmp5782, 136);
__m512 tmp5798 = _mm512_shuffle_f32x4(tmp5778, tmp5782, 221);
__m512 tmp5799 = _mm512_shuffle_f32x4(tmp5783, tmp5787, 136);
__m512 tmp5800 = _mm512_shuffle_f32x4(tmp5783, tmp5787, 221);
__m512 tmp5801 = _mm512_shuffle_f32x4(tmp5784, tmp5788, 136);
__m512 tmp5802 = _mm512_shuffle_f32x4(tmp5784, tmp5788, 221);
__m512 tmp5803 = _mm512_shuffle_f32x4(tmp5785, tmp5789, 136);
__m512 tmp5804 = _mm512_shuffle_f32x4(tmp5785, tmp5789, 221);
__m512 tmp5805 = _mm512_shuffle_f32x4(tmp5786, tmp5790, 136);
__m512 tmp5806 = _mm512_shuffle_f32x4(tmp5786, tmp5790, 221);
tmp5744 = _mm512_shuffle_f32x4(tmp5791, tmp5799, 136);
tmp5749 = _mm512_shuffle_f32x4(tmp5791, tmp5799, 221);
tmp5743 = _mm512_shuffle_f32x4(tmp5793, tmp5801, 136);
tmp5748 = _mm512_shuffle_f32x4(tmp5793, tmp5801, 221);
tmp5745 = _mm512_shuffle_f32x4(tmp5795, tmp5803, 136);
tmp5750 = _mm512_shuffle_f32x4(tmp5795, tmp5803, 221);
in814 = _mm512_shuffle_f32x4(tmp5797, tmp5805, 136);
in821 = _mm512_shuffle_f32x4(tmp5797, tmp5805, 221);
tmp5742 = _mm512_shuffle_f32x4(tmp5792, tmp5800, 136);
tmp5747 = _mm512_shuffle_f32x4(tmp5792, tmp5800, 221);
in810 = _mm512_shuffle_f32x4(tmp5794, tmp5802, 136);
in817 = _mm512_shuffle_f32x4(tmp5794, tmp5802, 221);
in812 = _mm512_shuffle_f32x4(tmp5796, tmp5804, 136);
in819 = _mm512_shuffle_f32x4(tmp5796, tmp5804, 221);
in811 = _mm512_shuffle_f32x4(tmp5798, tmp5806, 136);
in818 = _mm512_shuffle_f32x4(tmp5798, tmp5806, 221);
__m512 tmp5751 = _mm512_add_ps(tmp5743, in810);
__m512 tmp5755 = _mm512_add_ps(tmp5748, in817);
__m512 tmp5752 = _mm512_sub_ps(tmp5742, tmp5745);
__m512 tmp5756 = _mm512_sub_ps(tmp5747, tmp5750);
__m512 tmp5753 = _mm512_add_ps(tmp5745, in812);
__m512 tmp5757 = _mm512_add_ps(tmp5750, in819);
tmp5744 = _mm512_sub_ps(tmp5744, in812);
tmp5749 = _mm512_sub_ps(tmp5749, in819);
tmp5751 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-4.25e+00f), tmp5751);
tmp5755 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-4.25e+00f), tmp5755);
tmp5753 = _mm512_fmadd_ps(tmp5742, _mm512_set1_ps(-4.25e+00f), tmp5753);
tmp5757 = _mm512_fmadd_ps(tmp5747, _mm512_set1_ps(-4.25e+00f), tmp5757);
tmp5744 = _mm512_fmadd_ps(tmp5752, _mm512_set1_ps(5.25e+00f), tmp5744);
tmp5749 = _mm512_fmadd_ps(tmp5756, _mm512_set1_ps(5.25e+00f), tmp5749);
tmp5752 = _mm512_fmadd_ps(tmp5745, _mm512_set1_ps(2.5e-01f), in812);
tmp5756 = _mm512_fmadd_ps(tmp5750, _mm512_set1_ps(2.5e-01f), in819);
tmp5745 = _mm512_fmadd_ps(tmp5745, _mm512_set1_ps(4e+00f), in812);
tmp5750 = _mm512_fmadd_ps(tmp5750, _mm512_set1_ps(4e+00f), in819);
__m512 tmp5754 = _mm512_sub_ps(tmp5753, tmp5751);
__m512 tmp5758 = _mm512_sub_ps(tmp5757, tmp5755);
tmp5753 = _mm512_add_ps(tmp5751, tmp5753);
tmp5757 = _mm512_add_ps(tmp5755, tmp5757);
tmp5751 = _mm512_fmadd_ps(tmp5743, _mm512_set1_ps(2.5e-01f), in810);
tmp5755 = _mm512_fmadd_ps(tmp5748, _mm512_set1_ps(2.5e-01f), in817);
tmp5752 = _mm512_fmadd_ps(tmp5742, _mm512_set1_ps(-1.25e+00f), tmp5752);
tmp5756 = _mm512_fmadd_ps(tmp5747, _mm512_set1_ps(-1.25e+00f), tmp5756);
tmp5742 = _mm512_fmadd_ps(tmp5742, _mm512_set1_ps(-5e+00f), tmp5745);
tmp5747 = _mm512_fmadd_ps(tmp5747, _mm512_set1_ps(-5e+00f), tmp5750);
tmp5751 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-1.25e+00f), tmp5751);
tmp5755 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-1.25e+00f), tmp5755);
in812 = _mm512_fmadd_ps(tmp5751, _mm512_set1_ps(2e+00f), tmp5752);
in819 = _mm512_fmadd_ps(tmp5755, _mm512_set1_ps(2e+00f), tmp5756);
tmp5752 = _mm512_fnmadd_ps(tmp5751, _mm512_set1_ps(2e+00f), tmp5752);
tmp5756 = _mm512_fnmadd_ps(tmp5755, _mm512_set1_ps(2e+00f), tmp5756);
tmp5751 = _mm512_fmadd_ps(in810, _mm512_set1_ps(2.5e-01f), tmp5743);
tmp5755 = _mm512_fmadd_ps(in817, _mm512_set1_ps(2.5e-01f), tmp5748);
tmp5743 = _mm512_sub_ps(in811, tmp5743);
tmp5748 = _mm512_sub_ps(in818, tmp5748);
tmp5751 = _mm512_fmadd_ps(in814, _mm512_set1_ps(-1.25e+00f), tmp5751);
tmp5755 = _mm512_fmadd_ps(in821, _mm512_set1_ps(-1.25e+00f), tmp5755);
in814 = _mm512_sub_ps(in814, in810);
in821 = _mm512_sub_ps(in821, in817);
in814 = _mm512_fmadd_ps(in814, _mm512_set1_ps(5.25e+00f), tmp5743);
in821 = _mm512_fmadd_ps(in821, _mm512_set1_ps(5.25e+00f), tmp5748);
tmp5745 = _mm512_fmadd_ps(tmp5751, _mm512_set1_ps(2e+00f), tmp5742);
tmp5750 = _mm512_fmadd_ps(tmp5755, _mm512_set1_ps(2e+00f), tmp5747);
tmp5742 = _mm512_fnmadd_ps(tmp5751, _mm512_set1_ps(2e+00f), tmp5742);
tmp5747 = _mm512_fnmadd_ps(tmp5755, _mm512_set1_ps(2e+00f), tmp5747);
__m512 out755 = _mm512_shuffle_f32x4(tmp5744, tmp5753, 68);
__m512 out763 = _mm512_shuffle_f32x4(tmp5744, tmp5753, 238);
__m512 out756 = _mm512_shuffle_f32x4(tmp5754, in812, 68);
__m512 out764 = _mm512_shuffle_f32x4(tmp5754, in812, 238);
__m512 out757 = _mm512_shuffle_f32x4(tmp5752, tmp5745, 68);
__m512 out765 = _mm512_shuffle_f32x4(tmp5752, tmp5745, 238);
__m512 out758 = _mm512_shuffle_f32x4(tmp5742, in814, 68);
__m512 out766 = _mm512_shuffle_f32x4(tmp5742, in814, 238);
__m512 out759 = _mm512_shuffle_f32x4(tmp5749, tmp5757, 68);
__m512 out767 = _mm512_shuffle_f32x4(tmp5749, tmp5757, 238);
__m512 out760 = _mm512_shuffle_f32x4(tmp5758, in819, 68);
__m512 out768 = _mm512_shuffle_f32x4(tmp5758, in819, 238);
__m512 out761 = _mm512_shuffle_f32x4(tmp5756, tmp5750, 68);
__m512 out769 = _mm512_shuffle_f32x4(tmp5756, tmp5750, 238);
__m512 out762 = _mm512_shuffle_f32x4(tmp5747, in821, 68);
__m512 out770 = _mm512_shuffle_f32x4(tmp5747, in821, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k80, out755);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k80, out763);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k80, out759);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k80, out767);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k80, out756);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k80, out764);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k80, out760);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k80, out768);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k80, out757);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k80, out765);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k80, out761);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k80, out769);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k80, out758);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k80, out766);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k80, out762);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k80, out770);
__m512 dat1347 = _mm512_maskz_loadu_ps(8191, datPtr12+1204+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512i pm117 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in823 = _mm512_permutexvar_ps(pm117, dat1347);
__m512 dat1348 = _mm512_maskz_loadu_ps(8191, datPtr12+1428+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1349 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in824 = _mm512_permutexvar_ps(pm117, dat1348);
__m512i pm118 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in831 = _mm512_permutexvar_ps(pm118, dat1349);
__m512 dat1350 = _mm512_maskz_loadu_ps(8191, datPtr12+1652+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1351 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in825 = _mm512_permutexvar_ps(pm117, dat1350);
__m512 in832 = _mm512_permutexvar_ps(pm118, dat1351);
__m512 dat1352 = _mm512_maskz_loadu_ps(8191, datPtr12+1876+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1353 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in826 = _mm512_permutexvar_ps(pm117, dat1352);
__m512 in833 = _mm512_permutexvar_ps(pm118, dat1353);
__m512 dat1354 = _mm512_maskz_loadu_ps(8191, datPtr12+2100+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1355 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in827 = _mm512_permutexvar_ps(pm117, dat1354);
__m512 in834 = _mm512_permutexvar_ps(pm118, dat1355);
__m512 dat1356 = _mm512_maskz_loadu_ps(8191, datPtr12+2324+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1357 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in828 = _mm512_permutexvar_ps(pm117, dat1356);
__m512 in835 = _mm512_permutexvar_ps(pm118, dat1357);
__m512 dat1358 = _mm512_maskz_loadu_ps(8191, datPtr12+2548+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1359 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in829 = _mm512_permutexvar_ps(pm117, dat1358);
__m512 in836 = _mm512_permutexvar_ps(pm118, dat1359);
__m512 dat1360 = _mm512_maskz_loadu_ps(8191, datPtr12+2772+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1361 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in830 = _mm512_permutexvar_ps(pm117, dat1360);
__m512 in837 = _mm512_permutexvar_ps(pm118, dat1361);
__m512 tmp5807 = _mm512_add_ps(in824, in828);
__m512 tmp5811 = _mm512_add_ps(in831, in835);
__m512 tmp5808 = _mm512_sub_ps(in827, in825);
__m512 tmp5812 = _mm512_sub_ps(in834, in832);
__m512 tmp5809 = _mm512_add_ps(in825, in829);
__m512 tmp5813 = _mm512_add_ps(in832, in836);
in823 = _mm512_sub_ps(in823, in829);
__m512 tmp5814 = _mm512_sub_ps(_mm512_setzero_ps(), in836);
tmp5807 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-4.25e+00f), tmp5807);
tmp5811 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-4.25e+00f), tmp5811);
tmp5809 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-4.25e+00f), tmp5809);
tmp5813 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-4.25e+00f), tmp5813);
in823 = _mm512_fmadd_ps(tmp5808, _mm512_set1_ps(5.25e+00f), in823);
tmp5814 = _mm512_fmadd_ps(tmp5812, _mm512_set1_ps(5.25e+00f), tmp5814);
tmp5808 = _mm512_fmadd_ps(in825, _mm512_set1_ps(2.5e-01f), in829);
tmp5812 = _mm512_fmadd_ps(in832, _mm512_set1_ps(2.5e-01f), in836);
in825 = _mm512_fmadd_ps(in825, _mm512_set1_ps(4e+00f), in829);
in832 = _mm512_fmadd_ps(in832, _mm512_set1_ps(4e+00f), in836);
__m512 tmp5810 = _mm512_sub_ps(tmp5809, tmp5807);
__m512 tmp5815 = _mm512_sub_ps(tmp5813, tmp5811);
tmp5809 = _mm512_add_ps(tmp5807, tmp5809);
tmp5813 = _mm512_add_ps(tmp5811, tmp5813);
tmp5807 = _mm512_fmadd_ps(in824, _mm512_set1_ps(2.5e-01f), in828);
tmp5811 = _mm512_fmadd_ps(in831, _mm512_set1_ps(2.5e-01f), in835);
tmp5808 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-1.25e+00f), tmp5808);
tmp5812 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-1.25e+00f), tmp5812);
in827 = _mm512_fmadd_ps(in827, _mm512_set1_ps(-5e+00f), in825);
in834 = _mm512_fmadd_ps(in834, _mm512_set1_ps(-5e+00f), in832);
tmp5807 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-1.25e+00f), tmp5807);
tmp5811 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-1.25e+00f), tmp5811);
in829 = _mm512_fmadd_ps(tmp5807, _mm512_set1_ps(2e+00f), tmp5808);
in836 = _mm512_fmadd_ps(tmp5811, _mm512_set1_ps(2e+00f), tmp5812);
tmp5808 = _mm512_fnmadd_ps(tmp5807, _mm512_set1_ps(2e+00f), tmp5808);
tmp5812 = _mm512_fnmadd_ps(tmp5811, _mm512_set1_ps(2e+00f), tmp5812);
tmp5807 = _mm512_fmadd_ps(in828, _mm512_set1_ps(2.5e-01f), in824);
tmp5811 = _mm512_fmadd_ps(in835, _mm512_set1_ps(2.5e-01f), in831);
in824 = _mm512_sub_ps(in830, in824);
in831 = _mm512_sub_ps(in837, in831);
tmp5807 = _mm512_fmadd_ps(in826, _mm512_set1_ps(-1.25e+00f), tmp5807);
tmp5811 = _mm512_fmadd_ps(in833, _mm512_set1_ps(-1.25e+00f), tmp5811);
in826 = _mm512_sub_ps(in826, in828);
in833 = _mm512_sub_ps(in833, in835);
in826 = _mm512_fmadd_ps(in826, _mm512_set1_ps(5.25e+00f), in824);
in833 = _mm512_fmadd_ps(in833, _mm512_set1_ps(5.25e+00f), in831);
in825 = _mm512_fmadd_ps(tmp5807, _mm512_set1_ps(2e+00f), in827);
in832 = _mm512_fmadd_ps(tmp5811, _mm512_set1_ps(2e+00f), in834);
in827 = _mm512_fnmadd_ps(tmp5807, _mm512_set1_ps(2e+00f), in827);
in834 = _mm512_fnmadd_ps(tmp5811, _mm512_set1_ps(2e+00f), in834);
__m512 tmp5824 = _mm512_unpacklo_ps(in823, tmp5809);
__m512 tmp5825 = _mm512_unpackhi_ps(in823, tmp5809);
__m512 tmp5826 = _mm512_unpacklo_ps(tmp5810, in829);
__m512 tmp5827 = _mm512_unpackhi_ps(tmp5810, in829);
__m512 tmp5828 = _mm512_unpacklo_ps(tmp5808, in825);
__m512 tmp5829 = _mm512_unpackhi_ps(tmp5808, in825);
__m512 tmp5830 = _mm512_unpacklo_ps(in827, in826);
__m512 tmp5831 = _mm512_unpackhi_ps(in827, in826);
__m512 tmp5832 = _mm512_unpacklo_ps(tmp5814, tmp5813);
__m512 tmp5833 = _mm512_unpackhi_ps(tmp5814, tmp5813);
__m512 tmp5834 = _mm512_unpacklo_ps(tmp5815, in836);
__m512 tmp5835 = _mm512_unpackhi_ps(tmp5815, in836);
__m512 tmp5836 = _mm512_unpacklo_ps(tmp5812, in832);
__m512 tmp5837 = _mm512_unpackhi_ps(tmp5812, in832);
__m512 tmp5838 = _mm512_unpacklo_ps(in834, in833);
__m512 tmp5839 = _mm512_unpackhi_ps(in834, in833);
__m512 tmp5840 = _mm512_shuffle_ps(tmp5824, tmp5826, 68);
__m512 tmp5841 = _mm512_shuffle_ps(tmp5824, tmp5826, 238);
__m512 tmp5842 = _mm512_shuffle_ps(tmp5825, tmp5827, 68);
__m512 tmp5843 = _mm512_shuffle_ps(tmp5825, tmp5827, 238);
__m512 tmp5844 = _mm512_shuffle_ps(tmp5828, tmp5830, 68);
__m512 tmp5845 = _mm512_shuffle_ps(tmp5828, tmp5830, 238);
__m512 tmp5846 = _mm512_shuffle_ps(tmp5829, tmp5831, 68);
__m512 tmp5847 = _mm512_shuffle_ps(tmp5829, tmp5831, 238);
__m512 tmp5848 = _mm512_shuffle_ps(tmp5832, tmp5834, 68);
__m512 tmp5849 = _mm512_shuffle_ps(tmp5832, tmp5834, 238);
__m512 tmp5850 = _mm512_shuffle_ps(tmp5833, tmp5835, 68);
__m512 tmp5851 = _mm512_shuffle_ps(tmp5833, tmp5835, 238);
__m512 tmp5852 = _mm512_shuffle_ps(tmp5836, tmp5838, 68);
__m512 tmp5853 = _mm512_shuffle_ps(tmp5836, tmp5838, 238);
__m512 tmp5854 = _mm512_shuffle_ps(tmp5837, tmp5839, 68);
__m512 tmp5855 = _mm512_shuffle_ps(tmp5837, tmp5839, 238);
__m512 tmp5856 = _mm512_shuffle_f32x4(tmp5840, tmp5844, 136);
__m512 tmp5857 = _mm512_shuffle_f32x4(tmp5840, tmp5844, 221);
__m512 tmp5858 = _mm512_shuffle_f32x4(tmp5841, tmp5845, 136);
__m512 tmp5859 = _mm512_shuffle_f32x4(tmp5841, tmp5845, 221);
__m512 tmp5860 = _mm512_shuffle_f32x4(tmp5842, tmp5846, 136);
__m512 tmp5861 = _mm512_shuffle_f32x4(tmp5842, tmp5846, 221);
__m512 tmp5862 = _mm512_shuffle_f32x4(tmp5843, tmp5847, 136);
__m512 tmp5863 = _mm512_shuffle_f32x4(tmp5843, tmp5847, 221);
__m512 tmp5864 = _mm512_shuffle_f32x4(tmp5848, tmp5852, 136);
__m512 tmp5865 = _mm512_shuffle_f32x4(tmp5848, tmp5852, 221);
__m512 tmp5866 = _mm512_shuffle_f32x4(tmp5849, tmp5853, 136);
__m512 tmp5867 = _mm512_shuffle_f32x4(tmp5849, tmp5853, 221);
__m512 tmp5868 = _mm512_shuffle_f32x4(tmp5850, tmp5854, 136);
__m512 tmp5869 = _mm512_shuffle_f32x4(tmp5850, tmp5854, 221);
__m512 tmp5870 = _mm512_shuffle_f32x4(tmp5851, tmp5855, 136);
__m512 tmp5871 = _mm512_shuffle_f32x4(tmp5851, tmp5855, 221);
in823 = _mm512_shuffle_f32x4(tmp5856, tmp5864, 136);
tmp5814 = _mm512_shuffle_f32x4(tmp5856, tmp5864, 221);
tmp5809 = _mm512_shuffle_f32x4(tmp5858, tmp5866, 136);
tmp5813 = _mm512_shuffle_f32x4(tmp5858, tmp5866, 221);
tmp5810 = _mm512_shuffle_f32x4(tmp5860, tmp5868, 136);
tmp5815 = _mm512_shuffle_f32x4(tmp5860, tmp5868, 221);
in829 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 136);
in836 = _mm512_shuffle_f32x4(tmp5862, tmp5870, 221);
tmp5808 = _mm512_shuffle_f32x4(tmp5857, tmp5865, 136);
tmp5812 = _mm512_shuffle_f32x4(tmp5857, tmp5865, 221);
in825 = _mm512_shuffle_f32x4(tmp5859, tmp5867, 136);
in832 = _mm512_shuffle_f32x4(tmp5859, tmp5867, 221);
in827 = _mm512_shuffle_f32x4(tmp5861, tmp5869, 136);
in834 = _mm512_shuffle_f32x4(tmp5861, tmp5869, 221);
in826 = _mm512_shuffle_f32x4(tmp5863, tmp5871, 136);
in833 = _mm512_shuffle_f32x4(tmp5863, tmp5871, 221);
__m512 tmp5816 = _mm512_add_ps(tmp5809, in825);
__m512 tmp5820 = _mm512_add_ps(tmp5813, in832);
__m512 tmp5817 = _mm512_sub_ps(tmp5808, tmp5810);
__m512 tmp5821 = _mm512_sub_ps(tmp5812, tmp5815);
__m512 tmp5818 = _mm512_add_ps(tmp5810, in827);
__m512 tmp5822 = _mm512_add_ps(tmp5815, in834);
in823 = _mm512_sub_ps(in823, in827);
tmp5814 = _mm512_sub_ps(tmp5814, in834);
tmp5816 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-4.25e+00f), tmp5816);
tmp5820 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-4.25e+00f), tmp5820);
tmp5818 = _mm512_fmadd_ps(tmp5808, _mm512_set1_ps(-4.25e+00f), tmp5818);
tmp5822 = _mm512_fmadd_ps(tmp5812, _mm512_set1_ps(-4.25e+00f), tmp5822);
in823 = _mm512_fmadd_ps(tmp5817, _mm512_set1_ps(5.25e+00f), in823);
tmp5814 = _mm512_fmadd_ps(tmp5821, _mm512_set1_ps(5.25e+00f), tmp5814);
tmp5817 = _mm512_fmadd_ps(tmp5810, _mm512_set1_ps(2.5e-01f), in827);
tmp5821 = _mm512_fmadd_ps(tmp5815, _mm512_set1_ps(2.5e-01f), in834);
tmp5810 = _mm512_fmadd_ps(tmp5810, _mm512_set1_ps(4e+00f), in827);
tmp5815 = _mm512_fmadd_ps(tmp5815, _mm512_set1_ps(4e+00f), in834);
__m512 tmp5819 = _mm512_sub_ps(tmp5818, tmp5816);
__m512 tmp5823 = _mm512_sub_ps(tmp5822, tmp5820);
tmp5818 = _mm512_add_ps(tmp5816, tmp5818);
tmp5822 = _mm512_add_ps(tmp5820, tmp5822);
tmp5816 = _mm512_fmadd_ps(tmp5809, _mm512_set1_ps(2.5e-01f), in825);
tmp5820 = _mm512_fmadd_ps(tmp5813, _mm512_set1_ps(2.5e-01f), in832);
tmp5817 = _mm512_fmadd_ps(tmp5808, _mm512_set1_ps(-1.25e+00f), tmp5817);
tmp5821 = _mm512_fmadd_ps(tmp5812, _mm512_set1_ps(-1.25e+00f), tmp5821);
tmp5808 = _mm512_fmadd_ps(tmp5808, _mm512_set1_ps(-5e+00f), tmp5810);
tmp5812 = _mm512_fmadd_ps(tmp5812, _mm512_set1_ps(-5e+00f), tmp5815);
tmp5816 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-1.25e+00f), tmp5816);
tmp5820 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-1.25e+00f), tmp5820);
in827 = _mm512_fmadd_ps(tmp5816, _mm512_set1_ps(2e+00f), tmp5817);
in834 = _mm512_fmadd_ps(tmp5820, _mm512_set1_ps(2e+00f), tmp5821);
tmp5817 = _mm512_fnmadd_ps(tmp5816, _mm512_set1_ps(2e+00f), tmp5817);
tmp5821 = _mm512_fnmadd_ps(tmp5820, _mm512_set1_ps(2e+00f), tmp5821);
tmp5816 = _mm512_fmadd_ps(in825, _mm512_set1_ps(2.5e-01f), tmp5809);
tmp5820 = _mm512_fmadd_ps(in832, _mm512_set1_ps(2.5e-01f), tmp5813);
tmp5809 = _mm512_sub_ps(in826, tmp5809);
tmp5813 = _mm512_sub_ps(in833, tmp5813);
tmp5816 = _mm512_fmadd_ps(in829, _mm512_set1_ps(-1.25e+00f), tmp5816);
tmp5820 = _mm512_fmadd_ps(in836, _mm512_set1_ps(-1.25e+00f), tmp5820);
in829 = _mm512_sub_ps(in829, in825);
in836 = _mm512_sub_ps(in836, in832);
in829 = _mm512_fmadd_ps(in829, _mm512_set1_ps(5.25e+00f), tmp5809);
in836 = _mm512_fmadd_ps(in836, _mm512_set1_ps(5.25e+00f), tmp5813);
tmp5810 = _mm512_fmadd_ps(tmp5816, _mm512_set1_ps(2e+00f), tmp5808);
tmp5815 = _mm512_fmadd_ps(tmp5820, _mm512_set1_ps(2e+00f), tmp5812);
tmp5808 = _mm512_fnmadd_ps(tmp5816, _mm512_set1_ps(2e+00f), tmp5808);
tmp5812 = _mm512_fnmadd_ps(tmp5820, _mm512_set1_ps(2e+00f), tmp5812);
__m512 out771 = _mm512_shuffle_f32x4(in823, tmp5818, 68);
__m512 out779 = _mm512_shuffle_f32x4(in823, tmp5818, 238);
__m512 out772 = _mm512_shuffle_f32x4(tmp5819, in827, 68);
__m512 out780 = _mm512_shuffle_f32x4(tmp5819, in827, 238);
__m512 out773 = _mm512_shuffle_f32x4(tmp5817, tmp5810, 68);
__m512 out781 = _mm512_shuffle_f32x4(tmp5817, tmp5810, 238);
__m512 out774 = _mm512_shuffle_f32x4(tmp5808, in829, 68);
__m512 out782 = _mm512_shuffle_f32x4(tmp5808, in829, 238);
__m512 out775 = _mm512_shuffle_f32x4(tmp5814, tmp5822, 68);
__m512 out783 = _mm512_shuffle_f32x4(tmp5814, tmp5822, 238);
__m512 out776 = _mm512_shuffle_f32x4(tmp5823, in834, 68);
__m512 out784 = _mm512_shuffle_f32x4(tmp5823, in834, 238);
__m512 out777 = _mm512_shuffle_f32x4(tmp5821, tmp5815, 68);
__m512 out785 = _mm512_shuffle_f32x4(tmp5821, tmp5815, 238);
__m512 out778 = _mm512_shuffle_f32x4(tmp5812, in836, 68);
__m512 out786 = _mm512_shuffle_f32x4(tmp5812, in836, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k80, out771);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k80, out779);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k80, out775);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k80, out783);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k80, out772);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k80, out780);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k80, out776);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k80, out784);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k80, out773);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k80, out781);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k80, out777);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k80, out785);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k80, out774);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k80, out782);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k80, out778);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k80, out786);
__m512 dat1362 = _mm512_maskz_loadu_ps(8191, datPtr12+13812+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512i pm119 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in845 = _mm512_permutexvar_ps(pm119, dat1362);
__m512 dat1363 = _mm512_maskz_loadu_ps(511, datPtr12+12880+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1364 = _mm512_maskz_loadu_ps(8191, datPtr12+14036+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512i pm120 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in838 = _mm512_permutexvar_ps(pm120, dat1363);
__m512 in846 = _mm512_permutexvar_ps(pm119, dat1364);
__m512 dat1365 = _mm512_maskz_loadu_ps(511, datPtr12+13104+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1366 = _mm512_maskz_loadu_ps(8191, datPtr12+14260+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in839 = _mm512_permutexvar_ps(pm120, dat1365);
__m512 in847 = _mm512_permutexvar_ps(pm119, dat1366);
__m512 dat1367 = _mm512_maskz_loadu_ps(511, datPtr12+13328+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1368 = _mm512_maskz_loadu_ps(8191, datPtr12+14484+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in840 = _mm512_permutexvar_ps(pm120, dat1367);
__m512 in848 = _mm512_permutexvar_ps(pm119, dat1368);
__m512 dat1369 = _mm512_maskz_loadu_ps(511, datPtr12+13552+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1370 = _mm512_maskz_loadu_ps(8191, datPtr12+14708+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in841 = _mm512_permutexvar_ps(pm120, dat1369);
__m512 in849 = _mm512_permutexvar_ps(pm119, dat1370);
__m512 dat1371 = _mm512_maskz_loadu_ps(511, datPtr12+13776+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1372 = _mm512_maskz_loadu_ps(8191, datPtr12+14932+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in842 = _mm512_permutexvar_ps(pm120, dat1371);
__m512 in850 = _mm512_permutexvar_ps(pm119, dat1372);
__m512 dat1373 = _mm512_maskz_loadu_ps(511, datPtr12+14000+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1374 = _mm512_maskz_loadu_ps(8191, datPtr12+15156+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in843 = _mm512_permutexvar_ps(pm120, dat1373);
__m512 in851 = _mm512_permutexvar_ps(pm119, dat1374);
__m512 dat1375 = _mm512_maskz_loadu_ps(511, datPtr12+14224+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 dat1376 = _mm512_maskz_loadu_ps(8191, datPtr12+15380+50432*i27+224*h30+4*w37+50432*s19+25216*k80);
__m512 in844 = _mm512_permutexvar_ps(pm120, dat1375);
__m512 in852 = _mm512_permutexvar_ps(pm119, dat1376);
__m512 tmp5872 = _mm512_add_ps(in838, in842);
__m512 tmp5877 = _mm512_add_ps(in846, in850);
__m512 tmp5873 = _mm512_sub_ps(in841, in839);
__m512 tmp5878 = _mm512_sub_ps(in849, in847);
__m512 tmp5874 = _mm512_add_ps(in839, in843);
__m512 tmp5879 = _mm512_add_ps(in847, in851);
__m512 tmp5875 = _mm512_sub_ps(_mm512_setzero_ps(), in843);
in845 = _mm512_sub_ps(in845, in851);
tmp5872 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-4.25e+00f), tmp5872);
tmp5877 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-4.25e+00f), tmp5877);
tmp5874 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-4.25e+00f), tmp5874);
tmp5879 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-4.25e+00f), tmp5879);
tmp5875 = _mm512_fmadd_ps(tmp5873, _mm512_set1_ps(5.25e+00f), tmp5875);
in845 = _mm512_fmadd_ps(tmp5878, _mm512_set1_ps(5.25e+00f), in845);
tmp5873 = _mm512_fmadd_ps(in839, _mm512_set1_ps(2.5e-01f), in843);
tmp5878 = _mm512_fmadd_ps(in847, _mm512_set1_ps(2.5e-01f), in851);
in839 = _mm512_fmadd_ps(in839, _mm512_set1_ps(4e+00f), in843);
in847 = _mm512_fmadd_ps(in847, _mm512_set1_ps(4e+00f), in851);
__m512 tmp5876 = _mm512_sub_ps(tmp5874, tmp5872);
__m512 tmp5880 = _mm512_sub_ps(tmp5879, tmp5877);
tmp5874 = _mm512_add_ps(tmp5872, tmp5874);
tmp5879 = _mm512_add_ps(tmp5877, tmp5879);
tmp5872 = _mm512_fmadd_ps(in838, _mm512_set1_ps(2.5e-01f), in842);
tmp5877 = _mm512_fmadd_ps(in846, _mm512_set1_ps(2.5e-01f), in850);
tmp5873 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-1.25e+00f), tmp5873);
tmp5878 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-1.25e+00f), tmp5878);
in841 = _mm512_fmadd_ps(in841, _mm512_set1_ps(-5e+00f), in839);
in849 = _mm512_fmadd_ps(in849, _mm512_set1_ps(-5e+00f), in847);
tmp5872 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-1.25e+00f), tmp5872);
tmp5877 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-1.25e+00f), tmp5877);
in843 = _mm512_fmadd_ps(tmp5872, _mm512_set1_ps(2e+00f), tmp5873);
in851 = _mm512_fmadd_ps(tmp5877, _mm512_set1_ps(2e+00f), tmp5878);
tmp5873 = _mm512_fnmadd_ps(tmp5872, _mm512_set1_ps(2e+00f), tmp5873);
tmp5878 = _mm512_fnmadd_ps(tmp5877, _mm512_set1_ps(2e+00f), tmp5878);
tmp5872 = _mm512_fmadd_ps(in842, _mm512_set1_ps(2.5e-01f), in838);
tmp5877 = _mm512_fmadd_ps(in850, _mm512_set1_ps(2.5e-01f), in846);
in838 = _mm512_sub_ps(in844, in838);
in846 = _mm512_sub_ps(in852, in846);
tmp5872 = _mm512_fmadd_ps(in840, _mm512_set1_ps(-1.25e+00f), tmp5872);
tmp5877 = _mm512_fmadd_ps(in848, _mm512_set1_ps(-1.25e+00f), tmp5877);
in840 = _mm512_sub_ps(in840, in842);
in848 = _mm512_sub_ps(in848, in850);
in840 = _mm512_fmadd_ps(in840, _mm512_set1_ps(5.25e+00f), in838);
in848 = _mm512_fmadd_ps(in848, _mm512_set1_ps(5.25e+00f), in846);
in839 = _mm512_fmadd_ps(tmp5872, _mm512_set1_ps(2e+00f), in841);
in847 = _mm512_fmadd_ps(tmp5877, _mm512_set1_ps(2e+00f), in849);
in841 = _mm512_fnmadd_ps(tmp5872, _mm512_set1_ps(2e+00f), in841);
in849 = _mm512_fnmadd_ps(tmp5877, _mm512_set1_ps(2e+00f), in849);
__m512 tmp5889 = _mm512_unpacklo_ps(tmp5875, tmp5874);
__m512 tmp5890 = _mm512_unpackhi_ps(tmp5875, tmp5874);
__m512 tmp5891 = _mm512_unpacklo_ps(tmp5876, in843);
__m512 tmp5892 = _mm512_unpackhi_ps(tmp5876, in843);
__m512 tmp5893 = _mm512_unpacklo_ps(tmp5873, in839);
__m512 tmp5894 = _mm512_unpackhi_ps(tmp5873, in839);
__m512 tmp5895 = _mm512_unpacklo_ps(in841, in840);
__m512 tmp5896 = _mm512_unpackhi_ps(in841, in840);
__m512 tmp5897 = _mm512_unpacklo_ps(in845, tmp5879);
__m512 tmp5898 = _mm512_unpackhi_ps(in845, tmp5879);
__m512 tmp5899 = _mm512_unpacklo_ps(tmp5880, in851);
__m512 tmp5900 = _mm512_unpackhi_ps(tmp5880, in851);
__m512 tmp5901 = _mm512_unpacklo_ps(tmp5878, in847);
__m512 tmp5902 = _mm512_unpackhi_ps(tmp5878, in847);
__m512 tmp5903 = _mm512_unpacklo_ps(in849, in848);
__m512 tmp5904 = _mm512_unpackhi_ps(in849, in848);
__m512 tmp5905 = _mm512_shuffle_ps(tmp5889, tmp5891, 68);
__m512 tmp5906 = _mm512_shuffle_ps(tmp5889, tmp5891, 238);
__m512 tmp5907 = _mm512_shuffle_ps(tmp5890, tmp5892, 68);
__m512 tmp5908 = _mm512_shuffle_ps(tmp5890, tmp5892, 238);
__m512 tmp5909 = _mm512_shuffle_ps(tmp5893, tmp5895, 68);
__m512 tmp5910 = _mm512_shuffle_ps(tmp5893, tmp5895, 238);
__m512 tmp5911 = _mm512_shuffle_ps(tmp5894, tmp5896, 68);
__m512 tmp5912 = _mm512_shuffle_ps(tmp5894, tmp5896, 238);
__m512 tmp5913 = _mm512_shuffle_ps(tmp5897, tmp5899, 68);
__m512 tmp5914 = _mm512_shuffle_ps(tmp5897, tmp5899, 238);
__m512 tmp5915 = _mm512_shuffle_ps(tmp5898, tmp5900, 68);
__m512 tmp5916 = _mm512_shuffle_ps(tmp5898, tmp5900, 238);
__m512 tmp5917 = _mm512_shuffle_ps(tmp5901, tmp5903, 68);
__m512 tmp5918 = _mm512_shuffle_ps(tmp5901, tmp5903, 238);
__m512 tmp5919 = _mm512_shuffle_ps(tmp5902, tmp5904, 68);
__m512 tmp5920 = _mm512_shuffle_ps(tmp5902, tmp5904, 238);
__m512 tmp5921 = _mm512_shuffle_f32x4(tmp5905, tmp5909, 136);
__m512 tmp5922 = _mm512_shuffle_f32x4(tmp5905, tmp5909, 221);
__m512 tmp5923 = _mm512_shuffle_f32x4(tmp5906, tmp5910, 136);
__m512 tmp5924 = _mm512_shuffle_f32x4(tmp5906, tmp5910, 221);
__m512 tmp5925 = _mm512_shuffle_f32x4(tmp5907, tmp5911, 136);
__m512 tmp5926 = _mm512_shuffle_f32x4(tmp5907, tmp5911, 221);
__m512 tmp5927 = _mm512_shuffle_f32x4(tmp5908, tmp5912, 136);
__m512 tmp5928 = _mm512_shuffle_f32x4(tmp5908, tmp5912, 221);
__m512 tmp5929 = _mm512_shuffle_f32x4(tmp5913, tmp5917, 136);
__m512 tmp5930 = _mm512_shuffle_f32x4(tmp5913, tmp5917, 221);
__m512 tmp5931 = _mm512_shuffle_f32x4(tmp5914, tmp5918, 136);
__m512 tmp5932 = _mm512_shuffle_f32x4(tmp5914, tmp5918, 221);
__m512 tmp5933 = _mm512_shuffle_f32x4(tmp5915, tmp5919, 136);
__m512 tmp5934 = _mm512_shuffle_f32x4(tmp5915, tmp5919, 221);
__m512 tmp5935 = _mm512_shuffle_f32x4(tmp5916, tmp5920, 136);
__m512 tmp5936 = _mm512_shuffle_f32x4(tmp5916, tmp5920, 221);
tmp5875 = _mm512_shuffle_f32x4(tmp5921, tmp5929, 136);
in845 = _mm512_shuffle_f32x4(tmp5921, tmp5929, 221);
tmp5874 = _mm512_shuffle_f32x4(tmp5923, tmp5931, 136);
tmp5879 = _mm512_shuffle_f32x4(tmp5923, tmp5931, 221);
tmp5876 = _mm512_shuffle_f32x4(tmp5925, tmp5933, 136);
tmp5880 = _mm512_shuffle_f32x4(tmp5925, tmp5933, 221);
in843 = _mm512_shuffle_f32x4(tmp5927, tmp5935, 136);
in851 = _mm512_shuffle_f32x4(tmp5927, tmp5935, 221);
tmp5873 = _mm512_shuffle_f32x4(tmp5922, tmp5930, 136);
tmp5878 = _mm512_shuffle_f32x4(tmp5922, tmp5930, 221);
in839 = _mm512_shuffle_f32x4(tmp5924, tmp5932, 136);
in847 = _mm512_shuffle_f32x4(tmp5924, tmp5932, 221);
in841 = _mm512_shuffle_f32x4(tmp5926, tmp5934, 136);
in849 = _mm512_shuffle_f32x4(tmp5926, tmp5934, 221);
in840 = _mm512_shuffle_f32x4(tmp5928, tmp5936, 136);
in848 = _mm512_shuffle_f32x4(tmp5928, tmp5936, 221);
__m512 tmp5881 = _mm512_add_ps(tmp5874, in839);
__m512 tmp5885 = _mm512_add_ps(tmp5879, in847);
__m512 tmp5882 = _mm512_sub_ps(tmp5873, tmp5876);
__m512 tmp5886 = _mm512_sub_ps(tmp5878, tmp5880);
__m512 tmp5883 = _mm512_add_ps(tmp5876, in841);
__m512 tmp5887 = _mm512_add_ps(tmp5880, in849);
tmp5875 = _mm512_sub_ps(tmp5875, in841);
in845 = _mm512_sub_ps(in845, in849);
tmp5881 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-4.25e+00f), tmp5881);
tmp5885 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-4.25e+00f), tmp5885);
tmp5883 = _mm512_fmadd_ps(tmp5873, _mm512_set1_ps(-4.25e+00f), tmp5883);
tmp5887 = _mm512_fmadd_ps(tmp5878, _mm512_set1_ps(-4.25e+00f), tmp5887);
tmp5875 = _mm512_fmadd_ps(tmp5882, _mm512_set1_ps(5.25e+00f), tmp5875);
in845 = _mm512_fmadd_ps(tmp5886, _mm512_set1_ps(5.25e+00f), in845);
tmp5882 = _mm512_fmadd_ps(tmp5876, _mm512_set1_ps(2.5e-01f), in841);
tmp5886 = _mm512_fmadd_ps(tmp5880, _mm512_set1_ps(2.5e-01f), in849);
tmp5876 = _mm512_fmadd_ps(tmp5876, _mm512_set1_ps(4e+00f), in841);
tmp5880 = _mm512_fmadd_ps(tmp5880, _mm512_set1_ps(4e+00f), in849);
__m512 tmp5884 = _mm512_sub_ps(tmp5883, tmp5881);
__m512 tmp5888 = _mm512_sub_ps(tmp5887, tmp5885);
tmp5883 = _mm512_add_ps(tmp5881, tmp5883);
tmp5887 = _mm512_add_ps(tmp5885, tmp5887);
tmp5881 = _mm512_fmadd_ps(tmp5874, _mm512_set1_ps(2.5e-01f), in839);
tmp5885 = _mm512_fmadd_ps(tmp5879, _mm512_set1_ps(2.5e-01f), in847);
tmp5882 = _mm512_fmadd_ps(tmp5873, _mm512_set1_ps(-1.25e+00f), tmp5882);
tmp5886 = _mm512_fmadd_ps(tmp5878, _mm512_set1_ps(-1.25e+00f), tmp5886);
tmp5873 = _mm512_fmadd_ps(tmp5873, _mm512_set1_ps(-5e+00f), tmp5876);
tmp5878 = _mm512_fmadd_ps(tmp5878, _mm512_set1_ps(-5e+00f), tmp5880);
tmp5881 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-1.25e+00f), tmp5881);
tmp5885 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-1.25e+00f), tmp5885);
in841 = _mm512_fmadd_ps(tmp5881, _mm512_set1_ps(2e+00f), tmp5882);
in849 = _mm512_fmadd_ps(tmp5885, _mm512_set1_ps(2e+00f), tmp5886);
tmp5882 = _mm512_fnmadd_ps(tmp5881, _mm512_set1_ps(2e+00f), tmp5882);
tmp5886 = _mm512_fnmadd_ps(tmp5885, _mm512_set1_ps(2e+00f), tmp5886);
tmp5881 = _mm512_fmadd_ps(in839, _mm512_set1_ps(2.5e-01f), tmp5874);
tmp5885 = _mm512_fmadd_ps(in847, _mm512_set1_ps(2.5e-01f), tmp5879);
tmp5874 = _mm512_sub_ps(in840, tmp5874);
tmp5879 = _mm512_sub_ps(in848, tmp5879);
tmp5881 = _mm512_fmadd_ps(in843, _mm512_set1_ps(-1.25e+00f), tmp5881);
tmp5885 = _mm512_fmadd_ps(in851, _mm512_set1_ps(-1.25e+00f), tmp5885);
in843 = _mm512_sub_ps(in843, in839);
in851 = _mm512_sub_ps(in851, in847);
in843 = _mm512_fmadd_ps(in843, _mm512_set1_ps(5.25e+00f), tmp5874);
in851 = _mm512_fmadd_ps(in851, _mm512_set1_ps(5.25e+00f), tmp5879);
tmp5876 = _mm512_fmadd_ps(tmp5881, _mm512_set1_ps(2e+00f), tmp5873);
tmp5880 = _mm512_fmadd_ps(tmp5885, _mm512_set1_ps(2e+00f), tmp5878);
tmp5873 = _mm512_fnmadd_ps(tmp5881, _mm512_set1_ps(2e+00f), tmp5873);
tmp5878 = _mm512_fnmadd_ps(tmp5885, _mm512_set1_ps(2e+00f), tmp5878);
__m512 out787 = _mm512_shuffle_f32x4(tmp5875, tmp5883, 68);
__m512 out795 = _mm512_shuffle_f32x4(tmp5875, tmp5883, 238);
__m512 out788 = _mm512_shuffle_f32x4(tmp5884, in841, 68);
__m512 out796 = _mm512_shuffle_f32x4(tmp5884, in841, 238);
__m512 out789 = _mm512_shuffle_f32x4(tmp5882, tmp5876, 68);
__m512 out797 = _mm512_shuffle_f32x4(tmp5882, tmp5876, 238);
__m512 out790 = _mm512_shuffle_f32x4(tmp5873, in843, 68);
__m512 out798 = _mm512_shuffle_f32x4(tmp5873, in843, 238);
__m512 out791 = _mm512_shuffle_f32x4(in845, tmp5887, 68);
__m512 out799 = _mm512_shuffle_f32x4(in845, tmp5887, 238);
__m512 out792 = _mm512_shuffle_f32x4(tmp5888, in849, 68);
__m512 out800 = _mm512_shuffle_f32x4(tmp5888, in849, 238);
__m512 out793 = _mm512_shuffle_f32x4(tmp5886, tmp5880, 68);
__m512 out801 = _mm512_shuffle_f32x4(tmp5886, tmp5880, 238);
__m512 out794 = _mm512_shuffle_f32x4(tmp5878, in851, 68);
__m512 out802 = _mm512_shuffle_f32x4(tmp5878, in851, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k80, out787);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k80, out795);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k80, out791);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k80, out799);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k80, out788);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k80, out796);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k80, out792);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k80, out800);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k80, out789);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k80, out797);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k80, out793);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k80, out801);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k80, out790);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k80, out798);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k80, out794);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k80, out802);
}
++j21;
j21 = 2;
}
if (j21 < 15) {
ptrdiff_t rel14 = (size_t)(j21-2)%5;
ptrdiff_t base14 = 6+(size_t)(j21-2)/5*18;
for (; ; rel14 = 0, base14 += 18) {
if (rel14 < 2) {
if (rel14 < 1) {
ptrdiff_t h31 = base14+0;
ptrdiff_t w38 = 12;
ptrdiff_t k81 = 0;
for (; k81 != 2; ++k81) {
__m512 dat1377 = _mm512_maskz_loadu_ps(16383, datPtr12+0+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1378 = _mm512_maskz_loadu_ps(16383, datPtr12+48+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512i pm121 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in853 = _mm512_permutexvar_ps(pm121, dat1377);
__m512 in861 = _mm512_permutexvar_ps(pm121, dat1378);
__m512 dat1379 = _mm512_maskz_loadu_ps(16383, datPtr12+224+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1380 = _mm512_maskz_loadu_ps(16383, datPtr12+272+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in854 = _mm512_permutexvar_ps(pm121, dat1379);
__m512 in862 = _mm512_permutexvar_ps(pm121, dat1380);
__m512 dat1381 = _mm512_maskz_loadu_ps(16383, datPtr12+448+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1382 = _mm512_maskz_loadu_ps(16383, datPtr12+496+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in855 = _mm512_permutexvar_ps(pm121, dat1381);
__m512 in863 = _mm512_permutexvar_ps(pm121, dat1382);
__m512 dat1383 = _mm512_maskz_loadu_ps(16383, datPtr12+672+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1384 = _mm512_maskz_loadu_ps(16383, datPtr12+720+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in856 = _mm512_permutexvar_ps(pm121, dat1383);
__m512 in864 = _mm512_permutexvar_ps(pm121, dat1384);
__m512 dat1385 = _mm512_maskz_loadu_ps(16383, datPtr12+896+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1386 = _mm512_maskz_loadu_ps(16383, datPtr12+944+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in857 = _mm512_permutexvar_ps(pm121, dat1385);
__m512 in865 = _mm512_permutexvar_ps(pm121, dat1386);
__m512 dat1387 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1388 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in858 = _mm512_permutexvar_ps(pm121, dat1387);
__m512 in866 = _mm512_permutexvar_ps(pm121, dat1388);
__m512 dat1389 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1390 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in859 = _mm512_permutexvar_ps(pm121, dat1389);
__m512 in867 = _mm512_permutexvar_ps(pm121, dat1390);
__m512 dat1391 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1392 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in860 = _mm512_permutexvar_ps(pm121, dat1391);
__m512 in868 = _mm512_permutexvar_ps(pm121, dat1392);
__m512 tmp5937 = _mm512_add_ps(in854, in858);
__m512 tmp5941 = _mm512_add_ps(in862, in866);
__m512 tmp5938 = _mm512_sub_ps(in857, in855);
__m512 tmp5942 = _mm512_sub_ps(in865, in863);
__m512 tmp5939 = _mm512_add_ps(in855, in859);
__m512 tmp5943 = _mm512_add_ps(in863, in867);
in853 = _mm512_sub_ps(in853, in859);
in861 = _mm512_sub_ps(in861, in867);
tmp5937 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-4.25e+00f), tmp5937);
tmp5941 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-4.25e+00f), tmp5941);
tmp5939 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-4.25e+00f), tmp5939);
tmp5943 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-4.25e+00f), tmp5943);
in853 = _mm512_fmadd_ps(tmp5938, _mm512_set1_ps(5.25e+00f), in853);
in861 = _mm512_fmadd_ps(tmp5942, _mm512_set1_ps(5.25e+00f), in861);
tmp5938 = _mm512_fmadd_ps(in855, _mm512_set1_ps(2.5e-01f), in859);
tmp5942 = _mm512_fmadd_ps(in863, _mm512_set1_ps(2.5e-01f), in867);
in855 = _mm512_fmadd_ps(in855, _mm512_set1_ps(4e+00f), in859);
in863 = _mm512_fmadd_ps(in863, _mm512_set1_ps(4e+00f), in867);
__m512 tmp5940 = _mm512_sub_ps(tmp5939, tmp5937);
__m512 tmp5944 = _mm512_sub_ps(tmp5943, tmp5941);
tmp5939 = _mm512_add_ps(tmp5937, tmp5939);
tmp5943 = _mm512_add_ps(tmp5941, tmp5943);
tmp5937 = _mm512_fmadd_ps(in854, _mm512_set1_ps(2.5e-01f), in858);
tmp5941 = _mm512_fmadd_ps(in862, _mm512_set1_ps(2.5e-01f), in866);
tmp5938 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-1.25e+00f), tmp5938);
tmp5942 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-1.25e+00f), tmp5942);
in857 = _mm512_fmadd_ps(in857, _mm512_set1_ps(-5e+00f), in855);
in865 = _mm512_fmadd_ps(in865, _mm512_set1_ps(-5e+00f), in863);
tmp5937 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-1.25e+00f), tmp5937);
tmp5941 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-1.25e+00f), tmp5941);
in859 = _mm512_fmadd_ps(tmp5937, _mm512_set1_ps(2e+00f), tmp5938);
in867 = _mm512_fmadd_ps(tmp5941, _mm512_set1_ps(2e+00f), tmp5942);
tmp5938 = _mm512_fnmadd_ps(tmp5937, _mm512_set1_ps(2e+00f), tmp5938);
tmp5942 = _mm512_fnmadd_ps(tmp5941, _mm512_set1_ps(2e+00f), tmp5942);
tmp5937 = _mm512_fmadd_ps(in858, _mm512_set1_ps(2.5e-01f), in854);
tmp5941 = _mm512_fmadd_ps(in866, _mm512_set1_ps(2.5e-01f), in862);
in854 = _mm512_sub_ps(in860, in854);
in862 = _mm512_sub_ps(in868, in862);
tmp5937 = _mm512_fmadd_ps(in856, _mm512_set1_ps(-1.25e+00f), tmp5937);
tmp5941 = _mm512_fmadd_ps(in864, _mm512_set1_ps(-1.25e+00f), tmp5941);
in856 = _mm512_sub_ps(in856, in858);
in864 = _mm512_sub_ps(in864, in866);
in856 = _mm512_fmadd_ps(in856, _mm512_set1_ps(5.25e+00f), in854);
in864 = _mm512_fmadd_ps(in864, _mm512_set1_ps(5.25e+00f), in862);
in855 = _mm512_fmadd_ps(tmp5937, _mm512_set1_ps(2e+00f), in857);
in863 = _mm512_fmadd_ps(tmp5941, _mm512_set1_ps(2e+00f), in865);
in857 = _mm512_fnmadd_ps(tmp5937, _mm512_set1_ps(2e+00f), in857);
in865 = _mm512_fnmadd_ps(tmp5941, _mm512_set1_ps(2e+00f), in865);
__m512 tmp5953 = _mm512_unpacklo_ps(in853, tmp5939);
__m512 tmp5954 = _mm512_unpackhi_ps(in853, tmp5939);
__m512 tmp5955 = _mm512_unpacklo_ps(tmp5940, in859);
__m512 tmp5956 = _mm512_unpackhi_ps(tmp5940, in859);
__m512 tmp5957 = _mm512_unpacklo_ps(tmp5938, in855);
__m512 tmp5958 = _mm512_unpackhi_ps(tmp5938, in855);
__m512 tmp5959 = _mm512_unpacklo_ps(in857, in856);
__m512 tmp5960 = _mm512_unpackhi_ps(in857, in856);
__m512 tmp5961 = _mm512_unpacklo_ps(in861, tmp5943);
__m512 tmp5962 = _mm512_unpackhi_ps(in861, tmp5943);
__m512 tmp5963 = _mm512_unpacklo_ps(tmp5944, in867);
__m512 tmp5964 = _mm512_unpackhi_ps(tmp5944, in867);
__m512 tmp5965 = _mm512_unpacklo_ps(tmp5942, in863);
__m512 tmp5966 = _mm512_unpackhi_ps(tmp5942, in863);
__m512 tmp5967 = _mm512_unpacklo_ps(in865, in864);
__m512 tmp5968 = _mm512_unpackhi_ps(in865, in864);
__m512 tmp5969 = _mm512_shuffle_ps(tmp5953, tmp5955, 68);
__m512 tmp5970 = _mm512_shuffle_ps(tmp5953, tmp5955, 238);
__m512 tmp5971 = _mm512_shuffle_ps(tmp5954, tmp5956, 68);
__m512 tmp5972 = _mm512_shuffle_ps(tmp5954, tmp5956, 238);
__m512 tmp5973 = _mm512_shuffle_ps(tmp5957, tmp5959, 68);
__m512 tmp5974 = _mm512_shuffle_ps(tmp5957, tmp5959, 238);
__m512 tmp5975 = _mm512_shuffle_ps(tmp5958, tmp5960, 68);
__m512 tmp5976 = _mm512_shuffle_ps(tmp5958, tmp5960, 238);
__m512 tmp5977 = _mm512_shuffle_ps(tmp5961, tmp5963, 68);
__m512 tmp5978 = _mm512_shuffle_ps(tmp5961, tmp5963, 238);
__m512 tmp5979 = _mm512_shuffle_ps(tmp5962, tmp5964, 68);
__m512 tmp5980 = _mm512_shuffle_ps(tmp5962, tmp5964, 238);
__m512 tmp5981 = _mm512_shuffle_ps(tmp5965, tmp5967, 68);
__m512 tmp5982 = _mm512_shuffle_ps(tmp5965, tmp5967, 238);
__m512 tmp5983 = _mm512_shuffle_ps(tmp5966, tmp5968, 68);
__m512 tmp5984 = _mm512_shuffle_ps(tmp5966, tmp5968, 238);
__m512 tmp5985 = _mm512_shuffle_f32x4(tmp5969, tmp5973, 136);
__m512 tmp5986 = _mm512_shuffle_f32x4(tmp5969, tmp5973, 221);
__m512 tmp5987 = _mm512_shuffle_f32x4(tmp5970, tmp5974, 136);
__m512 tmp5988 = _mm512_shuffle_f32x4(tmp5970, tmp5974, 221);
__m512 tmp5989 = _mm512_shuffle_f32x4(tmp5971, tmp5975, 136);
__m512 tmp5990 = _mm512_shuffle_f32x4(tmp5971, tmp5975, 221);
__m512 tmp5991 = _mm512_shuffle_f32x4(tmp5972, tmp5976, 136);
__m512 tmp5992 = _mm512_shuffle_f32x4(tmp5972, tmp5976, 221);
__m512 tmp5993 = _mm512_shuffle_f32x4(tmp5977, tmp5981, 136);
__m512 tmp5994 = _mm512_shuffle_f32x4(tmp5977, tmp5981, 221);
__m512 tmp5995 = _mm512_shuffle_f32x4(tmp5978, tmp5982, 136);
__m512 tmp5996 = _mm512_shuffle_f32x4(tmp5978, tmp5982, 221);
__m512 tmp5997 = _mm512_shuffle_f32x4(tmp5979, tmp5983, 136);
__m512 tmp5998 = _mm512_shuffle_f32x4(tmp5979, tmp5983, 221);
__m512 tmp5999 = _mm512_shuffle_f32x4(tmp5980, tmp5984, 136);
__m512 tmp6000 = _mm512_shuffle_f32x4(tmp5980, tmp5984, 221);
in853 = _mm512_shuffle_f32x4(tmp5985, tmp5993, 136);
in861 = _mm512_shuffle_f32x4(tmp5985, tmp5993, 221);
tmp5939 = _mm512_shuffle_f32x4(tmp5987, tmp5995, 136);
tmp5943 = _mm512_shuffle_f32x4(tmp5987, tmp5995, 221);
tmp5940 = _mm512_shuffle_f32x4(tmp5989, tmp5997, 136);
tmp5944 = _mm512_shuffle_f32x4(tmp5989, tmp5997, 221);
in859 = _mm512_shuffle_f32x4(tmp5991, tmp5999, 136);
in867 = _mm512_shuffle_f32x4(tmp5991, tmp5999, 221);
tmp5938 = _mm512_shuffle_f32x4(tmp5986, tmp5994, 136);
tmp5942 = _mm512_shuffle_f32x4(tmp5986, tmp5994, 221);
in855 = _mm512_shuffle_f32x4(tmp5988, tmp5996, 136);
in863 = _mm512_shuffle_f32x4(tmp5988, tmp5996, 221);
in857 = _mm512_shuffle_f32x4(tmp5990, tmp5998, 136);
in865 = _mm512_shuffle_f32x4(tmp5990, tmp5998, 221);
in856 = _mm512_shuffle_f32x4(tmp5992, tmp6000, 136);
in864 = _mm512_shuffle_f32x4(tmp5992, tmp6000, 221);
__m512 tmp5945 = _mm512_add_ps(tmp5939, in855);
__m512 tmp5949 = _mm512_add_ps(tmp5943, in863);
__m512 tmp5946 = _mm512_sub_ps(tmp5938, tmp5940);
__m512 tmp5950 = _mm512_sub_ps(tmp5942, tmp5944);
__m512 tmp5947 = _mm512_add_ps(tmp5940, in857);
__m512 tmp5951 = _mm512_add_ps(tmp5944, in865);
in853 = _mm512_sub_ps(in853, in857);
in861 = _mm512_sub_ps(in861, in865);
tmp5945 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-4.25e+00f), tmp5945);
tmp5949 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-4.25e+00f), tmp5949);
tmp5947 = _mm512_fmadd_ps(tmp5938, _mm512_set1_ps(-4.25e+00f), tmp5947);
tmp5951 = _mm512_fmadd_ps(tmp5942, _mm512_set1_ps(-4.25e+00f), tmp5951);
in853 = _mm512_fmadd_ps(tmp5946, _mm512_set1_ps(5.25e+00f), in853);
in861 = _mm512_fmadd_ps(tmp5950, _mm512_set1_ps(5.25e+00f), in861);
tmp5946 = _mm512_fmadd_ps(tmp5940, _mm512_set1_ps(2.5e-01f), in857);
tmp5950 = _mm512_fmadd_ps(tmp5944, _mm512_set1_ps(2.5e-01f), in865);
tmp5940 = _mm512_fmadd_ps(tmp5940, _mm512_set1_ps(4e+00f), in857);
tmp5944 = _mm512_fmadd_ps(tmp5944, _mm512_set1_ps(4e+00f), in865);
__m512 tmp5948 = _mm512_sub_ps(tmp5947, tmp5945);
__m512 tmp5952 = _mm512_sub_ps(tmp5951, tmp5949);
tmp5947 = _mm512_add_ps(tmp5945, tmp5947);
tmp5951 = _mm512_add_ps(tmp5949, tmp5951);
tmp5945 = _mm512_fmadd_ps(tmp5939, _mm512_set1_ps(2.5e-01f), in855);
tmp5949 = _mm512_fmadd_ps(tmp5943, _mm512_set1_ps(2.5e-01f), in863);
tmp5946 = _mm512_fmadd_ps(tmp5938, _mm512_set1_ps(-1.25e+00f), tmp5946);
tmp5950 = _mm512_fmadd_ps(tmp5942, _mm512_set1_ps(-1.25e+00f), tmp5950);
tmp5938 = _mm512_fmadd_ps(tmp5938, _mm512_set1_ps(-5e+00f), tmp5940);
tmp5942 = _mm512_fmadd_ps(tmp5942, _mm512_set1_ps(-5e+00f), tmp5944);
tmp5945 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-1.25e+00f), tmp5945);
tmp5949 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-1.25e+00f), tmp5949);
in857 = _mm512_fmadd_ps(tmp5945, _mm512_set1_ps(2e+00f), tmp5946);
in865 = _mm512_fmadd_ps(tmp5949, _mm512_set1_ps(2e+00f), tmp5950);
tmp5946 = _mm512_fnmadd_ps(tmp5945, _mm512_set1_ps(2e+00f), tmp5946);
tmp5950 = _mm512_fnmadd_ps(tmp5949, _mm512_set1_ps(2e+00f), tmp5950);
tmp5945 = _mm512_fmadd_ps(in855, _mm512_set1_ps(2.5e-01f), tmp5939);
tmp5949 = _mm512_fmadd_ps(in863, _mm512_set1_ps(2.5e-01f), tmp5943);
tmp5939 = _mm512_sub_ps(in856, tmp5939);
tmp5943 = _mm512_sub_ps(in864, tmp5943);
tmp5945 = _mm512_fmadd_ps(in859, _mm512_set1_ps(-1.25e+00f), tmp5945);
tmp5949 = _mm512_fmadd_ps(in867, _mm512_set1_ps(-1.25e+00f), tmp5949);
in859 = _mm512_sub_ps(in859, in855);
in867 = _mm512_sub_ps(in867, in863);
in859 = _mm512_fmadd_ps(in859, _mm512_set1_ps(5.25e+00f), tmp5939);
in867 = _mm512_fmadd_ps(in867, _mm512_set1_ps(5.25e+00f), tmp5943);
tmp5940 = _mm512_fmadd_ps(tmp5945, _mm512_set1_ps(2e+00f), tmp5938);
tmp5944 = _mm512_fmadd_ps(tmp5949, _mm512_set1_ps(2e+00f), tmp5942);
tmp5938 = _mm512_fnmadd_ps(tmp5945, _mm512_set1_ps(2e+00f), tmp5938);
tmp5942 = _mm512_fnmadd_ps(tmp5949, _mm512_set1_ps(2e+00f), tmp5942);
__m512 out803 = _mm512_shuffle_f32x4(in853, tmp5947, 68);
__m512 out811 = _mm512_shuffle_f32x4(in853, tmp5947, 238);
__m512 out804 = _mm512_shuffle_f32x4(tmp5948, in857, 68);
__m512 out812 = _mm512_shuffle_f32x4(tmp5948, in857, 238);
__m512 out805 = _mm512_shuffle_f32x4(tmp5946, tmp5940, 68);
__m512 out813 = _mm512_shuffle_f32x4(tmp5946, tmp5940, 238);
__m512 out806 = _mm512_shuffle_f32x4(tmp5938, in859, 68);
__m512 out814 = _mm512_shuffle_f32x4(tmp5938, in859, 238);
__m512 out807 = _mm512_shuffle_f32x4(in861, tmp5951, 68);
__m512 out815 = _mm512_shuffle_f32x4(in861, tmp5951, 238);
__m512 out808 = _mm512_shuffle_f32x4(tmp5952, in865, 68);
__m512 out816 = _mm512_shuffle_f32x4(tmp5952, in865, 238);
__m512 out809 = _mm512_shuffle_f32x4(tmp5950, tmp5944, 68);
__m512 out817 = _mm512_shuffle_f32x4(tmp5950, tmp5944, 238);
__m512 out810 = _mm512_shuffle_f32x4(tmp5942, in867, 68);
__m512 out818 = _mm512_shuffle_f32x4(tmp5942, in867, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k81, out803);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k81, out811);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k81, out807);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k81, out815);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k81, out804);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k81, out812);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k81, out808);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k81, out816);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k81, out805);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k81, out813);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k81, out809);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k81, out817);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k81, out806);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k81, out814);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k81, out810);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k81, out818);
__m512 dat1393 = _mm512_maskz_loadu_ps(16383, datPtr12+96+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1394 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512i pm122 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in869 = _mm512_permutexvar_ps(pm122, dat1393);
__m512 in877 = _mm512_permutexvar_ps(pm122, dat1394);
__m512 dat1395 = _mm512_maskz_loadu_ps(16383, datPtr12+320+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1396 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in870 = _mm512_permutexvar_ps(pm122, dat1395);
__m512 in878 = _mm512_permutexvar_ps(pm122, dat1396);
__m512 dat1397 = _mm512_maskz_loadu_ps(16383, datPtr12+544+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1398 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in871 = _mm512_permutexvar_ps(pm122, dat1397);
__m512 in879 = _mm512_permutexvar_ps(pm122, dat1398);
__m512 dat1399 = _mm512_maskz_loadu_ps(16383, datPtr12+768+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1400 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in872 = _mm512_permutexvar_ps(pm122, dat1399);
__m512 in880 = _mm512_permutexvar_ps(pm122, dat1400);
__m512 dat1401 = _mm512_maskz_loadu_ps(16383, datPtr12+992+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1402 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in873 = _mm512_permutexvar_ps(pm122, dat1401);
__m512 in881 = _mm512_permutexvar_ps(pm122, dat1402);
__m512 dat1403 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1404 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in874 = _mm512_permutexvar_ps(pm122, dat1403);
__m512 in882 = _mm512_permutexvar_ps(pm122, dat1404);
__m512 dat1405 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1406 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in875 = _mm512_permutexvar_ps(pm122, dat1405);
__m512 in883 = _mm512_permutexvar_ps(pm122, dat1406);
__m512 dat1407 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1408 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in876 = _mm512_permutexvar_ps(pm122, dat1407);
__m512 in884 = _mm512_permutexvar_ps(pm122, dat1408);
__m512 tmp6001 = _mm512_add_ps(in870, in874);
__m512 tmp6005 = _mm512_add_ps(in878, in882);
__m512 tmp6002 = _mm512_sub_ps(in873, in871);
__m512 tmp6006 = _mm512_sub_ps(in881, in879);
__m512 tmp6003 = _mm512_add_ps(in871, in875);
__m512 tmp6007 = _mm512_add_ps(in879, in883);
in869 = _mm512_sub_ps(in869, in875);
in877 = _mm512_sub_ps(in877, in883);
tmp6001 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-4.25e+00f), tmp6001);
tmp6005 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-4.25e+00f), tmp6005);
tmp6003 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-4.25e+00f), tmp6003);
tmp6007 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-4.25e+00f), tmp6007);
in869 = _mm512_fmadd_ps(tmp6002, _mm512_set1_ps(5.25e+00f), in869);
in877 = _mm512_fmadd_ps(tmp6006, _mm512_set1_ps(5.25e+00f), in877);
tmp6002 = _mm512_fmadd_ps(in871, _mm512_set1_ps(2.5e-01f), in875);
tmp6006 = _mm512_fmadd_ps(in879, _mm512_set1_ps(2.5e-01f), in883);
in871 = _mm512_fmadd_ps(in871, _mm512_set1_ps(4e+00f), in875);
in879 = _mm512_fmadd_ps(in879, _mm512_set1_ps(4e+00f), in883);
__m512 tmp6004 = _mm512_sub_ps(tmp6003, tmp6001);
__m512 tmp6008 = _mm512_sub_ps(tmp6007, tmp6005);
tmp6003 = _mm512_add_ps(tmp6001, tmp6003);
tmp6007 = _mm512_add_ps(tmp6005, tmp6007);
tmp6001 = _mm512_fmadd_ps(in870, _mm512_set1_ps(2.5e-01f), in874);
tmp6005 = _mm512_fmadd_ps(in878, _mm512_set1_ps(2.5e-01f), in882);
tmp6002 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-1.25e+00f), tmp6002);
tmp6006 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-1.25e+00f), tmp6006);
in873 = _mm512_fmadd_ps(in873, _mm512_set1_ps(-5e+00f), in871);
in881 = _mm512_fmadd_ps(in881, _mm512_set1_ps(-5e+00f), in879);
tmp6001 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-1.25e+00f), tmp6001);
tmp6005 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-1.25e+00f), tmp6005);
in875 = _mm512_fmadd_ps(tmp6001, _mm512_set1_ps(2e+00f), tmp6002);
in883 = _mm512_fmadd_ps(tmp6005, _mm512_set1_ps(2e+00f), tmp6006);
tmp6002 = _mm512_fnmadd_ps(tmp6001, _mm512_set1_ps(2e+00f), tmp6002);
tmp6006 = _mm512_fnmadd_ps(tmp6005, _mm512_set1_ps(2e+00f), tmp6006);
tmp6001 = _mm512_fmadd_ps(in874, _mm512_set1_ps(2.5e-01f), in870);
tmp6005 = _mm512_fmadd_ps(in882, _mm512_set1_ps(2.5e-01f), in878);
in870 = _mm512_sub_ps(in876, in870);
in878 = _mm512_sub_ps(in884, in878);
tmp6001 = _mm512_fmadd_ps(in872, _mm512_set1_ps(-1.25e+00f), tmp6001);
tmp6005 = _mm512_fmadd_ps(in880, _mm512_set1_ps(-1.25e+00f), tmp6005);
in872 = _mm512_sub_ps(in872, in874);
in880 = _mm512_sub_ps(in880, in882);
in872 = _mm512_fmadd_ps(in872, _mm512_set1_ps(5.25e+00f), in870);
in880 = _mm512_fmadd_ps(in880, _mm512_set1_ps(5.25e+00f), in878);
in871 = _mm512_fmadd_ps(tmp6001, _mm512_set1_ps(2e+00f), in873);
in879 = _mm512_fmadd_ps(tmp6005, _mm512_set1_ps(2e+00f), in881);
in873 = _mm512_fnmadd_ps(tmp6001, _mm512_set1_ps(2e+00f), in873);
in881 = _mm512_fnmadd_ps(tmp6005, _mm512_set1_ps(2e+00f), in881);
__m512 tmp6017 = _mm512_unpacklo_ps(in869, tmp6003);
__m512 tmp6018 = _mm512_unpackhi_ps(in869, tmp6003);
__m512 tmp6019 = _mm512_unpacklo_ps(tmp6004, in875);
__m512 tmp6020 = _mm512_unpackhi_ps(tmp6004, in875);
__m512 tmp6021 = _mm512_unpacklo_ps(tmp6002, in871);
__m512 tmp6022 = _mm512_unpackhi_ps(tmp6002, in871);
__m512 tmp6023 = _mm512_unpacklo_ps(in873, in872);
__m512 tmp6024 = _mm512_unpackhi_ps(in873, in872);
__m512 tmp6025 = _mm512_unpacklo_ps(in877, tmp6007);
__m512 tmp6026 = _mm512_unpackhi_ps(in877, tmp6007);
__m512 tmp6027 = _mm512_unpacklo_ps(tmp6008, in883);
__m512 tmp6028 = _mm512_unpackhi_ps(tmp6008, in883);
__m512 tmp6029 = _mm512_unpacklo_ps(tmp6006, in879);
__m512 tmp6030 = _mm512_unpackhi_ps(tmp6006, in879);
__m512 tmp6031 = _mm512_unpacklo_ps(in881, in880);
__m512 tmp6032 = _mm512_unpackhi_ps(in881, in880);
__m512 tmp6033 = _mm512_shuffle_ps(tmp6017, tmp6019, 68);
__m512 tmp6034 = _mm512_shuffle_ps(tmp6017, tmp6019, 238);
__m512 tmp6035 = _mm512_shuffle_ps(tmp6018, tmp6020, 68);
__m512 tmp6036 = _mm512_shuffle_ps(tmp6018, tmp6020, 238);
__m512 tmp6037 = _mm512_shuffle_ps(tmp6021, tmp6023, 68);
__m512 tmp6038 = _mm512_shuffle_ps(tmp6021, tmp6023, 238);
__m512 tmp6039 = _mm512_shuffle_ps(tmp6022, tmp6024, 68);
__m512 tmp6040 = _mm512_shuffle_ps(tmp6022, tmp6024, 238);
__m512 tmp6041 = _mm512_shuffle_ps(tmp6025, tmp6027, 68);
__m512 tmp6042 = _mm512_shuffle_ps(tmp6025, tmp6027, 238);
__m512 tmp6043 = _mm512_shuffle_ps(tmp6026, tmp6028, 68);
__m512 tmp6044 = _mm512_shuffle_ps(tmp6026, tmp6028, 238);
__m512 tmp6045 = _mm512_shuffle_ps(tmp6029, tmp6031, 68);
__m512 tmp6046 = _mm512_shuffle_ps(tmp6029, tmp6031, 238);
__m512 tmp6047 = _mm512_shuffle_ps(tmp6030, tmp6032, 68);
__m512 tmp6048 = _mm512_shuffle_ps(tmp6030, tmp6032, 238);
__m512 tmp6049 = _mm512_shuffle_f32x4(tmp6033, tmp6037, 136);
__m512 tmp6050 = _mm512_shuffle_f32x4(tmp6033, tmp6037, 221);
__m512 tmp6051 = _mm512_shuffle_f32x4(tmp6034, tmp6038, 136);
__m512 tmp6052 = _mm512_shuffle_f32x4(tmp6034, tmp6038, 221);
__m512 tmp6053 = _mm512_shuffle_f32x4(tmp6035, tmp6039, 136);
__m512 tmp6054 = _mm512_shuffle_f32x4(tmp6035, tmp6039, 221);
__m512 tmp6055 = _mm512_shuffle_f32x4(tmp6036, tmp6040, 136);
__m512 tmp6056 = _mm512_shuffle_f32x4(tmp6036, tmp6040, 221);
__m512 tmp6057 = _mm512_shuffle_f32x4(tmp6041, tmp6045, 136);
__m512 tmp6058 = _mm512_shuffle_f32x4(tmp6041, tmp6045, 221);
__m512 tmp6059 = _mm512_shuffle_f32x4(tmp6042, tmp6046, 136);
__m512 tmp6060 = _mm512_shuffle_f32x4(tmp6042, tmp6046, 221);
__m512 tmp6061 = _mm512_shuffle_f32x4(tmp6043, tmp6047, 136);
__m512 tmp6062 = _mm512_shuffle_f32x4(tmp6043, tmp6047, 221);
__m512 tmp6063 = _mm512_shuffle_f32x4(tmp6044, tmp6048, 136);
__m512 tmp6064 = _mm512_shuffle_f32x4(tmp6044, tmp6048, 221);
in869 = _mm512_shuffle_f32x4(tmp6049, tmp6057, 136);
in877 = _mm512_shuffle_f32x4(tmp6049, tmp6057, 221);
tmp6003 = _mm512_shuffle_f32x4(tmp6051, tmp6059, 136);
tmp6007 = _mm512_shuffle_f32x4(tmp6051, tmp6059, 221);
tmp6004 = _mm512_shuffle_f32x4(tmp6053, tmp6061, 136);
tmp6008 = _mm512_shuffle_f32x4(tmp6053, tmp6061, 221);
in875 = _mm512_shuffle_f32x4(tmp6055, tmp6063, 136);
in883 = _mm512_shuffle_f32x4(tmp6055, tmp6063, 221);
tmp6002 = _mm512_shuffle_f32x4(tmp6050, tmp6058, 136);
tmp6006 = _mm512_shuffle_f32x4(tmp6050, tmp6058, 221);
in871 = _mm512_shuffle_f32x4(tmp6052, tmp6060, 136);
in879 = _mm512_shuffle_f32x4(tmp6052, tmp6060, 221);
in873 = _mm512_shuffle_f32x4(tmp6054, tmp6062, 136);
in881 = _mm512_shuffle_f32x4(tmp6054, tmp6062, 221);
in872 = _mm512_shuffle_f32x4(tmp6056, tmp6064, 136);
in880 = _mm512_shuffle_f32x4(tmp6056, tmp6064, 221);
__m512 tmp6009 = _mm512_add_ps(tmp6003, in871);
__m512 tmp6013 = _mm512_add_ps(tmp6007, in879);
__m512 tmp6010 = _mm512_sub_ps(tmp6002, tmp6004);
__m512 tmp6014 = _mm512_sub_ps(tmp6006, tmp6008);
__m512 tmp6011 = _mm512_add_ps(tmp6004, in873);
__m512 tmp6015 = _mm512_add_ps(tmp6008, in881);
in869 = _mm512_sub_ps(in869, in873);
in877 = _mm512_sub_ps(in877, in881);
tmp6009 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-4.25e+00f), tmp6009);
tmp6013 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-4.25e+00f), tmp6013);
tmp6011 = _mm512_fmadd_ps(tmp6002, _mm512_set1_ps(-4.25e+00f), tmp6011);
tmp6015 = _mm512_fmadd_ps(tmp6006, _mm512_set1_ps(-4.25e+00f), tmp6015);
in869 = _mm512_fmadd_ps(tmp6010, _mm512_set1_ps(5.25e+00f), in869);
in877 = _mm512_fmadd_ps(tmp6014, _mm512_set1_ps(5.25e+00f), in877);
tmp6010 = _mm512_fmadd_ps(tmp6004, _mm512_set1_ps(2.5e-01f), in873);
tmp6014 = _mm512_fmadd_ps(tmp6008, _mm512_set1_ps(2.5e-01f), in881);
tmp6004 = _mm512_fmadd_ps(tmp6004, _mm512_set1_ps(4e+00f), in873);
tmp6008 = _mm512_fmadd_ps(tmp6008, _mm512_set1_ps(4e+00f), in881);
__m512 tmp6012 = _mm512_sub_ps(tmp6011, tmp6009);
__m512 tmp6016 = _mm512_sub_ps(tmp6015, tmp6013);
tmp6011 = _mm512_add_ps(tmp6009, tmp6011);
tmp6015 = _mm512_add_ps(tmp6013, tmp6015);
tmp6009 = _mm512_fmadd_ps(tmp6003, _mm512_set1_ps(2.5e-01f), in871);
tmp6013 = _mm512_fmadd_ps(tmp6007, _mm512_set1_ps(2.5e-01f), in879);
tmp6010 = _mm512_fmadd_ps(tmp6002, _mm512_set1_ps(-1.25e+00f), tmp6010);
tmp6014 = _mm512_fmadd_ps(tmp6006, _mm512_set1_ps(-1.25e+00f), tmp6014);
tmp6002 = _mm512_fmadd_ps(tmp6002, _mm512_set1_ps(-5e+00f), tmp6004);
tmp6006 = _mm512_fmadd_ps(tmp6006, _mm512_set1_ps(-5e+00f), tmp6008);
tmp6009 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-1.25e+00f), tmp6009);
tmp6013 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-1.25e+00f), tmp6013);
in873 = _mm512_fmadd_ps(tmp6009, _mm512_set1_ps(2e+00f), tmp6010);
in881 = _mm512_fmadd_ps(tmp6013, _mm512_set1_ps(2e+00f), tmp6014);
tmp6010 = _mm512_fnmadd_ps(tmp6009, _mm512_set1_ps(2e+00f), tmp6010);
tmp6014 = _mm512_fnmadd_ps(tmp6013, _mm512_set1_ps(2e+00f), tmp6014);
tmp6009 = _mm512_fmadd_ps(in871, _mm512_set1_ps(2.5e-01f), tmp6003);
tmp6013 = _mm512_fmadd_ps(in879, _mm512_set1_ps(2.5e-01f), tmp6007);
tmp6003 = _mm512_sub_ps(in872, tmp6003);
tmp6007 = _mm512_sub_ps(in880, tmp6007);
tmp6009 = _mm512_fmadd_ps(in875, _mm512_set1_ps(-1.25e+00f), tmp6009);
tmp6013 = _mm512_fmadd_ps(in883, _mm512_set1_ps(-1.25e+00f), tmp6013);
in875 = _mm512_sub_ps(in875, in871);
in883 = _mm512_sub_ps(in883, in879);
in875 = _mm512_fmadd_ps(in875, _mm512_set1_ps(5.25e+00f), tmp6003);
in883 = _mm512_fmadd_ps(in883, _mm512_set1_ps(5.25e+00f), tmp6007);
tmp6004 = _mm512_fmadd_ps(tmp6009, _mm512_set1_ps(2e+00f), tmp6002);
tmp6008 = _mm512_fmadd_ps(tmp6013, _mm512_set1_ps(2e+00f), tmp6006);
tmp6002 = _mm512_fnmadd_ps(tmp6009, _mm512_set1_ps(2e+00f), tmp6002);
tmp6006 = _mm512_fnmadd_ps(tmp6013, _mm512_set1_ps(2e+00f), tmp6006);
__m512 out819 = _mm512_shuffle_f32x4(in869, tmp6011, 68);
__m512 out827 = _mm512_shuffle_f32x4(in869, tmp6011, 238);
__m512 out820 = _mm512_shuffle_f32x4(tmp6012, in873, 68);
__m512 out828 = _mm512_shuffle_f32x4(tmp6012, in873, 238);
__m512 out821 = _mm512_shuffle_f32x4(tmp6010, tmp6004, 68);
__m512 out829 = _mm512_shuffle_f32x4(tmp6010, tmp6004, 238);
__m512 out822 = _mm512_shuffle_f32x4(tmp6002, in875, 68);
__m512 out830 = _mm512_shuffle_f32x4(tmp6002, in875, 238);
__m512 out823 = _mm512_shuffle_f32x4(in877, tmp6015, 68);
__m512 out831 = _mm512_shuffle_f32x4(in877, tmp6015, 238);
__m512 out824 = _mm512_shuffle_f32x4(tmp6016, in881, 68);
__m512 out832 = _mm512_shuffle_f32x4(tmp6016, in881, 238);
__m512 out825 = _mm512_shuffle_f32x4(tmp6014, tmp6008, 68);
__m512 out833 = _mm512_shuffle_f32x4(tmp6014, tmp6008, 238);
__m512 out826 = _mm512_shuffle_f32x4(tmp6006, in883, 68);
__m512 out834 = _mm512_shuffle_f32x4(tmp6006, in883, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k81, out819);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k81, out827);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k81, out823);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k81, out831);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k81, out820);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k81, out828);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k81, out824);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k81, out832);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k81, out821);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k81, out829);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k81, out825);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k81, out833);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k81, out822);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k81, out830);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k81, out826);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k81, out834);
__m512 dat1409 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1410 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512i pm123 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in885 = _mm512_permutexvar_ps(pm123, dat1409);
__m512 in893 = _mm512_permutexvar_ps(pm123, dat1410);
__m512 dat1411 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1412 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in886 = _mm512_permutexvar_ps(pm123, dat1411);
__m512 in894 = _mm512_permutexvar_ps(pm123, dat1412);
__m512 dat1413 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1414 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in887 = _mm512_permutexvar_ps(pm123, dat1413);
__m512 in895 = _mm512_permutexvar_ps(pm123, dat1414);
__m512 dat1415 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1416 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in888 = _mm512_permutexvar_ps(pm123, dat1415);
__m512 in896 = _mm512_permutexvar_ps(pm123, dat1416);
__m512 dat1417 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1418 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in889 = _mm512_permutexvar_ps(pm123, dat1417);
__m512 in897 = _mm512_permutexvar_ps(pm123, dat1418);
__m512 dat1419 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1420 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in890 = _mm512_permutexvar_ps(pm123, dat1419);
__m512 in898 = _mm512_permutexvar_ps(pm123, dat1420);
__m512 dat1421 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1422 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in891 = _mm512_permutexvar_ps(pm123, dat1421);
__m512 in899 = _mm512_permutexvar_ps(pm123, dat1422);
__m512 dat1423 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 dat1424 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+50432*i27+224*h31+4*w38+50432*s19+25216*k81);
__m512 in892 = _mm512_permutexvar_ps(pm123, dat1423);
__m512 in900 = _mm512_permutexvar_ps(pm123, dat1424);
__m512 tmp6065 = _mm512_add_ps(in886, in890);
__m512 tmp6069 = _mm512_add_ps(in894, in898);
__m512 tmp6066 = _mm512_sub_ps(in889, in887);
__m512 tmp6070 = _mm512_sub_ps(in897, in895);
__m512 tmp6067 = _mm512_add_ps(in887, in891);
__m512 tmp6071 = _mm512_add_ps(in895, in899);
in885 = _mm512_sub_ps(in885, in891);
in893 = _mm512_sub_ps(in893, in899);
tmp6065 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-4.25e+00f), tmp6065);
tmp6069 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-4.25e+00f), tmp6069);
tmp6067 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-4.25e+00f), tmp6067);
tmp6071 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-4.25e+00f), tmp6071);
in885 = _mm512_fmadd_ps(tmp6066, _mm512_set1_ps(5.25e+00f), in885);
in893 = _mm512_fmadd_ps(tmp6070, _mm512_set1_ps(5.25e+00f), in893);
tmp6066 = _mm512_fmadd_ps(in887, _mm512_set1_ps(2.5e-01f), in891);
tmp6070 = _mm512_fmadd_ps(in895, _mm512_set1_ps(2.5e-01f), in899);
in887 = _mm512_fmadd_ps(in887, _mm512_set1_ps(4e+00f), in891);
in895 = _mm512_fmadd_ps(in895, _mm512_set1_ps(4e+00f), in899);
__m512 tmp6068 = _mm512_sub_ps(tmp6067, tmp6065);
__m512 tmp6072 = _mm512_sub_ps(tmp6071, tmp6069);
tmp6067 = _mm512_add_ps(tmp6065, tmp6067);
tmp6071 = _mm512_add_ps(tmp6069, tmp6071);
tmp6065 = _mm512_fmadd_ps(in886, _mm512_set1_ps(2.5e-01f), in890);
tmp6069 = _mm512_fmadd_ps(in894, _mm512_set1_ps(2.5e-01f), in898);
tmp6066 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-1.25e+00f), tmp6066);
tmp6070 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-1.25e+00f), tmp6070);
in889 = _mm512_fmadd_ps(in889, _mm512_set1_ps(-5e+00f), in887);
in897 = _mm512_fmadd_ps(in897, _mm512_set1_ps(-5e+00f), in895);
tmp6065 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-1.25e+00f), tmp6065);
tmp6069 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-1.25e+00f), tmp6069);
in891 = _mm512_fmadd_ps(tmp6065, _mm512_set1_ps(2e+00f), tmp6066);
in899 = _mm512_fmadd_ps(tmp6069, _mm512_set1_ps(2e+00f), tmp6070);
tmp6066 = _mm512_fnmadd_ps(tmp6065, _mm512_set1_ps(2e+00f), tmp6066);
tmp6070 = _mm512_fnmadd_ps(tmp6069, _mm512_set1_ps(2e+00f), tmp6070);
tmp6065 = _mm512_fmadd_ps(in890, _mm512_set1_ps(2.5e-01f), in886);
tmp6069 = _mm512_fmadd_ps(in898, _mm512_set1_ps(2.5e-01f), in894);
in886 = _mm512_sub_ps(in892, in886);
in894 = _mm512_sub_ps(in900, in894);
tmp6065 = _mm512_fmadd_ps(in888, _mm512_set1_ps(-1.25e+00f), tmp6065);
tmp6069 = _mm512_fmadd_ps(in896, _mm512_set1_ps(-1.25e+00f), tmp6069);
in888 = _mm512_sub_ps(in888, in890);
in896 = _mm512_sub_ps(in896, in898);
in888 = _mm512_fmadd_ps(in888, _mm512_set1_ps(5.25e+00f), in886);
in896 = _mm512_fmadd_ps(in896, _mm512_set1_ps(5.25e+00f), in894);
in887 = _mm512_fmadd_ps(tmp6065, _mm512_set1_ps(2e+00f), in889);
in895 = _mm512_fmadd_ps(tmp6069, _mm512_set1_ps(2e+00f), in897);
in889 = _mm512_fnmadd_ps(tmp6065, _mm512_set1_ps(2e+00f), in889);
in897 = _mm512_fnmadd_ps(tmp6069, _mm512_set1_ps(2e+00f), in897);
__m512 tmp6081 = _mm512_unpacklo_ps(in885, tmp6067);
__m512 tmp6082 = _mm512_unpackhi_ps(in885, tmp6067);
__m512 tmp6083 = _mm512_unpacklo_ps(tmp6068, in891);
__m512 tmp6084 = _mm512_unpackhi_ps(tmp6068, in891);
__m512 tmp6085 = _mm512_unpacklo_ps(tmp6066, in887);
__m512 tmp6086 = _mm512_unpackhi_ps(tmp6066, in887);
__m512 tmp6087 = _mm512_unpacklo_ps(in889, in888);
__m512 tmp6088 = _mm512_unpackhi_ps(in889, in888);
__m512 tmp6089 = _mm512_unpacklo_ps(in893, tmp6071);
__m512 tmp6090 = _mm512_unpackhi_ps(in893, tmp6071);
__m512 tmp6091 = _mm512_unpacklo_ps(tmp6072, in899);
__m512 tmp6092 = _mm512_unpackhi_ps(tmp6072, in899);
__m512 tmp6093 = _mm512_unpacklo_ps(tmp6070, in895);
__m512 tmp6094 = _mm512_unpackhi_ps(tmp6070, in895);
__m512 tmp6095 = _mm512_unpacklo_ps(in897, in896);
__m512 tmp6096 = _mm512_unpackhi_ps(in897, in896);
__m512 tmp6097 = _mm512_shuffle_ps(tmp6081, tmp6083, 68);
__m512 tmp6098 = _mm512_shuffle_ps(tmp6081, tmp6083, 238);
__m512 tmp6099 = _mm512_shuffle_ps(tmp6082, tmp6084, 68);
__m512 tmp6100 = _mm512_shuffle_ps(tmp6082, tmp6084, 238);
__m512 tmp6101 = _mm512_shuffle_ps(tmp6085, tmp6087, 68);
__m512 tmp6102 = _mm512_shuffle_ps(tmp6085, tmp6087, 238);
__m512 tmp6103 = _mm512_shuffle_ps(tmp6086, tmp6088, 68);
__m512 tmp6104 = _mm512_shuffle_ps(tmp6086, tmp6088, 238);
__m512 tmp6105 = _mm512_shuffle_ps(tmp6089, tmp6091, 68);
__m512 tmp6106 = _mm512_shuffle_ps(tmp6089, tmp6091, 238);
__m512 tmp6107 = _mm512_shuffle_ps(tmp6090, tmp6092, 68);
__m512 tmp6108 = _mm512_shuffle_ps(tmp6090, tmp6092, 238);
__m512 tmp6109 = _mm512_shuffle_ps(tmp6093, tmp6095, 68);
__m512 tmp6110 = _mm512_shuffle_ps(tmp6093, tmp6095, 238);
__m512 tmp6111 = _mm512_shuffle_ps(tmp6094, tmp6096, 68);
__m512 tmp6112 = _mm512_shuffle_ps(tmp6094, tmp6096, 238);
__m512 tmp6113 = _mm512_shuffle_f32x4(tmp6097, tmp6101, 136);
__m512 tmp6114 = _mm512_shuffle_f32x4(tmp6097, tmp6101, 221);
__m512 tmp6115 = _mm512_shuffle_f32x4(tmp6098, tmp6102, 136);
__m512 tmp6116 = _mm512_shuffle_f32x4(tmp6098, tmp6102, 221);
__m512 tmp6117 = _mm512_shuffle_f32x4(tmp6099, tmp6103, 136);
__m512 tmp6118 = _mm512_shuffle_f32x4(tmp6099, tmp6103, 221);
__m512 tmp6119 = _mm512_shuffle_f32x4(tmp6100, tmp6104, 136);
__m512 tmp6120 = _mm512_shuffle_f32x4(tmp6100, tmp6104, 221);
__m512 tmp6121 = _mm512_shuffle_f32x4(tmp6105, tmp6109, 136);
__m512 tmp6122 = _mm512_shuffle_f32x4(tmp6105, tmp6109, 221);
__m512 tmp6123 = _mm512_shuffle_f32x4(tmp6106, tmp6110, 136);
__m512 tmp6124 = _mm512_shuffle_f32x4(tmp6106, tmp6110, 221);
__m512 tmp6125 = _mm512_shuffle_f32x4(tmp6107, tmp6111, 136);
__m512 tmp6126 = _mm512_shuffle_f32x4(tmp6107, tmp6111, 221);
__m512 tmp6127 = _mm512_shuffle_f32x4(tmp6108, tmp6112, 136);
__m512 tmp6128 = _mm512_shuffle_f32x4(tmp6108, tmp6112, 221);
in885 = _mm512_shuffle_f32x4(tmp6113, tmp6121, 136);
in893 = _mm512_shuffle_f32x4(tmp6113, tmp6121, 221);
tmp6067 = _mm512_shuffle_f32x4(tmp6115, tmp6123, 136);
tmp6071 = _mm512_shuffle_f32x4(tmp6115, tmp6123, 221);
tmp6068 = _mm512_shuffle_f32x4(tmp6117, tmp6125, 136);
tmp6072 = _mm512_shuffle_f32x4(tmp6117, tmp6125, 221);
in891 = _mm512_shuffle_f32x4(tmp6119, tmp6127, 136);
in899 = _mm512_shuffle_f32x4(tmp6119, tmp6127, 221);
tmp6066 = _mm512_shuffle_f32x4(tmp6114, tmp6122, 136);
tmp6070 = _mm512_shuffle_f32x4(tmp6114, tmp6122, 221);
in887 = _mm512_shuffle_f32x4(tmp6116, tmp6124, 136);
in895 = _mm512_shuffle_f32x4(tmp6116, tmp6124, 221);
in889 = _mm512_shuffle_f32x4(tmp6118, tmp6126, 136);
in897 = _mm512_shuffle_f32x4(tmp6118, tmp6126, 221);
in888 = _mm512_shuffle_f32x4(tmp6120, tmp6128, 136);
in896 = _mm512_shuffle_f32x4(tmp6120, tmp6128, 221);
__m512 tmp6073 = _mm512_add_ps(tmp6067, in887);
__m512 tmp6077 = _mm512_add_ps(tmp6071, in895);
__m512 tmp6074 = _mm512_sub_ps(tmp6066, tmp6068);
__m512 tmp6078 = _mm512_sub_ps(tmp6070, tmp6072);
__m512 tmp6075 = _mm512_add_ps(tmp6068, in889);
__m512 tmp6079 = _mm512_add_ps(tmp6072, in897);
in885 = _mm512_sub_ps(in885, in889);
in893 = _mm512_sub_ps(in893, in897);
tmp6073 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-4.25e+00f), tmp6073);
tmp6077 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-4.25e+00f), tmp6077);
tmp6075 = _mm512_fmadd_ps(tmp6066, _mm512_set1_ps(-4.25e+00f), tmp6075);
tmp6079 = _mm512_fmadd_ps(tmp6070, _mm512_set1_ps(-4.25e+00f), tmp6079);
in885 = _mm512_fmadd_ps(tmp6074, _mm512_set1_ps(5.25e+00f), in885);
in893 = _mm512_fmadd_ps(tmp6078, _mm512_set1_ps(5.25e+00f), in893);
tmp6074 = _mm512_fmadd_ps(tmp6068, _mm512_set1_ps(2.5e-01f), in889);
tmp6078 = _mm512_fmadd_ps(tmp6072, _mm512_set1_ps(2.5e-01f), in897);
tmp6068 = _mm512_fmadd_ps(tmp6068, _mm512_set1_ps(4e+00f), in889);
tmp6072 = _mm512_fmadd_ps(tmp6072, _mm512_set1_ps(4e+00f), in897);
__m512 tmp6076 = _mm512_sub_ps(tmp6075, tmp6073);
__m512 tmp6080 = _mm512_sub_ps(tmp6079, tmp6077);
tmp6075 = _mm512_add_ps(tmp6073, tmp6075);
tmp6079 = _mm512_add_ps(tmp6077, tmp6079);
tmp6073 = _mm512_fmadd_ps(tmp6067, _mm512_set1_ps(2.5e-01f), in887);
tmp6077 = _mm512_fmadd_ps(tmp6071, _mm512_set1_ps(2.5e-01f), in895);
tmp6074 = _mm512_fmadd_ps(tmp6066, _mm512_set1_ps(-1.25e+00f), tmp6074);
tmp6078 = _mm512_fmadd_ps(tmp6070, _mm512_set1_ps(-1.25e+00f), tmp6078);
tmp6066 = _mm512_fmadd_ps(tmp6066, _mm512_set1_ps(-5e+00f), tmp6068);
tmp6070 = _mm512_fmadd_ps(tmp6070, _mm512_set1_ps(-5e+00f), tmp6072);
tmp6073 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-1.25e+00f), tmp6073);
tmp6077 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-1.25e+00f), tmp6077);
in889 = _mm512_fmadd_ps(tmp6073, _mm512_set1_ps(2e+00f), tmp6074);
in897 = _mm512_fmadd_ps(tmp6077, _mm512_set1_ps(2e+00f), tmp6078);
tmp6074 = _mm512_fnmadd_ps(tmp6073, _mm512_set1_ps(2e+00f), tmp6074);
tmp6078 = _mm512_fnmadd_ps(tmp6077, _mm512_set1_ps(2e+00f), tmp6078);
tmp6073 = _mm512_fmadd_ps(in887, _mm512_set1_ps(2.5e-01f), tmp6067);
tmp6077 = _mm512_fmadd_ps(in895, _mm512_set1_ps(2.5e-01f), tmp6071);
tmp6067 = _mm512_sub_ps(in888, tmp6067);
tmp6071 = _mm512_sub_ps(in896, tmp6071);
tmp6073 = _mm512_fmadd_ps(in891, _mm512_set1_ps(-1.25e+00f), tmp6073);
tmp6077 = _mm512_fmadd_ps(in899, _mm512_set1_ps(-1.25e+00f), tmp6077);
in891 = _mm512_sub_ps(in891, in887);
in899 = _mm512_sub_ps(in899, in895);
in891 = _mm512_fmadd_ps(in891, _mm512_set1_ps(5.25e+00f), tmp6067);
in899 = _mm512_fmadd_ps(in899, _mm512_set1_ps(5.25e+00f), tmp6071);
tmp6068 = _mm512_fmadd_ps(tmp6073, _mm512_set1_ps(2e+00f), tmp6066);
tmp6072 = _mm512_fmadd_ps(tmp6077, _mm512_set1_ps(2e+00f), tmp6070);
tmp6066 = _mm512_fnmadd_ps(tmp6073, _mm512_set1_ps(2e+00f), tmp6066);
tmp6070 = _mm512_fnmadd_ps(tmp6077, _mm512_set1_ps(2e+00f), tmp6070);
__m512 out835 = _mm512_shuffle_f32x4(in885, tmp6075, 68);
__m512 out843 = _mm512_shuffle_f32x4(in885, tmp6075, 238);
__m512 out836 = _mm512_shuffle_f32x4(tmp6076, in889, 68);
__m512 out844 = _mm512_shuffle_f32x4(tmp6076, in889, 238);
__m512 out837 = _mm512_shuffle_f32x4(tmp6074, tmp6068, 68);
__m512 out845 = _mm512_shuffle_f32x4(tmp6074, tmp6068, 238);
__m512 out838 = _mm512_shuffle_f32x4(tmp6066, in891, 68);
__m512 out846 = _mm512_shuffle_f32x4(tmp6066, in891, 238);
__m512 out839 = _mm512_shuffle_f32x4(in893, tmp6079, 68);
__m512 out847 = _mm512_shuffle_f32x4(in893, tmp6079, 238);
__m512 out840 = _mm512_shuffle_f32x4(tmp6080, in897, 68);
__m512 out848 = _mm512_shuffle_f32x4(tmp6080, in897, 238);
__m512 out841 = _mm512_shuffle_f32x4(tmp6078, tmp6072, 68);
__m512 out849 = _mm512_shuffle_f32x4(tmp6078, tmp6072, 238);
__m512 out842 = _mm512_shuffle_f32x4(tmp6070, in899, 68);
__m512 out850 = _mm512_shuffle_f32x4(tmp6070, in899, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k81, out835);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k81, out843);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k81, out839);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k81, out847);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k81, out836);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k81, out844);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k81, out840);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k81, out848);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k81, out837);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k81, out845);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k81, out841);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k81, out849);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k81, out838);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k81, out846);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k81, out842);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k81, out850);
}
++j21;
rel14 = 1;
}
ptrdiff_t h32 = base14+0;
ptrdiff_t w39 = 48;
ptrdiff_t k82 = 0;
for (; k82 != 2; ++k82) {
__m512 dat1425 = _mm512_maskz_loadu_ps(511, datPtr12+0+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1426 = _mm512_maskz_loadu_ps(8191, datPtr12+1156+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512i pm124 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in901 = _mm512_permutexvar_ps(pm124, dat1425);
__m512i pm125 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in909 = _mm512_permutexvar_ps(pm125, dat1426);
__m512 dat1427 = _mm512_maskz_loadu_ps(511, datPtr12+224+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1428 = _mm512_maskz_loadu_ps(8191, datPtr12+1380+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in902 = _mm512_permutexvar_ps(pm124, dat1427);
__m512 in910 = _mm512_permutexvar_ps(pm125, dat1428);
__m512 dat1429 = _mm512_maskz_loadu_ps(511, datPtr12+448+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1430 = _mm512_maskz_loadu_ps(8191, datPtr12+1604+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in903 = _mm512_permutexvar_ps(pm124, dat1429);
__m512 in911 = _mm512_permutexvar_ps(pm125, dat1430);
__m512 dat1431 = _mm512_maskz_loadu_ps(511, datPtr12+672+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1432 = _mm512_maskz_loadu_ps(8191, datPtr12+1828+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in904 = _mm512_permutexvar_ps(pm124, dat1431);
__m512 in912 = _mm512_permutexvar_ps(pm125, dat1432);
__m512 dat1433 = _mm512_maskz_loadu_ps(511, datPtr12+896+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1434 = _mm512_maskz_loadu_ps(8191, datPtr12+2052+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in905 = _mm512_permutexvar_ps(pm124, dat1433);
__m512 in913 = _mm512_permutexvar_ps(pm125, dat1434);
__m512 dat1435 = _mm512_maskz_loadu_ps(511, datPtr12+1120+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1436 = _mm512_maskz_loadu_ps(8191, datPtr12+2276+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in906 = _mm512_permutexvar_ps(pm124, dat1435);
__m512 in914 = _mm512_permutexvar_ps(pm125, dat1436);
__m512 dat1437 = _mm512_maskz_loadu_ps(511, datPtr12+1344+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1438 = _mm512_maskz_loadu_ps(8191, datPtr12+2500+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in907 = _mm512_permutexvar_ps(pm124, dat1437);
__m512 in915 = _mm512_permutexvar_ps(pm125, dat1438);
__m512 dat1439 = _mm512_maskz_loadu_ps(511, datPtr12+1568+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1440 = _mm512_maskz_loadu_ps(8191, datPtr12+2724+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in908 = _mm512_permutexvar_ps(pm124, dat1439);
__m512 in916 = _mm512_permutexvar_ps(pm125, dat1440);
__m512 tmp6129 = _mm512_add_ps(in902, in906);
__m512 tmp6133 = _mm512_add_ps(in910, in914);
__m512 tmp6130 = _mm512_sub_ps(in905, in903);
__m512 tmp6134 = _mm512_sub_ps(in913, in911);
__m512 tmp6131 = _mm512_add_ps(in903, in907);
__m512 tmp6135 = _mm512_add_ps(in911, in915);
in901 = _mm512_sub_ps(in901, in907);
in909 = _mm512_sub_ps(in909, in915);
tmp6129 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-4.25e+00f), tmp6129);
tmp6133 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-4.25e+00f), tmp6133);
tmp6131 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-4.25e+00f), tmp6131);
tmp6135 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-4.25e+00f), tmp6135);
in901 = _mm512_fmadd_ps(tmp6130, _mm512_set1_ps(5.25e+00f), in901);
in909 = _mm512_fmadd_ps(tmp6134, _mm512_set1_ps(5.25e+00f), in909);
tmp6130 = _mm512_fmadd_ps(in903, _mm512_set1_ps(2.5e-01f), in907);
tmp6134 = _mm512_fmadd_ps(in911, _mm512_set1_ps(2.5e-01f), in915);
in903 = _mm512_fmadd_ps(in903, _mm512_set1_ps(4e+00f), in907);
in911 = _mm512_fmadd_ps(in911, _mm512_set1_ps(4e+00f), in915);
__m512 tmp6132 = _mm512_sub_ps(tmp6131, tmp6129);
__m512 tmp6136 = _mm512_sub_ps(tmp6135, tmp6133);
tmp6131 = _mm512_add_ps(tmp6129, tmp6131);
tmp6135 = _mm512_add_ps(tmp6133, tmp6135);
tmp6129 = _mm512_fmadd_ps(in902, _mm512_set1_ps(2.5e-01f), in906);
tmp6133 = _mm512_fmadd_ps(in910, _mm512_set1_ps(2.5e-01f), in914);
tmp6130 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-1.25e+00f), tmp6130);
tmp6134 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-1.25e+00f), tmp6134);
in905 = _mm512_fmadd_ps(in905, _mm512_set1_ps(-5e+00f), in903);
in913 = _mm512_fmadd_ps(in913, _mm512_set1_ps(-5e+00f), in911);
tmp6129 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-1.25e+00f), tmp6129);
tmp6133 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-1.25e+00f), tmp6133);
in907 = _mm512_fmadd_ps(tmp6129, _mm512_set1_ps(2e+00f), tmp6130);
in915 = _mm512_fmadd_ps(tmp6133, _mm512_set1_ps(2e+00f), tmp6134);
tmp6130 = _mm512_fnmadd_ps(tmp6129, _mm512_set1_ps(2e+00f), tmp6130);
tmp6134 = _mm512_fnmadd_ps(tmp6133, _mm512_set1_ps(2e+00f), tmp6134);
tmp6129 = _mm512_fmadd_ps(in906, _mm512_set1_ps(2.5e-01f), in902);
tmp6133 = _mm512_fmadd_ps(in914, _mm512_set1_ps(2.5e-01f), in910);
in902 = _mm512_sub_ps(in908, in902);
in910 = _mm512_sub_ps(in916, in910);
tmp6129 = _mm512_fmadd_ps(in904, _mm512_set1_ps(-1.25e+00f), tmp6129);
tmp6133 = _mm512_fmadd_ps(in912, _mm512_set1_ps(-1.25e+00f), tmp6133);
in904 = _mm512_sub_ps(in904, in906);
in912 = _mm512_sub_ps(in912, in914);
in904 = _mm512_fmadd_ps(in904, _mm512_set1_ps(5.25e+00f), in902);
in912 = _mm512_fmadd_ps(in912, _mm512_set1_ps(5.25e+00f), in910);
in903 = _mm512_fmadd_ps(tmp6129, _mm512_set1_ps(2e+00f), in905);
in911 = _mm512_fmadd_ps(tmp6133, _mm512_set1_ps(2e+00f), in913);
in905 = _mm512_fnmadd_ps(tmp6129, _mm512_set1_ps(2e+00f), in905);
in913 = _mm512_fnmadd_ps(tmp6133, _mm512_set1_ps(2e+00f), in913);
__m512 tmp6145 = _mm512_unpacklo_ps(in901, tmp6131);
__m512 tmp6146 = _mm512_unpackhi_ps(in901, tmp6131);
__m512 tmp6147 = _mm512_unpacklo_ps(tmp6132, in907);
__m512 tmp6148 = _mm512_unpackhi_ps(tmp6132, in907);
__m512 tmp6149 = _mm512_unpacklo_ps(tmp6130, in903);
__m512 tmp6150 = _mm512_unpackhi_ps(tmp6130, in903);
__m512 tmp6151 = _mm512_unpacklo_ps(in905, in904);
__m512 tmp6152 = _mm512_unpackhi_ps(in905, in904);
__m512 tmp6153 = _mm512_unpacklo_ps(in909, tmp6135);
__m512 tmp6154 = _mm512_unpackhi_ps(in909, tmp6135);
__m512 tmp6155 = _mm512_unpacklo_ps(tmp6136, in915);
__m512 tmp6156 = _mm512_unpackhi_ps(tmp6136, in915);
__m512 tmp6157 = _mm512_unpacklo_ps(tmp6134, in911);
__m512 tmp6158 = _mm512_unpackhi_ps(tmp6134, in911);
__m512 tmp6159 = _mm512_unpacklo_ps(in913, in912);
__m512 tmp6160 = _mm512_unpackhi_ps(in913, in912);
__m512 tmp6161 = _mm512_shuffle_ps(tmp6145, tmp6147, 68);
__m512 tmp6162 = _mm512_shuffle_ps(tmp6145, tmp6147, 238);
__m512 tmp6163 = _mm512_shuffle_ps(tmp6146, tmp6148, 68);
__m512 tmp6164 = _mm512_shuffle_ps(tmp6146, tmp6148, 238);
__m512 tmp6165 = _mm512_shuffle_ps(tmp6149, tmp6151, 68);
__m512 tmp6166 = _mm512_shuffle_ps(tmp6149, tmp6151, 238);
__m512 tmp6167 = _mm512_shuffle_ps(tmp6150, tmp6152, 68);
__m512 tmp6168 = _mm512_shuffle_ps(tmp6150, tmp6152, 238);
__m512 tmp6169 = _mm512_shuffle_ps(tmp6153, tmp6155, 68);
__m512 tmp6170 = _mm512_shuffle_ps(tmp6153, tmp6155, 238);
__m512 tmp6171 = _mm512_shuffle_ps(tmp6154, tmp6156, 68);
__m512 tmp6172 = _mm512_shuffle_ps(tmp6154, tmp6156, 238);
__m512 tmp6173 = _mm512_shuffle_ps(tmp6157, tmp6159, 68);
__m512 tmp6174 = _mm512_shuffle_ps(tmp6157, tmp6159, 238);
__m512 tmp6175 = _mm512_shuffle_ps(tmp6158, tmp6160, 68);
__m512 tmp6176 = _mm512_shuffle_ps(tmp6158, tmp6160, 238);
__m512 tmp6177 = _mm512_shuffle_f32x4(tmp6161, tmp6165, 136);
__m512 tmp6178 = _mm512_shuffle_f32x4(tmp6161, tmp6165, 221);
__m512 tmp6179 = _mm512_shuffle_f32x4(tmp6162, tmp6166, 136);
__m512 tmp6180 = _mm512_shuffle_f32x4(tmp6162, tmp6166, 221);
__m512 tmp6181 = _mm512_shuffle_f32x4(tmp6163, tmp6167, 136);
__m512 tmp6182 = _mm512_shuffle_f32x4(tmp6163, tmp6167, 221);
__m512 tmp6183 = _mm512_shuffle_f32x4(tmp6164, tmp6168, 136);
__m512 tmp6184 = _mm512_shuffle_f32x4(tmp6164, tmp6168, 221);
__m512 tmp6185 = _mm512_shuffle_f32x4(tmp6169, tmp6173, 136);
__m512 tmp6186 = _mm512_shuffle_f32x4(tmp6169, tmp6173, 221);
__m512 tmp6187 = _mm512_shuffle_f32x4(tmp6170, tmp6174, 136);
__m512 tmp6188 = _mm512_shuffle_f32x4(tmp6170, tmp6174, 221);
__m512 tmp6189 = _mm512_shuffle_f32x4(tmp6171, tmp6175, 136);
__m512 tmp6190 = _mm512_shuffle_f32x4(tmp6171, tmp6175, 221);
__m512 tmp6191 = _mm512_shuffle_f32x4(tmp6172, tmp6176, 136);
__m512 tmp6192 = _mm512_shuffle_f32x4(tmp6172, tmp6176, 221);
in901 = _mm512_shuffle_f32x4(tmp6177, tmp6185, 136);
in909 = _mm512_shuffle_f32x4(tmp6177, tmp6185, 221);
tmp6131 = _mm512_shuffle_f32x4(tmp6179, tmp6187, 136);
tmp6135 = _mm512_shuffle_f32x4(tmp6179, tmp6187, 221);
tmp6132 = _mm512_shuffle_f32x4(tmp6181, tmp6189, 136);
tmp6136 = _mm512_shuffle_f32x4(tmp6181, tmp6189, 221);
in907 = _mm512_shuffle_f32x4(tmp6183, tmp6191, 136);
in915 = _mm512_shuffle_f32x4(tmp6183, tmp6191, 221);
tmp6130 = _mm512_shuffle_f32x4(tmp6178, tmp6186, 136);
tmp6134 = _mm512_shuffle_f32x4(tmp6178, tmp6186, 221);
in903 = _mm512_shuffle_f32x4(tmp6180, tmp6188, 136);
in911 = _mm512_shuffle_f32x4(tmp6180, tmp6188, 221);
in905 = _mm512_shuffle_f32x4(tmp6182, tmp6190, 136);
in913 = _mm512_shuffle_f32x4(tmp6182, tmp6190, 221);
in904 = _mm512_shuffle_f32x4(tmp6184, tmp6192, 136);
in912 = _mm512_shuffle_f32x4(tmp6184, tmp6192, 221);
__m512 tmp6137 = _mm512_add_ps(tmp6131, in903);
__m512 tmp6141 = _mm512_add_ps(tmp6135, in911);
__m512 tmp6138 = _mm512_sub_ps(tmp6130, tmp6132);
__m512 tmp6142 = _mm512_sub_ps(tmp6134, tmp6136);
__m512 tmp6139 = _mm512_add_ps(tmp6132, in905);
__m512 tmp6143 = _mm512_add_ps(tmp6136, in913);
in901 = _mm512_sub_ps(in901, in905);
in909 = _mm512_sub_ps(in909, in913);
tmp6137 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-4.25e+00f), tmp6137);
tmp6141 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-4.25e+00f), tmp6141);
tmp6139 = _mm512_fmadd_ps(tmp6130, _mm512_set1_ps(-4.25e+00f), tmp6139);
tmp6143 = _mm512_fmadd_ps(tmp6134, _mm512_set1_ps(-4.25e+00f), tmp6143);
in901 = _mm512_fmadd_ps(tmp6138, _mm512_set1_ps(5.25e+00f), in901);
in909 = _mm512_fmadd_ps(tmp6142, _mm512_set1_ps(5.25e+00f), in909);
tmp6138 = _mm512_fmadd_ps(tmp6132, _mm512_set1_ps(2.5e-01f), in905);
tmp6142 = _mm512_fmadd_ps(tmp6136, _mm512_set1_ps(2.5e-01f), in913);
tmp6132 = _mm512_fmadd_ps(tmp6132, _mm512_set1_ps(4e+00f), in905);
tmp6136 = _mm512_fmadd_ps(tmp6136, _mm512_set1_ps(4e+00f), in913);
__m512 tmp6140 = _mm512_sub_ps(tmp6139, tmp6137);
__m512 tmp6144 = _mm512_sub_ps(tmp6143, tmp6141);
tmp6139 = _mm512_add_ps(tmp6137, tmp6139);
tmp6143 = _mm512_add_ps(tmp6141, tmp6143);
tmp6137 = _mm512_fmadd_ps(tmp6131, _mm512_set1_ps(2.5e-01f), in903);
tmp6141 = _mm512_fmadd_ps(tmp6135, _mm512_set1_ps(2.5e-01f), in911);
tmp6138 = _mm512_fmadd_ps(tmp6130, _mm512_set1_ps(-1.25e+00f), tmp6138);
tmp6142 = _mm512_fmadd_ps(tmp6134, _mm512_set1_ps(-1.25e+00f), tmp6142);
tmp6130 = _mm512_fmadd_ps(tmp6130, _mm512_set1_ps(-5e+00f), tmp6132);
tmp6134 = _mm512_fmadd_ps(tmp6134, _mm512_set1_ps(-5e+00f), tmp6136);
tmp6137 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-1.25e+00f), tmp6137);
tmp6141 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-1.25e+00f), tmp6141);
in905 = _mm512_fmadd_ps(tmp6137, _mm512_set1_ps(2e+00f), tmp6138);
in913 = _mm512_fmadd_ps(tmp6141, _mm512_set1_ps(2e+00f), tmp6142);
tmp6138 = _mm512_fnmadd_ps(tmp6137, _mm512_set1_ps(2e+00f), tmp6138);
tmp6142 = _mm512_fnmadd_ps(tmp6141, _mm512_set1_ps(2e+00f), tmp6142);
tmp6137 = _mm512_fmadd_ps(in903, _mm512_set1_ps(2.5e-01f), tmp6131);
tmp6141 = _mm512_fmadd_ps(in911, _mm512_set1_ps(2.5e-01f), tmp6135);
tmp6131 = _mm512_sub_ps(in904, tmp6131);
tmp6135 = _mm512_sub_ps(in912, tmp6135);
tmp6137 = _mm512_fmadd_ps(in907, _mm512_set1_ps(-1.25e+00f), tmp6137);
tmp6141 = _mm512_fmadd_ps(in915, _mm512_set1_ps(-1.25e+00f), tmp6141);
in907 = _mm512_sub_ps(in907, in903);
in915 = _mm512_sub_ps(in915, in911);
in907 = _mm512_fmadd_ps(in907, _mm512_set1_ps(5.25e+00f), tmp6131);
in915 = _mm512_fmadd_ps(in915, _mm512_set1_ps(5.25e+00f), tmp6135);
tmp6132 = _mm512_fmadd_ps(tmp6137, _mm512_set1_ps(2e+00f), tmp6130);
tmp6136 = _mm512_fmadd_ps(tmp6141, _mm512_set1_ps(2e+00f), tmp6134);
tmp6130 = _mm512_fnmadd_ps(tmp6137, _mm512_set1_ps(2e+00f), tmp6130);
tmp6134 = _mm512_fnmadd_ps(tmp6141, _mm512_set1_ps(2e+00f), tmp6134);
__m512 out851 = _mm512_shuffle_f32x4(in901, tmp6139, 68);
__m512 out859 = _mm512_shuffle_f32x4(in901, tmp6139, 238);
__m512 out852 = _mm512_shuffle_f32x4(tmp6140, in905, 68);
__m512 out860 = _mm512_shuffle_f32x4(tmp6140, in905, 238);
__m512 out853 = _mm512_shuffle_f32x4(tmp6138, tmp6132, 68);
__m512 out861 = _mm512_shuffle_f32x4(tmp6138, tmp6132, 238);
__m512 out854 = _mm512_shuffle_f32x4(tmp6130, in907, 68);
__m512 out862 = _mm512_shuffle_f32x4(tmp6130, in907, 238);
__m512 out855 = _mm512_shuffle_f32x4(in909, tmp6143, 68);
__m512 out863 = _mm512_shuffle_f32x4(in909, tmp6143, 238);
__m512 out856 = _mm512_shuffle_f32x4(tmp6144, in913, 68);
__m512 out864 = _mm512_shuffle_f32x4(tmp6144, in913, 238);
__m512 out857 = _mm512_shuffle_f32x4(tmp6142, tmp6136, 68);
__m512 out865 = _mm512_shuffle_f32x4(tmp6142, tmp6136, 238);
__m512 out858 = _mm512_shuffle_f32x4(tmp6134, in915, 68);
__m512 out866 = _mm512_shuffle_f32x4(tmp6134, in915, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k82, out851);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k82, out859);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k82, out855);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k82, out863);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k82, out852);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k82, out860);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k82, out856);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k82, out864);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k82, out853);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k82, out861);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k82, out857);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k82, out865);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k82, out854);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k82, out862);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k82, out858);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k82, out866);
__m512 dat1441 = _mm512_maskz_loadu_ps(16383, datPtr12+1200+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1442 = _mm512_maskz_loadu_ps(511, datPtr12+12608+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512i pm126 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in917 = _mm512_permutexvar_ps(pm126, dat1441);
__m512i pm127 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in925 = _mm512_permutexvar_ps(pm127, dat1442);
__m512 dat1443 = _mm512_maskz_loadu_ps(16383, datPtr12+1424+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1444 = _mm512_maskz_loadu_ps(511, datPtr12+12832+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in918 = _mm512_permutexvar_ps(pm126, dat1443);
__m512 in926 = _mm512_permutexvar_ps(pm127, dat1444);
__m512 dat1445 = _mm512_maskz_loadu_ps(16383, datPtr12+1648+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1446 = _mm512_maskz_loadu_ps(511, datPtr12+13056+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in919 = _mm512_permutexvar_ps(pm126, dat1445);
__m512 in927 = _mm512_permutexvar_ps(pm127, dat1446);
__m512 dat1447 = _mm512_maskz_loadu_ps(16383, datPtr12+1872+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1448 = _mm512_maskz_loadu_ps(511, datPtr12+13280+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in920 = _mm512_permutexvar_ps(pm126, dat1447);
__m512 in928 = _mm512_permutexvar_ps(pm127, dat1448);
__m512 dat1449 = _mm512_maskz_loadu_ps(16383, datPtr12+2096+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1450 = _mm512_maskz_loadu_ps(511, datPtr12+13504+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in921 = _mm512_permutexvar_ps(pm126, dat1449);
__m512 in929 = _mm512_permutexvar_ps(pm127, dat1450);
__m512 dat1451 = _mm512_maskz_loadu_ps(16383, datPtr12+2320+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1452 = _mm512_maskz_loadu_ps(511, datPtr12+13728+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in922 = _mm512_permutexvar_ps(pm126, dat1451);
__m512 in930 = _mm512_permutexvar_ps(pm127, dat1452);
__m512 dat1453 = _mm512_maskz_loadu_ps(16383, datPtr12+2544+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1454 = _mm512_maskz_loadu_ps(511, datPtr12+13952+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in923 = _mm512_permutexvar_ps(pm126, dat1453);
__m512 in931 = _mm512_permutexvar_ps(pm127, dat1454);
__m512 dat1455 = _mm512_maskz_loadu_ps(16383, datPtr12+2768+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1456 = _mm512_maskz_loadu_ps(511, datPtr12+14176+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in924 = _mm512_permutexvar_ps(pm126, dat1455);
__m512 in932 = _mm512_permutexvar_ps(pm127, dat1456);
__m512 tmp6193 = _mm512_add_ps(in918, in922);
__m512 tmp6197 = _mm512_add_ps(in926, in930);
__m512 tmp6194 = _mm512_sub_ps(in921, in919);
__m512 tmp6198 = _mm512_sub_ps(in929, in927);
__m512 tmp6195 = _mm512_add_ps(in919, in923);
__m512 tmp6199 = _mm512_add_ps(in927, in931);
in917 = _mm512_sub_ps(in917, in923);
in925 = _mm512_sub_ps(in925, in931);
tmp6193 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-4.25e+00f), tmp6193);
tmp6197 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-4.25e+00f), tmp6197);
tmp6195 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-4.25e+00f), tmp6195);
tmp6199 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-4.25e+00f), tmp6199);
in917 = _mm512_fmadd_ps(tmp6194, _mm512_set1_ps(5.25e+00f), in917);
in925 = _mm512_fmadd_ps(tmp6198, _mm512_set1_ps(5.25e+00f), in925);
tmp6194 = _mm512_fmadd_ps(in919, _mm512_set1_ps(2.5e-01f), in923);
tmp6198 = _mm512_fmadd_ps(in927, _mm512_set1_ps(2.5e-01f), in931);
in919 = _mm512_fmadd_ps(in919, _mm512_set1_ps(4e+00f), in923);
in927 = _mm512_fmadd_ps(in927, _mm512_set1_ps(4e+00f), in931);
__m512 tmp6196 = _mm512_sub_ps(tmp6195, tmp6193);
__m512 tmp6200 = _mm512_sub_ps(tmp6199, tmp6197);
tmp6195 = _mm512_add_ps(tmp6193, tmp6195);
tmp6199 = _mm512_add_ps(tmp6197, tmp6199);
tmp6193 = _mm512_fmadd_ps(in918, _mm512_set1_ps(2.5e-01f), in922);
tmp6197 = _mm512_fmadd_ps(in926, _mm512_set1_ps(2.5e-01f), in930);
tmp6194 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-1.25e+00f), tmp6194);
tmp6198 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-1.25e+00f), tmp6198);
in921 = _mm512_fmadd_ps(in921, _mm512_set1_ps(-5e+00f), in919);
in929 = _mm512_fmadd_ps(in929, _mm512_set1_ps(-5e+00f), in927);
tmp6193 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-1.25e+00f), tmp6193);
tmp6197 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-1.25e+00f), tmp6197);
in923 = _mm512_fmadd_ps(tmp6193, _mm512_set1_ps(2e+00f), tmp6194);
in931 = _mm512_fmadd_ps(tmp6197, _mm512_set1_ps(2e+00f), tmp6198);
tmp6194 = _mm512_fnmadd_ps(tmp6193, _mm512_set1_ps(2e+00f), tmp6194);
tmp6198 = _mm512_fnmadd_ps(tmp6197, _mm512_set1_ps(2e+00f), tmp6198);
tmp6193 = _mm512_fmadd_ps(in922, _mm512_set1_ps(2.5e-01f), in918);
tmp6197 = _mm512_fmadd_ps(in930, _mm512_set1_ps(2.5e-01f), in926);
in918 = _mm512_sub_ps(in924, in918);
in926 = _mm512_sub_ps(in932, in926);
tmp6193 = _mm512_fmadd_ps(in920, _mm512_set1_ps(-1.25e+00f), tmp6193);
tmp6197 = _mm512_fmadd_ps(in928, _mm512_set1_ps(-1.25e+00f), tmp6197);
in920 = _mm512_sub_ps(in920, in922);
in928 = _mm512_sub_ps(in928, in930);
in920 = _mm512_fmadd_ps(in920, _mm512_set1_ps(5.25e+00f), in918);
in928 = _mm512_fmadd_ps(in928, _mm512_set1_ps(5.25e+00f), in926);
in919 = _mm512_fmadd_ps(tmp6193, _mm512_set1_ps(2e+00f), in921);
in927 = _mm512_fmadd_ps(tmp6197, _mm512_set1_ps(2e+00f), in929);
in921 = _mm512_fnmadd_ps(tmp6193, _mm512_set1_ps(2e+00f), in921);
in929 = _mm512_fnmadd_ps(tmp6197, _mm512_set1_ps(2e+00f), in929);
__m512 tmp6209 = _mm512_unpacklo_ps(in917, tmp6195);
__m512 tmp6210 = _mm512_unpackhi_ps(in917, tmp6195);
__m512 tmp6211 = _mm512_unpacklo_ps(tmp6196, in923);
__m512 tmp6212 = _mm512_unpackhi_ps(tmp6196, in923);
__m512 tmp6213 = _mm512_unpacklo_ps(tmp6194, in919);
__m512 tmp6214 = _mm512_unpackhi_ps(tmp6194, in919);
__m512 tmp6215 = _mm512_unpacklo_ps(in921, in920);
__m512 tmp6216 = _mm512_unpackhi_ps(in921, in920);
__m512 tmp6217 = _mm512_unpacklo_ps(in925, tmp6199);
__m512 tmp6218 = _mm512_unpackhi_ps(in925, tmp6199);
__m512 tmp6219 = _mm512_unpacklo_ps(tmp6200, in931);
__m512 tmp6220 = _mm512_unpackhi_ps(tmp6200, in931);
__m512 tmp6221 = _mm512_unpacklo_ps(tmp6198, in927);
__m512 tmp6222 = _mm512_unpackhi_ps(tmp6198, in927);
__m512 tmp6223 = _mm512_unpacklo_ps(in929, in928);
__m512 tmp6224 = _mm512_unpackhi_ps(in929, in928);
__m512 tmp6225 = _mm512_shuffle_ps(tmp6209, tmp6211, 68);
__m512 tmp6226 = _mm512_shuffle_ps(tmp6209, tmp6211, 238);
__m512 tmp6227 = _mm512_shuffle_ps(tmp6210, tmp6212, 68);
__m512 tmp6228 = _mm512_shuffle_ps(tmp6210, tmp6212, 238);
__m512 tmp6229 = _mm512_shuffle_ps(tmp6213, tmp6215, 68);
__m512 tmp6230 = _mm512_shuffle_ps(tmp6213, tmp6215, 238);
__m512 tmp6231 = _mm512_shuffle_ps(tmp6214, tmp6216, 68);
__m512 tmp6232 = _mm512_shuffle_ps(tmp6214, tmp6216, 238);
__m512 tmp6233 = _mm512_shuffle_ps(tmp6217, tmp6219, 68);
__m512 tmp6234 = _mm512_shuffle_ps(tmp6217, tmp6219, 238);
__m512 tmp6235 = _mm512_shuffle_ps(tmp6218, tmp6220, 68);
__m512 tmp6236 = _mm512_shuffle_ps(tmp6218, tmp6220, 238);
__m512 tmp6237 = _mm512_shuffle_ps(tmp6221, tmp6223, 68);
__m512 tmp6238 = _mm512_shuffle_ps(tmp6221, tmp6223, 238);
__m512 tmp6239 = _mm512_shuffle_ps(tmp6222, tmp6224, 68);
__m512 tmp6240 = _mm512_shuffle_ps(tmp6222, tmp6224, 238);
__m512 tmp6241 = _mm512_shuffle_f32x4(tmp6225, tmp6229, 136);
__m512 tmp6242 = _mm512_shuffle_f32x4(tmp6225, tmp6229, 221);
__m512 tmp6243 = _mm512_shuffle_f32x4(tmp6226, tmp6230, 136);
__m512 tmp6244 = _mm512_shuffle_f32x4(tmp6226, tmp6230, 221);
__m512 tmp6245 = _mm512_shuffle_f32x4(tmp6227, tmp6231, 136);
__m512 tmp6246 = _mm512_shuffle_f32x4(tmp6227, tmp6231, 221);
__m512 tmp6247 = _mm512_shuffle_f32x4(tmp6228, tmp6232, 136);
__m512 tmp6248 = _mm512_shuffle_f32x4(tmp6228, tmp6232, 221);
__m512 tmp6249 = _mm512_shuffle_f32x4(tmp6233, tmp6237, 136);
__m512 tmp6250 = _mm512_shuffle_f32x4(tmp6233, tmp6237, 221);
__m512 tmp6251 = _mm512_shuffle_f32x4(tmp6234, tmp6238, 136);
__m512 tmp6252 = _mm512_shuffle_f32x4(tmp6234, tmp6238, 221);
__m512 tmp6253 = _mm512_shuffle_f32x4(tmp6235, tmp6239, 136);
__m512 tmp6254 = _mm512_shuffle_f32x4(tmp6235, tmp6239, 221);
__m512 tmp6255 = _mm512_shuffle_f32x4(tmp6236, tmp6240, 136);
__m512 tmp6256 = _mm512_shuffle_f32x4(tmp6236, tmp6240, 221);
in917 = _mm512_shuffle_f32x4(tmp6241, tmp6249, 136);
in925 = _mm512_shuffle_f32x4(tmp6241, tmp6249, 221);
tmp6195 = _mm512_shuffle_f32x4(tmp6243, tmp6251, 136);
tmp6199 = _mm512_shuffle_f32x4(tmp6243, tmp6251, 221);
tmp6196 = _mm512_shuffle_f32x4(tmp6245, tmp6253, 136);
tmp6200 = _mm512_shuffle_f32x4(tmp6245, tmp6253, 221);
in923 = _mm512_shuffle_f32x4(tmp6247, tmp6255, 136);
in931 = _mm512_shuffle_f32x4(tmp6247, tmp6255, 221);
tmp6194 = _mm512_shuffle_f32x4(tmp6242, tmp6250, 136);
tmp6198 = _mm512_shuffle_f32x4(tmp6242, tmp6250, 221);
in919 = _mm512_shuffle_f32x4(tmp6244, tmp6252, 136);
in927 = _mm512_shuffle_f32x4(tmp6244, tmp6252, 221);
in921 = _mm512_shuffle_f32x4(tmp6246, tmp6254, 136);
in929 = _mm512_shuffle_f32x4(tmp6246, tmp6254, 221);
in920 = _mm512_shuffle_f32x4(tmp6248, tmp6256, 136);
in928 = _mm512_shuffle_f32x4(tmp6248, tmp6256, 221);
__m512 tmp6201 = _mm512_add_ps(tmp6195, in919);
__m512 tmp6205 = _mm512_add_ps(tmp6199, in927);
__m512 tmp6202 = _mm512_sub_ps(tmp6194, tmp6196);
__m512 tmp6206 = _mm512_sub_ps(tmp6198, tmp6200);
__m512 tmp6203 = _mm512_add_ps(tmp6196, in921);
__m512 tmp6207 = _mm512_add_ps(tmp6200, in929);
in917 = _mm512_sub_ps(in917, in921);
in925 = _mm512_sub_ps(in925, in929);
tmp6201 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-4.25e+00f), tmp6201);
tmp6205 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-4.25e+00f), tmp6205);
tmp6203 = _mm512_fmadd_ps(tmp6194, _mm512_set1_ps(-4.25e+00f), tmp6203);
tmp6207 = _mm512_fmadd_ps(tmp6198, _mm512_set1_ps(-4.25e+00f), tmp6207);
in917 = _mm512_fmadd_ps(tmp6202, _mm512_set1_ps(5.25e+00f), in917);
in925 = _mm512_fmadd_ps(tmp6206, _mm512_set1_ps(5.25e+00f), in925);
tmp6202 = _mm512_fmadd_ps(tmp6196, _mm512_set1_ps(2.5e-01f), in921);
tmp6206 = _mm512_fmadd_ps(tmp6200, _mm512_set1_ps(2.5e-01f), in929);
tmp6196 = _mm512_fmadd_ps(tmp6196, _mm512_set1_ps(4e+00f), in921);
tmp6200 = _mm512_fmadd_ps(tmp6200, _mm512_set1_ps(4e+00f), in929);
__m512 tmp6204 = _mm512_sub_ps(tmp6203, tmp6201);
__m512 tmp6208 = _mm512_sub_ps(tmp6207, tmp6205);
tmp6203 = _mm512_add_ps(tmp6201, tmp6203);
tmp6207 = _mm512_add_ps(tmp6205, tmp6207);
tmp6201 = _mm512_fmadd_ps(tmp6195, _mm512_set1_ps(2.5e-01f), in919);
tmp6205 = _mm512_fmadd_ps(tmp6199, _mm512_set1_ps(2.5e-01f), in927);
tmp6202 = _mm512_fmadd_ps(tmp6194, _mm512_set1_ps(-1.25e+00f), tmp6202);
tmp6206 = _mm512_fmadd_ps(tmp6198, _mm512_set1_ps(-1.25e+00f), tmp6206);
tmp6194 = _mm512_fmadd_ps(tmp6194, _mm512_set1_ps(-5e+00f), tmp6196);
tmp6198 = _mm512_fmadd_ps(tmp6198, _mm512_set1_ps(-5e+00f), tmp6200);
tmp6201 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-1.25e+00f), tmp6201);
tmp6205 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-1.25e+00f), tmp6205);
in921 = _mm512_fmadd_ps(tmp6201, _mm512_set1_ps(2e+00f), tmp6202);
in929 = _mm512_fmadd_ps(tmp6205, _mm512_set1_ps(2e+00f), tmp6206);
tmp6202 = _mm512_fnmadd_ps(tmp6201, _mm512_set1_ps(2e+00f), tmp6202);
tmp6206 = _mm512_fnmadd_ps(tmp6205, _mm512_set1_ps(2e+00f), tmp6206);
tmp6201 = _mm512_fmadd_ps(in919, _mm512_set1_ps(2.5e-01f), tmp6195);
tmp6205 = _mm512_fmadd_ps(in927, _mm512_set1_ps(2.5e-01f), tmp6199);
tmp6195 = _mm512_sub_ps(in920, tmp6195);
tmp6199 = _mm512_sub_ps(in928, tmp6199);
tmp6201 = _mm512_fmadd_ps(in923, _mm512_set1_ps(-1.25e+00f), tmp6201);
tmp6205 = _mm512_fmadd_ps(in931, _mm512_set1_ps(-1.25e+00f), tmp6205);
in923 = _mm512_sub_ps(in923, in919);
in931 = _mm512_sub_ps(in931, in927);
in923 = _mm512_fmadd_ps(in923, _mm512_set1_ps(5.25e+00f), tmp6195);
in931 = _mm512_fmadd_ps(in931, _mm512_set1_ps(5.25e+00f), tmp6199);
tmp6196 = _mm512_fmadd_ps(tmp6201, _mm512_set1_ps(2e+00f), tmp6194);
tmp6200 = _mm512_fmadd_ps(tmp6205, _mm512_set1_ps(2e+00f), tmp6198);
tmp6194 = _mm512_fnmadd_ps(tmp6201, _mm512_set1_ps(2e+00f), tmp6194);
tmp6198 = _mm512_fnmadd_ps(tmp6205, _mm512_set1_ps(2e+00f), tmp6198);
__m512 out867 = _mm512_shuffle_f32x4(in917, tmp6203, 68);
__m512 out875 = _mm512_shuffle_f32x4(in917, tmp6203, 238);
__m512 out868 = _mm512_shuffle_f32x4(tmp6204, in921, 68);
__m512 out876 = _mm512_shuffle_f32x4(tmp6204, in921, 238);
__m512 out869 = _mm512_shuffle_f32x4(tmp6202, tmp6196, 68);
__m512 out877 = _mm512_shuffle_f32x4(tmp6202, tmp6196, 238);
__m512 out870 = _mm512_shuffle_f32x4(tmp6194, in923, 68);
__m512 out878 = _mm512_shuffle_f32x4(tmp6194, in923, 238);
__m512 out871 = _mm512_shuffle_f32x4(in925, tmp6207, 68);
__m512 out879 = _mm512_shuffle_f32x4(in925, tmp6207, 238);
__m512 out872 = _mm512_shuffle_f32x4(tmp6208, in929, 68);
__m512 out880 = _mm512_shuffle_f32x4(tmp6208, in929, 238);
__m512 out873 = _mm512_shuffle_f32x4(tmp6206, tmp6200, 68);
__m512 out881 = _mm512_shuffle_f32x4(tmp6206, tmp6200, 238);
__m512 out874 = _mm512_shuffle_f32x4(tmp6198, in931, 68);
__m512 out882 = _mm512_shuffle_f32x4(tmp6198, in931, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k82, out867);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k82, out875);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k82, out871);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k82, out879);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k82, out868);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k82, out876);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k82, out872);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k82, out880);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k82, out869);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k82, out877);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k82, out873);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k82, out881);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k82, out870);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k82, out878);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k82, out874);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k82, out882);
__m512 dat1457 = _mm512_maskz_loadu_ps(8191, datPtr12+13764+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1458 = _mm512_maskz_loadu_ps(16383, datPtr12+13808+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512i pm128 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in933 = _mm512_permutexvar_ps(pm128, dat1457);
__m512i pm129 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in941 = _mm512_permutexvar_ps(pm129, dat1458);
__m512 dat1459 = _mm512_maskz_loadu_ps(8191, datPtr12+13988+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1460 = _mm512_maskz_loadu_ps(16383, datPtr12+14032+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in934 = _mm512_permutexvar_ps(pm128, dat1459);
__m512 in942 = _mm512_permutexvar_ps(pm129, dat1460);
__m512 dat1461 = _mm512_maskz_loadu_ps(8191, datPtr12+14212+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1462 = _mm512_maskz_loadu_ps(16383, datPtr12+14256+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in935 = _mm512_permutexvar_ps(pm128, dat1461);
__m512 in943 = _mm512_permutexvar_ps(pm129, dat1462);
__m512 dat1463 = _mm512_maskz_loadu_ps(8191, datPtr12+14436+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1464 = _mm512_maskz_loadu_ps(16383, datPtr12+14480+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in936 = _mm512_permutexvar_ps(pm128, dat1463);
__m512 in944 = _mm512_permutexvar_ps(pm129, dat1464);
__m512 dat1465 = _mm512_maskz_loadu_ps(8191, datPtr12+14660+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1466 = _mm512_maskz_loadu_ps(16383, datPtr12+14704+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in937 = _mm512_permutexvar_ps(pm128, dat1465);
__m512 in945 = _mm512_permutexvar_ps(pm129, dat1466);
__m512 dat1467 = _mm512_maskz_loadu_ps(8191, datPtr12+14884+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1468 = _mm512_maskz_loadu_ps(16383, datPtr12+14928+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in938 = _mm512_permutexvar_ps(pm128, dat1467);
__m512 in946 = _mm512_permutexvar_ps(pm129, dat1468);
__m512 dat1469 = _mm512_maskz_loadu_ps(8191, datPtr12+15108+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1470 = _mm512_maskz_loadu_ps(16383, datPtr12+15152+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in939 = _mm512_permutexvar_ps(pm128, dat1469);
__m512 in947 = _mm512_permutexvar_ps(pm129, dat1470);
__m512 dat1471 = _mm512_maskz_loadu_ps(8191, datPtr12+15332+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 dat1472 = _mm512_maskz_loadu_ps(16383, datPtr12+15376+50432*i27+224*h32+4*w39+50432*s19+25216*k82);
__m512 in940 = _mm512_permutexvar_ps(pm128, dat1471);
__m512 in948 = _mm512_permutexvar_ps(pm129, dat1472);
__m512 tmp6257 = _mm512_add_ps(in934, in938);
__m512 tmp6261 = _mm512_add_ps(in942, in946);
__m512 tmp6258 = _mm512_sub_ps(in937, in935);
__m512 tmp6262 = _mm512_sub_ps(in945, in943);
__m512 tmp6259 = _mm512_add_ps(in935, in939);
__m512 tmp6263 = _mm512_add_ps(in943, in947);
in933 = _mm512_sub_ps(in933, in939);
in941 = _mm512_sub_ps(in941, in947);
tmp6257 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-4.25e+00f), tmp6257);
tmp6261 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-4.25e+00f), tmp6261);
tmp6259 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-4.25e+00f), tmp6259);
tmp6263 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-4.25e+00f), tmp6263);
in933 = _mm512_fmadd_ps(tmp6258, _mm512_set1_ps(5.25e+00f), in933);
in941 = _mm512_fmadd_ps(tmp6262, _mm512_set1_ps(5.25e+00f), in941);
tmp6258 = _mm512_fmadd_ps(in935, _mm512_set1_ps(2.5e-01f), in939);
tmp6262 = _mm512_fmadd_ps(in943, _mm512_set1_ps(2.5e-01f), in947);
in935 = _mm512_fmadd_ps(in935, _mm512_set1_ps(4e+00f), in939);
in943 = _mm512_fmadd_ps(in943, _mm512_set1_ps(4e+00f), in947);
__m512 tmp6260 = _mm512_sub_ps(tmp6259, tmp6257);
__m512 tmp6264 = _mm512_sub_ps(tmp6263, tmp6261);
tmp6259 = _mm512_add_ps(tmp6257, tmp6259);
tmp6263 = _mm512_add_ps(tmp6261, tmp6263);
tmp6257 = _mm512_fmadd_ps(in934, _mm512_set1_ps(2.5e-01f), in938);
tmp6261 = _mm512_fmadd_ps(in942, _mm512_set1_ps(2.5e-01f), in946);
tmp6258 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-1.25e+00f), tmp6258);
tmp6262 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-1.25e+00f), tmp6262);
in937 = _mm512_fmadd_ps(in937, _mm512_set1_ps(-5e+00f), in935);
in945 = _mm512_fmadd_ps(in945, _mm512_set1_ps(-5e+00f), in943);
tmp6257 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-1.25e+00f), tmp6257);
tmp6261 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-1.25e+00f), tmp6261);
in939 = _mm512_fmadd_ps(tmp6257, _mm512_set1_ps(2e+00f), tmp6258);
in947 = _mm512_fmadd_ps(tmp6261, _mm512_set1_ps(2e+00f), tmp6262);
tmp6258 = _mm512_fnmadd_ps(tmp6257, _mm512_set1_ps(2e+00f), tmp6258);
tmp6262 = _mm512_fnmadd_ps(tmp6261, _mm512_set1_ps(2e+00f), tmp6262);
tmp6257 = _mm512_fmadd_ps(in938, _mm512_set1_ps(2.5e-01f), in934);
tmp6261 = _mm512_fmadd_ps(in946, _mm512_set1_ps(2.5e-01f), in942);
in934 = _mm512_sub_ps(in940, in934);
in942 = _mm512_sub_ps(in948, in942);
tmp6257 = _mm512_fmadd_ps(in936, _mm512_set1_ps(-1.25e+00f), tmp6257);
tmp6261 = _mm512_fmadd_ps(in944, _mm512_set1_ps(-1.25e+00f), tmp6261);
in936 = _mm512_sub_ps(in936, in938);
in944 = _mm512_sub_ps(in944, in946);
in936 = _mm512_fmadd_ps(in936, _mm512_set1_ps(5.25e+00f), in934);
in944 = _mm512_fmadd_ps(in944, _mm512_set1_ps(5.25e+00f), in942);
in935 = _mm512_fmadd_ps(tmp6257, _mm512_set1_ps(2e+00f), in937);
in943 = _mm512_fmadd_ps(tmp6261, _mm512_set1_ps(2e+00f), in945);
in937 = _mm512_fnmadd_ps(tmp6257, _mm512_set1_ps(2e+00f), in937);
in945 = _mm512_fnmadd_ps(tmp6261, _mm512_set1_ps(2e+00f), in945);
__m512 tmp6273 = _mm512_unpacklo_ps(in933, tmp6259);
__m512 tmp6274 = _mm512_unpackhi_ps(in933, tmp6259);
__m512 tmp6275 = _mm512_unpacklo_ps(tmp6260, in939);
__m512 tmp6276 = _mm512_unpackhi_ps(tmp6260, in939);
__m512 tmp6277 = _mm512_unpacklo_ps(tmp6258, in935);
__m512 tmp6278 = _mm512_unpackhi_ps(tmp6258, in935);
__m512 tmp6279 = _mm512_unpacklo_ps(in937, in936);
__m512 tmp6280 = _mm512_unpackhi_ps(in937, in936);
__m512 tmp6281 = _mm512_unpacklo_ps(in941, tmp6263);
__m512 tmp6282 = _mm512_unpackhi_ps(in941, tmp6263);
__m512 tmp6283 = _mm512_unpacklo_ps(tmp6264, in947);
__m512 tmp6284 = _mm512_unpackhi_ps(tmp6264, in947);
__m512 tmp6285 = _mm512_unpacklo_ps(tmp6262, in943);
__m512 tmp6286 = _mm512_unpackhi_ps(tmp6262, in943);
__m512 tmp6287 = _mm512_unpacklo_ps(in945, in944);
__m512 tmp6288 = _mm512_unpackhi_ps(in945, in944);
__m512 tmp6289 = _mm512_shuffle_ps(tmp6273, tmp6275, 68);
__m512 tmp6290 = _mm512_shuffle_ps(tmp6273, tmp6275, 238);
__m512 tmp6291 = _mm512_shuffle_ps(tmp6274, tmp6276, 68);
__m512 tmp6292 = _mm512_shuffle_ps(tmp6274, tmp6276, 238);
__m512 tmp6293 = _mm512_shuffle_ps(tmp6277, tmp6279, 68);
__m512 tmp6294 = _mm512_shuffle_ps(tmp6277, tmp6279, 238);
__m512 tmp6295 = _mm512_shuffle_ps(tmp6278, tmp6280, 68);
__m512 tmp6296 = _mm512_shuffle_ps(tmp6278, tmp6280, 238);
__m512 tmp6297 = _mm512_shuffle_ps(tmp6281, tmp6283, 68);
__m512 tmp6298 = _mm512_shuffle_ps(tmp6281, tmp6283, 238);
__m512 tmp6299 = _mm512_shuffle_ps(tmp6282, tmp6284, 68);
__m512 tmp6300 = _mm512_shuffle_ps(tmp6282, tmp6284, 238);
__m512 tmp6301 = _mm512_shuffle_ps(tmp6285, tmp6287, 68);
__m512 tmp6302 = _mm512_shuffle_ps(tmp6285, tmp6287, 238);
__m512 tmp6303 = _mm512_shuffle_ps(tmp6286, tmp6288, 68);
__m512 tmp6304 = _mm512_shuffle_ps(tmp6286, tmp6288, 238);
__m512 tmp6305 = _mm512_shuffle_f32x4(tmp6289, tmp6293, 136);
__m512 tmp6306 = _mm512_shuffle_f32x4(tmp6289, tmp6293, 221);
__m512 tmp6307 = _mm512_shuffle_f32x4(tmp6290, tmp6294, 136);
__m512 tmp6308 = _mm512_shuffle_f32x4(tmp6290, tmp6294, 221);
__m512 tmp6309 = _mm512_shuffle_f32x4(tmp6291, tmp6295, 136);
__m512 tmp6310 = _mm512_shuffle_f32x4(tmp6291, tmp6295, 221);
__m512 tmp6311 = _mm512_shuffle_f32x4(tmp6292, tmp6296, 136);
__m512 tmp6312 = _mm512_shuffle_f32x4(tmp6292, tmp6296, 221);
__m512 tmp6313 = _mm512_shuffle_f32x4(tmp6297, tmp6301, 136);
__m512 tmp6314 = _mm512_shuffle_f32x4(tmp6297, tmp6301, 221);
__m512 tmp6315 = _mm512_shuffle_f32x4(tmp6298, tmp6302, 136);
__m512 tmp6316 = _mm512_shuffle_f32x4(tmp6298, tmp6302, 221);
__m512 tmp6317 = _mm512_shuffle_f32x4(tmp6299, tmp6303, 136);
__m512 tmp6318 = _mm512_shuffle_f32x4(tmp6299, tmp6303, 221);
__m512 tmp6319 = _mm512_shuffle_f32x4(tmp6300, tmp6304, 136);
__m512 tmp6320 = _mm512_shuffle_f32x4(tmp6300, tmp6304, 221);
in933 = _mm512_shuffle_f32x4(tmp6305, tmp6313, 136);
in941 = _mm512_shuffle_f32x4(tmp6305, tmp6313, 221);
tmp6259 = _mm512_shuffle_f32x4(tmp6307, tmp6315, 136);
tmp6263 = _mm512_shuffle_f32x4(tmp6307, tmp6315, 221);
tmp6260 = _mm512_shuffle_f32x4(tmp6309, tmp6317, 136);
tmp6264 = _mm512_shuffle_f32x4(tmp6309, tmp6317, 221);
in939 = _mm512_shuffle_f32x4(tmp6311, tmp6319, 136);
in947 = _mm512_shuffle_f32x4(tmp6311, tmp6319, 221);
tmp6258 = _mm512_shuffle_f32x4(tmp6306, tmp6314, 136);
tmp6262 = _mm512_shuffle_f32x4(tmp6306, tmp6314, 221);
in935 = _mm512_shuffle_f32x4(tmp6308, tmp6316, 136);
in943 = _mm512_shuffle_f32x4(tmp6308, tmp6316, 221);
in937 = _mm512_shuffle_f32x4(tmp6310, tmp6318, 136);
in945 = _mm512_shuffle_f32x4(tmp6310, tmp6318, 221);
in936 = _mm512_shuffle_f32x4(tmp6312, tmp6320, 136);
in944 = _mm512_shuffle_f32x4(tmp6312, tmp6320, 221);
__m512 tmp6265 = _mm512_add_ps(tmp6259, in935);
__m512 tmp6269 = _mm512_add_ps(tmp6263, in943);
__m512 tmp6266 = _mm512_sub_ps(tmp6258, tmp6260);
__m512 tmp6270 = _mm512_sub_ps(tmp6262, tmp6264);
__m512 tmp6267 = _mm512_add_ps(tmp6260, in937);
__m512 tmp6271 = _mm512_add_ps(tmp6264, in945);
in933 = _mm512_sub_ps(in933, in937);
in941 = _mm512_sub_ps(in941, in945);
tmp6265 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-4.25e+00f), tmp6265);
tmp6269 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-4.25e+00f), tmp6269);
tmp6267 = _mm512_fmadd_ps(tmp6258, _mm512_set1_ps(-4.25e+00f), tmp6267);
tmp6271 = _mm512_fmadd_ps(tmp6262, _mm512_set1_ps(-4.25e+00f), tmp6271);
in933 = _mm512_fmadd_ps(tmp6266, _mm512_set1_ps(5.25e+00f), in933);
in941 = _mm512_fmadd_ps(tmp6270, _mm512_set1_ps(5.25e+00f), in941);
tmp6266 = _mm512_fmadd_ps(tmp6260, _mm512_set1_ps(2.5e-01f), in937);
tmp6270 = _mm512_fmadd_ps(tmp6264, _mm512_set1_ps(2.5e-01f), in945);
tmp6260 = _mm512_fmadd_ps(tmp6260, _mm512_set1_ps(4e+00f), in937);
tmp6264 = _mm512_fmadd_ps(tmp6264, _mm512_set1_ps(4e+00f), in945);
__m512 tmp6268 = _mm512_sub_ps(tmp6267, tmp6265);
__m512 tmp6272 = _mm512_sub_ps(tmp6271, tmp6269);
tmp6267 = _mm512_add_ps(tmp6265, tmp6267);
tmp6271 = _mm512_add_ps(tmp6269, tmp6271);
tmp6265 = _mm512_fmadd_ps(tmp6259, _mm512_set1_ps(2.5e-01f), in935);
tmp6269 = _mm512_fmadd_ps(tmp6263, _mm512_set1_ps(2.5e-01f), in943);
tmp6266 = _mm512_fmadd_ps(tmp6258, _mm512_set1_ps(-1.25e+00f), tmp6266);
tmp6270 = _mm512_fmadd_ps(tmp6262, _mm512_set1_ps(-1.25e+00f), tmp6270);
tmp6258 = _mm512_fmadd_ps(tmp6258, _mm512_set1_ps(-5e+00f), tmp6260);
tmp6262 = _mm512_fmadd_ps(tmp6262, _mm512_set1_ps(-5e+00f), tmp6264);
tmp6265 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-1.25e+00f), tmp6265);
tmp6269 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-1.25e+00f), tmp6269);
in937 = _mm512_fmadd_ps(tmp6265, _mm512_set1_ps(2e+00f), tmp6266);
in945 = _mm512_fmadd_ps(tmp6269, _mm512_set1_ps(2e+00f), tmp6270);
tmp6266 = _mm512_fnmadd_ps(tmp6265, _mm512_set1_ps(2e+00f), tmp6266);
tmp6270 = _mm512_fnmadd_ps(tmp6269, _mm512_set1_ps(2e+00f), tmp6270);
tmp6265 = _mm512_fmadd_ps(in935, _mm512_set1_ps(2.5e-01f), tmp6259);
tmp6269 = _mm512_fmadd_ps(in943, _mm512_set1_ps(2.5e-01f), tmp6263);
tmp6259 = _mm512_sub_ps(in936, tmp6259);
tmp6263 = _mm512_sub_ps(in944, tmp6263);
tmp6265 = _mm512_fmadd_ps(in939, _mm512_set1_ps(-1.25e+00f), tmp6265);
tmp6269 = _mm512_fmadd_ps(in947, _mm512_set1_ps(-1.25e+00f), tmp6269);
in939 = _mm512_sub_ps(in939, in935);
in947 = _mm512_sub_ps(in947, in943);
in939 = _mm512_fmadd_ps(in939, _mm512_set1_ps(5.25e+00f), tmp6259);
in947 = _mm512_fmadd_ps(in947, _mm512_set1_ps(5.25e+00f), tmp6263);
tmp6260 = _mm512_fmadd_ps(tmp6265, _mm512_set1_ps(2e+00f), tmp6258);
tmp6264 = _mm512_fmadd_ps(tmp6269, _mm512_set1_ps(2e+00f), tmp6262);
tmp6258 = _mm512_fnmadd_ps(tmp6265, _mm512_set1_ps(2e+00f), tmp6258);
tmp6262 = _mm512_fnmadd_ps(tmp6269, _mm512_set1_ps(2e+00f), tmp6262);
__m512 out883 = _mm512_shuffle_f32x4(in933, tmp6267, 68);
__m512 out891 = _mm512_shuffle_f32x4(in933, tmp6267, 238);
__m512 out884 = _mm512_shuffle_f32x4(tmp6268, in937, 68);
__m512 out892 = _mm512_shuffle_f32x4(tmp6268, in937, 238);
__m512 out885 = _mm512_shuffle_f32x4(tmp6266, tmp6260, 68);
__m512 out893 = _mm512_shuffle_f32x4(tmp6266, tmp6260, 238);
__m512 out886 = _mm512_shuffle_f32x4(tmp6258, in939, 68);
__m512 out894 = _mm512_shuffle_f32x4(tmp6258, in939, 238);
__m512 out887 = _mm512_shuffle_f32x4(in941, tmp6271, 68);
__m512 out895 = _mm512_shuffle_f32x4(in941, tmp6271, 238);
__m512 out888 = _mm512_shuffle_f32x4(tmp6272, in945, 68);
__m512 out896 = _mm512_shuffle_f32x4(tmp6272, in945, 238);
__m512 out889 = _mm512_shuffle_f32x4(tmp6270, tmp6264, 68);
__m512 out897 = _mm512_shuffle_f32x4(tmp6270, tmp6264, 238);
__m512 out890 = _mm512_shuffle_f32x4(tmp6262, in947, 68);
__m512 out898 = _mm512_shuffle_f32x4(tmp6262, in947, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k82, out883);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k82, out891);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k82, out887);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k82, out895);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k82, out884);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k82, out892);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k82, out888);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k82, out896);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k82, out885);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k82, out893);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k82, out889);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k82, out897);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k82, out886);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k82, out894);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k82, out890);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k82, out898);
}
++j21;
rel14 = 2;
}
if (rel14 < 3) {
ptrdiff_t h33 = base14+6;
ptrdiff_t w40 = 24;
ptrdiff_t k83 = 0;
for (; k83 != 2; ++k83) {
__m512 dat1473 = _mm512_maskz_loadu_ps(16383, datPtr12+0+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1474 = _mm512_maskz_loadu_ps(16383, datPtr12+48+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512i pm130 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in949 = _mm512_permutexvar_ps(pm130, dat1473);
__m512 in957 = _mm512_permutexvar_ps(pm130, dat1474);
__m512 dat1475 = _mm512_maskz_loadu_ps(16383, datPtr12+224+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1476 = _mm512_maskz_loadu_ps(16383, datPtr12+272+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in950 = _mm512_permutexvar_ps(pm130, dat1475);
__m512 in958 = _mm512_permutexvar_ps(pm130, dat1476);
__m512 dat1477 = _mm512_maskz_loadu_ps(16383, datPtr12+448+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1478 = _mm512_maskz_loadu_ps(16383, datPtr12+496+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in951 = _mm512_permutexvar_ps(pm130, dat1477);
__m512 in959 = _mm512_permutexvar_ps(pm130, dat1478);
__m512 dat1479 = _mm512_maskz_loadu_ps(16383, datPtr12+672+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1480 = _mm512_maskz_loadu_ps(16383, datPtr12+720+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in952 = _mm512_permutexvar_ps(pm130, dat1479);
__m512 in960 = _mm512_permutexvar_ps(pm130, dat1480);
__m512 dat1481 = _mm512_maskz_loadu_ps(16383, datPtr12+896+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1482 = _mm512_maskz_loadu_ps(16383, datPtr12+944+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in953 = _mm512_permutexvar_ps(pm130, dat1481);
__m512 in961 = _mm512_permutexvar_ps(pm130, dat1482);
__m512 dat1483 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1484 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in954 = _mm512_permutexvar_ps(pm130, dat1483);
__m512 in962 = _mm512_permutexvar_ps(pm130, dat1484);
__m512 dat1485 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1486 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in955 = _mm512_permutexvar_ps(pm130, dat1485);
__m512 in963 = _mm512_permutexvar_ps(pm130, dat1486);
__m512 dat1487 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1488 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in956 = _mm512_permutexvar_ps(pm130, dat1487);
__m512 in964 = _mm512_permutexvar_ps(pm130, dat1488);
__m512 tmp6321 = _mm512_add_ps(in950, in954);
__m512 tmp6325 = _mm512_add_ps(in958, in962);
__m512 tmp6322 = _mm512_sub_ps(in953, in951);
__m512 tmp6326 = _mm512_sub_ps(in961, in959);
__m512 tmp6323 = _mm512_add_ps(in951, in955);
__m512 tmp6327 = _mm512_add_ps(in959, in963);
in949 = _mm512_sub_ps(in949, in955);
in957 = _mm512_sub_ps(in957, in963);
tmp6321 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-4.25e+00f), tmp6321);
tmp6325 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-4.25e+00f), tmp6325);
tmp6323 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-4.25e+00f), tmp6323);
tmp6327 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-4.25e+00f), tmp6327);
in949 = _mm512_fmadd_ps(tmp6322, _mm512_set1_ps(5.25e+00f), in949);
in957 = _mm512_fmadd_ps(tmp6326, _mm512_set1_ps(5.25e+00f), in957);
tmp6322 = _mm512_fmadd_ps(in951, _mm512_set1_ps(2.5e-01f), in955);
tmp6326 = _mm512_fmadd_ps(in959, _mm512_set1_ps(2.5e-01f), in963);
in951 = _mm512_fmadd_ps(in951, _mm512_set1_ps(4e+00f), in955);
in959 = _mm512_fmadd_ps(in959, _mm512_set1_ps(4e+00f), in963);
__m512 tmp6324 = _mm512_sub_ps(tmp6323, tmp6321);
__m512 tmp6328 = _mm512_sub_ps(tmp6327, tmp6325);
tmp6323 = _mm512_add_ps(tmp6321, tmp6323);
tmp6327 = _mm512_add_ps(tmp6325, tmp6327);
tmp6321 = _mm512_fmadd_ps(in950, _mm512_set1_ps(2.5e-01f), in954);
tmp6325 = _mm512_fmadd_ps(in958, _mm512_set1_ps(2.5e-01f), in962);
tmp6322 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-1.25e+00f), tmp6322);
tmp6326 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-1.25e+00f), tmp6326);
in953 = _mm512_fmadd_ps(in953, _mm512_set1_ps(-5e+00f), in951);
in961 = _mm512_fmadd_ps(in961, _mm512_set1_ps(-5e+00f), in959);
tmp6321 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-1.25e+00f), tmp6321);
tmp6325 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-1.25e+00f), tmp6325);
in955 = _mm512_fmadd_ps(tmp6321, _mm512_set1_ps(2e+00f), tmp6322);
in963 = _mm512_fmadd_ps(tmp6325, _mm512_set1_ps(2e+00f), tmp6326);
tmp6322 = _mm512_fnmadd_ps(tmp6321, _mm512_set1_ps(2e+00f), tmp6322);
tmp6326 = _mm512_fnmadd_ps(tmp6325, _mm512_set1_ps(2e+00f), tmp6326);
tmp6321 = _mm512_fmadd_ps(in954, _mm512_set1_ps(2.5e-01f), in950);
tmp6325 = _mm512_fmadd_ps(in962, _mm512_set1_ps(2.5e-01f), in958);
in950 = _mm512_sub_ps(in956, in950);
in958 = _mm512_sub_ps(in964, in958);
tmp6321 = _mm512_fmadd_ps(in952, _mm512_set1_ps(-1.25e+00f), tmp6321);
tmp6325 = _mm512_fmadd_ps(in960, _mm512_set1_ps(-1.25e+00f), tmp6325);
in952 = _mm512_sub_ps(in952, in954);
in960 = _mm512_sub_ps(in960, in962);
in952 = _mm512_fmadd_ps(in952, _mm512_set1_ps(5.25e+00f), in950);
in960 = _mm512_fmadd_ps(in960, _mm512_set1_ps(5.25e+00f), in958);
in951 = _mm512_fmadd_ps(tmp6321, _mm512_set1_ps(2e+00f), in953);
in959 = _mm512_fmadd_ps(tmp6325, _mm512_set1_ps(2e+00f), in961);
in953 = _mm512_fnmadd_ps(tmp6321, _mm512_set1_ps(2e+00f), in953);
in961 = _mm512_fnmadd_ps(tmp6325, _mm512_set1_ps(2e+00f), in961);
__m512 tmp6337 = _mm512_unpacklo_ps(in949, tmp6323);
__m512 tmp6338 = _mm512_unpackhi_ps(in949, tmp6323);
__m512 tmp6339 = _mm512_unpacklo_ps(tmp6324, in955);
__m512 tmp6340 = _mm512_unpackhi_ps(tmp6324, in955);
__m512 tmp6341 = _mm512_unpacklo_ps(tmp6322, in951);
__m512 tmp6342 = _mm512_unpackhi_ps(tmp6322, in951);
__m512 tmp6343 = _mm512_unpacklo_ps(in953, in952);
__m512 tmp6344 = _mm512_unpackhi_ps(in953, in952);
__m512 tmp6345 = _mm512_unpacklo_ps(in957, tmp6327);
__m512 tmp6346 = _mm512_unpackhi_ps(in957, tmp6327);
__m512 tmp6347 = _mm512_unpacklo_ps(tmp6328, in963);
__m512 tmp6348 = _mm512_unpackhi_ps(tmp6328, in963);
__m512 tmp6349 = _mm512_unpacklo_ps(tmp6326, in959);
__m512 tmp6350 = _mm512_unpackhi_ps(tmp6326, in959);
__m512 tmp6351 = _mm512_unpacklo_ps(in961, in960);
__m512 tmp6352 = _mm512_unpackhi_ps(in961, in960);
__m512 tmp6353 = _mm512_shuffle_ps(tmp6337, tmp6339, 68);
__m512 tmp6354 = _mm512_shuffle_ps(tmp6337, tmp6339, 238);
__m512 tmp6355 = _mm512_shuffle_ps(tmp6338, tmp6340, 68);
__m512 tmp6356 = _mm512_shuffle_ps(tmp6338, tmp6340, 238);
__m512 tmp6357 = _mm512_shuffle_ps(tmp6341, tmp6343, 68);
__m512 tmp6358 = _mm512_shuffle_ps(tmp6341, tmp6343, 238);
__m512 tmp6359 = _mm512_shuffle_ps(tmp6342, tmp6344, 68);
__m512 tmp6360 = _mm512_shuffle_ps(tmp6342, tmp6344, 238);
__m512 tmp6361 = _mm512_shuffle_ps(tmp6345, tmp6347, 68);
__m512 tmp6362 = _mm512_shuffle_ps(tmp6345, tmp6347, 238);
__m512 tmp6363 = _mm512_shuffle_ps(tmp6346, tmp6348, 68);
__m512 tmp6364 = _mm512_shuffle_ps(tmp6346, tmp6348, 238);
__m512 tmp6365 = _mm512_shuffle_ps(tmp6349, tmp6351, 68);
__m512 tmp6366 = _mm512_shuffle_ps(tmp6349, tmp6351, 238);
__m512 tmp6367 = _mm512_shuffle_ps(tmp6350, tmp6352, 68);
__m512 tmp6368 = _mm512_shuffle_ps(tmp6350, tmp6352, 238);
__m512 tmp6369 = _mm512_shuffle_f32x4(tmp6353, tmp6357, 136);
__m512 tmp6370 = _mm512_shuffle_f32x4(tmp6353, tmp6357, 221);
__m512 tmp6371 = _mm512_shuffle_f32x4(tmp6354, tmp6358, 136);
__m512 tmp6372 = _mm512_shuffle_f32x4(tmp6354, tmp6358, 221);
__m512 tmp6373 = _mm512_shuffle_f32x4(tmp6355, tmp6359, 136);
__m512 tmp6374 = _mm512_shuffle_f32x4(tmp6355, tmp6359, 221);
__m512 tmp6375 = _mm512_shuffle_f32x4(tmp6356, tmp6360, 136);
__m512 tmp6376 = _mm512_shuffle_f32x4(tmp6356, tmp6360, 221);
__m512 tmp6377 = _mm512_shuffle_f32x4(tmp6361, tmp6365, 136);
__m512 tmp6378 = _mm512_shuffle_f32x4(tmp6361, tmp6365, 221);
__m512 tmp6379 = _mm512_shuffle_f32x4(tmp6362, tmp6366, 136);
__m512 tmp6380 = _mm512_shuffle_f32x4(tmp6362, tmp6366, 221);
__m512 tmp6381 = _mm512_shuffle_f32x4(tmp6363, tmp6367, 136);
__m512 tmp6382 = _mm512_shuffle_f32x4(tmp6363, tmp6367, 221);
__m512 tmp6383 = _mm512_shuffle_f32x4(tmp6364, tmp6368, 136);
__m512 tmp6384 = _mm512_shuffle_f32x4(tmp6364, tmp6368, 221);
in949 = _mm512_shuffle_f32x4(tmp6369, tmp6377, 136);
in957 = _mm512_shuffle_f32x4(tmp6369, tmp6377, 221);
tmp6323 = _mm512_shuffle_f32x4(tmp6371, tmp6379, 136);
tmp6327 = _mm512_shuffle_f32x4(tmp6371, tmp6379, 221);
tmp6324 = _mm512_shuffle_f32x4(tmp6373, tmp6381, 136);
tmp6328 = _mm512_shuffle_f32x4(tmp6373, tmp6381, 221);
in955 = _mm512_shuffle_f32x4(tmp6375, tmp6383, 136);
in963 = _mm512_shuffle_f32x4(tmp6375, tmp6383, 221);
tmp6322 = _mm512_shuffle_f32x4(tmp6370, tmp6378, 136);
tmp6326 = _mm512_shuffle_f32x4(tmp6370, tmp6378, 221);
in951 = _mm512_shuffle_f32x4(tmp6372, tmp6380, 136);
in959 = _mm512_shuffle_f32x4(tmp6372, tmp6380, 221);
in953 = _mm512_shuffle_f32x4(tmp6374, tmp6382, 136);
in961 = _mm512_shuffle_f32x4(tmp6374, tmp6382, 221);
in952 = _mm512_shuffle_f32x4(tmp6376, tmp6384, 136);
in960 = _mm512_shuffle_f32x4(tmp6376, tmp6384, 221);
__m512 tmp6329 = _mm512_add_ps(tmp6323, in951);
__m512 tmp6333 = _mm512_add_ps(tmp6327, in959);
__m512 tmp6330 = _mm512_sub_ps(tmp6322, tmp6324);
__m512 tmp6334 = _mm512_sub_ps(tmp6326, tmp6328);
__m512 tmp6331 = _mm512_add_ps(tmp6324, in953);
__m512 tmp6335 = _mm512_add_ps(tmp6328, in961);
in949 = _mm512_sub_ps(in949, in953);
in957 = _mm512_sub_ps(in957, in961);
tmp6329 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-4.25e+00f), tmp6329);
tmp6333 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-4.25e+00f), tmp6333);
tmp6331 = _mm512_fmadd_ps(tmp6322, _mm512_set1_ps(-4.25e+00f), tmp6331);
tmp6335 = _mm512_fmadd_ps(tmp6326, _mm512_set1_ps(-4.25e+00f), tmp6335);
in949 = _mm512_fmadd_ps(tmp6330, _mm512_set1_ps(5.25e+00f), in949);
in957 = _mm512_fmadd_ps(tmp6334, _mm512_set1_ps(5.25e+00f), in957);
tmp6330 = _mm512_fmadd_ps(tmp6324, _mm512_set1_ps(2.5e-01f), in953);
tmp6334 = _mm512_fmadd_ps(tmp6328, _mm512_set1_ps(2.5e-01f), in961);
tmp6324 = _mm512_fmadd_ps(tmp6324, _mm512_set1_ps(4e+00f), in953);
tmp6328 = _mm512_fmadd_ps(tmp6328, _mm512_set1_ps(4e+00f), in961);
__m512 tmp6332 = _mm512_sub_ps(tmp6331, tmp6329);
__m512 tmp6336 = _mm512_sub_ps(tmp6335, tmp6333);
tmp6331 = _mm512_add_ps(tmp6329, tmp6331);
tmp6335 = _mm512_add_ps(tmp6333, tmp6335);
tmp6329 = _mm512_fmadd_ps(tmp6323, _mm512_set1_ps(2.5e-01f), in951);
tmp6333 = _mm512_fmadd_ps(tmp6327, _mm512_set1_ps(2.5e-01f), in959);
tmp6330 = _mm512_fmadd_ps(tmp6322, _mm512_set1_ps(-1.25e+00f), tmp6330);
tmp6334 = _mm512_fmadd_ps(tmp6326, _mm512_set1_ps(-1.25e+00f), tmp6334);
tmp6322 = _mm512_fmadd_ps(tmp6322, _mm512_set1_ps(-5e+00f), tmp6324);
tmp6326 = _mm512_fmadd_ps(tmp6326, _mm512_set1_ps(-5e+00f), tmp6328);
tmp6329 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-1.25e+00f), tmp6329);
tmp6333 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-1.25e+00f), tmp6333);
in953 = _mm512_fmadd_ps(tmp6329, _mm512_set1_ps(2e+00f), tmp6330);
in961 = _mm512_fmadd_ps(tmp6333, _mm512_set1_ps(2e+00f), tmp6334);
tmp6330 = _mm512_fnmadd_ps(tmp6329, _mm512_set1_ps(2e+00f), tmp6330);
tmp6334 = _mm512_fnmadd_ps(tmp6333, _mm512_set1_ps(2e+00f), tmp6334);
tmp6329 = _mm512_fmadd_ps(in951, _mm512_set1_ps(2.5e-01f), tmp6323);
tmp6333 = _mm512_fmadd_ps(in959, _mm512_set1_ps(2.5e-01f), tmp6327);
tmp6323 = _mm512_sub_ps(in952, tmp6323);
tmp6327 = _mm512_sub_ps(in960, tmp6327);
tmp6329 = _mm512_fmadd_ps(in955, _mm512_set1_ps(-1.25e+00f), tmp6329);
tmp6333 = _mm512_fmadd_ps(in963, _mm512_set1_ps(-1.25e+00f), tmp6333);
in955 = _mm512_sub_ps(in955, in951);
in963 = _mm512_sub_ps(in963, in959);
in955 = _mm512_fmadd_ps(in955, _mm512_set1_ps(5.25e+00f), tmp6323);
in963 = _mm512_fmadd_ps(in963, _mm512_set1_ps(5.25e+00f), tmp6327);
tmp6324 = _mm512_fmadd_ps(tmp6329, _mm512_set1_ps(2e+00f), tmp6322);
tmp6328 = _mm512_fmadd_ps(tmp6333, _mm512_set1_ps(2e+00f), tmp6326);
tmp6322 = _mm512_fnmadd_ps(tmp6329, _mm512_set1_ps(2e+00f), tmp6322);
tmp6326 = _mm512_fnmadd_ps(tmp6333, _mm512_set1_ps(2e+00f), tmp6326);
__m512 out899 = _mm512_shuffle_f32x4(in949, tmp6331, 68);
__m512 out907 = _mm512_shuffle_f32x4(in949, tmp6331, 238);
__m512 out900 = _mm512_shuffle_f32x4(tmp6332, in953, 68);
__m512 out908 = _mm512_shuffle_f32x4(tmp6332, in953, 238);
__m512 out901 = _mm512_shuffle_f32x4(tmp6330, tmp6324, 68);
__m512 out909 = _mm512_shuffle_f32x4(tmp6330, tmp6324, 238);
__m512 out902 = _mm512_shuffle_f32x4(tmp6322, in955, 68);
__m512 out910 = _mm512_shuffle_f32x4(tmp6322, in955, 238);
__m512 out903 = _mm512_shuffle_f32x4(in957, tmp6335, 68);
__m512 out911 = _mm512_shuffle_f32x4(in957, tmp6335, 238);
__m512 out904 = _mm512_shuffle_f32x4(tmp6336, in961, 68);
__m512 out912 = _mm512_shuffle_f32x4(tmp6336, in961, 238);
__m512 out905 = _mm512_shuffle_f32x4(tmp6334, tmp6328, 68);
__m512 out913 = _mm512_shuffle_f32x4(tmp6334, tmp6328, 238);
__m512 out906 = _mm512_shuffle_f32x4(tmp6326, in963, 68);
__m512 out914 = _mm512_shuffle_f32x4(tmp6326, in963, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k83, out899);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k83, out907);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k83, out903);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k83, out911);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k83, out900);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k83, out908);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k83, out904);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k83, out912);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k83, out901);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k83, out909);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k83, out905);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k83, out913);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k83, out902);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k83, out910);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k83, out906);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k83, out914);
__m512 dat1489 = _mm512_maskz_loadu_ps(511, datPtr12+96+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1490 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512i pm131 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in965 = _mm512_permutexvar_ps(pm131, dat1489);
__m512i pm132 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in973 = _mm512_permutexvar_ps(pm132, dat1490);
__m512 dat1491 = _mm512_maskz_loadu_ps(511, datPtr12+320+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1492 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in966 = _mm512_permutexvar_ps(pm131, dat1491);
__m512 in974 = _mm512_permutexvar_ps(pm132, dat1492);
__m512 dat1493 = _mm512_maskz_loadu_ps(511, datPtr12+544+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1494 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in967 = _mm512_permutexvar_ps(pm131, dat1493);
__m512 in975 = _mm512_permutexvar_ps(pm132, dat1494);
__m512 dat1495 = _mm512_maskz_loadu_ps(511, datPtr12+768+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1496 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in968 = _mm512_permutexvar_ps(pm131, dat1495);
__m512 in976 = _mm512_permutexvar_ps(pm132, dat1496);
__m512 dat1497 = _mm512_maskz_loadu_ps(511, datPtr12+992+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1498 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in969 = _mm512_permutexvar_ps(pm131, dat1497);
__m512 in977 = _mm512_permutexvar_ps(pm132, dat1498);
__m512 dat1499 = _mm512_maskz_loadu_ps(511, datPtr12+1216+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1500 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in970 = _mm512_permutexvar_ps(pm131, dat1499);
__m512 in978 = _mm512_permutexvar_ps(pm132, dat1500);
__m512 dat1501 = _mm512_maskz_loadu_ps(511, datPtr12+1440+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1502 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in971 = _mm512_permutexvar_ps(pm131, dat1501);
__m512 in979 = _mm512_permutexvar_ps(pm132, dat1502);
__m512 dat1503 = _mm512_maskz_loadu_ps(511, datPtr12+1664+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1504 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in972 = _mm512_permutexvar_ps(pm131, dat1503);
__m512 in980 = _mm512_permutexvar_ps(pm132, dat1504);
__m512 tmp6385 = _mm512_add_ps(in966, in970);
__m512 tmp6389 = _mm512_add_ps(in974, in978);
__m512 tmp6386 = _mm512_sub_ps(in969, in967);
__m512 tmp6390 = _mm512_sub_ps(in977, in975);
__m512 tmp6387 = _mm512_add_ps(in967, in971);
__m512 tmp6391 = _mm512_add_ps(in975, in979);
in965 = _mm512_sub_ps(in965, in971);
in973 = _mm512_sub_ps(in973, in979);
tmp6385 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-4.25e+00f), tmp6385);
tmp6389 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-4.25e+00f), tmp6389);
tmp6387 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-4.25e+00f), tmp6387);
tmp6391 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-4.25e+00f), tmp6391);
in965 = _mm512_fmadd_ps(tmp6386, _mm512_set1_ps(5.25e+00f), in965);
in973 = _mm512_fmadd_ps(tmp6390, _mm512_set1_ps(5.25e+00f), in973);
tmp6386 = _mm512_fmadd_ps(in967, _mm512_set1_ps(2.5e-01f), in971);
tmp6390 = _mm512_fmadd_ps(in975, _mm512_set1_ps(2.5e-01f), in979);
in967 = _mm512_fmadd_ps(in967, _mm512_set1_ps(4e+00f), in971);
in975 = _mm512_fmadd_ps(in975, _mm512_set1_ps(4e+00f), in979);
__m512 tmp6388 = _mm512_sub_ps(tmp6387, tmp6385);
__m512 tmp6392 = _mm512_sub_ps(tmp6391, tmp6389);
tmp6387 = _mm512_add_ps(tmp6385, tmp6387);
tmp6391 = _mm512_add_ps(tmp6389, tmp6391);
tmp6385 = _mm512_fmadd_ps(in966, _mm512_set1_ps(2.5e-01f), in970);
tmp6389 = _mm512_fmadd_ps(in974, _mm512_set1_ps(2.5e-01f), in978);
tmp6386 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-1.25e+00f), tmp6386);
tmp6390 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-1.25e+00f), tmp6390);
in969 = _mm512_fmadd_ps(in969, _mm512_set1_ps(-5e+00f), in967);
in977 = _mm512_fmadd_ps(in977, _mm512_set1_ps(-5e+00f), in975);
tmp6385 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-1.25e+00f), tmp6385);
tmp6389 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-1.25e+00f), tmp6389);
in971 = _mm512_fmadd_ps(tmp6385, _mm512_set1_ps(2e+00f), tmp6386);
in979 = _mm512_fmadd_ps(tmp6389, _mm512_set1_ps(2e+00f), tmp6390);
tmp6386 = _mm512_fnmadd_ps(tmp6385, _mm512_set1_ps(2e+00f), tmp6386);
tmp6390 = _mm512_fnmadd_ps(tmp6389, _mm512_set1_ps(2e+00f), tmp6390);
tmp6385 = _mm512_fmadd_ps(in970, _mm512_set1_ps(2.5e-01f), in966);
tmp6389 = _mm512_fmadd_ps(in978, _mm512_set1_ps(2.5e-01f), in974);
in966 = _mm512_sub_ps(in972, in966);
in974 = _mm512_sub_ps(in980, in974);
tmp6385 = _mm512_fmadd_ps(in968, _mm512_set1_ps(-1.25e+00f), tmp6385);
tmp6389 = _mm512_fmadd_ps(in976, _mm512_set1_ps(-1.25e+00f), tmp6389);
in968 = _mm512_sub_ps(in968, in970);
in976 = _mm512_sub_ps(in976, in978);
in968 = _mm512_fmadd_ps(in968, _mm512_set1_ps(5.25e+00f), in966);
in976 = _mm512_fmadd_ps(in976, _mm512_set1_ps(5.25e+00f), in974);
in967 = _mm512_fmadd_ps(tmp6385, _mm512_set1_ps(2e+00f), in969);
in975 = _mm512_fmadd_ps(tmp6389, _mm512_set1_ps(2e+00f), in977);
in969 = _mm512_fnmadd_ps(tmp6385, _mm512_set1_ps(2e+00f), in969);
in977 = _mm512_fnmadd_ps(tmp6389, _mm512_set1_ps(2e+00f), in977);
__m512 tmp6401 = _mm512_unpacklo_ps(in965, tmp6387);
__m512 tmp6402 = _mm512_unpackhi_ps(in965, tmp6387);
__m512 tmp6403 = _mm512_unpacklo_ps(tmp6388, in971);
__m512 tmp6404 = _mm512_unpackhi_ps(tmp6388, in971);
__m512 tmp6405 = _mm512_unpacklo_ps(tmp6386, in967);
__m512 tmp6406 = _mm512_unpackhi_ps(tmp6386, in967);
__m512 tmp6407 = _mm512_unpacklo_ps(in969, in968);
__m512 tmp6408 = _mm512_unpackhi_ps(in969, in968);
__m512 tmp6409 = _mm512_unpacklo_ps(in973, tmp6391);
__m512 tmp6410 = _mm512_unpackhi_ps(in973, tmp6391);
__m512 tmp6411 = _mm512_unpacklo_ps(tmp6392, in979);
__m512 tmp6412 = _mm512_unpackhi_ps(tmp6392, in979);
__m512 tmp6413 = _mm512_unpacklo_ps(tmp6390, in975);
__m512 tmp6414 = _mm512_unpackhi_ps(tmp6390, in975);
__m512 tmp6415 = _mm512_unpacklo_ps(in977, in976);
__m512 tmp6416 = _mm512_unpackhi_ps(in977, in976);
__m512 tmp6417 = _mm512_shuffle_ps(tmp6401, tmp6403, 68);
__m512 tmp6418 = _mm512_shuffle_ps(tmp6401, tmp6403, 238);
__m512 tmp6419 = _mm512_shuffle_ps(tmp6402, tmp6404, 68);
__m512 tmp6420 = _mm512_shuffle_ps(tmp6402, tmp6404, 238);
__m512 tmp6421 = _mm512_shuffle_ps(tmp6405, tmp6407, 68);
__m512 tmp6422 = _mm512_shuffle_ps(tmp6405, tmp6407, 238);
__m512 tmp6423 = _mm512_shuffle_ps(tmp6406, tmp6408, 68);
__m512 tmp6424 = _mm512_shuffle_ps(tmp6406, tmp6408, 238);
__m512 tmp6425 = _mm512_shuffle_ps(tmp6409, tmp6411, 68);
__m512 tmp6426 = _mm512_shuffle_ps(tmp6409, tmp6411, 238);
__m512 tmp6427 = _mm512_shuffle_ps(tmp6410, tmp6412, 68);
__m512 tmp6428 = _mm512_shuffle_ps(tmp6410, tmp6412, 238);
__m512 tmp6429 = _mm512_shuffle_ps(tmp6413, tmp6415, 68);
__m512 tmp6430 = _mm512_shuffle_ps(tmp6413, tmp6415, 238);
__m512 tmp6431 = _mm512_shuffle_ps(tmp6414, tmp6416, 68);
__m512 tmp6432 = _mm512_shuffle_ps(tmp6414, tmp6416, 238);
__m512 tmp6433 = _mm512_shuffle_f32x4(tmp6417, tmp6421, 136);
__m512 tmp6434 = _mm512_shuffle_f32x4(tmp6417, tmp6421, 221);
__m512 tmp6435 = _mm512_shuffle_f32x4(tmp6418, tmp6422, 136);
__m512 tmp6436 = _mm512_shuffle_f32x4(tmp6418, tmp6422, 221);
__m512 tmp6437 = _mm512_shuffle_f32x4(tmp6419, tmp6423, 136);
__m512 tmp6438 = _mm512_shuffle_f32x4(tmp6419, tmp6423, 221);
__m512 tmp6439 = _mm512_shuffle_f32x4(tmp6420, tmp6424, 136);
__m512 tmp6440 = _mm512_shuffle_f32x4(tmp6420, tmp6424, 221);
__m512 tmp6441 = _mm512_shuffle_f32x4(tmp6425, tmp6429, 136);
__m512 tmp6442 = _mm512_shuffle_f32x4(tmp6425, tmp6429, 221);
__m512 tmp6443 = _mm512_shuffle_f32x4(tmp6426, tmp6430, 136);
__m512 tmp6444 = _mm512_shuffle_f32x4(tmp6426, tmp6430, 221);
__m512 tmp6445 = _mm512_shuffle_f32x4(tmp6427, tmp6431, 136);
__m512 tmp6446 = _mm512_shuffle_f32x4(tmp6427, tmp6431, 221);
__m512 tmp6447 = _mm512_shuffle_f32x4(tmp6428, tmp6432, 136);
__m512 tmp6448 = _mm512_shuffle_f32x4(tmp6428, tmp6432, 221);
in965 = _mm512_shuffle_f32x4(tmp6433, tmp6441, 136);
in973 = _mm512_shuffle_f32x4(tmp6433, tmp6441, 221);
tmp6387 = _mm512_shuffle_f32x4(tmp6435, tmp6443, 136);
tmp6391 = _mm512_shuffle_f32x4(tmp6435, tmp6443, 221);
tmp6388 = _mm512_shuffle_f32x4(tmp6437, tmp6445, 136);
tmp6392 = _mm512_shuffle_f32x4(tmp6437, tmp6445, 221);
in971 = _mm512_shuffle_f32x4(tmp6439, tmp6447, 136);
in979 = _mm512_shuffle_f32x4(tmp6439, tmp6447, 221);
tmp6386 = _mm512_shuffle_f32x4(tmp6434, tmp6442, 136);
tmp6390 = _mm512_shuffle_f32x4(tmp6434, tmp6442, 221);
in967 = _mm512_shuffle_f32x4(tmp6436, tmp6444, 136);
in975 = _mm512_shuffle_f32x4(tmp6436, tmp6444, 221);
in969 = _mm512_shuffle_f32x4(tmp6438, tmp6446, 136);
in977 = _mm512_shuffle_f32x4(tmp6438, tmp6446, 221);
in968 = _mm512_shuffle_f32x4(tmp6440, tmp6448, 136);
in976 = _mm512_shuffle_f32x4(tmp6440, tmp6448, 221);
__m512 tmp6393 = _mm512_add_ps(tmp6387, in967);
__m512 tmp6397 = _mm512_add_ps(tmp6391, in975);
__m512 tmp6394 = _mm512_sub_ps(tmp6386, tmp6388);
__m512 tmp6398 = _mm512_sub_ps(tmp6390, tmp6392);
__m512 tmp6395 = _mm512_add_ps(tmp6388, in969);
__m512 tmp6399 = _mm512_add_ps(tmp6392, in977);
in965 = _mm512_sub_ps(in965, in969);
in973 = _mm512_sub_ps(in973, in977);
tmp6393 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-4.25e+00f), tmp6393);
tmp6397 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-4.25e+00f), tmp6397);
tmp6395 = _mm512_fmadd_ps(tmp6386, _mm512_set1_ps(-4.25e+00f), tmp6395);
tmp6399 = _mm512_fmadd_ps(tmp6390, _mm512_set1_ps(-4.25e+00f), tmp6399);
in965 = _mm512_fmadd_ps(tmp6394, _mm512_set1_ps(5.25e+00f), in965);
in973 = _mm512_fmadd_ps(tmp6398, _mm512_set1_ps(5.25e+00f), in973);
tmp6394 = _mm512_fmadd_ps(tmp6388, _mm512_set1_ps(2.5e-01f), in969);
tmp6398 = _mm512_fmadd_ps(tmp6392, _mm512_set1_ps(2.5e-01f), in977);
tmp6388 = _mm512_fmadd_ps(tmp6388, _mm512_set1_ps(4e+00f), in969);
tmp6392 = _mm512_fmadd_ps(tmp6392, _mm512_set1_ps(4e+00f), in977);
__m512 tmp6396 = _mm512_sub_ps(tmp6395, tmp6393);
__m512 tmp6400 = _mm512_sub_ps(tmp6399, tmp6397);
tmp6395 = _mm512_add_ps(tmp6393, tmp6395);
tmp6399 = _mm512_add_ps(tmp6397, tmp6399);
tmp6393 = _mm512_fmadd_ps(tmp6387, _mm512_set1_ps(2.5e-01f), in967);
tmp6397 = _mm512_fmadd_ps(tmp6391, _mm512_set1_ps(2.5e-01f), in975);
tmp6394 = _mm512_fmadd_ps(tmp6386, _mm512_set1_ps(-1.25e+00f), tmp6394);
tmp6398 = _mm512_fmadd_ps(tmp6390, _mm512_set1_ps(-1.25e+00f), tmp6398);
tmp6386 = _mm512_fmadd_ps(tmp6386, _mm512_set1_ps(-5e+00f), tmp6388);
tmp6390 = _mm512_fmadd_ps(tmp6390, _mm512_set1_ps(-5e+00f), tmp6392);
tmp6393 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-1.25e+00f), tmp6393);
tmp6397 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-1.25e+00f), tmp6397);
in969 = _mm512_fmadd_ps(tmp6393, _mm512_set1_ps(2e+00f), tmp6394);
in977 = _mm512_fmadd_ps(tmp6397, _mm512_set1_ps(2e+00f), tmp6398);
tmp6394 = _mm512_fnmadd_ps(tmp6393, _mm512_set1_ps(2e+00f), tmp6394);
tmp6398 = _mm512_fnmadd_ps(tmp6397, _mm512_set1_ps(2e+00f), tmp6398);
tmp6393 = _mm512_fmadd_ps(in967, _mm512_set1_ps(2.5e-01f), tmp6387);
tmp6397 = _mm512_fmadd_ps(in975, _mm512_set1_ps(2.5e-01f), tmp6391);
tmp6387 = _mm512_sub_ps(in968, tmp6387);
tmp6391 = _mm512_sub_ps(in976, tmp6391);
tmp6393 = _mm512_fmadd_ps(in971, _mm512_set1_ps(-1.25e+00f), tmp6393);
tmp6397 = _mm512_fmadd_ps(in979, _mm512_set1_ps(-1.25e+00f), tmp6397);
in971 = _mm512_sub_ps(in971, in967);
in979 = _mm512_sub_ps(in979, in975);
in971 = _mm512_fmadd_ps(in971, _mm512_set1_ps(5.25e+00f), tmp6387);
in979 = _mm512_fmadd_ps(in979, _mm512_set1_ps(5.25e+00f), tmp6391);
tmp6388 = _mm512_fmadd_ps(tmp6393, _mm512_set1_ps(2e+00f), tmp6386);
tmp6392 = _mm512_fmadd_ps(tmp6397, _mm512_set1_ps(2e+00f), tmp6390);
tmp6386 = _mm512_fnmadd_ps(tmp6393, _mm512_set1_ps(2e+00f), tmp6386);
tmp6390 = _mm512_fnmadd_ps(tmp6397, _mm512_set1_ps(2e+00f), tmp6390);
__m512 out915 = _mm512_shuffle_f32x4(in965, tmp6395, 68);
__m512 out923 = _mm512_shuffle_f32x4(in965, tmp6395, 238);
__m512 out916 = _mm512_shuffle_f32x4(tmp6396, in969, 68);
__m512 out924 = _mm512_shuffle_f32x4(tmp6396, in969, 238);
__m512 out917 = _mm512_shuffle_f32x4(tmp6394, tmp6388, 68);
__m512 out925 = _mm512_shuffle_f32x4(tmp6394, tmp6388, 238);
__m512 out918 = _mm512_shuffle_f32x4(tmp6386, in971, 68);
__m512 out926 = _mm512_shuffle_f32x4(tmp6386, in971, 238);
__m512 out919 = _mm512_shuffle_f32x4(in973, tmp6399, 68);
__m512 out927 = _mm512_shuffle_f32x4(in973, tmp6399, 238);
__m512 out920 = _mm512_shuffle_f32x4(tmp6400, in977, 68);
__m512 out928 = _mm512_shuffle_f32x4(tmp6400, in977, 238);
__m512 out921 = _mm512_shuffle_f32x4(tmp6398, tmp6392, 68);
__m512 out929 = _mm512_shuffle_f32x4(tmp6398, tmp6392, 238);
__m512 out922 = _mm512_shuffle_f32x4(tmp6390, in979, 68);
__m512 out930 = _mm512_shuffle_f32x4(tmp6390, in979, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k83, out915);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k83, out923);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k83, out919);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k83, out927);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k83, out916);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k83, out924);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k83, out920);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k83, out928);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k83, out917);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k83, out925);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k83, out921);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k83, out929);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k83, out918);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k83, out926);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k83, out922);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k83, out930);
__m512 dat1505 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1506 = _mm512_maskz_loadu_ps(511, datPtr12+12704+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512i pm133 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in981 = _mm512_permutexvar_ps(pm133, dat1505);
__m512i pm134 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in989 = _mm512_permutexvar_ps(pm134, dat1506);
__m512 dat1507 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1508 = _mm512_maskz_loadu_ps(511, datPtr12+12928+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in982 = _mm512_permutexvar_ps(pm133, dat1507);
__m512 in990 = _mm512_permutexvar_ps(pm134, dat1508);
__m512 dat1509 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1510 = _mm512_maskz_loadu_ps(511, datPtr12+13152+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in983 = _mm512_permutexvar_ps(pm133, dat1509);
__m512 in991 = _mm512_permutexvar_ps(pm134, dat1510);
__m512 dat1511 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1512 = _mm512_maskz_loadu_ps(511, datPtr12+13376+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in984 = _mm512_permutexvar_ps(pm133, dat1511);
__m512 in992 = _mm512_permutexvar_ps(pm134, dat1512);
__m512 dat1513 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1514 = _mm512_maskz_loadu_ps(511, datPtr12+13600+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in985 = _mm512_permutexvar_ps(pm133, dat1513);
__m512 in993 = _mm512_permutexvar_ps(pm134, dat1514);
__m512 dat1515 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1516 = _mm512_maskz_loadu_ps(511, datPtr12+13824+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in986 = _mm512_permutexvar_ps(pm133, dat1515);
__m512 in994 = _mm512_permutexvar_ps(pm134, dat1516);
__m512 dat1517 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1518 = _mm512_maskz_loadu_ps(511, datPtr12+14048+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in987 = _mm512_permutexvar_ps(pm133, dat1517);
__m512 in995 = _mm512_permutexvar_ps(pm134, dat1518);
__m512 dat1519 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 dat1520 = _mm512_maskz_loadu_ps(511, datPtr12+14272+50432*i27+224*h33+4*w40+50432*s19+25216*k83);
__m512 in988 = _mm512_permutexvar_ps(pm133, dat1519);
__m512 in996 = _mm512_permutexvar_ps(pm134, dat1520);
__m512 tmp6449 = _mm512_add_ps(in982, in986);
__m512 tmp6453 = _mm512_add_ps(in990, in994);
__m512 tmp6450 = _mm512_sub_ps(in985, in983);
__m512 tmp6454 = _mm512_sub_ps(in993, in991);
__m512 tmp6451 = _mm512_add_ps(in983, in987);
__m512 tmp6455 = _mm512_add_ps(in991, in995);
in981 = _mm512_sub_ps(in981, in987);
in989 = _mm512_sub_ps(in989, in995);
tmp6449 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-4.25e+00f), tmp6449);
tmp6453 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-4.25e+00f), tmp6453);
tmp6451 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-4.25e+00f), tmp6451);
tmp6455 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-4.25e+00f), tmp6455);
in981 = _mm512_fmadd_ps(tmp6450, _mm512_set1_ps(5.25e+00f), in981);
in989 = _mm512_fmadd_ps(tmp6454, _mm512_set1_ps(5.25e+00f), in989);
tmp6450 = _mm512_fmadd_ps(in983, _mm512_set1_ps(2.5e-01f), in987);
tmp6454 = _mm512_fmadd_ps(in991, _mm512_set1_ps(2.5e-01f), in995);
in983 = _mm512_fmadd_ps(in983, _mm512_set1_ps(4e+00f), in987);
in991 = _mm512_fmadd_ps(in991, _mm512_set1_ps(4e+00f), in995);
__m512 tmp6452 = _mm512_sub_ps(tmp6451, tmp6449);
__m512 tmp6456 = _mm512_sub_ps(tmp6455, tmp6453);
tmp6451 = _mm512_add_ps(tmp6449, tmp6451);
tmp6455 = _mm512_add_ps(tmp6453, tmp6455);
tmp6449 = _mm512_fmadd_ps(in982, _mm512_set1_ps(2.5e-01f), in986);
tmp6453 = _mm512_fmadd_ps(in990, _mm512_set1_ps(2.5e-01f), in994);
tmp6450 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-1.25e+00f), tmp6450);
tmp6454 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-1.25e+00f), tmp6454);
in985 = _mm512_fmadd_ps(in985, _mm512_set1_ps(-5e+00f), in983);
in993 = _mm512_fmadd_ps(in993, _mm512_set1_ps(-5e+00f), in991);
tmp6449 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-1.25e+00f), tmp6449);
tmp6453 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-1.25e+00f), tmp6453);
in987 = _mm512_fmadd_ps(tmp6449, _mm512_set1_ps(2e+00f), tmp6450);
in995 = _mm512_fmadd_ps(tmp6453, _mm512_set1_ps(2e+00f), tmp6454);
tmp6450 = _mm512_fnmadd_ps(tmp6449, _mm512_set1_ps(2e+00f), tmp6450);
tmp6454 = _mm512_fnmadd_ps(tmp6453, _mm512_set1_ps(2e+00f), tmp6454);
tmp6449 = _mm512_fmadd_ps(in986, _mm512_set1_ps(2.5e-01f), in982);
tmp6453 = _mm512_fmadd_ps(in994, _mm512_set1_ps(2.5e-01f), in990);
in982 = _mm512_sub_ps(in988, in982);
in990 = _mm512_sub_ps(in996, in990);
tmp6449 = _mm512_fmadd_ps(in984, _mm512_set1_ps(-1.25e+00f), tmp6449);
tmp6453 = _mm512_fmadd_ps(in992, _mm512_set1_ps(-1.25e+00f), tmp6453);
in984 = _mm512_sub_ps(in984, in986);
in992 = _mm512_sub_ps(in992, in994);
in984 = _mm512_fmadd_ps(in984, _mm512_set1_ps(5.25e+00f), in982);
in992 = _mm512_fmadd_ps(in992, _mm512_set1_ps(5.25e+00f), in990);
in983 = _mm512_fmadd_ps(tmp6449, _mm512_set1_ps(2e+00f), in985);
in991 = _mm512_fmadd_ps(tmp6453, _mm512_set1_ps(2e+00f), in993);
in985 = _mm512_fnmadd_ps(tmp6449, _mm512_set1_ps(2e+00f), in985);
in993 = _mm512_fnmadd_ps(tmp6453, _mm512_set1_ps(2e+00f), in993);
__m512 tmp6465 = _mm512_unpacklo_ps(in981, tmp6451);
__m512 tmp6466 = _mm512_unpackhi_ps(in981, tmp6451);
__m512 tmp6467 = _mm512_unpacklo_ps(tmp6452, in987);
__m512 tmp6468 = _mm512_unpackhi_ps(tmp6452, in987);
__m512 tmp6469 = _mm512_unpacklo_ps(tmp6450, in983);
__m512 tmp6470 = _mm512_unpackhi_ps(tmp6450, in983);
__m512 tmp6471 = _mm512_unpacklo_ps(in985, in984);
__m512 tmp6472 = _mm512_unpackhi_ps(in985, in984);
__m512 tmp6473 = _mm512_unpacklo_ps(in989, tmp6455);
__m512 tmp6474 = _mm512_unpackhi_ps(in989, tmp6455);
__m512 tmp6475 = _mm512_unpacklo_ps(tmp6456, in995);
__m512 tmp6476 = _mm512_unpackhi_ps(tmp6456, in995);
__m512 tmp6477 = _mm512_unpacklo_ps(tmp6454, in991);
__m512 tmp6478 = _mm512_unpackhi_ps(tmp6454, in991);
__m512 tmp6479 = _mm512_unpacklo_ps(in993, in992);
__m512 tmp6480 = _mm512_unpackhi_ps(in993, in992);
__m512 tmp6481 = _mm512_shuffle_ps(tmp6465, tmp6467, 68);
__m512 tmp6482 = _mm512_shuffle_ps(tmp6465, tmp6467, 238);
__m512 tmp6483 = _mm512_shuffle_ps(tmp6466, tmp6468, 68);
__m512 tmp6484 = _mm512_shuffle_ps(tmp6466, tmp6468, 238);
__m512 tmp6485 = _mm512_shuffle_ps(tmp6469, tmp6471, 68);
__m512 tmp6486 = _mm512_shuffle_ps(tmp6469, tmp6471, 238);
__m512 tmp6487 = _mm512_shuffle_ps(tmp6470, tmp6472, 68);
__m512 tmp6488 = _mm512_shuffle_ps(tmp6470, tmp6472, 238);
__m512 tmp6489 = _mm512_shuffle_ps(tmp6473, tmp6475, 68);
__m512 tmp6490 = _mm512_shuffle_ps(tmp6473, tmp6475, 238);
__m512 tmp6491 = _mm512_shuffle_ps(tmp6474, tmp6476, 68);
__m512 tmp6492 = _mm512_shuffle_ps(tmp6474, tmp6476, 238);
__m512 tmp6493 = _mm512_shuffle_ps(tmp6477, tmp6479, 68);
__m512 tmp6494 = _mm512_shuffle_ps(tmp6477, tmp6479, 238);
__m512 tmp6495 = _mm512_shuffle_ps(tmp6478, tmp6480, 68);
__m512 tmp6496 = _mm512_shuffle_ps(tmp6478, tmp6480, 238);
__m512 tmp6497 = _mm512_shuffle_f32x4(tmp6481, tmp6485, 136);
__m512 tmp6498 = _mm512_shuffle_f32x4(tmp6481, tmp6485, 221);
__m512 tmp6499 = _mm512_shuffle_f32x4(tmp6482, tmp6486, 136);
__m512 tmp6500 = _mm512_shuffle_f32x4(tmp6482, tmp6486, 221);
__m512 tmp6501 = _mm512_shuffle_f32x4(tmp6483, tmp6487, 136);
__m512 tmp6502 = _mm512_shuffle_f32x4(tmp6483, tmp6487, 221);
__m512 tmp6503 = _mm512_shuffle_f32x4(tmp6484, tmp6488, 136);
__m512 tmp6504 = _mm512_shuffle_f32x4(tmp6484, tmp6488, 221);
__m512 tmp6505 = _mm512_shuffle_f32x4(tmp6489, tmp6493, 136);
__m512 tmp6506 = _mm512_shuffle_f32x4(tmp6489, tmp6493, 221);
__m512 tmp6507 = _mm512_shuffle_f32x4(tmp6490, tmp6494, 136);
__m512 tmp6508 = _mm512_shuffle_f32x4(tmp6490, tmp6494, 221);
__m512 tmp6509 = _mm512_shuffle_f32x4(tmp6491, tmp6495, 136);
__m512 tmp6510 = _mm512_shuffle_f32x4(tmp6491, tmp6495, 221);
__m512 tmp6511 = _mm512_shuffle_f32x4(tmp6492, tmp6496, 136);
__m512 tmp6512 = _mm512_shuffle_f32x4(tmp6492, tmp6496, 221);
in981 = _mm512_shuffle_f32x4(tmp6497, tmp6505, 136);
in989 = _mm512_shuffle_f32x4(tmp6497, tmp6505, 221);
tmp6451 = _mm512_shuffle_f32x4(tmp6499, tmp6507, 136);
tmp6455 = _mm512_shuffle_f32x4(tmp6499, tmp6507, 221);
tmp6452 = _mm512_shuffle_f32x4(tmp6501, tmp6509, 136);
tmp6456 = _mm512_shuffle_f32x4(tmp6501, tmp6509, 221);
in987 = _mm512_shuffle_f32x4(tmp6503, tmp6511, 136);
in995 = _mm512_shuffle_f32x4(tmp6503, tmp6511, 221);
tmp6450 = _mm512_shuffle_f32x4(tmp6498, tmp6506, 136);
tmp6454 = _mm512_shuffle_f32x4(tmp6498, tmp6506, 221);
in983 = _mm512_shuffle_f32x4(tmp6500, tmp6508, 136);
in991 = _mm512_shuffle_f32x4(tmp6500, tmp6508, 221);
in985 = _mm512_shuffle_f32x4(tmp6502, tmp6510, 136);
in993 = _mm512_shuffle_f32x4(tmp6502, tmp6510, 221);
in984 = _mm512_shuffle_f32x4(tmp6504, tmp6512, 136);
in992 = _mm512_shuffle_f32x4(tmp6504, tmp6512, 221);
__m512 tmp6457 = _mm512_add_ps(tmp6451, in983);
__m512 tmp6461 = _mm512_add_ps(tmp6455, in991);
__m512 tmp6458 = _mm512_sub_ps(tmp6450, tmp6452);
__m512 tmp6462 = _mm512_sub_ps(tmp6454, tmp6456);
__m512 tmp6459 = _mm512_add_ps(tmp6452, in985);
__m512 tmp6463 = _mm512_add_ps(tmp6456, in993);
in981 = _mm512_sub_ps(in981, in985);
in989 = _mm512_sub_ps(in989, in993);
tmp6457 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-4.25e+00f), tmp6457);
tmp6461 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-4.25e+00f), tmp6461);
tmp6459 = _mm512_fmadd_ps(tmp6450, _mm512_set1_ps(-4.25e+00f), tmp6459);
tmp6463 = _mm512_fmadd_ps(tmp6454, _mm512_set1_ps(-4.25e+00f), tmp6463);
in981 = _mm512_fmadd_ps(tmp6458, _mm512_set1_ps(5.25e+00f), in981);
in989 = _mm512_fmadd_ps(tmp6462, _mm512_set1_ps(5.25e+00f), in989);
tmp6458 = _mm512_fmadd_ps(tmp6452, _mm512_set1_ps(2.5e-01f), in985);
tmp6462 = _mm512_fmadd_ps(tmp6456, _mm512_set1_ps(2.5e-01f), in993);
tmp6452 = _mm512_fmadd_ps(tmp6452, _mm512_set1_ps(4e+00f), in985);
tmp6456 = _mm512_fmadd_ps(tmp6456, _mm512_set1_ps(4e+00f), in993);
__m512 tmp6460 = _mm512_sub_ps(tmp6459, tmp6457);
__m512 tmp6464 = _mm512_sub_ps(tmp6463, tmp6461);
tmp6459 = _mm512_add_ps(tmp6457, tmp6459);
tmp6463 = _mm512_add_ps(tmp6461, tmp6463);
tmp6457 = _mm512_fmadd_ps(tmp6451, _mm512_set1_ps(2.5e-01f), in983);
tmp6461 = _mm512_fmadd_ps(tmp6455, _mm512_set1_ps(2.5e-01f), in991);
tmp6458 = _mm512_fmadd_ps(tmp6450, _mm512_set1_ps(-1.25e+00f), tmp6458);
tmp6462 = _mm512_fmadd_ps(tmp6454, _mm512_set1_ps(-1.25e+00f), tmp6462);
tmp6450 = _mm512_fmadd_ps(tmp6450, _mm512_set1_ps(-5e+00f), tmp6452);
tmp6454 = _mm512_fmadd_ps(tmp6454, _mm512_set1_ps(-5e+00f), tmp6456);
tmp6457 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-1.25e+00f), tmp6457);
tmp6461 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-1.25e+00f), tmp6461);
in985 = _mm512_fmadd_ps(tmp6457, _mm512_set1_ps(2e+00f), tmp6458);
in993 = _mm512_fmadd_ps(tmp6461, _mm512_set1_ps(2e+00f), tmp6462);
tmp6458 = _mm512_fnmadd_ps(tmp6457, _mm512_set1_ps(2e+00f), tmp6458);
tmp6462 = _mm512_fnmadd_ps(tmp6461, _mm512_set1_ps(2e+00f), tmp6462);
tmp6457 = _mm512_fmadd_ps(in983, _mm512_set1_ps(2.5e-01f), tmp6451);
tmp6461 = _mm512_fmadd_ps(in991, _mm512_set1_ps(2.5e-01f), tmp6455);
tmp6451 = _mm512_sub_ps(in984, tmp6451);
tmp6455 = _mm512_sub_ps(in992, tmp6455);
tmp6457 = _mm512_fmadd_ps(in987, _mm512_set1_ps(-1.25e+00f), tmp6457);
tmp6461 = _mm512_fmadd_ps(in995, _mm512_set1_ps(-1.25e+00f), tmp6461);
in987 = _mm512_sub_ps(in987, in983);
in995 = _mm512_sub_ps(in995, in991);
in987 = _mm512_fmadd_ps(in987, _mm512_set1_ps(5.25e+00f), tmp6451);
in995 = _mm512_fmadd_ps(in995, _mm512_set1_ps(5.25e+00f), tmp6455);
tmp6452 = _mm512_fmadd_ps(tmp6457, _mm512_set1_ps(2e+00f), tmp6450);
tmp6456 = _mm512_fmadd_ps(tmp6461, _mm512_set1_ps(2e+00f), tmp6454);
tmp6450 = _mm512_fnmadd_ps(tmp6457, _mm512_set1_ps(2e+00f), tmp6450);
tmp6454 = _mm512_fnmadd_ps(tmp6461, _mm512_set1_ps(2e+00f), tmp6454);
__m512 out931 = _mm512_shuffle_f32x4(in981, tmp6459, 68);
__m512 out939 = _mm512_shuffle_f32x4(in981, tmp6459, 238);
__m512 out932 = _mm512_shuffle_f32x4(tmp6460, in985, 68);
__m512 out940 = _mm512_shuffle_f32x4(tmp6460, in985, 238);
__m512 out933 = _mm512_shuffle_f32x4(tmp6458, tmp6452, 68);
__m512 out941 = _mm512_shuffle_f32x4(tmp6458, tmp6452, 238);
__m512 out934 = _mm512_shuffle_f32x4(tmp6450, in987, 68);
__m512 out942 = _mm512_shuffle_f32x4(tmp6450, in987, 238);
__m512 out935 = _mm512_shuffle_f32x4(in989, tmp6463, 68);
__m512 out943 = _mm512_shuffle_f32x4(in989, tmp6463, 238);
__m512 out936 = _mm512_shuffle_f32x4(tmp6464, in993, 68);
__m512 out944 = _mm512_shuffle_f32x4(tmp6464, in993, 238);
__m512 out937 = _mm512_shuffle_f32x4(tmp6462, tmp6456, 68);
__m512 out945 = _mm512_shuffle_f32x4(tmp6462, tmp6456, 238);
__m512 out938 = _mm512_shuffle_f32x4(tmp6454, in995, 68);
__m512 out946 = _mm512_shuffle_f32x4(tmp6454, in995, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k83, out931);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k83, out939);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k83, out935);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k83, out943);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k83, out932);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k83, out940);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k83, out936);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k83, out944);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k83, out933);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k83, out941);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k83, out937);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k83, out945);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k83, out934);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k83, out942);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k83, out938);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k83, out946);
}
++j21;
if (j21 >= 15) break;
rel14 = 3;
}
if (rel14 < 4) {
ptrdiff_t h34 = base14+12;
ptrdiff_t w41 = 0;
ptrdiff_t k84 = 0;
for (; k84 != 2; ++k84) {
__m512 dat1521 = _mm512_maskz_loadu_ps(8191, datPtr12+4+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1522 = _mm512_maskz_loadu_ps(16383, datPtr12+48+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512i pm135 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in997 = _mm512_permutexvar_ps(pm135, dat1521);
__m512i pm136 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1005 = _mm512_permutexvar_ps(pm136, dat1522);
__m512 dat1523 = _mm512_maskz_loadu_ps(8191, datPtr12+228+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1524 = _mm512_maskz_loadu_ps(16383, datPtr12+272+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in998 = _mm512_permutexvar_ps(pm135, dat1523);
__m512 in1006 = _mm512_permutexvar_ps(pm136, dat1524);
__m512 dat1525 = _mm512_maskz_loadu_ps(8191, datPtr12+452+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1526 = _mm512_maskz_loadu_ps(16383, datPtr12+496+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in999 = _mm512_permutexvar_ps(pm135, dat1525);
__m512 in1007 = _mm512_permutexvar_ps(pm136, dat1526);
__m512 dat1527 = _mm512_maskz_loadu_ps(8191, datPtr12+676+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1528 = _mm512_maskz_loadu_ps(16383, datPtr12+720+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1000 = _mm512_permutexvar_ps(pm135, dat1527);
__m512 in1008 = _mm512_permutexvar_ps(pm136, dat1528);
__m512 dat1529 = _mm512_maskz_loadu_ps(8191, datPtr12+900+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1530 = _mm512_maskz_loadu_ps(16383, datPtr12+944+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1001 = _mm512_permutexvar_ps(pm135, dat1529);
__m512 in1009 = _mm512_permutexvar_ps(pm136, dat1530);
__m512 dat1531 = _mm512_maskz_loadu_ps(8191, datPtr12+1124+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1532 = _mm512_maskz_loadu_ps(16383, datPtr12+1168+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1002 = _mm512_permutexvar_ps(pm135, dat1531);
__m512 in1010 = _mm512_permutexvar_ps(pm136, dat1532);
__m512 dat1533 = _mm512_maskz_loadu_ps(8191, datPtr12+1348+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1534 = _mm512_maskz_loadu_ps(16383, datPtr12+1392+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1003 = _mm512_permutexvar_ps(pm135, dat1533);
__m512 in1011 = _mm512_permutexvar_ps(pm136, dat1534);
__m512 dat1535 = _mm512_maskz_loadu_ps(8191, datPtr12+1572+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1536 = _mm512_maskz_loadu_ps(16383, datPtr12+1616+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1004 = _mm512_permutexvar_ps(pm135, dat1535);
__m512 in1012 = _mm512_permutexvar_ps(pm136, dat1536);
__m512 tmp6513 = _mm512_add_ps(in998, in1002);
__m512 tmp6517 = _mm512_add_ps(in1006, in1010);
__m512 tmp6514 = _mm512_sub_ps(in1001, in999);
__m512 tmp6518 = _mm512_sub_ps(in1009, in1007);
__m512 tmp6515 = _mm512_add_ps(in999, in1003);
__m512 tmp6519 = _mm512_add_ps(in1007, in1011);
in997 = _mm512_sub_ps(in997, in1003);
in1005 = _mm512_sub_ps(in1005, in1011);
tmp6513 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-4.25e+00f), tmp6513);
tmp6517 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-4.25e+00f), tmp6517);
tmp6515 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-4.25e+00f), tmp6515);
tmp6519 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-4.25e+00f), tmp6519);
in997 = _mm512_fmadd_ps(tmp6514, _mm512_set1_ps(5.25e+00f), in997);
in1005 = _mm512_fmadd_ps(tmp6518, _mm512_set1_ps(5.25e+00f), in1005);
tmp6514 = _mm512_fmadd_ps(in999, _mm512_set1_ps(2.5e-01f), in1003);
tmp6518 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(2.5e-01f), in1011);
in999 = _mm512_fmadd_ps(in999, _mm512_set1_ps(4e+00f), in1003);
in1007 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(4e+00f), in1011);
__m512 tmp6516 = _mm512_sub_ps(tmp6515, tmp6513);
__m512 tmp6520 = _mm512_sub_ps(tmp6519, tmp6517);
tmp6515 = _mm512_add_ps(tmp6513, tmp6515);
tmp6519 = _mm512_add_ps(tmp6517, tmp6519);
tmp6513 = _mm512_fmadd_ps(in998, _mm512_set1_ps(2.5e-01f), in1002);
tmp6517 = _mm512_fmadd_ps(in1006, _mm512_set1_ps(2.5e-01f), in1010);
tmp6514 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-1.25e+00f), tmp6514);
tmp6518 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-1.25e+00f), tmp6518);
in1001 = _mm512_fmadd_ps(in1001, _mm512_set1_ps(-5e+00f), in999);
in1009 = _mm512_fmadd_ps(in1009, _mm512_set1_ps(-5e+00f), in1007);
tmp6513 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-1.25e+00f), tmp6513);
tmp6517 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-1.25e+00f), tmp6517);
in1003 = _mm512_fmadd_ps(tmp6513, _mm512_set1_ps(2e+00f), tmp6514);
in1011 = _mm512_fmadd_ps(tmp6517, _mm512_set1_ps(2e+00f), tmp6518);
tmp6514 = _mm512_fnmadd_ps(tmp6513, _mm512_set1_ps(2e+00f), tmp6514);
tmp6518 = _mm512_fnmadd_ps(tmp6517, _mm512_set1_ps(2e+00f), tmp6518);
tmp6513 = _mm512_fmadd_ps(in1002, _mm512_set1_ps(2.5e-01f), in998);
tmp6517 = _mm512_fmadd_ps(in1010, _mm512_set1_ps(2.5e-01f), in1006);
in998 = _mm512_sub_ps(in1004, in998);
in1006 = _mm512_sub_ps(in1012, in1006);
tmp6513 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(-1.25e+00f), tmp6513);
tmp6517 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(-1.25e+00f), tmp6517);
in1000 = _mm512_sub_ps(in1000, in1002);
in1008 = _mm512_sub_ps(in1008, in1010);
in1000 = _mm512_fmadd_ps(in1000, _mm512_set1_ps(5.25e+00f), in998);
in1008 = _mm512_fmadd_ps(in1008, _mm512_set1_ps(5.25e+00f), in1006);
in999 = _mm512_fmadd_ps(tmp6513, _mm512_set1_ps(2e+00f), in1001);
in1007 = _mm512_fmadd_ps(tmp6517, _mm512_set1_ps(2e+00f), in1009);
in1001 = _mm512_fnmadd_ps(tmp6513, _mm512_set1_ps(2e+00f), in1001);
in1009 = _mm512_fnmadd_ps(tmp6517, _mm512_set1_ps(2e+00f), in1009);
__m512 tmp6529 = _mm512_unpacklo_ps(in997, tmp6515);
__m512 tmp6530 = _mm512_unpackhi_ps(in997, tmp6515);
__m512 tmp6531 = _mm512_unpacklo_ps(tmp6516, in1003);
__m512 tmp6532 = _mm512_unpackhi_ps(tmp6516, in1003);
__m512 tmp6533 = _mm512_unpacklo_ps(tmp6514, in999);
__m512 tmp6534 = _mm512_unpackhi_ps(tmp6514, in999);
__m512 tmp6535 = _mm512_unpacklo_ps(in1001, in1000);
__m512 tmp6536 = _mm512_unpackhi_ps(in1001, in1000);
__m512 tmp6537 = _mm512_unpacklo_ps(in1005, tmp6519);
__m512 tmp6538 = _mm512_unpackhi_ps(in1005, tmp6519);
__m512 tmp6539 = _mm512_unpacklo_ps(tmp6520, in1011);
__m512 tmp6540 = _mm512_unpackhi_ps(tmp6520, in1011);
__m512 tmp6541 = _mm512_unpacklo_ps(tmp6518, in1007);
__m512 tmp6542 = _mm512_unpackhi_ps(tmp6518, in1007);
__m512 tmp6543 = _mm512_unpacklo_ps(in1009, in1008);
__m512 tmp6544 = _mm512_unpackhi_ps(in1009, in1008);
__m512 tmp6545 = _mm512_shuffle_ps(tmp6529, tmp6531, 68);
__m512 tmp6546 = _mm512_shuffle_ps(tmp6529, tmp6531, 238);
__m512 tmp6547 = _mm512_shuffle_ps(tmp6530, tmp6532, 68);
__m512 tmp6548 = _mm512_shuffle_ps(tmp6530, tmp6532, 238);
__m512 tmp6549 = _mm512_shuffle_ps(tmp6533, tmp6535, 68);
__m512 tmp6550 = _mm512_shuffle_ps(tmp6533, tmp6535, 238);
__m512 tmp6551 = _mm512_shuffle_ps(tmp6534, tmp6536, 68);
__m512 tmp6552 = _mm512_shuffle_ps(tmp6534, tmp6536, 238);
__m512 tmp6553 = _mm512_shuffle_ps(tmp6537, tmp6539, 68);
__m512 tmp6554 = _mm512_shuffle_ps(tmp6537, tmp6539, 238);
__m512 tmp6555 = _mm512_shuffle_ps(tmp6538, tmp6540, 68);
__m512 tmp6556 = _mm512_shuffle_ps(tmp6538, tmp6540, 238);
__m512 tmp6557 = _mm512_shuffle_ps(tmp6541, tmp6543, 68);
__m512 tmp6558 = _mm512_shuffle_ps(tmp6541, tmp6543, 238);
__m512 tmp6559 = _mm512_shuffle_ps(tmp6542, tmp6544, 68);
__m512 tmp6560 = _mm512_shuffle_ps(tmp6542, tmp6544, 238);
__m512 tmp6561 = _mm512_shuffle_f32x4(tmp6545, tmp6549, 136);
__m512 tmp6562 = _mm512_shuffle_f32x4(tmp6545, tmp6549, 221);
__m512 tmp6563 = _mm512_shuffle_f32x4(tmp6546, tmp6550, 136);
__m512 tmp6564 = _mm512_shuffle_f32x4(tmp6546, tmp6550, 221);
__m512 tmp6565 = _mm512_shuffle_f32x4(tmp6547, tmp6551, 136);
__m512 tmp6566 = _mm512_shuffle_f32x4(tmp6547, tmp6551, 221);
__m512 tmp6567 = _mm512_shuffle_f32x4(tmp6548, tmp6552, 136);
__m512 tmp6568 = _mm512_shuffle_f32x4(tmp6548, tmp6552, 221);
__m512 tmp6569 = _mm512_shuffle_f32x4(tmp6553, tmp6557, 136);
__m512 tmp6570 = _mm512_shuffle_f32x4(tmp6553, tmp6557, 221);
__m512 tmp6571 = _mm512_shuffle_f32x4(tmp6554, tmp6558, 136);
__m512 tmp6572 = _mm512_shuffle_f32x4(tmp6554, tmp6558, 221);
__m512 tmp6573 = _mm512_shuffle_f32x4(tmp6555, tmp6559, 136);
__m512 tmp6574 = _mm512_shuffle_f32x4(tmp6555, tmp6559, 221);
__m512 tmp6575 = _mm512_shuffle_f32x4(tmp6556, tmp6560, 136);
__m512 tmp6576 = _mm512_shuffle_f32x4(tmp6556, tmp6560, 221);
in997 = _mm512_shuffle_f32x4(tmp6561, tmp6569, 136);
in1005 = _mm512_shuffle_f32x4(tmp6561, tmp6569, 221);
tmp6515 = _mm512_shuffle_f32x4(tmp6563, tmp6571, 136);
tmp6519 = _mm512_shuffle_f32x4(tmp6563, tmp6571, 221);
tmp6516 = _mm512_shuffle_f32x4(tmp6565, tmp6573, 136);
tmp6520 = _mm512_shuffle_f32x4(tmp6565, tmp6573, 221);
in1003 = _mm512_shuffle_f32x4(tmp6567, tmp6575, 136);
in1011 = _mm512_shuffle_f32x4(tmp6567, tmp6575, 221);
tmp6514 = _mm512_shuffle_f32x4(tmp6562, tmp6570, 136);
tmp6518 = _mm512_shuffle_f32x4(tmp6562, tmp6570, 221);
in999 = _mm512_shuffle_f32x4(tmp6564, tmp6572, 136);
in1007 = _mm512_shuffle_f32x4(tmp6564, tmp6572, 221);
in1001 = _mm512_shuffle_f32x4(tmp6566, tmp6574, 136);
in1009 = _mm512_shuffle_f32x4(tmp6566, tmp6574, 221);
in1000 = _mm512_shuffle_f32x4(tmp6568, tmp6576, 136);
in1008 = _mm512_shuffle_f32x4(tmp6568, tmp6576, 221);
__m512 tmp6521 = _mm512_add_ps(tmp6515, in999);
__m512 tmp6525 = _mm512_add_ps(tmp6519, in1007);
__m512 tmp6522 = _mm512_sub_ps(tmp6514, tmp6516);
__m512 tmp6526 = _mm512_sub_ps(tmp6518, tmp6520);
__m512 tmp6523 = _mm512_add_ps(tmp6516, in1001);
__m512 tmp6527 = _mm512_add_ps(tmp6520, in1009);
in997 = _mm512_sub_ps(in997, in1001);
in1005 = _mm512_sub_ps(in1005, in1009);
tmp6521 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-4.25e+00f), tmp6521);
tmp6525 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-4.25e+00f), tmp6525);
tmp6523 = _mm512_fmadd_ps(tmp6514, _mm512_set1_ps(-4.25e+00f), tmp6523);
tmp6527 = _mm512_fmadd_ps(tmp6518, _mm512_set1_ps(-4.25e+00f), tmp6527);
in997 = _mm512_fmadd_ps(tmp6522, _mm512_set1_ps(5.25e+00f), in997);
in1005 = _mm512_fmadd_ps(tmp6526, _mm512_set1_ps(5.25e+00f), in1005);
tmp6522 = _mm512_fmadd_ps(tmp6516, _mm512_set1_ps(2.5e-01f), in1001);
tmp6526 = _mm512_fmadd_ps(tmp6520, _mm512_set1_ps(2.5e-01f), in1009);
tmp6516 = _mm512_fmadd_ps(tmp6516, _mm512_set1_ps(4e+00f), in1001);
tmp6520 = _mm512_fmadd_ps(tmp6520, _mm512_set1_ps(4e+00f), in1009);
__m512 tmp6524 = _mm512_sub_ps(tmp6523, tmp6521);
__m512 tmp6528 = _mm512_sub_ps(tmp6527, tmp6525);
tmp6523 = _mm512_add_ps(tmp6521, tmp6523);
tmp6527 = _mm512_add_ps(tmp6525, tmp6527);
tmp6521 = _mm512_fmadd_ps(tmp6515, _mm512_set1_ps(2.5e-01f), in999);
tmp6525 = _mm512_fmadd_ps(tmp6519, _mm512_set1_ps(2.5e-01f), in1007);
tmp6522 = _mm512_fmadd_ps(tmp6514, _mm512_set1_ps(-1.25e+00f), tmp6522);
tmp6526 = _mm512_fmadd_ps(tmp6518, _mm512_set1_ps(-1.25e+00f), tmp6526);
tmp6514 = _mm512_fmadd_ps(tmp6514, _mm512_set1_ps(-5e+00f), tmp6516);
tmp6518 = _mm512_fmadd_ps(tmp6518, _mm512_set1_ps(-5e+00f), tmp6520);
tmp6521 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-1.25e+00f), tmp6521);
tmp6525 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-1.25e+00f), tmp6525);
in1001 = _mm512_fmadd_ps(tmp6521, _mm512_set1_ps(2e+00f), tmp6522);
in1009 = _mm512_fmadd_ps(tmp6525, _mm512_set1_ps(2e+00f), tmp6526);
tmp6522 = _mm512_fnmadd_ps(tmp6521, _mm512_set1_ps(2e+00f), tmp6522);
tmp6526 = _mm512_fnmadd_ps(tmp6525, _mm512_set1_ps(2e+00f), tmp6526);
tmp6521 = _mm512_fmadd_ps(in999, _mm512_set1_ps(2.5e-01f), tmp6515);
tmp6525 = _mm512_fmadd_ps(in1007, _mm512_set1_ps(2.5e-01f), tmp6519);
tmp6515 = _mm512_sub_ps(in1000, tmp6515);
tmp6519 = _mm512_sub_ps(in1008, tmp6519);
tmp6521 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(-1.25e+00f), tmp6521);
tmp6525 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(-1.25e+00f), tmp6525);
in1003 = _mm512_sub_ps(in1003, in999);
in1011 = _mm512_sub_ps(in1011, in1007);
in1003 = _mm512_fmadd_ps(in1003, _mm512_set1_ps(5.25e+00f), tmp6515);
in1011 = _mm512_fmadd_ps(in1011, _mm512_set1_ps(5.25e+00f), tmp6519);
tmp6516 = _mm512_fmadd_ps(tmp6521, _mm512_set1_ps(2e+00f), tmp6514);
tmp6520 = _mm512_fmadd_ps(tmp6525, _mm512_set1_ps(2e+00f), tmp6518);
tmp6514 = _mm512_fnmadd_ps(tmp6521, _mm512_set1_ps(2e+00f), tmp6514);
tmp6518 = _mm512_fnmadd_ps(tmp6525, _mm512_set1_ps(2e+00f), tmp6518);
__m512 out947 = _mm512_shuffle_f32x4(in997, tmp6523, 68);
__m512 out955 = _mm512_shuffle_f32x4(in997, tmp6523, 238);
__m512 out948 = _mm512_shuffle_f32x4(tmp6524, in1001, 68);
__m512 out956 = _mm512_shuffle_f32x4(tmp6524, in1001, 238);
__m512 out949 = _mm512_shuffle_f32x4(tmp6522, tmp6516, 68);
__m512 out957 = _mm512_shuffle_f32x4(tmp6522, tmp6516, 238);
__m512 out950 = _mm512_shuffle_f32x4(tmp6514, in1003, 68);
__m512 out958 = _mm512_shuffle_f32x4(tmp6514, in1003, 238);
__m512 out951 = _mm512_shuffle_f32x4(in1005, tmp6527, 68);
__m512 out959 = _mm512_shuffle_f32x4(in1005, tmp6527, 238);
__m512 out952 = _mm512_shuffle_f32x4(tmp6528, in1009, 68);
__m512 out960 = _mm512_shuffle_f32x4(tmp6528, in1009, 238);
__m512 out953 = _mm512_shuffle_f32x4(tmp6526, tmp6520, 68);
__m512 out961 = _mm512_shuffle_f32x4(tmp6526, tmp6520, 238);
__m512 out954 = _mm512_shuffle_f32x4(tmp6518, in1011, 68);
__m512 out962 = _mm512_shuffle_f32x4(tmp6518, in1011, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k84, out947);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k84, out955);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k84, out951);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k84, out959);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k84, out948);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k84, out956);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k84, out952);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k84, out960);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k84, out949);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k84, out957);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k84, out953);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k84, out961);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k84, out950);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k84, out958);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k84, out954);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k84, out962);
__m512 dat1537 = _mm512_maskz_loadu_ps(16383, datPtr12+96+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1538 = _mm512_maskz_loadu_ps(8191, datPtr12+12612+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512i pm137 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1013 = _mm512_permutexvar_ps(pm137, dat1537);
__m512i pm138 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1021 = _mm512_permutexvar_ps(pm138, dat1538);
__m512 dat1539 = _mm512_maskz_loadu_ps(16383, datPtr12+320+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1540 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1014 = _mm512_permutexvar_ps(pm137, dat1539);
__m512 in1022 = _mm512_permutexvar_ps(pm138, dat1540);
__m512 dat1541 = _mm512_maskz_loadu_ps(16383, datPtr12+544+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1542 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1015 = _mm512_permutexvar_ps(pm137, dat1541);
__m512 in1023 = _mm512_permutexvar_ps(pm138, dat1542);
__m512 dat1543 = _mm512_maskz_loadu_ps(16383, datPtr12+768+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1544 = _mm512_maskz_loadu_ps(8191, datPtr12+13284+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1016 = _mm512_permutexvar_ps(pm137, dat1543);
__m512 in1024 = _mm512_permutexvar_ps(pm138, dat1544);
__m512 dat1545 = _mm512_maskz_loadu_ps(16383, datPtr12+992+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1546 = _mm512_maskz_loadu_ps(8191, datPtr12+13508+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1017 = _mm512_permutexvar_ps(pm137, dat1545);
__m512 in1025 = _mm512_permutexvar_ps(pm138, dat1546);
__m512 dat1547 = _mm512_maskz_loadu_ps(16383, datPtr12+1216+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1548 = _mm512_maskz_loadu_ps(8191, datPtr12+13732+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1018 = _mm512_permutexvar_ps(pm137, dat1547);
__m512 in1026 = _mm512_permutexvar_ps(pm138, dat1548);
__m512 dat1549 = _mm512_maskz_loadu_ps(16383, datPtr12+1440+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1550 = _mm512_maskz_loadu_ps(8191, datPtr12+13956+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1019 = _mm512_permutexvar_ps(pm137, dat1549);
__m512 in1027 = _mm512_permutexvar_ps(pm138, dat1550);
__m512 dat1551 = _mm512_maskz_loadu_ps(16383, datPtr12+1664+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1552 = _mm512_maskz_loadu_ps(8191, datPtr12+14180+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1020 = _mm512_permutexvar_ps(pm137, dat1551);
__m512 in1028 = _mm512_permutexvar_ps(pm138, dat1552);
__m512 tmp6577 = _mm512_add_ps(in1014, in1018);
__m512 tmp6581 = _mm512_add_ps(in1022, in1026);
__m512 tmp6578 = _mm512_sub_ps(in1017, in1015);
__m512 tmp6582 = _mm512_sub_ps(in1025, in1023);
__m512 tmp6579 = _mm512_add_ps(in1015, in1019);
__m512 tmp6583 = _mm512_add_ps(in1023, in1027);
in1013 = _mm512_sub_ps(in1013, in1019);
in1021 = _mm512_sub_ps(in1021, in1027);
tmp6577 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-4.25e+00f), tmp6577);
tmp6581 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-4.25e+00f), tmp6581);
tmp6579 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-4.25e+00f), tmp6579);
tmp6583 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-4.25e+00f), tmp6583);
in1013 = _mm512_fmadd_ps(tmp6578, _mm512_set1_ps(5.25e+00f), in1013);
in1021 = _mm512_fmadd_ps(tmp6582, _mm512_set1_ps(5.25e+00f), in1021);
tmp6578 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(2.5e-01f), in1019);
tmp6582 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(2.5e-01f), in1027);
in1015 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(4e+00f), in1019);
in1023 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(4e+00f), in1027);
__m512 tmp6580 = _mm512_sub_ps(tmp6579, tmp6577);
__m512 tmp6584 = _mm512_sub_ps(tmp6583, tmp6581);
tmp6579 = _mm512_add_ps(tmp6577, tmp6579);
tmp6583 = _mm512_add_ps(tmp6581, tmp6583);
tmp6577 = _mm512_fmadd_ps(in1014, _mm512_set1_ps(2.5e-01f), in1018);
tmp6581 = _mm512_fmadd_ps(in1022, _mm512_set1_ps(2.5e-01f), in1026);
tmp6578 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-1.25e+00f), tmp6578);
tmp6582 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-1.25e+00f), tmp6582);
in1017 = _mm512_fmadd_ps(in1017, _mm512_set1_ps(-5e+00f), in1015);
in1025 = _mm512_fmadd_ps(in1025, _mm512_set1_ps(-5e+00f), in1023);
tmp6577 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-1.25e+00f), tmp6577);
tmp6581 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-1.25e+00f), tmp6581);
in1019 = _mm512_fmadd_ps(tmp6577, _mm512_set1_ps(2e+00f), tmp6578);
in1027 = _mm512_fmadd_ps(tmp6581, _mm512_set1_ps(2e+00f), tmp6582);
tmp6578 = _mm512_fnmadd_ps(tmp6577, _mm512_set1_ps(2e+00f), tmp6578);
tmp6582 = _mm512_fnmadd_ps(tmp6581, _mm512_set1_ps(2e+00f), tmp6582);
tmp6577 = _mm512_fmadd_ps(in1018, _mm512_set1_ps(2.5e-01f), in1014);
tmp6581 = _mm512_fmadd_ps(in1026, _mm512_set1_ps(2.5e-01f), in1022);
in1014 = _mm512_sub_ps(in1020, in1014);
in1022 = _mm512_sub_ps(in1028, in1022);
tmp6577 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(-1.25e+00f), tmp6577);
tmp6581 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(-1.25e+00f), tmp6581);
in1016 = _mm512_sub_ps(in1016, in1018);
in1024 = _mm512_sub_ps(in1024, in1026);
in1016 = _mm512_fmadd_ps(in1016, _mm512_set1_ps(5.25e+00f), in1014);
in1024 = _mm512_fmadd_ps(in1024, _mm512_set1_ps(5.25e+00f), in1022);
in1015 = _mm512_fmadd_ps(tmp6577, _mm512_set1_ps(2e+00f), in1017);
in1023 = _mm512_fmadd_ps(tmp6581, _mm512_set1_ps(2e+00f), in1025);
in1017 = _mm512_fnmadd_ps(tmp6577, _mm512_set1_ps(2e+00f), in1017);
in1025 = _mm512_fnmadd_ps(tmp6581, _mm512_set1_ps(2e+00f), in1025);
__m512 tmp6593 = _mm512_unpacklo_ps(in1013, tmp6579);
__m512 tmp6594 = _mm512_unpackhi_ps(in1013, tmp6579);
__m512 tmp6595 = _mm512_unpacklo_ps(tmp6580, in1019);
__m512 tmp6596 = _mm512_unpackhi_ps(tmp6580, in1019);
__m512 tmp6597 = _mm512_unpacklo_ps(tmp6578, in1015);
__m512 tmp6598 = _mm512_unpackhi_ps(tmp6578, in1015);
__m512 tmp6599 = _mm512_unpacklo_ps(in1017, in1016);
__m512 tmp6600 = _mm512_unpackhi_ps(in1017, in1016);
__m512 tmp6601 = _mm512_unpacklo_ps(in1021, tmp6583);
__m512 tmp6602 = _mm512_unpackhi_ps(in1021, tmp6583);
__m512 tmp6603 = _mm512_unpacklo_ps(tmp6584, in1027);
__m512 tmp6604 = _mm512_unpackhi_ps(tmp6584, in1027);
__m512 tmp6605 = _mm512_unpacklo_ps(tmp6582, in1023);
__m512 tmp6606 = _mm512_unpackhi_ps(tmp6582, in1023);
__m512 tmp6607 = _mm512_unpacklo_ps(in1025, in1024);
__m512 tmp6608 = _mm512_unpackhi_ps(in1025, in1024);
__m512 tmp6609 = _mm512_shuffle_ps(tmp6593, tmp6595, 68);
__m512 tmp6610 = _mm512_shuffle_ps(tmp6593, tmp6595, 238);
__m512 tmp6611 = _mm512_shuffle_ps(tmp6594, tmp6596, 68);
__m512 tmp6612 = _mm512_shuffle_ps(tmp6594, tmp6596, 238);
__m512 tmp6613 = _mm512_shuffle_ps(tmp6597, tmp6599, 68);
__m512 tmp6614 = _mm512_shuffle_ps(tmp6597, tmp6599, 238);
__m512 tmp6615 = _mm512_shuffle_ps(tmp6598, tmp6600, 68);
__m512 tmp6616 = _mm512_shuffle_ps(tmp6598, tmp6600, 238);
__m512 tmp6617 = _mm512_shuffle_ps(tmp6601, tmp6603, 68);
__m512 tmp6618 = _mm512_shuffle_ps(tmp6601, tmp6603, 238);
__m512 tmp6619 = _mm512_shuffle_ps(tmp6602, tmp6604, 68);
__m512 tmp6620 = _mm512_shuffle_ps(tmp6602, tmp6604, 238);
__m512 tmp6621 = _mm512_shuffle_ps(tmp6605, tmp6607, 68);
__m512 tmp6622 = _mm512_shuffle_ps(tmp6605, tmp6607, 238);
__m512 tmp6623 = _mm512_shuffle_ps(tmp6606, tmp6608, 68);
__m512 tmp6624 = _mm512_shuffle_ps(tmp6606, tmp6608, 238);
__m512 tmp6625 = _mm512_shuffle_f32x4(tmp6609, tmp6613, 136);
__m512 tmp6626 = _mm512_shuffle_f32x4(tmp6609, tmp6613, 221);
__m512 tmp6627 = _mm512_shuffle_f32x4(tmp6610, tmp6614, 136);
__m512 tmp6628 = _mm512_shuffle_f32x4(tmp6610, tmp6614, 221);
__m512 tmp6629 = _mm512_shuffle_f32x4(tmp6611, tmp6615, 136);
__m512 tmp6630 = _mm512_shuffle_f32x4(tmp6611, tmp6615, 221);
__m512 tmp6631 = _mm512_shuffle_f32x4(tmp6612, tmp6616, 136);
__m512 tmp6632 = _mm512_shuffle_f32x4(tmp6612, tmp6616, 221);
__m512 tmp6633 = _mm512_shuffle_f32x4(tmp6617, tmp6621, 136);
__m512 tmp6634 = _mm512_shuffle_f32x4(tmp6617, tmp6621, 221);
__m512 tmp6635 = _mm512_shuffle_f32x4(tmp6618, tmp6622, 136);
__m512 tmp6636 = _mm512_shuffle_f32x4(tmp6618, tmp6622, 221);
__m512 tmp6637 = _mm512_shuffle_f32x4(tmp6619, tmp6623, 136);
__m512 tmp6638 = _mm512_shuffle_f32x4(tmp6619, tmp6623, 221);
__m512 tmp6639 = _mm512_shuffle_f32x4(tmp6620, tmp6624, 136);
__m512 tmp6640 = _mm512_shuffle_f32x4(tmp6620, tmp6624, 221);
in1013 = _mm512_shuffle_f32x4(tmp6625, tmp6633, 136);
in1021 = _mm512_shuffle_f32x4(tmp6625, tmp6633, 221);
tmp6579 = _mm512_shuffle_f32x4(tmp6627, tmp6635, 136);
tmp6583 = _mm512_shuffle_f32x4(tmp6627, tmp6635, 221);
tmp6580 = _mm512_shuffle_f32x4(tmp6629, tmp6637, 136);
tmp6584 = _mm512_shuffle_f32x4(tmp6629, tmp6637, 221);
in1019 = _mm512_shuffle_f32x4(tmp6631, tmp6639, 136);
in1027 = _mm512_shuffle_f32x4(tmp6631, tmp6639, 221);
tmp6578 = _mm512_shuffle_f32x4(tmp6626, tmp6634, 136);
tmp6582 = _mm512_shuffle_f32x4(tmp6626, tmp6634, 221);
in1015 = _mm512_shuffle_f32x4(tmp6628, tmp6636, 136);
in1023 = _mm512_shuffle_f32x4(tmp6628, tmp6636, 221);
in1017 = _mm512_shuffle_f32x4(tmp6630, tmp6638, 136);
in1025 = _mm512_shuffle_f32x4(tmp6630, tmp6638, 221);
in1016 = _mm512_shuffle_f32x4(tmp6632, tmp6640, 136);
in1024 = _mm512_shuffle_f32x4(tmp6632, tmp6640, 221);
__m512 tmp6585 = _mm512_add_ps(tmp6579, in1015);
__m512 tmp6589 = _mm512_add_ps(tmp6583, in1023);
__m512 tmp6586 = _mm512_sub_ps(tmp6578, tmp6580);
__m512 tmp6590 = _mm512_sub_ps(tmp6582, tmp6584);
__m512 tmp6587 = _mm512_add_ps(tmp6580, in1017);
__m512 tmp6591 = _mm512_add_ps(tmp6584, in1025);
in1013 = _mm512_sub_ps(in1013, in1017);
in1021 = _mm512_sub_ps(in1021, in1025);
tmp6585 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-4.25e+00f), tmp6585);
tmp6589 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-4.25e+00f), tmp6589);
tmp6587 = _mm512_fmadd_ps(tmp6578, _mm512_set1_ps(-4.25e+00f), tmp6587);
tmp6591 = _mm512_fmadd_ps(tmp6582, _mm512_set1_ps(-4.25e+00f), tmp6591);
in1013 = _mm512_fmadd_ps(tmp6586, _mm512_set1_ps(5.25e+00f), in1013);
in1021 = _mm512_fmadd_ps(tmp6590, _mm512_set1_ps(5.25e+00f), in1021);
tmp6586 = _mm512_fmadd_ps(tmp6580, _mm512_set1_ps(2.5e-01f), in1017);
tmp6590 = _mm512_fmadd_ps(tmp6584, _mm512_set1_ps(2.5e-01f), in1025);
tmp6580 = _mm512_fmadd_ps(tmp6580, _mm512_set1_ps(4e+00f), in1017);
tmp6584 = _mm512_fmadd_ps(tmp6584, _mm512_set1_ps(4e+00f), in1025);
__m512 tmp6588 = _mm512_sub_ps(tmp6587, tmp6585);
__m512 tmp6592 = _mm512_sub_ps(tmp6591, tmp6589);
tmp6587 = _mm512_add_ps(tmp6585, tmp6587);
tmp6591 = _mm512_add_ps(tmp6589, tmp6591);
tmp6585 = _mm512_fmadd_ps(tmp6579, _mm512_set1_ps(2.5e-01f), in1015);
tmp6589 = _mm512_fmadd_ps(tmp6583, _mm512_set1_ps(2.5e-01f), in1023);
tmp6586 = _mm512_fmadd_ps(tmp6578, _mm512_set1_ps(-1.25e+00f), tmp6586);
tmp6590 = _mm512_fmadd_ps(tmp6582, _mm512_set1_ps(-1.25e+00f), tmp6590);
tmp6578 = _mm512_fmadd_ps(tmp6578, _mm512_set1_ps(-5e+00f), tmp6580);
tmp6582 = _mm512_fmadd_ps(tmp6582, _mm512_set1_ps(-5e+00f), tmp6584);
tmp6585 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-1.25e+00f), tmp6585);
tmp6589 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-1.25e+00f), tmp6589);
in1017 = _mm512_fmadd_ps(tmp6585, _mm512_set1_ps(2e+00f), tmp6586);
in1025 = _mm512_fmadd_ps(tmp6589, _mm512_set1_ps(2e+00f), tmp6590);
tmp6586 = _mm512_fnmadd_ps(tmp6585, _mm512_set1_ps(2e+00f), tmp6586);
tmp6590 = _mm512_fnmadd_ps(tmp6589, _mm512_set1_ps(2e+00f), tmp6590);
tmp6585 = _mm512_fmadd_ps(in1015, _mm512_set1_ps(2.5e-01f), tmp6579);
tmp6589 = _mm512_fmadd_ps(in1023, _mm512_set1_ps(2.5e-01f), tmp6583);
tmp6579 = _mm512_sub_ps(in1016, tmp6579);
tmp6583 = _mm512_sub_ps(in1024, tmp6583);
tmp6585 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(-1.25e+00f), tmp6585);
tmp6589 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(-1.25e+00f), tmp6589);
in1019 = _mm512_sub_ps(in1019, in1015);
in1027 = _mm512_sub_ps(in1027, in1023);
in1019 = _mm512_fmadd_ps(in1019, _mm512_set1_ps(5.25e+00f), tmp6579);
in1027 = _mm512_fmadd_ps(in1027, _mm512_set1_ps(5.25e+00f), tmp6583);
tmp6580 = _mm512_fmadd_ps(tmp6585, _mm512_set1_ps(2e+00f), tmp6578);
tmp6584 = _mm512_fmadd_ps(tmp6589, _mm512_set1_ps(2e+00f), tmp6582);
tmp6578 = _mm512_fnmadd_ps(tmp6585, _mm512_set1_ps(2e+00f), tmp6578);
tmp6582 = _mm512_fnmadd_ps(tmp6589, _mm512_set1_ps(2e+00f), tmp6582);
__m512 out963 = _mm512_shuffle_f32x4(in1013, tmp6587, 68);
__m512 out971 = _mm512_shuffle_f32x4(in1013, tmp6587, 238);
__m512 out964 = _mm512_shuffle_f32x4(tmp6588, in1017, 68);
__m512 out972 = _mm512_shuffle_f32x4(tmp6588, in1017, 238);
__m512 out965 = _mm512_shuffle_f32x4(tmp6586, tmp6580, 68);
__m512 out973 = _mm512_shuffle_f32x4(tmp6586, tmp6580, 238);
__m512 out966 = _mm512_shuffle_f32x4(tmp6578, in1019, 68);
__m512 out974 = _mm512_shuffle_f32x4(tmp6578, in1019, 238);
__m512 out967 = _mm512_shuffle_f32x4(in1021, tmp6591, 68);
__m512 out975 = _mm512_shuffle_f32x4(in1021, tmp6591, 238);
__m512 out968 = _mm512_shuffle_f32x4(tmp6592, in1025, 68);
__m512 out976 = _mm512_shuffle_f32x4(tmp6592, in1025, 238);
__m512 out969 = _mm512_shuffle_f32x4(tmp6590, tmp6584, 68);
__m512 out977 = _mm512_shuffle_f32x4(tmp6590, tmp6584, 238);
__m512 out970 = _mm512_shuffle_f32x4(tmp6582, in1027, 68);
__m512 out978 = _mm512_shuffle_f32x4(tmp6582, in1027, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k84, out963);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k84, out971);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k84, out967);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k84, out975);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k84, out964);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k84, out972);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k84, out968);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k84, out976);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k84, out965);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k84, out973);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k84, out969);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k84, out977);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k84, out966);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k84, out974);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k84, out970);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k84, out978);
__m512 dat1553 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1554 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512i pm139 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1029 = _mm512_permutexvar_ps(pm139, dat1553);
__m512 in1037 = _mm512_permutexvar_ps(pm139, dat1554);
__m512 dat1555 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1556 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1030 = _mm512_permutexvar_ps(pm139, dat1555);
__m512 in1038 = _mm512_permutexvar_ps(pm139, dat1556);
__m512 dat1557 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1558 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1031 = _mm512_permutexvar_ps(pm139, dat1557);
__m512 in1039 = _mm512_permutexvar_ps(pm139, dat1558);
__m512 dat1559 = _mm512_maskz_loadu_ps(16383, datPtr12+13328+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1560 = _mm512_maskz_loadu_ps(16383, datPtr12+13376+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1032 = _mm512_permutexvar_ps(pm139, dat1559);
__m512 in1040 = _mm512_permutexvar_ps(pm139, dat1560);
__m512 dat1561 = _mm512_maskz_loadu_ps(16383, datPtr12+13552+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1562 = _mm512_maskz_loadu_ps(16383, datPtr12+13600+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1033 = _mm512_permutexvar_ps(pm139, dat1561);
__m512 in1041 = _mm512_permutexvar_ps(pm139, dat1562);
__m512 dat1563 = _mm512_maskz_loadu_ps(16383, datPtr12+13776+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1564 = _mm512_maskz_loadu_ps(16383, datPtr12+13824+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1034 = _mm512_permutexvar_ps(pm139, dat1563);
__m512 in1042 = _mm512_permutexvar_ps(pm139, dat1564);
__m512 dat1565 = _mm512_maskz_loadu_ps(16383, datPtr12+14000+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1566 = _mm512_maskz_loadu_ps(16383, datPtr12+14048+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1035 = _mm512_permutexvar_ps(pm139, dat1565);
__m512 in1043 = _mm512_permutexvar_ps(pm139, dat1566);
__m512 dat1567 = _mm512_maskz_loadu_ps(16383, datPtr12+14224+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 dat1568 = _mm512_maskz_loadu_ps(16383, datPtr12+14272+50432*i27+224*h34+4*w41+50432*s19+25216*k84);
__m512 in1036 = _mm512_permutexvar_ps(pm139, dat1567);
__m512 in1044 = _mm512_permutexvar_ps(pm139, dat1568);
__m512 tmp6641 = _mm512_add_ps(in1030, in1034);
__m512 tmp6645 = _mm512_add_ps(in1038, in1042);
__m512 tmp6642 = _mm512_sub_ps(in1033, in1031);
__m512 tmp6646 = _mm512_sub_ps(in1041, in1039);
__m512 tmp6643 = _mm512_add_ps(in1031, in1035);
__m512 tmp6647 = _mm512_add_ps(in1039, in1043);
in1029 = _mm512_sub_ps(in1029, in1035);
in1037 = _mm512_sub_ps(in1037, in1043);
tmp6641 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-4.25e+00f), tmp6641);
tmp6645 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-4.25e+00f), tmp6645);
tmp6643 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-4.25e+00f), tmp6643);
tmp6647 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-4.25e+00f), tmp6647);
in1029 = _mm512_fmadd_ps(tmp6642, _mm512_set1_ps(5.25e+00f), in1029);
in1037 = _mm512_fmadd_ps(tmp6646, _mm512_set1_ps(5.25e+00f), in1037);
tmp6642 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(2.5e-01f), in1035);
tmp6646 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(2.5e-01f), in1043);
in1031 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(4e+00f), in1035);
in1039 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(4e+00f), in1043);
__m512 tmp6644 = _mm512_sub_ps(tmp6643, tmp6641);
__m512 tmp6648 = _mm512_sub_ps(tmp6647, tmp6645);
tmp6643 = _mm512_add_ps(tmp6641, tmp6643);
tmp6647 = _mm512_add_ps(tmp6645, tmp6647);
tmp6641 = _mm512_fmadd_ps(in1030, _mm512_set1_ps(2.5e-01f), in1034);
tmp6645 = _mm512_fmadd_ps(in1038, _mm512_set1_ps(2.5e-01f), in1042);
tmp6642 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-1.25e+00f), tmp6642);
tmp6646 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-1.25e+00f), tmp6646);
in1033 = _mm512_fmadd_ps(in1033, _mm512_set1_ps(-5e+00f), in1031);
in1041 = _mm512_fmadd_ps(in1041, _mm512_set1_ps(-5e+00f), in1039);
tmp6641 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-1.25e+00f), tmp6641);
tmp6645 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-1.25e+00f), tmp6645);
in1035 = _mm512_fmadd_ps(tmp6641, _mm512_set1_ps(2e+00f), tmp6642);
in1043 = _mm512_fmadd_ps(tmp6645, _mm512_set1_ps(2e+00f), tmp6646);
tmp6642 = _mm512_fnmadd_ps(tmp6641, _mm512_set1_ps(2e+00f), tmp6642);
tmp6646 = _mm512_fnmadd_ps(tmp6645, _mm512_set1_ps(2e+00f), tmp6646);
tmp6641 = _mm512_fmadd_ps(in1034, _mm512_set1_ps(2.5e-01f), in1030);
tmp6645 = _mm512_fmadd_ps(in1042, _mm512_set1_ps(2.5e-01f), in1038);
in1030 = _mm512_sub_ps(in1036, in1030);
in1038 = _mm512_sub_ps(in1044, in1038);
tmp6641 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(-1.25e+00f), tmp6641);
tmp6645 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(-1.25e+00f), tmp6645);
in1032 = _mm512_sub_ps(in1032, in1034);
in1040 = _mm512_sub_ps(in1040, in1042);
in1032 = _mm512_fmadd_ps(in1032, _mm512_set1_ps(5.25e+00f), in1030);
in1040 = _mm512_fmadd_ps(in1040, _mm512_set1_ps(5.25e+00f), in1038);
in1031 = _mm512_fmadd_ps(tmp6641, _mm512_set1_ps(2e+00f), in1033);
in1039 = _mm512_fmadd_ps(tmp6645, _mm512_set1_ps(2e+00f), in1041);
in1033 = _mm512_fnmadd_ps(tmp6641, _mm512_set1_ps(2e+00f), in1033);
in1041 = _mm512_fnmadd_ps(tmp6645, _mm512_set1_ps(2e+00f), in1041);
__m512 tmp6657 = _mm512_unpacklo_ps(in1029, tmp6643);
__m512 tmp6658 = _mm512_unpackhi_ps(in1029, tmp6643);
__m512 tmp6659 = _mm512_unpacklo_ps(tmp6644, in1035);
__m512 tmp6660 = _mm512_unpackhi_ps(tmp6644, in1035);
__m512 tmp6661 = _mm512_unpacklo_ps(tmp6642, in1031);
__m512 tmp6662 = _mm512_unpackhi_ps(tmp6642, in1031);
__m512 tmp6663 = _mm512_unpacklo_ps(in1033, in1032);
__m512 tmp6664 = _mm512_unpackhi_ps(in1033, in1032);
__m512 tmp6665 = _mm512_unpacklo_ps(in1037, tmp6647);
__m512 tmp6666 = _mm512_unpackhi_ps(in1037, tmp6647);
__m512 tmp6667 = _mm512_unpacklo_ps(tmp6648, in1043);
__m512 tmp6668 = _mm512_unpackhi_ps(tmp6648, in1043);
__m512 tmp6669 = _mm512_unpacklo_ps(tmp6646, in1039);
__m512 tmp6670 = _mm512_unpackhi_ps(tmp6646, in1039);
__m512 tmp6671 = _mm512_unpacklo_ps(in1041, in1040);
__m512 tmp6672 = _mm512_unpackhi_ps(in1041, in1040);
__m512 tmp6673 = _mm512_shuffle_ps(tmp6657, tmp6659, 68);
__m512 tmp6674 = _mm512_shuffle_ps(tmp6657, tmp6659, 238);
__m512 tmp6675 = _mm512_shuffle_ps(tmp6658, tmp6660, 68);
__m512 tmp6676 = _mm512_shuffle_ps(tmp6658, tmp6660, 238);
__m512 tmp6677 = _mm512_shuffle_ps(tmp6661, tmp6663, 68);
__m512 tmp6678 = _mm512_shuffle_ps(tmp6661, tmp6663, 238);
__m512 tmp6679 = _mm512_shuffle_ps(tmp6662, tmp6664, 68);
__m512 tmp6680 = _mm512_shuffle_ps(tmp6662, tmp6664, 238);
__m512 tmp6681 = _mm512_shuffle_ps(tmp6665, tmp6667, 68);
__m512 tmp6682 = _mm512_shuffle_ps(tmp6665, tmp6667, 238);
__m512 tmp6683 = _mm512_shuffle_ps(tmp6666, tmp6668, 68);
__m512 tmp6684 = _mm512_shuffle_ps(tmp6666, tmp6668, 238);
__m512 tmp6685 = _mm512_shuffle_ps(tmp6669, tmp6671, 68);
__m512 tmp6686 = _mm512_shuffle_ps(tmp6669, tmp6671, 238);
__m512 tmp6687 = _mm512_shuffle_ps(tmp6670, tmp6672, 68);
__m512 tmp6688 = _mm512_shuffle_ps(tmp6670, tmp6672, 238);
__m512 tmp6689 = _mm512_shuffle_f32x4(tmp6673, tmp6677, 136);
__m512 tmp6690 = _mm512_shuffle_f32x4(tmp6673, tmp6677, 221);
__m512 tmp6691 = _mm512_shuffle_f32x4(tmp6674, tmp6678, 136);
__m512 tmp6692 = _mm512_shuffle_f32x4(tmp6674, tmp6678, 221);
__m512 tmp6693 = _mm512_shuffle_f32x4(tmp6675, tmp6679, 136);
__m512 tmp6694 = _mm512_shuffle_f32x4(tmp6675, tmp6679, 221);
__m512 tmp6695 = _mm512_shuffle_f32x4(tmp6676, tmp6680, 136);
__m512 tmp6696 = _mm512_shuffle_f32x4(tmp6676, tmp6680, 221);
__m512 tmp6697 = _mm512_shuffle_f32x4(tmp6681, tmp6685, 136);
__m512 tmp6698 = _mm512_shuffle_f32x4(tmp6681, tmp6685, 221);
__m512 tmp6699 = _mm512_shuffle_f32x4(tmp6682, tmp6686, 136);
__m512 tmp6700 = _mm512_shuffle_f32x4(tmp6682, tmp6686, 221);
__m512 tmp6701 = _mm512_shuffle_f32x4(tmp6683, tmp6687, 136);
__m512 tmp6702 = _mm512_shuffle_f32x4(tmp6683, tmp6687, 221);
__m512 tmp6703 = _mm512_shuffle_f32x4(tmp6684, tmp6688, 136);
__m512 tmp6704 = _mm512_shuffle_f32x4(tmp6684, tmp6688, 221);
in1029 = _mm512_shuffle_f32x4(tmp6689, tmp6697, 136);
in1037 = _mm512_shuffle_f32x4(tmp6689, tmp6697, 221);
tmp6643 = _mm512_shuffle_f32x4(tmp6691, tmp6699, 136);
tmp6647 = _mm512_shuffle_f32x4(tmp6691, tmp6699, 221);
tmp6644 = _mm512_shuffle_f32x4(tmp6693, tmp6701, 136);
tmp6648 = _mm512_shuffle_f32x4(tmp6693, tmp6701, 221);
in1035 = _mm512_shuffle_f32x4(tmp6695, tmp6703, 136);
in1043 = _mm512_shuffle_f32x4(tmp6695, tmp6703, 221);
tmp6642 = _mm512_shuffle_f32x4(tmp6690, tmp6698, 136);
tmp6646 = _mm512_shuffle_f32x4(tmp6690, tmp6698, 221);
in1031 = _mm512_shuffle_f32x4(tmp6692, tmp6700, 136);
in1039 = _mm512_shuffle_f32x4(tmp6692, tmp6700, 221);
in1033 = _mm512_shuffle_f32x4(tmp6694, tmp6702, 136);
in1041 = _mm512_shuffle_f32x4(tmp6694, tmp6702, 221);
in1032 = _mm512_shuffle_f32x4(tmp6696, tmp6704, 136);
in1040 = _mm512_shuffle_f32x4(tmp6696, tmp6704, 221);
__m512 tmp6649 = _mm512_add_ps(tmp6643, in1031);
__m512 tmp6653 = _mm512_add_ps(tmp6647, in1039);
__m512 tmp6650 = _mm512_sub_ps(tmp6642, tmp6644);
__m512 tmp6654 = _mm512_sub_ps(tmp6646, tmp6648);
__m512 tmp6651 = _mm512_add_ps(tmp6644, in1033);
__m512 tmp6655 = _mm512_add_ps(tmp6648, in1041);
in1029 = _mm512_sub_ps(in1029, in1033);
in1037 = _mm512_sub_ps(in1037, in1041);
tmp6649 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-4.25e+00f), tmp6649);
tmp6653 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-4.25e+00f), tmp6653);
tmp6651 = _mm512_fmadd_ps(tmp6642, _mm512_set1_ps(-4.25e+00f), tmp6651);
tmp6655 = _mm512_fmadd_ps(tmp6646, _mm512_set1_ps(-4.25e+00f), tmp6655);
in1029 = _mm512_fmadd_ps(tmp6650, _mm512_set1_ps(5.25e+00f), in1029);
in1037 = _mm512_fmadd_ps(tmp6654, _mm512_set1_ps(5.25e+00f), in1037);
tmp6650 = _mm512_fmadd_ps(tmp6644, _mm512_set1_ps(2.5e-01f), in1033);
tmp6654 = _mm512_fmadd_ps(tmp6648, _mm512_set1_ps(2.5e-01f), in1041);
tmp6644 = _mm512_fmadd_ps(tmp6644, _mm512_set1_ps(4e+00f), in1033);
tmp6648 = _mm512_fmadd_ps(tmp6648, _mm512_set1_ps(4e+00f), in1041);
__m512 tmp6652 = _mm512_sub_ps(tmp6651, tmp6649);
__m512 tmp6656 = _mm512_sub_ps(tmp6655, tmp6653);
tmp6651 = _mm512_add_ps(tmp6649, tmp6651);
tmp6655 = _mm512_add_ps(tmp6653, tmp6655);
tmp6649 = _mm512_fmadd_ps(tmp6643, _mm512_set1_ps(2.5e-01f), in1031);
tmp6653 = _mm512_fmadd_ps(tmp6647, _mm512_set1_ps(2.5e-01f), in1039);
tmp6650 = _mm512_fmadd_ps(tmp6642, _mm512_set1_ps(-1.25e+00f), tmp6650);
tmp6654 = _mm512_fmadd_ps(tmp6646, _mm512_set1_ps(-1.25e+00f), tmp6654);
tmp6642 = _mm512_fmadd_ps(tmp6642, _mm512_set1_ps(-5e+00f), tmp6644);
tmp6646 = _mm512_fmadd_ps(tmp6646, _mm512_set1_ps(-5e+00f), tmp6648);
tmp6649 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-1.25e+00f), tmp6649);
tmp6653 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-1.25e+00f), tmp6653);
in1033 = _mm512_fmadd_ps(tmp6649, _mm512_set1_ps(2e+00f), tmp6650);
in1041 = _mm512_fmadd_ps(tmp6653, _mm512_set1_ps(2e+00f), tmp6654);
tmp6650 = _mm512_fnmadd_ps(tmp6649, _mm512_set1_ps(2e+00f), tmp6650);
tmp6654 = _mm512_fnmadd_ps(tmp6653, _mm512_set1_ps(2e+00f), tmp6654);
tmp6649 = _mm512_fmadd_ps(in1031, _mm512_set1_ps(2.5e-01f), tmp6643);
tmp6653 = _mm512_fmadd_ps(in1039, _mm512_set1_ps(2.5e-01f), tmp6647);
tmp6643 = _mm512_sub_ps(in1032, tmp6643);
tmp6647 = _mm512_sub_ps(in1040, tmp6647);
tmp6649 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(-1.25e+00f), tmp6649);
tmp6653 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(-1.25e+00f), tmp6653);
in1035 = _mm512_sub_ps(in1035, in1031);
in1043 = _mm512_sub_ps(in1043, in1039);
in1035 = _mm512_fmadd_ps(in1035, _mm512_set1_ps(5.25e+00f), tmp6643);
in1043 = _mm512_fmadd_ps(in1043, _mm512_set1_ps(5.25e+00f), tmp6647);
tmp6644 = _mm512_fmadd_ps(tmp6649, _mm512_set1_ps(2e+00f), tmp6642);
tmp6648 = _mm512_fmadd_ps(tmp6653, _mm512_set1_ps(2e+00f), tmp6646);
tmp6642 = _mm512_fnmadd_ps(tmp6649, _mm512_set1_ps(2e+00f), tmp6642);
tmp6646 = _mm512_fnmadd_ps(tmp6653, _mm512_set1_ps(2e+00f), tmp6646);
__m512 out979 = _mm512_shuffle_f32x4(in1029, tmp6651, 68);
__m512 out987 = _mm512_shuffle_f32x4(in1029, tmp6651, 238);
__m512 out980 = _mm512_shuffle_f32x4(tmp6652, in1033, 68);
__m512 out988 = _mm512_shuffle_f32x4(tmp6652, in1033, 238);
__m512 out981 = _mm512_shuffle_f32x4(tmp6650, tmp6644, 68);
__m512 out989 = _mm512_shuffle_f32x4(tmp6650, tmp6644, 238);
__m512 out982 = _mm512_shuffle_f32x4(tmp6642, in1035, 68);
__m512 out990 = _mm512_shuffle_f32x4(tmp6642, in1035, 238);
__m512 out983 = _mm512_shuffle_f32x4(in1037, tmp6655, 68);
__m512 out991 = _mm512_shuffle_f32x4(in1037, tmp6655, 238);
__m512 out984 = _mm512_shuffle_f32x4(tmp6656, in1041, 68);
__m512 out992 = _mm512_shuffle_f32x4(tmp6656, in1041, 238);
__m512 out985 = _mm512_shuffle_f32x4(tmp6654, tmp6648, 68);
__m512 out993 = _mm512_shuffle_f32x4(tmp6654, tmp6648, 238);
__m512 out986 = _mm512_shuffle_f32x4(tmp6646, in1043, 68);
__m512 out994 = _mm512_shuffle_f32x4(tmp6646, in1043, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k84, out979);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k84, out987);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k84, out983);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k84, out991);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k84, out980);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k84, out988);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k84, out984);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k84, out992);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k84, out981);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k84, out989);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k84, out985);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k84, out993);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k84, out982);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k84, out990);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k84, out986);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k84, out994);
}
++j21;
rel14 = 4;
}
ptrdiff_t h35 = base14+12;
ptrdiff_t w42 = 36;
ptrdiff_t k85 = 0;
for (; k85 != 2; ++k85) {
__m512 dat1569 = _mm512_maskz_loadu_ps(16383, datPtr12+0+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1570 = _mm512_maskz_loadu_ps(511, datPtr12+48+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512i pm140 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1045 = _mm512_permutexvar_ps(pm140, dat1569);
__m512i pm141 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1053 = _mm512_permutexvar_ps(pm141, dat1570);
__m512 dat1571 = _mm512_maskz_loadu_ps(16383, datPtr12+224+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1572 = _mm512_maskz_loadu_ps(511, datPtr12+272+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1046 = _mm512_permutexvar_ps(pm140, dat1571);
__m512 in1054 = _mm512_permutexvar_ps(pm141, dat1572);
__m512 dat1573 = _mm512_maskz_loadu_ps(16383, datPtr12+448+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1574 = _mm512_maskz_loadu_ps(511, datPtr12+496+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1047 = _mm512_permutexvar_ps(pm140, dat1573);
__m512 in1055 = _mm512_permutexvar_ps(pm141, dat1574);
__m512 dat1575 = _mm512_maskz_loadu_ps(16383, datPtr12+672+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1576 = _mm512_maskz_loadu_ps(511, datPtr12+720+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1048 = _mm512_permutexvar_ps(pm140, dat1575);
__m512 in1056 = _mm512_permutexvar_ps(pm141, dat1576);
__m512 dat1577 = _mm512_maskz_loadu_ps(16383, datPtr12+896+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1578 = _mm512_maskz_loadu_ps(511, datPtr12+944+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1049 = _mm512_permutexvar_ps(pm140, dat1577);
__m512 in1057 = _mm512_permutexvar_ps(pm141, dat1578);
__m512 dat1579 = _mm512_maskz_loadu_ps(16383, datPtr12+1120+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1580 = _mm512_maskz_loadu_ps(511, datPtr12+1168+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1050 = _mm512_permutexvar_ps(pm140, dat1579);
__m512 in1058 = _mm512_permutexvar_ps(pm141, dat1580);
__m512 dat1581 = _mm512_maskz_loadu_ps(16383, datPtr12+1344+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1582 = _mm512_maskz_loadu_ps(511, datPtr12+1392+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1051 = _mm512_permutexvar_ps(pm140, dat1581);
__m512 in1059 = _mm512_permutexvar_ps(pm141, dat1582);
__m512 dat1583 = _mm512_maskz_loadu_ps(16383, datPtr12+1568+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1584 = _mm512_maskz_loadu_ps(511, datPtr12+1616+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1052 = _mm512_permutexvar_ps(pm140, dat1583);
__m512 in1060 = _mm512_permutexvar_ps(pm141, dat1584);
__m512 tmp6705 = _mm512_add_ps(in1046, in1050);
__m512 tmp6709 = _mm512_add_ps(in1054, in1058);
__m512 tmp6706 = _mm512_sub_ps(in1049, in1047);
__m512 tmp6710 = _mm512_sub_ps(in1057, in1055);
__m512 tmp6707 = _mm512_add_ps(in1047, in1051);
__m512 tmp6711 = _mm512_add_ps(in1055, in1059);
in1045 = _mm512_sub_ps(in1045, in1051);
in1053 = _mm512_sub_ps(in1053, in1059);
tmp6705 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-4.25e+00f), tmp6705);
tmp6709 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-4.25e+00f), tmp6709);
tmp6707 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-4.25e+00f), tmp6707);
tmp6711 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-4.25e+00f), tmp6711);
in1045 = _mm512_fmadd_ps(tmp6706, _mm512_set1_ps(5.25e+00f), in1045);
in1053 = _mm512_fmadd_ps(tmp6710, _mm512_set1_ps(5.25e+00f), in1053);
tmp6706 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(2.5e-01f), in1051);
tmp6710 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(2.5e-01f), in1059);
in1047 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(4e+00f), in1051);
in1055 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(4e+00f), in1059);
__m512 tmp6708 = _mm512_sub_ps(tmp6707, tmp6705);
__m512 tmp6712 = _mm512_sub_ps(tmp6711, tmp6709);
tmp6707 = _mm512_add_ps(tmp6705, tmp6707);
tmp6711 = _mm512_add_ps(tmp6709, tmp6711);
tmp6705 = _mm512_fmadd_ps(in1046, _mm512_set1_ps(2.5e-01f), in1050);
tmp6709 = _mm512_fmadd_ps(in1054, _mm512_set1_ps(2.5e-01f), in1058);
tmp6706 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-1.25e+00f), tmp6706);
tmp6710 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-1.25e+00f), tmp6710);
in1049 = _mm512_fmadd_ps(in1049, _mm512_set1_ps(-5e+00f), in1047);
in1057 = _mm512_fmadd_ps(in1057, _mm512_set1_ps(-5e+00f), in1055);
tmp6705 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-1.25e+00f), tmp6705);
tmp6709 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-1.25e+00f), tmp6709);
in1051 = _mm512_fmadd_ps(tmp6705, _mm512_set1_ps(2e+00f), tmp6706);
in1059 = _mm512_fmadd_ps(tmp6709, _mm512_set1_ps(2e+00f), tmp6710);
tmp6706 = _mm512_fnmadd_ps(tmp6705, _mm512_set1_ps(2e+00f), tmp6706);
tmp6710 = _mm512_fnmadd_ps(tmp6709, _mm512_set1_ps(2e+00f), tmp6710);
tmp6705 = _mm512_fmadd_ps(in1050, _mm512_set1_ps(2.5e-01f), in1046);
tmp6709 = _mm512_fmadd_ps(in1058, _mm512_set1_ps(2.5e-01f), in1054);
in1046 = _mm512_sub_ps(in1052, in1046);
in1054 = _mm512_sub_ps(in1060, in1054);
tmp6705 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(-1.25e+00f), tmp6705);
tmp6709 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(-1.25e+00f), tmp6709);
in1048 = _mm512_sub_ps(in1048, in1050);
in1056 = _mm512_sub_ps(in1056, in1058);
in1048 = _mm512_fmadd_ps(in1048, _mm512_set1_ps(5.25e+00f), in1046);
in1056 = _mm512_fmadd_ps(in1056, _mm512_set1_ps(5.25e+00f), in1054);
in1047 = _mm512_fmadd_ps(tmp6705, _mm512_set1_ps(2e+00f), in1049);
in1055 = _mm512_fmadd_ps(tmp6709, _mm512_set1_ps(2e+00f), in1057);
in1049 = _mm512_fnmadd_ps(tmp6705, _mm512_set1_ps(2e+00f), in1049);
in1057 = _mm512_fnmadd_ps(tmp6709, _mm512_set1_ps(2e+00f), in1057);
__m512 tmp6721 = _mm512_unpacklo_ps(in1045, tmp6707);
__m512 tmp6722 = _mm512_unpackhi_ps(in1045, tmp6707);
__m512 tmp6723 = _mm512_unpacklo_ps(tmp6708, in1051);
__m512 tmp6724 = _mm512_unpackhi_ps(tmp6708, in1051);
__m512 tmp6725 = _mm512_unpacklo_ps(tmp6706, in1047);
__m512 tmp6726 = _mm512_unpackhi_ps(tmp6706, in1047);
__m512 tmp6727 = _mm512_unpacklo_ps(in1049, in1048);
__m512 tmp6728 = _mm512_unpackhi_ps(in1049, in1048);
__m512 tmp6729 = _mm512_unpacklo_ps(in1053, tmp6711);
__m512 tmp6730 = _mm512_unpackhi_ps(in1053, tmp6711);
__m512 tmp6731 = _mm512_unpacklo_ps(tmp6712, in1059);
__m512 tmp6732 = _mm512_unpackhi_ps(tmp6712, in1059);
__m512 tmp6733 = _mm512_unpacklo_ps(tmp6710, in1055);
__m512 tmp6734 = _mm512_unpackhi_ps(tmp6710, in1055);
__m512 tmp6735 = _mm512_unpacklo_ps(in1057, in1056);
__m512 tmp6736 = _mm512_unpackhi_ps(in1057, in1056);
__m512 tmp6737 = _mm512_shuffle_ps(tmp6721, tmp6723, 68);
__m512 tmp6738 = _mm512_shuffle_ps(tmp6721, tmp6723, 238);
__m512 tmp6739 = _mm512_shuffle_ps(tmp6722, tmp6724, 68);
__m512 tmp6740 = _mm512_shuffle_ps(tmp6722, tmp6724, 238);
__m512 tmp6741 = _mm512_shuffle_ps(tmp6725, tmp6727, 68);
__m512 tmp6742 = _mm512_shuffle_ps(tmp6725, tmp6727, 238);
__m512 tmp6743 = _mm512_shuffle_ps(tmp6726, tmp6728, 68);
__m512 tmp6744 = _mm512_shuffle_ps(tmp6726, tmp6728, 238);
__m512 tmp6745 = _mm512_shuffle_ps(tmp6729, tmp6731, 68);
__m512 tmp6746 = _mm512_shuffle_ps(tmp6729, tmp6731, 238);
__m512 tmp6747 = _mm512_shuffle_ps(tmp6730, tmp6732, 68);
__m512 tmp6748 = _mm512_shuffle_ps(tmp6730, tmp6732, 238);
__m512 tmp6749 = _mm512_shuffle_ps(tmp6733, tmp6735, 68);
__m512 tmp6750 = _mm512_shuffle_ps(tmp6733, tmp6735, 238);
__m512 tmp6751 = _mm512_shuffle_ps(tmp6734, tmp6736, 68);
__m512 tmp6752 = _mm512_shuffle_ps(tmp6734, tmp6736, 238);
__m512 tmp6753 = _mm512_shuffle_f32x4(tmp6737, tmp6741, 136);
__m512 tmp6754 = _mm512_shuffle_f32x4(tmp6737, tmp6741, 221);
__m512 tmp6755 = _mm512_shuffle_f32x4(tmp6738, tmp6742, 136);
__m512 tmp6756 = _mm512_shuffle_f32x4(tmp6738, tmp6742, 221);
__m512 tmp6757 = _mm512_shuffle_f32x4(tmp6739, tmp6743, 136);
__m512 tmp6758 = _mm512_shuffle_f32x4(tmp6739, tmp6743, 221);
__m512 tmp6759 = _mm512_shuffle_f32x4(tmp6740, tmp6744, 136);
__m512 tmp6760 = _mm512_shuffle_f32x4(tmp6740, tmp6744, 221);
__m512 tmp6761 = _mm512_shuffle_f32x4(tmp6745, tmp6749, 136);
__m512 tmp6762 = _mm512_shuffle_f32x4(tmp6745, tmp6749, 221);
__m512 tmp6763 = _mm512_shuffle_f32x4(tmp6746, tmp6750, 136);
__m512 tmp6764 = _mm512_shuffle_f32x4(tmp6746, tmp6750, 221);
__m512 tmp6765 = _mm512_shuffle_f32x4(tmp6747, tmp6751, 136);
__m512 tmp6766 = _mm512_shuffle_f32x4(tmp6747, tmp6751, 221);
__m512 tmp6767 = _mm512_shuffle_f32x4(tmp6748, tmp6752, 136);
__m512 tmp6768 = _mm512_shuffle_f32x4(tmp6748, tmp6752, 221);
in1045 = _mm512_shuffle_f32x4(tmp6753, tmp6761, 136);
in1053 = _mm512_shuffle_f32x4(tmp6753, tmp6761, 221);
tmp6707 = _mm512_shuffle_f32x4(tmp6755, tmp6763, 136);
tmp6711 = _mm512_shuffle_f32x4(tmp6755, tmp6763, 221);
tmp6708 = _mm512_shuffle_f32x4(tmp6757, tmp6765, 136);
tmp6712 = _mm512_shuffle_f32x4(tmp6757, tmp6765, 221);
in1051 = _mm512_shuffle_f32x4(tmp6759, tmp6767, 136);
in1059 = _mm512_shuffle_f32x4(tmp6759, tmp6767, 221);
tmp6706 = _mm512_shuffle_f32x4(tmp6754, tmp6762, 136);
tmp6710 = _mm512_shuffle_f32x4(tmp6754, tmp6762, 221);
in1047 = _mm512_shuffle_f32x4(tmp6756, tmp6764, 136);
in1055 = _mm512_shuffle_f32x4(tmp6756, tmp6764, 221);
in1049 = _mm512_shuffle_f32x4(tmp6758, tmp6766, 136);
in1057 = _mm512_shuffle_f32x4(tmp6758, tmp6766, 221);
in1048 = _mm512_shuffle_f32x4(tmp6760, tmp6768, 136);
in1056 = _mm512_shuffle_f32x4(tmp6760, tmp6768, 221);
__m512 tmp6713 = _mm512_add_ps(tmp6707, in1047);
__m512 tmp6717 = _mm512_add_ps(tmp6711, in1055);
__m512 tmp6714 = _mm512_sub_ps(tmp6706, tmp6708);
__m512 tmp6718 = _mm512_sub_ps(tmp6710, tmp6712);
__m512 tmp6715 = _mm512_add_ps(tmp6708, in1049);
__m512 tmp6719 = _mm512_add_ps(tmp6712, in1057);
in1045 = _mm512_sub_ps(in1045, in1049);
in1053 = _mm512_sub_ps(in1053, in1057);
tmp6713 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-4.25e+00f), tmp6713);
tmp6717 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-4.25e+00f), tmp6717);
tmp6715 = _mm512_fmadd_ps(tmp6706, _mm512_set1_ps(-4.25e+00f), tmp6715);
tmp6719 = _mm512_fmadd_ps(tmp6710, _mm512_set1_ps(-4.25e+00f), tmp6719);
in1045 = _mm512_fmadd_ps(tmp6714, _mm512_set1_ps(5.25e+00f), in1045);
in1053 = _mm512_fmadd_ps(tmp6718, _mm512_set1_ps(5.25e+00f), in1053);
tmp6714 = _mm512_fmadd_ps(tmp6708, _mm512_set1_ps(2.5e-01f), in1049);
tmp6718 = _mm512_fmadd_ps(tmp6712, _mm512_set1_ps(2.5e-01f), in1057);
tmp6708 = _mm512_fmadd_ps(tmp6708, _mm512_set1_ps(4e+00f), in1049);
tmp6712 = _mm512_fmadd_ps(tmp6712, _mm512_set1_ps(4e+00f), in1057);
__m512 tmp6716 = _mm512_sub_ps(tmp6715, tmp6713);
__m512 tmp6720 = _mm512_sub_ps(tmp6719, tmp6717);
tmp6715 = _mm512_add_ps(tmp6713, tmp6715);
tmp6719 = _mm512_add_ps(tmp6717, tmp6719);
tmp6713 = _mm512_fmadd_ps(tmp6707, _mm512_set1_ps(2.5e-01f), in1047);
tmp6717 = _mm512_fmadd_ps(tmp6711, _mm512_set1_ps(2.5e-01f), in1055);
tmp6714 = _mm512_fmadd_ps(tmp6706, _mm512_set1_ps(-1.25e+00f), tmp6714);
tmp6718 = _mm512_fmadd_ps(tmp6710, _mm512_set1_ps(-1.25e+00f), tmp6718);
tmp6706 = _mm512_fmadd_ps(tmp6706, _mm512_set1_ps(-5e+00f), tmp6708);
tmp6710 = _mm512_fmadd_ps(tmp6710, _mm512_set1_ps(-5e+00f), tmp6712);
tmp6713 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-1.25e+00f), tmp6713);
tmp6717 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-1.25e+00f), tmp6717);
in1049 = _mm512_fmadd_ps(tmp6713, _mm512_set1_ps(2e+00f), tmp6714);
in1057 = _mm512_fmadd_ps(tmp6717, _mm512_set1_ps(2e+00f), tmp6718);
tmp6714 = _mm512_fnmadd_ps(tmp6713, _mm512_set1_ps(2e+00f), tmp6714);
tmp6718 = _mm512_fnmadd_ps(tmp6717, _mm512_set1_ps(2e+00f), tmp6718);
tmp6713 = _mm512_fmadd_ps(in1047, _mm512_set1_ps(2.5e-01f), tmp6707);
tmp6717 = _mm512_fmadd_ps(in1055, _mm512_set1_ps(2.5e-01f), tmp6711);
tmp6707 = _mm512_sub_ps(in1048, tmp6707);
tmp6711 = _mm512_sub_ps(in1056, tmp6711);
tmp6713 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(-1.25e+00f), tmp6713);
tmp6717 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(-1.25e+00f), tmp6717);
in1051 = _mm512_sub_ps(in1051, in1047);
in1059 = _mm512_sub_ps(in1059, in1055);
in1051 = _mm512_fmadd_ps(in1051, _mm512_set1_ps(5.25e+00f), tmp6707);
in1059 = _mm512_fmadd_ps(in1059, _mm512_set1_ps(5.25e+00f), tmp6711);
tmp6708 = _mm512_fmadd_ps(tmp6713, _mm512_set1_ps(2e+00f), tmp6706);
tmp6712 = _mm512_fmadd_ps(tmp6717, _mm512_set1_ps(2e+00f), tmp6710);
tmp6706 = _mm512_fnmadd_ps(tmp6713, _mm512_set1_ps(2e+00f), tmp6706);
tmp6710 = _mm512_fnmadd_ps(tmp6717, _mm512_set1_ps(2e+00f), tmp6710);
__m512 out995 = _mm512_shuffle_f32x4(in1045, tmp6715, 68);
__m512 out1003 = _mm512_shuffle_f32x4(in1045, tmp6715, 238);
__m512 out996 = _mm512_shuffle_f32x4(tmp6716, in1049, 68);
__m512 out1004 = _mm512_shuffle_f32x4(tmp6716, in1049, 238);
__m512 out997 = _mm512_shuffle_f32x4(tmp6714, tmp6708, 68);
__m512 out1005 = _mm512_shuffle_f32x4(tmp6714, tmp6708, 238);
__m512 out998 = _mm512_shuffle_f32x4(tmp6706, in1051, 68);
__m512 out1006 = _mm512_shuffle_f32x4(tmp6706, in1051, 238);
__m512 out999 = _mm512_shuffle_f32x4(in1053, tmp6719, 68);
__m512 out1007 = _mm512_shuffle_f32x4(in1053, tmp6719, 238);
__m512 out1000 = _mm512_shuffle_f32x4(tmp6720, in1057, 68);
__m512 out1008 = _mm512_shuffle_f32x4(tmp6720, in1057, 238);
__m512 out1001 = _mm512_shuffle_f32x4(tmp6718, tmp6712, 68);
__m512 out1009 = _mm512_shuffle_f32x4(tmp6718, tmp6712, 238);
__m512 out1002 = _mm512_shuffle_f32x4(tmp6710, in1059, 68);
__m512 out1010 = _mm512_shuffle_f32x4(tmp6710, in1059, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k85, out995);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k85, out1003);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k85, out999);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k85, out1007);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k85, out996);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k85, out1004);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k85, out1000);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k85, out1008);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k85, out997);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k85, out1005);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k85, out1001);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k85, out1009);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k85, out998);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k85, out1006);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k85, out1002);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k85, out1010);
__m512 dat1585 = _mm512_maskz_loadu_ps(8191, datPtr12+1204+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1586 = _mm512_maskz_loadu_ps(16383, datPtr12+12608+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512i pm142 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1061 = _mm512_permutexvar_ps(pm142, dat1585);
__m512i pm143 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1069 = _mm512_permutexvar_ps(pm143, dat1586);
__m512 dat1587 = _mm512_maskz_loadu_ps(8191, datPtr12+1428+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1588 = _mm512_maskz_loadu_ps(16383, datPtr12+12832+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1062 = _mm512_permutexvar_ps(pm142, dat1587);
__m512 in1070 = _mm512_permutexvar_ps(pm143, dat1588);
__m512 dat1589 = _mm512_maskz_loadu_ps(8191, datPtr12+1652+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1590 = _mm512_maskz_loadu_ps(16383, datPtr12+13056+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1063 = _mm512_permutexvar_ps(pm142, dat1589);
__m512 in1071 = _mm512_permutexvar_ps(pm143, dat1590);
__m512 dat1591 = _mm512_maskz_loadu_ps(8191, datPtr12+1876+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1592 = _mm512_maskz_loadu_ps(16383, datPtr12+13280+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1064 = _mm512_permutexvar_ps(pm142, dat1591);
__m512 in1072 = _mm512_permutexvar_ps(pm143, dat1592);
__m512 dat1593 = _mm512_maskz_loadu_ps(8191, datPtr12+2100+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1594 = _mm512_maskz_loadu_ps(16383, datPtr12+13504+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1065 = _mm512_permutexvar_ps(pm142, dat1593);
__m512 in1073 = _mm512_permutexvar_ps(pm143, dat1594);
__m512 dat1595 = _mm512_maskz_loadu_ps(8191, datPtr12+2324+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1596 = _mm512_maskz_loadu_ps(16383, datPtr12+13728+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1066 = _mm512_permutexvar_ps(pm142, dat1595);
__m512 in1074 = _mm512_permutexvar_ps(pm143, dat1596);
__m512 dat1597 = _mm512_maskz_loadu_ps(8191, datPtr12+2548+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1598 = _mm512_maskz_loadu_ps(16383, datPtr12+13952+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1067 = _mm512_permutexvar_ps(pm142, dat1597);
__m512 in1075 = _mm512_permutexvar_ps(pm143, dat1598);
__m512 dat1599 = _mm512_maskz_loadu_ps(8191, datPtr12+2772+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1600 = _mm512_maskz_loadu_ps(16383, datPtr12+14176+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1068 = _mm512_permutexvar_ps(pm142, dat1599);
__m512 in1076 = _mm512_permutexvar_ps(pm143, dat1600);
__m512 tmp6769 = _mm512_add_ps(in1062, in1066);
__m512 tmp6773 = _mm512_add_ps(in1070, in1074);
__m512 tmp6770 = _mm512_sub_ps(in1065, in1063);
__m512 tmp6774 = _mm512_sub_ps(in1073, in1071);
__m512 tmp6771 = _mm512_add_ps(in1063, in1067);
__m512 tmp6775 = _mm512_add_ps(in1071, in1075);
in1061 = _mm512_sub_ps(in1061, in1067);
in1069 = _mm512_sub_ps(in1069, in1075);
tmp6769 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-4.25e+00f), tmp6769);
tmp6773 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-4.25e+00f), tmp6773);
tmp6771 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-4.25e+00f), tmp6771);
tmp6775 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-4.25e+00f), tmp6775);
in1061 = _mm512_fmadd_ps(tmp6770, _mm512_set1_ps(5.25e+00f), in1061);
in1069 = _mm512_fmadd_ps(tmp6774, _mm512_set1_ps(5.25e+00f), in1069);
tmp6770 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(2.5e-01f), in1067);
tmp6774 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(2.5e-01f), in1075);
in1063 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(4e+00f), in1067);
in1071 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(4e+00f), in1075);
__m512 tmp6772 = _mm512_sub_ps(tmp6771, tmp6769);
__m512 tmp6776 = _mm512_sub_ps(tmp6775, tmp6773);
tmp6771 = _mm512_add_ps(tmp6769, tmp6771);
tmp6775 = _mm512_add_ps(tmp6773, tmp6775);
tmp6769 = _mm512_fmadd_ps(in1062, _mm512_set1_ps(2.5e-01f), in1066);
tmp6773 = _mm512_fmadd_ps(in1070, _mm512_set1_ps(2.5e-01f), in1074);
tmp6770 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-1.25e+00f), tmp6770);
tmp6774 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-1.25e+00f), tmp6774);
in1065 = _mm512_fmadd_ps(in1065, _mm512_set1_ps(-5e+00f), in1063);
in1073 = _mm512_fmadd_ps(in1073, _mm512_set1_ps(-5e+00f), in1071);
tmp6769 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-1.25e+00f), tmp6769);
tmp6773 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-1.25e+00f), tmp6773);
in1067 = _mm512_fmadd_ps(tmp6769, _mm512_set1_ps(2e+00f), tmp6770);
in1075 = _mm512_fmadd_ps(tmp6773, _mm512_set1_ps(2e+00f), tmp6774);
tmp6770 = _mm512_fnmadd_ps(tmp6769, _mm512_set1_ps(2e+00f), tmp6770);
tmp6774 = _mm512_fnmadd_ps(tmp6773, _mm512_set1_ps(2e+00f), tmp6774);
tmp6769 = _mm512_fmadd_ps(in1066, _mm512_set1_ps(2.5e-01f), in1062);
tmp6773 = _mm512_fmadd_ps(in1074, _mm512_set1_ps(2.5e-01f), in1070);
in1062 = _mm512_sub_ps(in1068, in1062);
in1070 = _mm512_sub_ps(in1076, in1070);
tmp6769 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(-1.25e+00f), tmp6769);
tmp6773 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(-1.25e+00f), tmp6773);
in1064 = _mm512_sub_ps(in1064, in1066);
in1072 = _mm512_sub_ps(in1072, in1074);
in1064 = _mm512_fmadd_ps(in1064, _mm512_set1_ps(5.25e+00f), in1062);
in1072 = _mm512_fmadd_ps(in1072, _mm512_set1_ps(5.25e+00f), in1070);
in1063 = _mm512_fmadd_ps(tmp6769, _mm512_set1_ps(2e+00f), in1065);
in1071 = _mm512_fmadd_ps(tmp6773, _mm512_set1_ps(2e+00f), in1073);
in1065 = _mm512_fnmadd_ps(tmp6769, _mm512_set1_ps(2e+00f), in1065);
in1073 = _mm512_fnmadd_ps(tmp6773, _mm512_set1_ps(2e+00f), in1073);
__m512 tmp6785 = _mm512_unpacklo_ps(in1061, tmp6771);
__m512 tmp6786 = _mm512_unpackhi_ps(in1061, tmp6771);
__m512 tmp6787 = _mm512_unpacklo_ps(tmp6772, in1067);
__m512 tmp6788 = _mm512_unpackhi_ps(tmp6772, in1067);
__m512 tmp6789 = _mm512_unpacklo_ps(tmp6770, in1063);
__m512 tmp6790 = _mm512_unpackhi_ps(tmp6770, in1063);
__m512 tmp6791 = _mm512_unpacklo_ps(in1065, in1064);
__m512 tmp6792 = _mm512_unpackhi_ps(in1065, in1064);
__m512 tmp6793 = _mm512_unpacklo_ps(in1069, tmp6775);
__m512 tmp6794 = _mm512_unpackhi_ps(in1069, tmp6775);
__m512 tmp6795 = _mm512_unpacklo_ps(tmp6776, in1075);
__m512 tmp6796 = _mm512_unpackhi_ps(tmp6776, in1075);
__m512 tmp6797 = _mm512_unpacklo_ps(tmp6774, in1071);
__m512 tmp6798 = _mm512_unpackhi_ps(tmp6774, in1071);
__m512 tmp6799 = _mm512_unpacklo_ps(in1073, in1072);
__m512 tmp6800 = _mm512_unpackhi_ps(in1073, in1072);
__m512 tmp6801 = _mm512_shuffle_ps(tmp6785, tmp6787, 68);
__m512 tmp6802 = _mm512_shuffle_ps(tmp6785, tmp6787, 238);
__m512 tmp6803 = _mm512_shuffle_ps(tmp6786, tmp6788, 68);
__m512 tmp6804 = _mm512_shuffle_ps(tmp6786, tmp6788, 238);
__m512 tmp6805 = _mm512_shuffle_ps(tmp6789, tmp6791, 68);
__m512 tmp6806 = _mm512_shuffle_ps(tmp6789, tmp6791, 238);
__m512 tmp6807 = _mm512_shuffle_ps(tmp6790, tmp6792, 68);
__m512 tmp6808 = _mm512_shuffle_ps(tmp6790, tmp6792, 238);
__m512 tmp6809 = _mm512_shuffle_ps(tmp6793, tmp6795, 68);
__m512 tmp6810 = _mm512_shuffle_ps(tmp6793, tmp6795, 238);
__m512 tmp6811 = _mm512_shuffle_ps(tmp6794, tmp6796, 68);
__m512 tmp6812 = _mm512_shuffle_ps(tmp6794, tmp6796, 238);
__m512 tmp6813 = _mm512_shuffle_ps(tmp6797, tmp6799, 68);
__m512 tmp6814 = _mm512_shuffle_ps(tmp6797, tmp6799, 238);
__m512 tmp6815 = _mm512_shuffle_ps(tmp6798, tmp6800, 68);
__m512 tmp6816 = _mm512_shuffle_ps(tmp6798, tmp6800, 238);
__m512 tmp6817 = _mm512_shuffle_f32x4(tmp6801, tmp6805, 136);
__m512 tmp6818 = _mm512_shuffle_f32x4(tmp6801, tmp6805, 221);
__m512 tmp6819 = _mm512_shuffle_f32x4(tmp6802, tmp6806, 136);
__m512 tmp6820 = _mm512_shuffle_f32x4(tmp6802, tmp6806, 221);
__m512 tmp6821 = _mm512_shuffle_f32x4(tmp6803, tmp6807, 136);
__m512 tmp6822 = _mm512_shuffle_f32x4(tmp6803, tmp6807, 221);
__m512 tmp6823 = _mm512_shuffle_f32x4(tmp6804, tmp6808, 136);
__m512 tmp6824 = _mm512_shuffle_f32x4(tmp6804, tmp6808, 221);
__m512 tmp6825 = _mm512_shuffle_f32x4(tmp6809, tmp6813, 136);
__m512 tmp6826 = _mm512_shuffle_f32x4(tmp6809, tmp6813, 221);
__m512 tmp6827 = _mm512_shuffle_f32x4(tmp6810, tmp6814, 136);
__m512 tmp6828 = _mm512_shuffle_f32x4(tmp6810, tmp6814, 221);
__m512 tmp6829 = _mm512_shuffle_f32x4(tmp6811, tmp6815, 136);
__m512 tmp6830 = _mm512_shuffle_f32x4(tmp6811, tmp6815, 221);
__m512 tmp6831 = _mm512_shuffle_f32x4(tmp6812, tmp6816, 136);
__m512 tmp6832 = _mm512_shuffle_f32x4(tmp6812, tmp6816, 221);
in1061 = _mm512_shuffle_f32x4(tmp6817, tmp6825, 136);
in1069 = _mm512_shuffle_f32x4(tmp6817, tmp6825, 221);
tmp6771 = _mm512_shuffle_f32x4(tmp6819, tmp6827, 136);
tmp6775 = _mm512_shuffle_f32x4(tmp6819, tmp6827, 221);
tmp6772 = _mm512_shuffle_f32x4(tmp6821, tmp6829, 136);
tmp6776 = _mm512_shuffle_f32x4(tmp6821, tmp6829, 221);
in1067 = _mm512_shuffle_f32x4(tmp6823, tmp6831, 136);
in1075 = _mm512_shuffle_f32x4(tmp6823, tmp6831, 221);
tmp6770 = _mm512_shuffle_f32x4(tmp6818, tmp6826, 136);
tmp6774 = _mm512_shuffle_f32x4(tmp6818, tmp6826, 221);
in1063 = _mm512_shuffle_f32x4(tmp6820, tmp6828, 136);
in1071 = _mm512_shuffle_f32x4(tmp6820, tmp6828, 221);
in1065 = _mm512_shuffle_f32x4(tmp6822, tmp6830, 136);
in1073 = _mm512_shuffle_f32x4(tmp6822, tmp6830, 221);
in1064 = _mm512_shuffle_f32x4(tmp6824, tmp6832, 136);
in1072 = _mm512_shuffle_f32x4(tmp6824, tmp6832, 221);
__m512 tmp6777 = _mm512_add_ps(tmp6771, in1063);
__m512 tmp6781 = _mm512_add_ps(tmp6775, in1071);
__m512 tmp6778 = _mm512_sub_ps(tmp6770, tmp6772);
__m512 tmp6782 = _mm512_sub_ps(tmp6774, tmp6776);
__m512 tmp6779 = _mm512_add_ps(tmp6772, in1065);
__m512 tmp6783 = _mm512_add_ps(tmp6776, in1073);
in1061 = _mm512_sub_ps(in1061, in1065);
in1069 = _mm512_sub_ps(in1069, in1073);
tmp6777 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-4.25e+00f), tmp6777);
tmp6781 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-4.25e+00f), tmp6781);
tmp6779 = _mm512_fmadd_ps(tmp6770, _mm512_set1_ps(-4.25e+00f), tmp6779);
tmp6783 = _mm512_fmadd_ps(tmp6774, _mm512_set1_ps(-4.25e+00f), tmp6783);
in1061 = _mm512_fmadd_ps(tmp6778, _mm512_set1_ps(5.25e+00f), in1061);
in1069 = _mm512_fmadd_ps(tmp6782, _mm512_set1_ps(5.25e+00f), in1069);
tmp6778 = _mm512_fmadd_ps(tmp6772, _mm512_set1_ps(2.5e-01f), in1065);
tmp6782 = _mm512_fmadd_ps(tmp6776, _mm512_set1_ps(2.5e-01f), in1073);
tmp6772 = _mm512_fmadd_ps(tmp6772, _mm512_set1_ps(4e+00f), in1065);
tmp6776 = _mm512_fmadd_ps(tmp6776, _mm512_set1_ps(4e+00f), in1073);
__m512 tmp6780 = _mm512_sub_ps(tmp6779, tmp6777);
__m512 tmp6784 = _mm512_sub_ps(tmp6783, tmp6781);
tmp6779 = _mm512_add_ps(tmp6777, tmp6779);
tmp6783 = _mm512_add_ps(tmp6781, tmp6783);
tmp6777 = _mm512_fmadd_ps(tmp6771, _mm512_set1_ps(2.5e-01f), in1063);
tmp6781 = _mm512_fmadd_ps(tmp6775, _mm512_set1_ps(2.5e-01f), in1071);
tmp6778 = _mm512_fmadd_ps(tmp6770, _mm512_set1_ps(-1.25e+00f), tmp6778);
tmp6782 = _mm512_fmadd_ps(tmp6774, _mm512_set1_ps(-1.25e+00f), tmp6782);
tmp6770 = _mm512_fmadd_ps(tmp6770, _mm512_set1_ps(-5e+00f), tmp6772);
tmp6774 = _mm512_fmadd_ps(tmp6774, _mm512_set1_ps(-5e+00f), tmp6776);
tmp6777 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-1.25e+00f), tmp6777);
tmp6781 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-1.25e+00f), tmp6781);
in1065 = _mm512_fmadd_ps(tmp6777, _mm512_set1_ps(2e+00f), tmp6778);
in1073 = _mm512_fmadd_ps(tmp6781, _mm512_set1_ps(2e+00f), tmp6782);
tmp6778 = _mm512_fnmadd_ps(tmp6777, _mm512_set1_ps(2e+00f), tmp6778);
tmp6782 = _mm512_fnmadd_ps(tmp6781, _mm512_set1_ps(2e+00f), tmp6782);
tmp6777 = _mm512_fmadd_ps(in1063, _mm512_set1_ps(2.5e-01f), tmp6771);
tmp6781 = _mm512_fmadd_ps(in1071, _mm512_set1_ps(2.5e-01f), tmp6775);
tmp6771 = _mm512_sub_ps(in1064, tmp6771);
tmp6775 = _mm512_sub_ps(in1072, tmp6775);
tmp6777 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(-1.25e+00f), tmp6777);
tmp6781 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(-1.25e+00f), tmp6781);
in1067 = _mm512_sub_ps(in1067, in1063);
in1075 = _mm512_sub_ps(in1075, in1071);
in1067 = _mm512_fmadd_ps(in1067, _mm512_set1_ps(5.25e+00f), tmp6771);
in1075 = _mm512_fmadd_ps(in1075, _mm512_set1_ps(5.25e+00f), tmp6775);
tmp6772 = _mm512_fmadd_ps(tmp6777, _mm512_set1_ps(2e+00f), tmp6770);
tmp6776 = _mm512_fmadd_ps(tmp6781, _mm512_set1_ps(2e+00f), tmp6774);
tmp6770 = _mm512_fnmadd_ps(tmp6777, _mm512_set1_ps(2e+00f), tmp6770);
tmp6774 = _mm512_fnmadd_ps(tmp6781, _mm512_set1_ps(2e+00f), tmp6774);
__m512 out1011 = _mm512_shuffle_f32x4(in1061, tmp6779, 68);
__m512 out1019 = _mm512_shuffle_f32x4(in1061, tmp6779, 238);
__m512 out1012 = _mm512_shuffle_f32x4(tmp6780, in1065, 68);
__m512 out1020 = _mm512_shuffle_f32x4(tmp6780, in1065, 238);
__m512 out1013 = _mm512_shuffle_f32x4(tmp6778, tmp6772, 68);
__m512 out1021 = _mm512_shuffle_f32x4(tmp6778, tmp6772, 238);
__m512 out1014 = _mm512_shuffle_f32x4(tmp6770, in1067, 68);
__m512 out1022 = _mm512_shuffle_f32x4(tmp6770, in1067, 238);
__m512 out1015 = _mm512_shuffle_f32x4(in1069, tmp6783, 68);
__m512 out1023 = _mm512_shuffle_f32x4(in1069, tmp6783, 238);
__m512 out1016 = _mm512_shuffle_f32x4(tmp6784, in1073, 68);
__m512 out1024 = _mm512_shuffle_f32x4(tmp6784, in1073, 238);
__m512 out1017 = _mm512_shuffle_f32x4(tmp6782, tmp6776, 68);
__m512 out1025 = _mm512_shuffle_f32x4(tmp6782, tmp6776, 238);
__m512 out1018 = _mm512_shuffle_f32x4(tmp6774, in1075, 68);
__m512 out1026 = _mm512_shuffle_f32x4(tmp6774, in1075, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k85, out1011);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k85, out1019);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k85, out1015);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k85, out1023);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k85, out1012);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k85, out1020);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k85, out1016);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k85, out1024);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k85, out1013);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k85, out1021);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k85, out1017);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k85, out1025);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k85, out1014);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k85, out1022);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k85, out1018);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k85, out1026);
__m512 dat1601 = _mm512_maskz_loadu_ps(511, datPtr12+12656+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1602 = _mm512_maskz_loadu_ps(8191, datPtr12+13812+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512i pm144 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1077 = _mm512_permutexvar_ps(pm144, dat1601);
__m512i pm145 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1085 = _mm512_permutexvar_ps(pm145, dat1602);
__m512 dat1603 = _mm512_maskz_loadu_ps(511, datPtr12+12880+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1604 = _mm512_maskz_loadu_ps(8191, datPtr12+14036+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1078 = _mm512_permutexvar_ps(pm144, dat1603);
__m512 in1086 = _mm512_permutexvar_ps(pm145, dat1604);
__m512 dat1605 = _mm512_maskz_loadu_ps(511, datPtr12+13104+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1606 = _mm512_maskz_loadu_ps(8191, datPtr12+14260+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1079 = _mm512_permutexvar_ps(pm144, dat1605);
__m512 in1087 = _mm512_permutexvar_ps(pm145, dat1606);
__m512 dat1607 = _mm512_maskz_loadu_ps(511, datPtr12+13328+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1608 = _mm512_maskz_loadu_ps(8191, datPtr12+14484+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1080 = _mm512_permutexvar_ps(pm144, dat1607);
__m512 in1088 = _mm512_permutexvar_ps(pm145, dat1608);
__m512 dat1609 = _mm512_maskz_loadu_ps(511, datPtr12+13552+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1610 = _mm512_maskz_loadu_ps(8191, datPtr12+14708+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1081 = _mm512_permutexvar_ps(pm144, dat1609);
__m512 in1089 = _mm512_permutexvar_ps(pm145, dat1610);
__m512 dat1611 = _mm512_maskz_loadu_ps(511, datPtr12+13776+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1612 = _mm512_maskz_loadu_ps(8191, datPtr12+14932+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1082 = _mm512_permutexvar_ps(pm144, dat1611);
__m512 in1090 = _mm512_permutexvar_ps(pm145, dat1612);
__m512 dat1613 = _mm512_maskz_loadu_ps(511, datPtr12+14000+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1614 = _mm512_maskz_loadu_ps(8191, datPtr12+15156+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1083 = _mm512_permutexvar_ps(pm144, dat1613);
__m512 in1091 = _mm512_permutexvar_ps(pm145, dat1614);
__m512 dat1615 = _mm512_maskz_loadu_ps(511, datPtr12+14224+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 dat1616 = _mm512_maskz_loadu_ps(8191, datPtr12+15380+50432*i27+224*h35+4*w42+50432*s19+25216*k85);
__m512 in1084 = _mm512_permutexvar_ps(pm144, dat1615);
__m512 in1092 = _mm512_permutexvar_ps(pm145, dat1616);
__m512 tmp6833 = _mm512_add_ps(in1078, in1082);
__m512 tmp6837 = _mm512_add_ps(in1086, in1090);
__m512 tmp6834 = _mm512_sub_ps(in1081, in1079);
__m512 tmp6838 = _mm512_sub_ps(in1089, in1087);
__m512 tmp6835 = _mm512_add_ps(in1079, in1083);
__m512 tmp6839 = _mm512_add_ps(in1087, in1091);
in1077 = _mm512_sub_ps(in1077, in1083);
in1085 = _mm512_sub_ps(in1085, in1091);
tmp6833 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-4.25e+00f), tmp6833);
tmp6837 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-4.25e+00f), tmp6837);
tmp6835 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-4.25e+00f), tmp6835);
tmp6839 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-4.25e+00f), tmp6839);
in1077 = _mm512_fmadd_ps(tmp6834, _mm512_set1_ps(5.25e+00f), in1077);
in1085 = _mm512_fmadd_ps(tmp6838, _mm512_set1_ps(5.25e+00f), in1085);
tmp6834 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(2.5e-01f), in1083);
tmp6838 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(2.5e-01f), in1091);
in1079 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(4e+00f), in1083);
in1087 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(4e+00f), in1091);
__m512 tmp6836 = _mm512_sub_ps(tmp6835, tmp6833);
__m512 tmp6840 = _mm512_sub_ps(tmp6839, tmp6837);
tmp6835 = _mm512_add_ps(tmp6833, tmp6835);
tmp6839 = _mm512_add_ps(tmp6837, tmp6839);
tmp6833 = _mm512_fmadd_ps(in1078, _mm512_set1_ps(2.5e-01f), in1082);
tmp6837 = _mm512_fmadd_ps(in1086, _mm512_set1_ps(2.5e-01f), in1090);
tmp6834 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-1.25e+00f), tmp6834);
tmp6838 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-1.25e+00f), tmp6838);
in1081 = _mm512_fmadd_ps(in1081, _mm512_set1_ps(-5e+00f), in1079);
in1089 = _mm512_fmadd_ps(in1089, _mm512_set1_ps(-5e+00f), in1087);
tmp6833 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-1.25e+00f), tmp6833);
tmp6837 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-1.25e+00f), tmp6837);
in1083 = _mm512_fmadd_ps(tmp6833, _mm512_set1_ps(2e+00f), tmp6834);
in1091 = _mm512_fmadd_ps(tmp6837, _mm512_set1_ps(2e+00f), tmp6838);
tmp6834 = _mm512_fnmadd_ps(tmp6833, _mm512_set1_ps(2e+00f), tmp6834);
tmp6838 = _mm512_fnmadd_ps(tmp6837, _mm512_set1_ps(2e+00f), tmp6838);
tmp6833 = _mm512_fmadd_ps(in1082, _mm512_set1_ps(2.5e-01f), in1078);
tmp6837 = _mm512_fmadd_ps(in1090, _mm512_set1_ps(2.5e-01f), in1086);
in1078 = _mm512_sub_ps(in1084, in1078);
in1086 = _mm512_sub_ps(in1092, in1086);
tmp6833 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(-1.25e+00f), tmp6833);
tmp6837 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(-1.25e+00f), tmp6837);
in1080 = _mm512_sub_ps(in1080, in1082);
in1088 = _mm512_sub_ps(in1088, in1090);
in1080 = _mm512_fmadd_ps(in1080, _mm512_set1_ps(5.25e+00f), in1078);
in1088 = _mm512_fmadd_ps(in1088, _mm512_set1_ps(5.25e+00f), in1086);
in1079 = _mm512_fmadd_ps(tmp6833, _mm512_set1_ps(2e+00f), in1081);
in1087 = _mm512_fmadd_ps(tmp6837, _mm512_set1_ps(2e+00f), in1089);
in1081 = _mm512_fnmadd_ps(tmp6833, _mm512_set1_ps(2e+00f), in1081);
in1089 = _mm512_fnmadd_ps(tmp6837, _mm512_set1_ps(2e+00f), in1089);
__m512 tmp6849 = _mm512_unpacklo_ps(in1077, tmp6835);
__m512 tmp6850 = _mm512_unpackhi_ps(in1077, tmp6835);
__m512 tmp6851 = _mm512_unpacklo_ps(tmp6836, in1083);
__m512 tmp6852 = _mm512_unpackhi_ps(tmp6836, in1083);
__m512 tmp6853 = _mm512_unpacklo_ps(tmp6834, in1079);
__m512 tmp6854 = _mm512_unpackhi_ps(tmp6834, in1079);
__m512 tmp6855 = _mm512_unpacklo_ps(in1081, in1080);
__m512 tmp6856 = _mm512_unpackhi_ps(in1081, in1080);
__m512 tmp6857 = _mm512_unpacklo_ps(in1085, tmp6839);
__m512 tmp6858 = _mm512_unpackhi_ps(in1085, tmp6839);
__m512 tmp6859 = _mm512_unpacklo_ps(tmp6840, in1091);
__m512 tmp6860 = _mm512_unpackhi_ps(tmp6840, in1091);
__m512 tmp6861 = _mm512_unpacklo_ps(tmp6838, in1087);
__m512 tmp6862 = _mm512_unpackhi_ps(tmp6838, in1087);
__m512 tmp6863 = _mm512_unpacklo_ps(in1089, in1088);
__m512 tmp6864 = _mm512_unpackhi_ps(in1089, in1088);
__m512 tmp6865 = _mm512_shuffle_ps(tmp6849, tmp6851, 68);
__m512 tmp6866 = _mm512_shuffle_ps(tmp6849, tmp6851, 238);
__m512 tmp6867 = _mm512_shuffle_ps(tmp6850, tmp6852, 68);
__m512 tmp6868 = _mm512_shuffle_ps(tmp6850, tmp6852, 238);
__m512 tmp6869 = _mm512_shuffle_ps(tmp6853, tmp6855, 68);
__m512 tmp6870 = _mm512_shuffle_ps(tmp6853, tmp6855, 238);
__m512 tmp6871 = _mm512_shuffle_ps(tmp6854, tmp6856, 68);
__m512 tmp6872 = _mm512_shuffle_ps(tmp6854, tmp6856, 238);
__m512 tmp6873 = _mm512_shuffle_ps(tmp6857, tmp6859, 68);
__m512 tmp6874 = _mm512_shuffle_ps(tmp6857, tmp6859, 238);
__m512 tmp6875 = _mm512_shuffle_ps(tmp6858, tmp6860, 68);
__m512 tmp6876 = _mm512_shuffle_ps(tmp6858, tmp6860, 238);
__m512 tmp6877 = _mm512_shuffle_ps(tmp6861, tmp6863, 68);
__m512 tmp6878 = _mm512_shuffle_ps(tmp6861, tmp6863, 238);
__m512 tmp6879 = _mm512_shuffle_ps(tmp6862, tmp6864, 68);
__m512 tmp6880 = _mm512_shuffle_ps(tmp6862, tmp6864, 238);
__m512 tmp6881 = _mm512_shuffle_f32x4(tmp6865, tmp6869, 136);
__m512 tmp6882 = _mm512_shuffle_f32x4(tmp6865, tmp6869, 221);
__m512 tmp6883 = _mm512_shuffle_f32x4(tmp6866, tmp6870, 136);
__m512 tmp6884 = _mm512_shuffle_f32x4(tmp6866, tmp6870, 221);
__m512 tmp6885 = _mm512_shuffle_f32x4(tmp6867, tmp6871, 136);
__m512 tmp6886 = _mm512_shuffle_f32x4(tmp6867, tmp6871, 221);
__m512 tmp6887 = _mm512_shuffle_f32x4(tmp6868, tmp6872, 136);
__m512 tmp6888 = _mm512_shuffle_f32x4(tmp6868, tmp6872, 221);
__m512 tmp6889 = _mm512_shuffle_f32x4(tmp6873, tmp6877, 136);
__m512 tmp6890 = _mm512_shuffle_f32x4(tmp6873, tmp6877, 221);
__m512 tmp6891 = _mm512_shuffle_f32x4(tmp6874, tmp6878, 136);
__m512 tmp6892 = _mm512_shuffle_f32x4(tmp6874, tmp6878, 221);
__m512 tmp6893 = _mm512_shuffle_f32x4(tmp6875, tmp6879, 136);
__m512 tmp6894 = _mm512_shuffle_f32x4(tmp6875, tmp6879, 221);
__m512 tmp6895 = _mm512_shuffle_f32x4(tmp6876, tmp6880, 136);
__m512 tmp6896 = _mm512_shuffle_f32x4(tmp6876, tmp6880, 221);
in1077 = _mm512_shuffle_f32x4(tmp6881, tmp6889, 136);
in1085 = _mm512_shuffle_f32x4(tmp6881, tmp6889, 221);
tmp6835 = _mm512_shuffle_f32x4(tmp6883, tmp6891, 136);
tmp6839 = _mm512_shuffle_f32x4(tmp6883, tmp6891, 221);
tmp6836 = _mm512_shuffle_f32x4(tmp6885, tmp6893, 136);
tmp6840 = _mm512_shuffle_f32x4(tmp6885, tmp6893, 221);
in1083 = _mm512_shuffle_f32x4(tmp6887, tmp6895, 136);
in1091 = _mm512_shuffle_f32x4(tmp6887, tmp6895, 221);
tmp6834 = _mm512_shuffle_f32x4(tmp6882, tmp6890, 136);
tmp6838 = _mm512_shuffle_f32x4(tmp6882, tmp6890, 221);
in1079 = _mm512_shuffle_f32x4(tmp6884, tmp6892, 136);
in1087 = _mm512_shuffle_f32x4(tmp6884, tmp6892, 221);
in1081 = _mm512_shuffle_f32x4(tmp6886, tmp6894, 136);
in1089 = _mm512_shuffle_f32x4(tmp6886, tmp6894, 221);
in1080 = _mm512_shuffle_f32x4(tmp6888, tmp6896, 136);
in1088 = _mm512_shuffle_f32x4(tmp6888, tmp6896, 221);
__m512 tmp6841 = _mm512_add_ps(tmp6835, in1079);
__m512 tmp6845 = _mm512_add_ps(tmp6839, in1087);
__m512 tmp6842 = _mm512_sub_ps(tmp6834, tmp6836);
__m512 tmp6846 = _mm512_sub_ps(tmp6838, tmp6840);
__m512 tmp6843 = _mm512_add_ps(tmp6836, in1081);
__m512 tmp6847 = _mm512_add_ps(tmp6840, in1089);
in1077 = _mm512_sub_ps(in1077, in1081);
in1085 = _mm512_sub_ps(in1085, in1089);
tmp6841 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-4.25e+00f), tmp6841);
tmp6845 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-4.25e+00f), tmp6845);
tmp6843 = _mm512_fmadd_ps(tmp6834, _mm512_set1_ps(-4.25e+00f), tmp6843);
tmp6847 = _mm512_fmadd_ps(tmp6838, _mm512_set1_ps(-4.25e+00f), tmp6847);
in1077 = _mm512_fmadd_ps(tmp6842, _mm512_set1_ps(5.25e+00f), in1077);
in1085 = _mm512_fmadd_ps(tmp6846, _mm512_set1_ps(5.25e+00f), in1085);
tmp6842 = _mm512_fmadd_ps(tmp6836, _mm512_set1_ps(2.5e-01f), in1081);
tmp6846 = _mm512_fmadd_ps(tmp6840, _mm512_set1_ps(2.5e-01f), in1089);
tmp6836 = _mm512_fmadd_ps(tmp6836, _mm512_set1_ps(4e+00f), in1081);
tmp6840 = _mm512_fmadd_ps(tmp6840, _mm512_set1_ps(4e+00f), in1089);
__m512 tmp6844 = _mm512_sub_ps(tmp6843, tmp6841);
__m512 tmp6848 = _mm512_sub_ps(tmp6847, tmp6845);
tmp6843 = _mm512_add_ps(tmp6841, tmp6843);
tmp6847 = _mm512_add_ps(tmp6845, tmp6847);
tmp6841 = _mm512_fmadd_ps(tmp6835, _mm512_set1_ps(2.5e-01f), in1079);
tmp6845 = _mm512_fmadd_ps(tmp6839, _mm512_set1_ps(2.5e-01f), in1087);
tmp6842 = _mm512_fmadd_ps(tmp6834, _mm512_set1_ps(-1.25e+00f), tmp6842);
tmp6846 = _mm512_fmadd_ps(tmp6838, _mm512_set1_ps(-1.25e+00f), tmp6846);
tmp6834 = _mm512_fmadd_ps(tmp6834, _mm512_set1_ps(-5e+00f), tmp6836);
tmp6838 = _mm512_fmadd_ps(tmp6838, _mm512_set1_ps(-5e+00f), tmp6840);
tmp6841 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-1.25e+00f), tmp6841);
tmp6845 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-1.25e+00f), tmp6845);
in1081 = _mm512_fmadd_ps(tmp6841, _mm512_set1_ps(2e+00f), tmp6842);
in1089 = _mm512_fmadd_ps(tmp6845, _mm512_set1_ps(2e+00f), tmp6846);
tmp6842 = _mm512_fnmadd_ps(tmp6841, _mm512_set1_ps(2e+00f), tmp6842);
tmp6846 = _mm512_fnmadd_ps(tmp6845, _mm512_set1_ps(2e+00f), tmp6846);
tmp6841 = _mm512_fmadd_ps(in1079, _mm512_set1_ps(2.5e-01f), tmp6835);
tmp6845 = _mm512_fmadd_ps(in1087, _mm512_set1_ps(2.5e-01f), tmp6839);
tmp6835 = _mm512_sub_ps(in1080, tmp6835);
tmp6839 = _mm512_sub_ps(in1088, tmp6839);
tmp6841 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(-1.25e+00f), tmp6841);
tmp6845 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(-1.25e+00f), tmp6845);
in1083 = _mm512_sub_ps(in1083, in1079);
in1091 = _mm512_sub_ps(in1091, in1087);
in1083 = _mm512_fmadd_ps(in1083, _mm512_set1_ps(5.25e+00f), tmp6835);
in1091 = _mm512_fmadd_ps(in1091, _mm512_set1_ps(5.25e+00f), tmp6839);
tmp6836 = _mm512_fmadd_ps(tmp6841, _mm512_set1_ps(2e+00f), tmp6834);
tmp6840 = _mm512_fmadd_ps(tmp6845, _mm512_set1_ps(2e+00f), tmp6838);
tmp6834 = _mm512_fnmadd_ps(tmp6841, _mm512_set1_ps(2e+00f), tmp6834);
tmp6838 = _mm512_fnmadd_ps(tmp6845, _mm512_set1_ps(2e+00f), tmp6838);
__m512 out1027 = _mm512_shuffle_f32x4(in1077, tmp6843, 68);
__m512 out1035 = _mm512_shuffle_f32x4(in1077, tmp6843, 238);
__m512 out1028 = _mm512_shuffle_f32x4(tmp6844, in1081, 68);
__m512 out1036 = _mm512_shuffle_f32x4(tmp6844, in1081, 238);
__m512 out1029 = _mm512_shuffle_f32x4(tmp6842, tmp6836, 68);
__m512 out1037 = _mm512_shuffle_f32x4(tmp6842, tmp6836, 238);
__m512 out1030 = _mm512_shuffle_f32x4(tmp6834, in1083, 68);
__m512 out1038 = _mm512_shuffle_f32x4(tmp6834, in1083, 238);
__m512 out1031 = _mm512_shuffle_f32x4(in1085, tmp6847, 68);
__m512 out1039 = _mm512_shuffle_f32x4(in1085, tmp6847, 238);
__m512 out1032 = _mm512_shuffle_f32x4(tmp6848, in1089, 68);
__m512 out1040 = _mm512_shuffle_f32x4(tmp6848, in1089, 238);
__m512 out1033 = _mm512_shuffle_f32x4(tmp6846, tmp6840, 68);
__m512 out1041 = _mm512_shuffle_f32x4(tmp6846, tmp6840, 238);
__m512 out1034 = _mm512_shuffle_f32x4(tmp6838, in1091, 68);
__m512 out1042 = _mm512_shuffle_f32x4(tmp6838, in1091, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k85, out1027);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k85, out1035);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k85, out1031);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k85, out1039);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k85, out1028);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k85, out1036);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k85, out1032);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k85, out1040);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k85, out1029);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k85, out1037);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k85, out1033);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k85, out1041);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k85, out1030);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k85, out1038);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k85, out1034);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k85, out1042);
}
++j21;
}
j21 = 15;
}
ptrdiff_t rel15 = j21-15;
ptrdiff_t base15 = 54;
if (rel15 < 1) {
ptrdiff_t h36 = base15+0;
ptrdiff_t w43 = 0;
ptrdiff_t k86 = 0;
for (; k86 != 2; ++k86) {
__m512 dat1617 = _mm512_maskz_loadu_ps(8191, datPtr12+4+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1618 = _mm512_maskz_loadu_ps(16383, datPtr12+48+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512i pm146 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1093 = _mm512_permutexvar_ps(pm146, dat1617);
__m512i pm147 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1096 = _mm512_permutexvar_ps(pm147, dat1618);
__m512 dat1619 = _mm512_maskz_loadu_ps(8191, datPtr12+228+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1620 = _mm512_maskz_loadu_ps(16383, datPtr12+272+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1094 = _mm512_permutexvar_ps(pm146, dat1619);
__m512 in1097 = _mm512_permutexvar_ps(pm147, dat1620);
__m512 dat1621 = _mm512_maskz_loadu_ps(8191, datPtr12+452+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1622 = _mm512_maskz_loadu_ps(16383, datPtr12+496+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1095 = _mm512_permutexvar_ps(pm146, dat1621);
__m512 in1098 = _mm512_permutexvar_ps(pm147, dat1622);
__m512 tmp6897 = in1094;
__m512 tmp6904 = in1097;
__m512 tmp6898 = _mm512_sub_ps(_mm512_setzero_ps(), in1095);
__m512 tmp6905 = _mm512_sub_ps(_mm512_setzero_ps(), in1098);
__m512 tmp6899 = in1095;
__m512 tmp6906 = in1098;
in1093 = in1093;
in1096 = in1096;
tmp6897 = tmp6897;
tmp6904 = tmp6904;
tmp6899 = tmp6899;
tmp6906 = tmp6906;
in1093 = _mm512_fmadd_ps(tmp6898, _mm512_set1_ps(5.25e+00f), in1093);
in1096 = _mm512_fmadd_ps(tmp6905, _mm512_set1_ps(5.25e+00f), in1096);
tmp6898 = _mm512_mul_ps(in1095, _mm512_set1_ps(2.5e-01f));
tmp6905 = _mm512_mul_ps(in1098, _mm512_set1_ps(2.5e-01f));
in1095 = _mm512_mul_ps(in1095, _mm512_set1_ps(4e+00f));
in1098 = _mm512_mul_ps(in1098, _mm512_set1_ps(4e+00f));
__m512 tmp6900 = _mm512_sub_ps(tmp6899, tmp6897);
__m512 tmp6907 = _mm512_sub_ps(tmp6906, tmp6904);
tmp6899 = _mm512_add_ps(tmp6897, tmp6899);
tmp6906 = _mm512_add_ps(tmp6904, tmp6906);
tmp6897 = _mm512_mul_ps(in1094, _mm512_set1_ps(2.5e-01f));
tmp6904 = _mm512_mul_ps(in1097, _mm512_set1_ps(2.5e-01f));
tmp6898 = tmp6898;
tmp6905 = tmp6905;
__m512 tmp6901 = in1095;
__m512 tmp6908 = in1098;
tmp6897 = tmp6897;
tmp6904 = tmp6904;
__m512 tmp6902 = _mm512_fmadd_ps(tmp6897, _mm512_set1_ps(2e+00f), tmp6898);
__m512 tmp6909 = _mm512_fmadd_ps(tmp6904, _mm512_set1_ps(2e+00f), tmp6905);
tmp6898 = _mm512_fnmadd_ps(tmp6897, _mm512_set1_ps(2e+00f), tmp6898);
tmp6905 = _mm512_fnmadd_ps(tmp6904, _mm512_set1_ps(2e+00f), tmp6905);
tmp6897 = in1094;
tmp6904 = in1097;
in1094 = _mm512_sub_ps(_mm512_setzero_ps(), in1094);
in1097 = _mm512_sub_ps(_mm512_setzero_ps(), in1097);
tmp6897 = tmp6897;
tmp6904 = tmp6904;
__m512 tmp6903 = in1094;
__m512 tmp6910 = in1097;
in1095 = _mm512_fmadd_ps(tmp6897, _mm512_set1_ps(2e+00f), tmp6901);
in1098 = _mm512_fmadd_ps(tmp6904, _mm512_set1_ps(2e+00f), tmp6908);
tmp6901 = _mm512_fnmadd_ps(tmp6897, _mm512_set1_ps(2e+00f), tmp6901);
tmp6908 = _mm512_fnmadd_ps(tmp6904, _mm512_set1_ps(2e+00f), tmp6908);
__m512 tmp6919 = _mm512_unpacklo_ps(in1093, tmp6899);
__m512 tmp6920 = _mm512_unpackhi_ps(in1093, tmp6899);
__m512 tmp6921 = _mm512_unpacklo_ps(tmp6900, tmp6902);
__m512 tmp6922 = _mm512_unpackhi_ps(tmp6900, tmp6902);
__m512 tmp6923 = _mm512_unpacklo_ps(tmp6898, in1095);
__m512 tmp6924 = _mm512_unpackhi_ps(tmp6898, in1095);
__m512 tmp6925 = _mm512_unpacklo_ps(tmp6901, tmp6903);
__m512 tmp6926 = _mm512_unpackhi_ps(tmp6901, tmp6903);
__m512 tmp6927 = _mm512_unpacklo_ps(in1096, tmp6906);
__m512 tmp6928 = _mm512_unpackhi_ps(in1096, tmp6906);
__m512 tmp6929 = _mm512_unpacklo_ps(tmp6907, tmp6909);
__m512 tmp6930 = _mm512_unpackhi_ps(tmp6907, tmp6909);
__m512 tmp6931 = _mm512_unpacklo_ps(tmp6905, in1098);
__m512 tmp6932 = _mm512_unpackhi_ps(tmp6905, in1098);
__m512 tmp6933 = _mm512_unpacklo_ps(tmp6908, tmp6910);
__m512 tmp6934 = _mm512_unpackhi_ps(tmp6908, tmp6910);
__m512 tmp6935 = _mm512_shuffle_ps(tmp6919, tmp6921, 68);
__m512 tmp6936 = _mm512_shuffle_ps(tmp6919, tmp6921, 238);
__m512 tmp6937 = _mm512_shuffle_ps(tmp6920, tmp6922, 68);
__m512 tmp6938 = _mm512_shuffle_ps(tmp6920, tmp6922, 238);
__m512 tmp6939 = _mm512_shuffle_ps(tmp6923, tmp6925, 68);
__m512 tmp6940 = _mm512_shuffle_ps(tmp6923, tmp6925, 238);
__m512 tmp6941 = _mm512_shuffle_ps(tmp6924, tmp6926, 68);
__m512 tmp6942 = _mm512_shuffle_ps(tmp6924, tmp6926, 238);
__m512 tmp6943 = _mm512_shuffle_ps(tmp6927, tmp6929, 68);
__m512 tmp6944 = _mm512_shuffle_ps(tmp6927, tmp6929, 238);
__m512 tmp6945 = _mm512_shuffle_ps(tmp6928, tmp6930, 68);
__m512 tmp6946 = _mm512_shuffle_ps(tmp6928, tmp6930, 238);
__m512 tmp6947 = _mm512_shuffle_ps(tmp6931, tmp6933, 68);
__m512 tmp6948 = _mm512_shuffle_ps(tmp6931, tmp6933, 238);
__m512 tmp6949 = _mm512_shuffle_ps(tmp6932, tmp6934, 68);
__m512 tmp6950 = _mm512_shuffle_ps(tmp6932, tmp6934, 238);
__m512 tmp6951 = _mm512_shuffle_f32x4(tmp6935, tmp6939, 136);
__m512 tmp6952 = _mm512_shuffle_f32x4(tmp6935, tmp6939, 221);
__m512 tmp6953 = _mm512_shuffle_f32x4(tmp6936, tmp6940, 136);
__m512 tmp6954 = _mm512_shuffle_f32x4(tmp6936, tmp6940, 221);
__m512 tmp6955 = _mm512_shuffle_f32x4(tmp6937, tmp6941, 136);
__m512 tmp6956 = _mm512_shuffle_f32x4(tmp6937, tmp6941, 221);
__m512 tmp6957 = _mm512_shuffle_f32x4(tmp6938, tmp6942, 136);
__m512 tmp6958 = _mm512_shuffle_f32x4(tmp6938, tmp6942, 221);
__m512 tmp6959 = _mm512_shuffle_f32x4(tmp6943, tmp6947, 136);
__m512 tmp6960 = _mm512_shuffle_f32x4(tmp6943, tmp6947, 221);
__m512 tmp6961 = _mm512_shuffle_f32x4(tmp6944, tmp6948, 136);
__m512 tmp6962 = _mm512_shuffle_f32x4(tmp6944, tmp6948, 221);
__m512 tmp6963 = _mm512_shuffle_f32x4(tmp6945, tmp6949, 136);
__m512 tmp6964 = _mm512_shuffle_f32x4(tmp6945, tmp6949, 221);
__m512 tmp6965 = _mm512_shuffle_f32x4(tmp6946, tmp6950, 136);
__m512 tmp6966 = _mm512_shuffle_f32x4(tmp6946, tmp6950, 221);
in1093 = _mm512_shuffle_f32x4(tmp6951, tmp6959, 136);
in1096 = _mm512_shuffle_f32x4(tmp6951, tmp6959, 221);
tmp6899 = _mm512_shuffle_f32x4(tmp6953, tmp6961, 136);
tmp6906 = _mm512_shuffle_f32x4(tmp6953, tmp6961, 221);
tmp6900 = _mm512_shuffle_f32x4(tmp6955, tmp6963, 136);
tmp6907 = _mm512_shuffle_f32x4(tmp6955, tmp6963, 221);
tmp6902 = _mm512_shuffle_f32x4(tmp6957, tmp6965, 136);
tmp6909 = _mm512_shuffle_f32x4(tmp6957, tmp6965, 221);
tmp6898 = _mm512_shuffle_f32x4(tmp6952, tmp6960, 136);
tmp6905 = _mm512_shuffle_f32x4(tmp6952, tmp6960, 221);
in1095 = _mm512_shuffle_f32x4(tmp6954, tmp6962, 136);
in1098 = _mm512_shuffle_f32x4(tmp6954, tmp6962, 221);
tmp6901 = _mm512_shuffle_f32x4(tmp6956, tmp6964, 136);
tmp6908 = _mm512_shuffle_f32x4(tmp6956, tmp6964, 221);
tmp6903 = _mm512_shuffle_f32x4(tmp6958, tmp6966, 136);
tmp6910 = _mm512_shuffle_f32x4(tmp6958, tmp6966, 221);
__m512 tmp6911 = _mm512_add_ps(tmp6899, in1095);
__m512 tmp6915 = _mm512_add_ps(tmp6906, in1098);
__m512 tmp6912 = _mm512_sub_ps(tmp6898, tmp6900);
__m512 tmp6916 = _mm512_sub_ps(tmp6905, tmp6907);
__m512 tmp6913 = _mm512_add_ps(tmp6900, tmp6901);
__m512 tmp6917 = _mm512_add_ps(tmp6907, tmp6908);
in1093 = _mm512_sub_ps(in1093, tmp6901);
in1096 = _mm512_sub_ps(in1096, tmp6908);
tmp6911 = _mm512_fmadd_ps(tmp6902, _mm512_set1_ps(-4.25e+00f), tmp6911);
tmp6915 = _mm512_fmadd_ps(tmp6909, _mm512_set1_ps(-4.25e+00f), tmp6915);
tmp6913 = _mm512_fmadd_ps(tmp6898, _mm512_set1_ps(-4.25e+00f), tmp6913);
tmp6917 = _mm512_fmadd_ps(tmp6905, _mm512_set1_ps(-4.25e+00f), tmp6917);
in1093 = _mm512_fmadd_ps(tmp6912, _mm512_set1_ps(5.25e+00f), in1093);
in1096 = _mm512_fmadd_ps(tmp6916, _mm512_set1_ps(5.25e+00f), in1096);
tmp6912 = _mm512_fmadd_ps(tmp6900, _mm512_set1_ps(2.5e-01f), tmp6901);
tmp6916 = _mm512_fmadd_ps(tmp6907, _mm512_set1_ps(2.5e-01f), tmp6908);
tmp6900 = _mm512_fmadd_ps(tmp6900, _mm512_set1_ps(4e+00f), tmp6901);
tmp6907 = _mm512_fmadd_ps(tmp6907, _mm512_set1_ps(4e+00f), tmp6908);
__m512 tmp6914 = _mm512_sub_ps(tmp6913, tmp6911);
__m512 tmp6918 = _mm512_sub_ps(tmp6917, tmp6915);
tmp6913 = _mm512_add_ps(tmp6911, tmp6913);
tmp6917 = _mm512_add_ps(tmp6915, tmp6917);
tmp6911 = _mm512_fmadd_ps(tmp6899, _mm512_set1_ps(2.5e-01f), in1095);
tmp6915 = _mm512_fmadd_ps(tmp6906, _mm512_set1_ps(2.5e-01f), in1098);
tmp6912 = _mm512_fmadd_ps(tmp6898, _mm512_set1_ps(-1.25e+00f), tmp6912);
tmp6916 = _mm512_fmadd_ps(tmp6905, _mm512_set1_ps(-1.25e+00f), tmp6916);
tmp6898 = _mm512_fmadd_ps(tmp6898, _mm512_set1_ps(-5e+00f), tmp6900);
tmp6905 = _mm512_fmadd_ps(tmp6905, _mm512_set1_ps(-5e+00f), tmp6907);
tmp6911 = _mm512_fmadd_ps(tmp6902, _mm512_set1_ps(-1.25e+00f), tmp6911);
tmp6915 = _mm512_fmadd_ps(tmp6909, _mm512_set1_ps(-1.25e+00f), tmp6915);
tmp6901 = _mm512_fmadd_ps(tmp6911, _mm512_set1_ps(2e+00f), tmp6912);
tmp6908 = _mm512_fmadd_ps(tmp6915, _mm512_set1_ps(2e+00f), tmp6916);
tmp6912 = _mm512_fnmadd_ps(tmp6911, _mm512_set1_ps(2e+00f), tmp6912);
tmp6916 = _mm512_fnmadd_ps(tmp6915, _mm512_set1_ps(2e+00f), tmp6916);
tmp6911 = _mm512_fmadd_ps(in1095, _mm512_set1_ps(2.5e-01f), tmp6899);
tmp6915 = _mm512_fmadd_ps(in1098, _mm512_set1_ps(2.5e-01f), tmp6906);
tmp6899 = _mm512_sub_ps(tmp6903, tmp6899);
tmp6906 = _mm512_sub_ps(tmp6910, tmp6906);
tmp6911 = _mm512_fmadd_ps(tmp6902, _mm512_set1_ps(-1.25e+00f), tmp6911);
tmp6915 = _mm512_fmadd_ps(tmp6909, _mm512_set1_ps(-1.25e+00f), tmp6915);
tmp6902 = _mm512_sub_ps(tmp6902, in1095);
tmp6909 = _mm512_sub_ps(tmp6909, in1098);
tmp6902 = _mm512_fmadd_ps(tmp6902, _mm512_set1_ps(5.25e+00f), tmp6899);
tmp6909 = _mm512_fmadd_ps(tmp6909, _mm512_set1_ps(5.25e+00f), tmp6906);
tmp6900 = _mm512_fmadd_ps(tmp6911, _mm512_set1_ps(2e+00f), tmp6898);
tmp6907 = _mm512_fmadd_ps(tmp6915, _mm512_set1_ps(2e+00f), tmp6905);
tmp6898 = _mm512_fnmadd_ps(tmp6911, _mm512_set1_ps(2e+00f), tmp6898);
tmp6905 = _mm512_fnmadd_ps(tmp6915, _mm512_set1_ps(2e+00f), tmp6905);
__m512 out1043 = _mm512_shuffle_f32x4(in1093, tmp6913, 68);
__m512 out1051 = _mm512_shuffle_f32x4(in1093, tmp6913, 238);
__m512 out1044 = _mm512_shuffle_f32x4(tmp6914, tmp6901, 68);
__m512 out1052 = _mm512_shuffle_f32x4(tmp6914, tmp6901, 238);
__m512 out1045 = _mm512_shuffle_f32x4(tmp6912, tmp6900, 68);
__m512 out1053 = _mm512_shuffle_f32x4(tmp6912, tmp6900, 238);
__m512 out1046 = _mm512_shuffle_f32x4(tmp6898, tmp6902, 68);
__m512 out1054 = _mm512_shuffle_f32x4(tmp6898, tmp6902, 238);
__m512 out1047 = _mm512_shuffle_f32x4(in1096, tmp6917, 68);
__m512 out1055 = _mm512_shuffle_f32x4(in1096, tmp6917, 238);
__m512 out1048 = _mm512_shuffle_f32x4(tmp6918, tmp6908, 68);
__m512 out1056 = _mm512_shuffle_f32x4(tmp6918, tmp6908, 238);
__m512 out1049 = _mm512_shuffle_f32x4(tmp6916, tmp6907, 68);
__m512 out1057 = _mm512_shuffle_f32x4(tmp6916, tmp6907, 238);
__m512 out1050 = _mm512_shuffle_f32x4(tmp6905, tmp6909, 68);
__m512 out1058 = _mm512_shuffle_f32x4(tmp6905, tmp6909, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1536*s19+768*k86, out1043);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1536*s19+768*k86, out1051);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1536*s19+768*k86, out1047);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1536*s19+768*k86, out1055);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1536*s19+768*k86, out1044);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1536*s19+768*k86, out1052);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1536*s19+768*k86, out1048);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1536*s19+768*k86, out1056);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1536*s19+768*k86, out1045);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1536*s19+768*k86, out1053);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1536*s19+768*k86, out1049);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1536*s19+768*k86, out1057);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1536*s19+768*k86, out1046);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1536*s19+768*k86, out1054);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1536*s19+768*k86, out1050);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1536*s19+768*k86, out1058);
__m512 dat1623 = _mm512_maskz_loadu_ps(16383, datPtr12+96+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1624 = _mm512_maskz_loadu_ps(8191, datPtr12+12612+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512i pm148 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1099 = _mm512_permutexvar_ps(pm148, dat1623);
__m512i pm149 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1102 = _mm512_permutexvar_ps(pm149, dat1624);
__m512 dat1625 = _mm512_maskz_loadu_ps(16383, datPtr12+320+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1626 = _mm512_maskz_loadu_ps(8191, datPtr12+12836+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1100 = _mm512_permutexvar_ps(pm148, dat1625);
__m512 in1103 = _mm512_permutexvar_ps(pm149, dat1626);
__m512 dat1627 = _mm512_maskz_loadu_ps(16383, datPtr12+544+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1628 = _mm512_maskz_loadu_ps(8191, datPtr12+13060+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1101 = _mm512_permutexvar_ps(pm148, dat1627);
__m512 in1104 = _mm512_permutexvar_ps(pm149, dat1628);
__m512 tmp6967 = in1100;
__m512 tmp6974 = in1103;
__m512 tmp6968 = _mm512_sub_ps(_mm512_setzero_ps(), in1101);
__m512 tmp6975 = _mm512_sub_ps(_mm512_setzero_ps(), in1104);
__m512 tmp6969 = in1101;
__m512 tmp6976 = in1104;
in1099 = in1099;
in1102 = in1102;
tmp6967 = tmp6967;
tmp6974 = tmp6974;
tmp6969 = tmp6969;
tmp6976 = tmp6976;
in1099 = _mm512_fmadd_ps(tmp6968, _mm512_set1_ps(5.25e+00f), in1099);
in1102 = _mm512_fmadd_ps(tmp6975, _mm512_set1_ps(5.25e+00f), in1102);
tmp6968 = _mm512_mul_ps(in1101, _mm512_set1_ps(2.5e-01f));
tmp6975 = _mm512_mul_ps(in1104, _mm512_set1_ps(2.5e-01f));
in1101 = _mm512_mul_ps(in1101, _mm512_set1_ps(4e+00f));
in1104 = _mm512_mul_ps(in1104, _mm512_set1_ps(4e+00f));
__m512 tmp6970 = _mm512_sub_ps(tmp6969, tmp6967);
__m512 tmp6977 = _mm512_sub_ps(tmp6976, tmp6974);
tmp6969 = _mm512_add_ps(tmp6967, tmp6969);
tmp6976 = _mm512_add_ps(tmp6974, tmp6976);
tmp6967 = _mm512_mul_ps(in1100, _mm512_set1_ps(2.5e-01f));
tmp6974 = _mm512_mul_ps(in1103, _mm512_set1_ps(2.5e-01f));
tmp6968 = tmp6968;
tmp6975 = tmp6975;
__m512 tmp6971 = in1101;
__m512 tmp6978 = in1104;
tmp6967 = tmp6967;
tmp6974 = tmp6974;
__m512 tmp6972 = _mm512_fmadd_ps(tmp6967, _mm512_set1_ps(2e+00f), tmp6968);
__m512 tmp6979 = _mm512_fmadd_ps(tmp6974, _mm512_set1_ps(2e+00f), tmp6975);
tmp6968 = _mm512_fnmadd_ps(tmp6967, _mm512_set1_ps(2e+00f), tmp6968);
tmp6975 = _mm512_fnmadd_ps(tmp6974, _mm512_set1_ps(2e+00f), tmp6975);
tmp6967 = in1100;
tmp6974 = in1103;
in1100 = _mm512_sub_ps(_mm512_setzero_ps(), in1100);
in1103 = _mm512_sub_ps(_mm512_setzero_ps(), in1103);
tmp6967 = tmp6967;
tmp6974 = tmp6974;
__m512 tmp6973 = in1100;
__m512 tmp6980 = in1103;
in1101 = _mm512_fmadd_ps(tmp6967, _mm512_set1_ps(2e+00f), tmp6971);
in1104 = _mm512_fmadd_ps(tmp6974, _mm512_set1_ps(2e+00f), tmp6978);
tmp6971 = _mm512_fnmadd_ps(tmp6967, _mm512_set1_ps(2e+00f), tmp6971);
tmp6978 = _mm512_fnmadd_ps(tmp6974, _mm512_set1_ps(2e+00f), tmp6978);
__m512 tmp6989 = _mm512_unpacklo_ps(in1099, tmp6969);
__m512 tmp6990 = _mm512_unpackhi_ps(in1099, tmp6969);
__m512 tmp6991 = _mm512_unpacklo_ps(tmp6970, tmp6972);
__m512 tmp6992 = _mm512_unpackhi_ps(tmp6970, tmp6972);
__m512 tmp6993 = _mm512_unpacklo_ps(tmp6968, in1101);
__m512 tmp6994 = _mm512_unpackhi_ps(tmp6968, in1101);
__m512 tmp6995 = _mm512_unpacklo_ps(tmp6971, tmp6973);
__m512 tmp6996 = _mm512_unpackhi_ps(tmp6971, tmp6973);
__m512 tmp6997 = _mm512_unpacklo_ps(in1102, tmp6976);
__m512 tmp6998 = _mm512_unpackhi_ps(in1102, tmp6976);
__m512 tmp6999 = _mm512_unpacklo_ps(tmp6977, tmp6979);
__m512 tmp7000 = _mm512_unpackhi_ps(tmp6977, tmp6979);
__m512 tmp7001 = _mm512_unpacklo_ps(tmp6975, in1104);
__m512 tmp7002 = _mm512_unpackhi_ps(tmp6975, in1104);
__m512 tmp7003 = _mm512_unpacklo_ps(tmp6978, tmp6980);
__m512 tmp7004 = _mm512_unpackhi_ps(tmp6978, tmp6980);
__m512 tmp7005 = _mm512_shuffle_ps(tmp6989, tmp6991, 68);
__m512 tmp7006 = _mm512_shuffle_ps(tmp6989, tmp6991, 238);
__m512 tmp7007 = _mm512_shuffle_ps(tmp6990, tmp6992, 68);
__m512 tmp7008 = _mm512_shuffle_ps(tmp6990, tmp6992, 238);
__m512 tmp7009 = _mm512_shuffle_ps(tmp6993, tmp6995, 68);
__m512 tmp7010 = _mm512_shuffle_ps(tmp6993, tmp6995, 238);
__m512 tmp7011 = _mm512_shuffle_ps(tmp6994, tmp6996, 68);
__m512 tmp7012 = _mm512_shuffle_ps(tmp6994, tmp6996, 238);
__m512 tmp7013 = _mm512_shuffle_ps(tmp6997, tmp6999, 68);
__m512 tmp7014 = _mm512_shuffle_ps(tmp6997, tmp6999, 238);
__m512 tmp7015 = _mm512_shuffle_ps(tmp6998, tmp7000, 68);
__m512 tmp7016 = _mm512_shuffle_ps(tmp6998, tmp7000, 238);
__m512 tmp7017 = _mm512_shuffle_ps(tmp7001, tmp7003, 68);
__m512 tmp7018 = _mm512_shuffle_ps(tmp7001, tmp7003, 238);
__m512 tmp7019 = _mm512_shuffle_ps(tmp7002, tmp7004, 68);
__m512 tmp7020 = _mm512_shuffle_ps(tmp7002, tmp7004, 238);
__m512 tmp7021 = _mm512_shuffle_f32x4(tmp7005, tmp7009, 136);
__m512 tmp7022 = _mm512_shuffle_f32x4(tmp7005, tmp7009, 221);
__m512 tmp7023 = _mm512_shuffle_f32x4(tmp7006, tmp7010, 136);
__m512 tmp7024 = _mm512_shuffle_f32x4(tmp7006, tmp7010, 221);
__m512 tmp7025 = _mm512_shuffle_f32x4(tmp7007, tmp7011, 136);
__m512 tmp7026 = _mm512_shuffle_f32x4(tmp7007, tmp7011, 221);
__m512 tmp7027 = _mm512_shuffle_f32x4(tmp7008, tmp7012, 136);
__m512 tmp7028 = _mm512_shuffle_f32x4(tmp7008, tmp7012, 221);
__m512 tmp7029 = _mm512_shuffle_f32x4(tmp7013, tmp7017, 136);
__m512 tmp7030 = _mm512_shuffle_f32x4(tmp7013, tmp7017, 221);
__m512 tmp7031 = _mm512_shuffle_f32x4(tmp7014, tmp7018, 136);
__m512 tmp7032 = _mm512_shuffle_f32x4(tmp7014, tmp7018, 221);
__m512 tmp7033 = _mm512_shuffle_f32x4(tmp7015, tmp7019, 136);
__m512 tmp7034 = _mm512_shuffle_f32x4(tmp7015, tmp7019, 221);
__m512 tmp7035 = _mm512_shuffle_f32x4(tmp7016, tmp7020, 136);
__m512 tmp7036 = _mm512_shuffle_f32x4(tmp7016, tmp7020, 221);
in1099 = _mm512_shuffle_f32x4(tmp7021, tmp7029, 136);
in1102 = _mm512_shuffle_f32x4(tmp7021, tmp7029, 221);
tmp6969 = _mm512_shuffle_f32x4(tmp7023, tmp7031, 136);
tmp6976 = _mm512_shuffle_f32x4(tmp7023, tmp7031, 221);
tmp6970 = _mm512_shuffle_f32x4(tmp7025, tmp7033, 136);
tmp6977 = _mm512_shuffle_f32x4(tmp7025, tmp7033, 221);
tmp6972 = _mm512_shuffle_f32x4(tmp7027, tmp7035, 136);
tmp6979 = _mm512_shuffle_f32x4(tmp7027, tmp7035, 221);
tmp6968 = _mm512_shuffle_f32x4(tmp7022, tmp7030, 136);
tmp6975 = _mm512_shuffle_f32x4(tmp7022, tmp7030, 221);
in1101 = _mm512_shuffle_f32x4(tmp7024, tmp7032, 136);
in1104 = _mm512_shuffle_f32x4(tmp7024, tmp7032, 221);
tmp6971 = _mm512_shuffle_f32x4(tmp7026, tmp7034, 136);
tmp6978 = _mm512_shuffle_f32x4(tmp7026, tmp7034, 221);
tmp6973 = _mm512_shuffle_f32x4(tmp7028, tmp7036, 136);
tmp6980 = _mm512_shuffle_f32x4(tmp7028, tmp7036, 221);
__m512 tmp6981 = _mm512_add_ps(tmp6969, in1101);
__m512 tmp6985 = _mm512_add_ps(tmp6976, in1104);
__m512 tmp6982 = _mm512_sub_ps(tmp6968, tmp6970);
__m512 tmp6986 = _mm512_sub_ps(tmp6975, tmp6977);
__m512 tmp6983 = _mm512_add_ps(tmp6970, tmp6971);
__m512 tmp6987 = _mm512_add_ps(tmp6977, tmp6978);
in1099 = _mm512_sub_ps(in1099, tmp6971);
in1102 = _mm512_sub_ps(in1102, tmp6978);
tmp6981 = _mm512_fmadd_ps(tmp6972, _mm512_set1_ps(-4.25e+00f), tmp6981);
tmp6985 = _mm512_fmadd_ps(tmp6979, _mm512_set1_ps(-4.25e+00f), tmp6985);
tmp6983 = _mm512_fmadd_ps(tmp6968, _mm512_set1_ps(-4.25e+00f), tmp6983);
tmp6987 = _mm512_fmadd_ps(tmp6975, _mm512_set1_ps(-4.25e+00f), tmp6987);
in1099 = _mm512_fmadd_ps(tmp6982, _mm512_set1_ps(5.25e+00f), in1099);
in1102 = _mm512_fmadd_ps(tmp6986, _mm512_set1_ps(5.25e+00f), in1102);
tmp6982 = _mm512_fmadd_ps(tmp6970, _mm512_set1_ps(2.5e-01f), tmp6971);
tmp6986 = _mm512_fmadd_ps(tmp6977, _mm512_set1_ps(2.5e-01f), tmp6978);
tmp6970 = _mm512_fmadd_ps(tmp6970, _mm512_set1_ps(4e+00f), tmp6971);
tmp6977 = _mm512_fmadd_ps(tmp6977, _mm512_set1_ps(4e+00f), tmp6978);
__m512 tmp6984 = _mm512_sub_ps(tmp6983, tmp6981);
__m512 tmp6988 = _mm512_sub_ps(tmp6987, tmp6985);
tmp6983 = _mm512_add_ps(tmp6981, tmp6983);
tmp6987 = _mm512_add_ps(tmp6985, tmp6987);
tmp6981 = _mm512_fmadd_ps(tmp6969, _mm512_set1_ps(2.5e-01f), in1101);
tmp6985 = _mm512_fmadd_ps(tmp6976, _mm512_set1_ps(2.5e-01f), in1104);
tmp6982 = _mm512_fmadd_ps(tmp6968, _mm512_set1_ps(-1.25e+00f), tmp6982);
tmp6986 = _mm512_fmadd_ps(tmp6975, _mm512_set1_ps(-1.25e+00f), tmp6986);
tmp6968 = _mm512_fmadd_ps(tmp6968, _mm512_set1_ps(-5e+00f), tmp6970);
tmp6975 = _mm512_fmadd_ps(tmp6975, _mm512_set1_ps(-5e+00f), tmp6977);
tmp6981 = _mm512_fmadd_ps(tmp6972, _mm512_set1_ps(-1.25e+00f), tmp6981);
tmp6985 = _mm512_fmadd_ps(tmp6979, _mm512_set1_ps(-1.25e+00f), tmp6985);
tmp6971 = _mm512_fmadd_ps(tmp6981, _mm512_set1_ps(2e+00f), tmp6982);
tmp6978 = _mm512_fmadd_ps(tmp6985, _mm512_set1_ps(2e+00f), tmp6986);
tmp6982 = _mm512_fnmadd_ps(tmp6981, _mm512_set1_ps(2e+00f), tmp6982);
tmp6986 = _mm512_fnmadd_ps(tmp6985, _mm512_set1_ps(2e+00f), tmp6986);
tmp6981 = _mm512_fmadd_ps(in1101, _mm512_set1_ps(2.5e-01f), tmp6969);
tmp6985 = _mm512_fmadd_ps(in1104, _mm512_set1_ps(2.5e-01f), tmp6976);
tmp6969 = _mm512_sub_ps(tmp6973, tmp6969);
tmp6976 = _mm512_sub_ps(tmp6980, tmp6976);
tmp6981 = _mm512_fmadd_ps(tmp6972, _mm512_set1_ps(-1.25e+00f), tmp6981);
tmp6985 = _mm512_fmadd_ps(tmp6979, _mm512_set1_ps(-1.25e+00f), tmp6985);
tmp6972 = _mm512_sub_ps(tmp6972, in1101);
tmp6979 = _mm512_sub_ps(tmp6979, in1104);
tmp6972 = _mm512_fmadd_ps(tmp6972, _mm512_set1_ps(5.25e+00f), tmp6969);
tmp6979 = _mm512_fmadd_ps(tmp6979, _mm512_set1_ps(5.25e+00f), tmp6976);
tmp6970 = _mm512_fmadd_ps(tmp6981, _mm512_set1_ps(2e+00f), tmp6968);
tmp6977 = _mm512_fmadd_ps(tmp6985, _mm512_set1_ps(2e+00f), tmp6975);
tmp6968 = _mm512_fnmadd_ps(tmp6981, _mm512_set1_ps(2e+00f), tmp6968);
tmp6975 = _mm512_fnmadd_ps(tmp6985, _mm512_set1_ps(2e+00f), tmp6975);
__m512 out1059 = _mm512_shuffle_f32x4(in1099, tmp6983, 68);
__m512 out1067 = _mm512_shuffle_f32x4(in1099, tmp6983, 238);
__m512 out1060 = _mm512_shuffle_f32x4(tmp6984, tmp6971, 68);
__m512 out1068 = _mm512_shuffle_f32x4(tmp6984, tmp6971, 238);
__m512 out1061 = _mm512_shuffle_f32x4(tmp6982, tmp6970, 68);
__m512 out1069 = _mm512_shuffle_f32x4(tmp6982, tmp6970, 238);
__m512 out1062 = _mm512_shuffle_f32x4(tmp6968, tmp6972, 68);
__m512 out1070 = _mm512_shuffle_f32x4(tmp6968, tmp6972, 238);
__m512 out1063 = _mm512_shuffle_f32x4(in1102, tmp6987, 68);
__m512 out1071 = _mm512_shuffle_f32x4(in1102, tmp6987, 238);
__m512 out1064 = _mm512_shuffle_f32x4(tmp6988, tmp6978, 68);
__m512 out1072 = _mm512_shuffle_f32x4(tmp6988, tmp6978, 238);
__m512 out1065 = _mm512_shuffle_f32x4(tmp6986, tmp6977, 68);
__m512 out1073 = _mm512_shuffle_f32x4(tmp6986, tmp6977, 238);
__m512 out1066 = _mm512_shuffle_f32x4(tmp6975, tmp6979, 68);
__m512 out1074 = _mm512_shuffle_f32x4(tmp6975, tmp6979, 238);
_mm512_storeu_ps(dfPtr6+256+102400*i27+1536*j21+1536*s19+768*k86, out1059);
_mm512_storeu_ps(dfPtr6+384+102400*i27+1536*j21+1536*s19+768*k86, out1067);
_mm512_storeu_ps(dfPtr6+320+102400*i27+1536*j21+1536*s19+768*k86, out1063);
_mm512_storeu_ps(dfPtr6+448+102400*i27+1536*j21+1536*s19+768*k86, out1071);
_mm512_storeu_ps(dfPtr6+25856+102400*i27+1536*j21+1536*s19+768*k86, out1060);
_mm512_storeu_ps(dfPtr6+25984+102400*i27+1536*j21+1536*s19+768*k86, out1068);
_mm512_storeu_ps(dfPtr6+25920+102400*i27+1536*j21+1536*s19+768*k86, out1064);
_mm512_storeu_ps(dfPtr6+26048+102400*i27+1536*j21+1536*s19+768*k86, out1072);
_mm512_storeu_ps(dfPtr6+51456+102400*i27+1536*j21+1536*s19+768*k86, out1061);
_mm512_storeu_ps(dfPtr6+51584+102400*i27+1536*j21+1536*s19+768*k86, out1069);
_mm512_storeu_ps(dfPtr6+51520+102400*i27+1536*j21+1536*s19+768*k86, out1065);
_mm512_storeu_ps(dfPtr6+51648+102400*i27+1536*j21+1536*s19+768*k86, out1073);
_mm512_storeu_ps(dfPtr6+77056+102400*i27+1536*j21+1536*s19+768*k86, out1062);
_mm512_storeu_ps(dfPtr6+77184+102400*i27+1536*j21+1536*s19+768*k86, out1070);
_mm512_storeu_ps(dfPtr6+77120+102400*i27+1536*j21+1536*s19+768*k86, out1066);
_mm512_storeu_ps(dfPtr6+77248+102400*i27+1536*j21+1536*s19+768*k86, out1074);
__m512 dat1629 = _mm512_maskz_loadu_ps(16383, datPtr12+12656+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1630 = _mm512_maskz_loadu_ps(16383, datPtr12+12704+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512i pm150 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1105 = _mm512_permutexvar_ps(pm150, dat1629);
__m512 in1108 = _mm512_permutexvar_ps(pm150, dat1630);
__m512 dat1631 = _mm512_maskz_loadu_ps(16383, datPtr12+12880+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1632 = _mm512_maskz_loadu_ps(16383, datPtr12+12928+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1106 = _mm512_permutexvar_ps(pm150, dat1631);
__m512 in1109 = _mm512_permutexvar_ps(pm150, dat1632);
__m512 dat1633 = _mm512_maskz_loadu_ps(16383, datPtr12+13104+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 dat1634 = _mm512_maskz_loadu_ps(16383, datPtr12+13152+50432*i27+224*h36+4*w43+50432*s19+25216*k86);
__m512 in1107 = _mm512_permutexvar_ps(pm150, dat1633);
__m512 in1110 = _mm512_permutexvar_ps(pm150, dat1634);
__m512 tmp7037 = in1106;
__m512 tmp7044 = in1109;
__m512 tmp7038 = _mm512_sub_ps(_mm512_setzero_ps(), in1107);
__m512 tmp7045 = _mm512_sub_ps(_mm512_setzero_ps(), in1110);
__m512 tmp7039 = in1107;
__m512 tmp7046 = in1110;
in1105 = in1105;
in1108 = in1108;
tmp7037 = tmp7037;
tmp7044 = tmp7044;
tmp7039 = tmp7039;
tmp7046 = tmp7046;
in1105 = _mm512_fmadd_ps(tmp7038, _mm512_set1_ps(5.25e+00f), in1105);
in1108 = _mm512_fmadd_ps(tmp7045, _mm512_set1_ps(5.25e+00f), in1108);
tmp7038 = _mm512_mul_ps(in1107, _mm512_set1_ps(2.5e-01f));
tmp7045 = _mm512_mul_ps(in1110, _mm512_set1_ps(2.5e-01f));
in1107 = _mm512_mul_ps(in1107, _mm512_set1_ps(4e+00f));
in1110 = _mm512_mul_ps(in1110, _mm512_set1_ps(4e+00f));
__m512 tmp7040 = _mm512_sub_ps(tmp7039, tmp7037);
__m512 tmp7047 = _mm512_sub_ps(tmp7046, tmp7044);
tmp7039 = _mm512_add_ps(tmp7037, tmp7039);
tmp7046 = _mm512_add_ps(tmp7044, tmp7046);
tmp7037 = _mm512_mul_ps(in1106, _mm512_set1_ps(2.5e-01f));
tmp7044 = _mm512_mul_ps(in1109, _mm512_set1_ps(2.5e-01f));
tmp7038 = tmp7038;
tmp7045 = tmp7045;
__m512 tmp7041 = in1107;
__m512 tmp7048 = in1110;
tmp7037 = tmp7037;
tmp7044 = tmp7044;
__m512 tmp7042 = _mm512_fmadd_ps(tmp7037, _mm512_set1_ps(2e+00f), tmp7038);
__m512 tmp7049 = _mm512_fmadd_ps(tmp7044, _mm512_set1_ps(2e+00f), tmp7045);
tmp7038 = _mm512_fnmadd_ps(tmp7037, _mm512_set1_ps(2e+00f), tmp7038);
tmp7045 = _mm512_fnmadd_ps(tmp7044, _mm512_set1_ps(2e+00f), tmp7045);
tmp7037 = in1106;
tmp7044 = in1109;
in1106 = _mm512_sub_ps(_mm512_setzero_ps(), in1106);
in1109 = _mm512_sub_ps(_mm512_setzero_ps(), in1109);
tmp7037 = tmp7037;
tmp7044 = tmp7044;
__m512 tmp7043 = in1106;
__m512 tmp7050 = in1109;
in1107 = _mm512_fmadd_ps(tmp7037, _mm512_set1_ps(2e+00f), tmp7041);
in1110 = _mm512_fmadd_ps(tmp7044, _mm512_set1_ps(2e+00f), tmp7048);
tmp7041 = _mm512_fnmadd_ps(tmp7037, _mm512_set1_ps(2e+00f), tmp7041);
tmp7048 = _mm512_fnmadd_ps(tmp7044, _mm512_set1_ps(2e+00f), tmp7048);
__m512 tmp7059 = _mm512_unpacklo_ps(in1105, tmp7039);
__m512 tmp7060 = _mm512_unpackhi_ps(in1105, tmp7039);
__m512 tmp7061 = _mm512_unpacklo_ps(tmp7040, tmp7042);
__m512 tmp7062 = _mm512_unpackhi_ps(tmp7040, tmp7042);
__m512 tmp7063 = _mm512_unpacklo_ps(tmp7038, in1107);
__m512 tmp7064 = _mm512_unpackhi_ps(tmp7038, in1107);
__m512 tmp7065 = _mm512_unpacklo_ps(tmp7041, tmp7043);
__m512 tmp7066 = _mm512_unpackhi_ps(tmp7041, tmp7043);
__m512 tmp7067 = _mm512_unpacklo_ps(in1108, tmp7046);
__m512 tmp7068 = _mm512_unpackhi_ps(in1108, tmp7046);
__m512 tmp7069 = _mm512_unpacklo_ps(tmp7047, tmp7049);
__m512 tmp7070 = _mm512_unpackhi_ps(tmp7047, tmp7049);
__m512 tmp7071 = _mm512_unpacklo_ps(tmp7045, in1110);
__m512 tmp7072 = _mm512_unpackhi_ps(tmp7045, in1110);
__m512 tmp7073 = _mm512_unpacklo_ps(tmp7048, tmp7050);
__m512 tmp7074 = _mm512_unpackhi_ps(tmp7048, tmp7050);
__m512 tmp7075 = _mm512_shuffle_ps(tmp7059, tmp7061, 68);
__m512 tmp7076 = _mm512_shuffle_ps(tmp7059, tmp7061, 238);
__m512 tmp7077 = _mm512_shuffle_ps(tmp7060, tmp7062, 68);
__m512 tmp7078 = _mm512_shuffle_ps(tmp7060, tmp7062, 238);
__m512 tmp7079 = _mm512_shuffle_ps(tmp7063, tmp7065, 68);
__m512 tmp7080 = _mm512_shuffle_ps(tmp7063, tmp7065, 238);
__m512 tmp7081 = _mm512_shuffle_ps(tmp7064, tmp7066, 68);
__m512 tmp7082 = _mm512_shuffle_ps(tmp7064, tmp7066, 238);
__m512 tmp7083 = _mm512_shuffle_ps(tmp7067, tmp7069, 68);
__m512 tmp7084 = _mm512_shuffle_ps(tmp7067, tmp7069, 238);
__m512 tmp7085 = _mm512_shuffle_ps(tmp7068, tmp7070, 68);
__m512 tmp7086 = _mm512_shuffle_ps(tmp7068, tmp7070, 238);
__m512 tmp7087 = _mm512_shuffle_ps(tmp7071, tmp7073, 68);
__m512 tmp7088 = _mm512_shuffle_ps(tmp7071, tmp7073, 238);
__m512 tmp7089 = _mm512_shuffle_ps(tmp7072, tmp7074, 68);
__m512 tmp7090 = _mm512_shuffle_ps(tmp7072, tmp7074, 238);
__m512 tmp7091 = _mm512_shuffle_f32x4(tmp7075, tmp7079, 136);
__m512 tmp7092 = _mm512_shuffle_f32x4(tmp7075, tmp7079, 221);
__m512 tmp7093 = _mm512_shuffle_f32x4(tmp7076, tmp7080, 136);
__m512 tmp7094 = _mm512_shuffle_f32x4(tmp7076, tmp7080, 221);
__m512 tmp7095 = _mm512_shuffle_f32x4(tmp7077, tmp7081, 136);
__m512 tmp7096 = _mm512_shuffle_f32x4(tmp7077, tmp7081, 221);
__m512 tmp7097 = _mm512_shuffle_f32x4(tmp7078, tmp7082, 136);
__m512 tmp7098 = _mm512_shuffle_f32x4(tmp7078, tmp7082, 221);
__m512 tmp7099 = _mm512_shuffle_f32x4(tmp7083, tmp7087, 136);
__m512 tmp7100 = _mm512_shuffle_f32x4(tmp7083, tmp7087, 221);
__m512 tmp7101 = _mm512_shuffle_f32x4(tmp7084, tmp7088, 136);
__m512 tmp7102 = _mm512_shuffle_f32x4(tmp7084, tmp7088, 221);
__m512 tmp7103 = _mm512_shuffle_f32x4(tmp7085, tmp7089, 136);
__m512 tmp7104 = _mm512_shuffle_f32x4(tmp7085, tmp7089, 221);
__m512 tmp7105 = _mm512_shuffle_f32x4(tmp7086, tmp7090, 136);
__m512 tmp7106 = _mm512_shuffle_f32x4(tmp7086, tmp7090, 221);
in1105 = _mm512_shuffle_f32x4(tmp7091, tmp7099, 136);
in1108 = _mm512_shuffle_f32x4(tmp7091, tmp7099, 221);
tmp7039 = _mm512_shuffle_f32x4(tmp7093, tmp7101, 136);
tmp7046 = _mm512_shuffle_f32x4(tmp7093, tmp7101, 221);
tmp7040 = _mm512_shuffle_f32x4(tmp7095, tmp7103, 136);
tmp7047 = _mm512_shuffle_f32x4(tmp7095, tmp7103, 221);
tmp7042 = _mm512_shuffle_f32x4(tmp7097, tmp7105, 136);
tmp7049 = _mm512_shuffle_f32x4(tmp7097, tmp7105, 221);
tmp7038 = _mm512_shuffle_f32x4(tmp7092, tmp7100, 136);
tmp7045 = _mm512_shuffle_f32x4(tmp7092, tmp7100, 221);
in1107 = _mm512_shuffle_f32x4(tmp7094, tmp7102, 136);
in1110 = _mm512_shuffle_f32x4(tmp7094, tmp7102, 221);
tmp7041 = _mm512_shuffle_f32x4(tmp7096, tmp7104, 136);
tmp7048 = _mm512_shuffle_f32x4(tmp7096, tmp7104, 221);
tmp7043 = _mm512_shuffle_f32x4(tmp7098, tmp7106, 136);
tmp7050 = _mm512_shuffle_f32x4(tmp7098, tmp7106, 221);
__m512 tmp7051 = _mm512_add_ps(tmp7039, in1107);
__m512 tmp7055 = _mm512_add_ps(tmp7046, in1110);
__m512 tmp7052 = _mm512_sub_ps(tmp7038, tmp7040);
__m512 tmp7056 = _mm512_sub_ps(tmp7045, tmp7047);
__m512 tmp7053 = _mm512_add_ps(tmp7040, tmp7041);
__m512 tmp7057 = _mm512_add_ps(tmp7047, tmp7048);
in1105 = _mm512_sub_ps(in1105, tmp7041);
in1108 = _mm512_sub_ps(in1108, tmp7048);
tmp7051 = _mm512_fmadd_ps(tmp7042, _mm512_set1_ps(-4.25e+00f), tmp7051);
tmp7055 = _mm512_fmadd_ps(tmp7049, _mm512_set1_ps(-4.25e+00f), tmp7055);
tmp7053 = _mm512_fmadd_ps(tmp7038, _mm512_set1_ps(-4.25e+00f), tmp7053);
tmp7057 = _mm512_fmadd_ps(tmp7045, _mm512_set1_ps(-4.25e+00f), tmp7057);
in1105 = _mm512_fmadd_ps(tmp7052, _mm512_set1_ps(5.25e+00f), in1105);
in1108 = _mm512_fmadd_ps(tmp7056, _mm512_set1_ps(5.25e+00f), in1108);
tmp7052 = _mm512_fmadd_ps(tmp7040, _mm512_set1_ps(2.5e-01f), tmp7041);
tmp7056 = _mm512_fmadd_ps(tmp7047, _mm512_set1_ps(2.5e-01f), tmp7048);
tmp7040 = _mm512_fmadd_ps(tmp7040, _mm512_set1_ps(4e+00f), tmp7041);
tmp7047 = _mm512_fmadd_ps(tmp7047, _mm512_set1_ps(4e+00f), tmp7048);
__m512 tmp7054 = _mm512_sub_ps(tmp7053, tmp7051);
__m512 tmp7058 = _mm512_sub_ps(tmp7057, tmp7055);
tmp7053 = _mm512_add_ps(tmp7051, tmp7053);
tmp7057 = _mm512_add_ps(tmp7055, tmp7057);
tmp7051 = _mm512_fmadd_ps(tmp7039, _mm512_set1_ps(2.5e-01f), in1107);
tmp7055 = _mm512_fmadd_ps(tmp7046, _mm512_set1_ps(2.5e-01f), in1110);
tmp7052 = _mm512_fmadd_ps(tmp7038, _mm512_set1_ps(-1.25e+00f), tmp7052);
tmp7056 = _mm512_fmadd_ps(tmp7045, _mm512_set1_ps(-1.25e+00f), tmp7056);
tmp7038 = _mm512_fmadd_ps(tmp7038, _mm512_set1_ps(-5e+00f), tmp7040);
tmp7045 = _mm512_fmadd_ps(tmp7045, _mm512_set1_ps(-5e+00f), tmp7047);
tmp7051 = _mm512_fmadd_ps(tmp7042, _mm512_set1_ps(-1.25e+00f), tmp7051);
tmp7055 = _mm512_fmadd_ps(tmp7049, _mm512_set1_ps(-1.25e+00f), tmp7055);
tmp7041 = _mm512_fmadd_ps(tmp7051, _mm512_set1_ps(2e+00f), tmp7052);
tmp7048 = _mm512_fmadd_ps(tmp7055, _mm512_set1_ps(2e+00f), tmp7056);
tmp7052 = _mm512_fnmadd_ps(tmp7051, _mm512_set1_ps(2e+00f), tmp7052);
tmp7056 = _mm512_fnmadd_ps(tmp7055, _mm512_set1_ps(2e+00f), tmp7056);
tmp7051 = _mm512_fmadd_ps(in1107, _mm512_set1_ps(2.5e-01f), tmp7039);
tmp7055 = _mm512_fmadd_ps(in1110, _mm512_set1_ps(2.5e-01f), tmp7046);
tmp7039 = _mm512_sub_ps(tmp7043, tmp7039);
tmp7046 = _mm512_sub_ps(tmp7050, tmp7046);
tmp7051 = _mm512_fmadd_ps(tmp7042, _mm512_set1_ps(-1.25e+00f), tmp7051);
tmp7055 = _mm512_fmadd_ps(tmp7049, _mm512_set1_ps(-1.25e+00f), tmp7055);
tmp7042 = _mm512_sub_ps(tmp7042, in1107);
tmp7049 = _mm512_sub_ps(tmp7049, in1110);
tmp7042 = _mm512_fmadd_ps(tmp7042, _mm512_set1_ps(5.25e+00f), tmp7039);
tmp7049 = _mm512_fmadd_ps(tmp7049, _mm512_set1_ps(5.25e+00f), tmp7046);
tmp7040 = _mm512_fmadd_ps(tmp7051, _mm512_set1_ps(2e+00f), tmp7038);
tmp7047 = _mm512_fmadd_ps(tmp7055, _mm512_set1_ps(2e+00f), tmp7045);
tmp7038 = _mm512_fnmadd_ps(tmp7051, _mm512_set1_ps(2e+00f), tmp7038);
tmp7045 = _mm512_fnmadd_ps(tmp7055, _mm512_set1_ps(2e+00f), tmp7045);
__m512 out1075 = _mm512_shuffle_f32x4(in1105, tmp7053, 68);
__m512 out1083 = _mm512_shuffle_f32x4(in1105, tmp7053, 238);
__m512 out1076 = _mm512_shuffle_f32x4(tmp7054, tmp7041, 68);
__m512 out1084 = _mm512_shuffle_f32x4(tmp7054, tmp7041, 238);
__m512 out1077 = _mm512_shuffle_f32x4(tmp7052, tmp7040, 68);
__m512 out1085 = _mm512_shuffle_f32x4(tmp7052, tmp7040, 238);
__m512 out1078 = _mm512_shuffle_f32x4(tmp7038, tmp7042, 68);
__m512 out1086 = _mm512_shuffle_f32x4(tmp7038, tmp7042, 238);
__m512 out1079 = _mm512_shuffle_f32x4(in1108, tmp7057, 68);
__m512 out1087 = _mm512_shuffle_f32x4(in1108, tmp7057, 238);
__m512 out1080 = _mm512_shuffle_f32x4(tmp7058, tmp7048, 68);
__m512 out1088 = _mm512_shuffle_f32x4(tmp7058, tmp7048, 238);
__m512 out1081 = _mm512_shuffle_f32x4(tmp7056, tmp7047, 68);
__m512 out1089 = _mm512_shuffle_f32x4(tmp7056, tmp7047, 238);
__m512 out1082 = _mm512_shuffle_f32x4(tmp7045, tmp7049, 68);
__m512 out1090 = _mm512_shuffle_f32x4(tmp7045, tmp7049, 238);
_mm512_storeu_ps(dfPtr6+512+102400*i27+1536*j21+1536*s19+768*k86, out1075);
_mm512_storeu_ps(dfPtr6+640+102400*i27+1536*j21+1536*s19+768*k86, out1083);
_mm512_storeu_ps(dfPtr6+576+102400*i27+1536*j21+1536*s19+768*k86, out1079);
_mm512_storeu_ps(dfPtr6+704+102400*i27+1536*j21+1536*s19+768*k86, out1087);
_mm512_storeu_ps(dfPtr6+26112+102400*i27+1536*j21+1536*s19+768*k86, out1076);
_mm512_storeu_ps(dfPtr6+26240+102400*i27+1536*j21+1536*s19+768*k86, out1084);
_mm512_storeu_ps(dfPtr6+26176+102400*i27+1536*j21+1536*s19+768*k86, out1080);
_mm512_storeu_ps(dfPtr6+26304+102400*i27+1536*j21+1536*s19+768*k86, out1088);
_mm512_storeu_ps(dfPtr6+51712+102400*i27+1536*j21+1536*s19+768*k86, out1077);
_mm512_storeu_ps(dfPtr6+51840+102400*i27+1536*j21+1536*s19+768*k86, out1085);
_mm512_storeu_ps(dfPtr6+51776+102400*i27+1536*j21+1536*s19+768*k86, out1081);
_mm512_storeu_ps(dfPtr6+51904+102400*i27+1536*j21+1536*s19+768*k86, out1089);
_mm512_storeu_ps(dfPtr6+77312+102400*i27+1536*j21+1536*s19+768*k86, out1078);
_mm512_storeu_ps(dfPtr6+77440+102400*i27+1536*j21+1536*s19+768*k86, out1086);
_mm512_storeu_ps(dfPtr6+77376+102400*i27+1536*j21+1536*s19+768*k86, out1082);
_mm512_storeu_ps(dfPtr6+77504+102400*i27+1536*j21+1536*s19+768*k86, out1090);
}
++j21;
rel15 = 1;
}
ptrdiff_t h37 = base15+0;
ptrdiff_t w44 = 36;
ptrdiff_t k87 = 0;
for (; k87 != 4; ++k87) {
__m512 dat1635 = _mm512_maskz_loadu_ps(16383, datPtr12+0+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512 dat1636 = _mm512_maskz_loadu_ps(511, datPtr12+48+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512i pm151 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1111 = _mm512_permutexvar_ps(pm151, dat1635);
__m512i pm152 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1114 = _mm512_permutexvar_ps(pm152, dat1636);
__m512 dat1637 = _mm512_maskz_loadu_ps(16383, datPtr12+224+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512 dat1638 = _mm512_maskz_loadu_ps(511, datPtr12+272+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512 in1112 = _mm512_permutexvar_ps(pm151, dat1637);
__m512 in1115 = _mm512_permutexvar_ps(pm152, dat1638);
__m512 dat1639 = _mm512_maskz_loadu_ps(16383, datPtr12+448+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512 dat1640 = _mm512_maskz_loadu_ps(511, datPtr12+496+50432*i27+224*h37+4*w44+50432*s19+12608*k87);
__m512 in1113 = _mm512_permutexvar_ps(pm151, dat1639);
__m512 in1116 = _mm512_permutexvar_ps(pm152, dat1640);
__m512 tmp7107 = in1112;
__m512 tmp7114 = in1115;
__m512 tmp7108 = _mm512_sub_ps(_mm512_setzero_ps(), in1113);
__m512 tmp7115 = _mm512_sub_ps(_mm512_setzero_ps(), in1116);
__m512 tmp7109 = in1113;
__m512 tmp7116 = in1116;
in1111 = in1111;
in1114 = in1114;
tmp7107 = tmp7107;
tmp7114 = tmp7114;
tmp7109 = tmp7109;
tmp7116 = tmp7116;
in1111 = _mm512_fmadd_ps(tmp7108, _mm512_set1_ps(5.25e+00f), in1111);
in1114 = _mm512_fmadd_ps(tmp7115, _mm512_set1_ps(5.25e+00f), in1114);
tmp7108 = _mm512_mul_ps(in1113, _mm512_set1_ps(2.5e-01f));
tmp7115 = _mm512_mul_ps(in1116, _mm512_set1_ps(2.5e-01f));
in1113 = _mm512_mul_ps(in1113, _mm512_set1_ps(4e+00f));
in1116 = _mm512_mul_ps(in1116, _mm512_set1_ps(4e+00f));
__m512 tmp7110 = _mm512_sub_ps(tmp7109, tmp7107);
__m512 tmp7117 = _mm512_sub_ps(tmp7116, tmp7114);
tmp7109 = _mm512_add_ps(tmp7107, tmp7109);
tmp7116 = _mm512_add_ps(tmp7114, tmp7116);
tmp7107 = _mm512_mul_ps(in1112, _mm512_set1_ps(2.5e-01f));
tmp7114 = _mm512_mul_ps(in1115, _mm512_set1_ps(2.5e-01f));
tmp7108 = tmp7108;
tmp7115 = tmp7115;
__m512 tmp7111 = in1113;
__m512 tmp7118 = in1116;
tmp7107 = tmp7107;
tmp7114 = tmp7114;
__m512 tmp7112 = _mm512_fmadd_ps(tmp7107, _mm512_set1_ps(2e+00f), tmp7108);
__m512 tmp7119 = _mm512_fmadd_ps(tmp7114, _mm512_set1_ps(2e+00f), tmp7115);
tmp7108 = _mm512_fnmadd_ps(tmp7107, _mm512_set1_ps(2e+00f), tmp7108);
tmp7115 = _mm512_fnmadd_ps(tmp7114, _mm512_set1_ps(2e+00f), tmp7115);
tmp7107 = in1112;
tmp7114 = in1115;
in1112 = _mm512_sub_ps(_mm512_setzero_ps(), in1112);
in1115 = _mm512_sub_ps(_mm512_setzero_ps(), in1115);
tmp7107 = tmp7107;
tmp7114 = tmp7114;
__m512 tmp7113 = in1112;
__m512 tmp7120 = in1115;
in1113 = _mm512_fmadd_ps(tmp7107, _mm512_set1_ps(2e+00f), tmp7111);
in1116 = _mm512_fmadd_ps(tmp7114, _mm512_set1_ps(2e+00f), tmp7118);
tmp7111 = _mm512_fnmadd_ps(tmp7107, _mm512_set1_ps(2e+00f), tmp7111);
tmp7118 = _mm512_fnmadd_ps(tmp7114, _mm512_set1_ps(2e+00f), tmp7118);
__m512 tmp7129 = _mm512_unpacklo_ps(in1111, tmp7109);
__m512 tmp7130 = _mm512_unpackhi_ps(in1111, tmp7109);
__m512 tmp7131 = _mm512_unpacklo_ps(tmp7110, tmp7112);
__m512 tmp7132 = _mm512_unpackhi_ps(tmp7110, tmp7112);
__m512 tmp7133 = _mm512_unpacklo_ps(tmp7108, in1113);
__m512 tmp7134 = _mm512_unpackhi_ps(tmp7108, in1113);
__m512 tmp7135 = _mm512_unpacklo_ps(tmp7111, tmp7113);
__m512 tmp7136 = _mm512_unpackhi_ps(tmp7111, tmp7113);
__m512 tmp7137 = _mm512_unpacklo_ps(in1114, tmp7116);
__m512 tmp7138 = _mm512_unpackhi_ps(in1114, tmp7116);
__m512 tmp7139 = _mm512_unpacklo_ps(tmp7117, tmp7119);
__m512 tmp7140 = _mm512_unpackhi_ps(tmp7117, tmp7119);
__m512 tmp7141 = _mm512_unpacklo_ps(tmp7115, in1116);
__m512 tmp7142 = _mm512_unpackhi_ps(tmp7115, in1116);
__m512 tmp7143 = _mm512_unpacklo_ps(tmp7118, tmp7120);
__m512 tmp7144 = _mm512_unpackhi_ps(tmp7118, tmp7120);
__m512 tmp7145 = _mm512_shuffle_ps(tmp7129, tmp7131, 68);
__m512 tmp7146 = _mm512_shuffle_ps(tmp7129, tmp7131, 238);
__m512 tmp7147 = _mm512_shuffle_ps(tmp7130, tmp7132, 68);
__m512 tmp7148 = _mm512_shuffle_ps(tmp7130, tmp7132, 238);
__m512 tmp7149 = _mm512_shuffle_ps(tmp7133, tmp7135, 68);
__m512 tmp7150 = _mm512_shuffle_ps(tmp7133, tmp7135, 238);
__m512 tmp7151 = _mm512_shuffle_ps(tmp7134, tmp7136, 68);
__m512 tmp7152 = _mm512_shuffle_ps(tmp7134, tmp7136, 238);
__m512 tmp7153 = _mm512_shuffle_ps(tmp7137, tmp7139, 68);
__m512 tmp7154 = _mm512_shuffle_ps(tmp7137, tmp7139, 238);
__m512 tmp7155 = _mm512_shuffle_ps(tmp7138, tmp7140, 68);
__m512 tmp7156 = _mm512_shuffle_ps(tmp7138, tmp7140, 238);
__m512 tmp7157 = _mm512_shuffle_ps(tmp7141, tmp7143, 68);
__m512 tmp7158 = _mm512_shuffle_ps(tmp7141, tmp7143, 238);
__m512 tmp7159 = _mm512_shuffle_ps(tmp7142, tmp7144, 68);
__m512 tmp7160 = _mm512_shuffle_ps(tmp7142, tmp7144, 238);
__m512 tmp7161 = _mm512_shuffle_f32x4(tmp7145, tmp7149, 136);
__m512 tmp7162 = _mm512_shuffle_f32x4(tmp7145, tmp7149, 221);
__m512 tmp7163 = _mm512_shuffle_f32x4(tmp7146, tmp7150, 136);
__m512 tmp7164 = _mm512_shuffle_f32x4(tmp7146, tmp7150, 221);
__m512 tmp7165 = _mm512_shuffle_f32x4(tmp7147, tmp7151, 136);
__m512 tmp7166 = _mm512_shuffle_f32x4(tmp7147, tmp7151, 221);
__m512 tmp7167 = _mm512_shuffle_f32x4(tmp7148, tmp7152, 136);
__m512 tmp7168 = _mm512_shuffle_f32x4(tmp7148, tmp7152, 221);
__m512 tmp7169 = _mm512_shuffle_f32x4(tmp7153, tmp7157, 136);
__m512 tmp7170 = _mm512_shuffle_f32x4(tmp7153, tmp7157, 221);
__m512 tmp7171 = _mm512_shuffle_f32x4(tmp7154, tmp7158, 136);
__m512 tmp7172 = _mm512_shuffle_f32x4(tmp7154, tmp7158, 221);
__m512 tmp7173 = _mm512_shuffle_f32x4(tmp7155, tmp7159, 136);
__m512 tmp7174 = _mm512_shuffle_f32x4(tmp7155, tmp7159, 221);
__m512 tmp7175 = _mm512_shuffle_f32x4(tmp7156, tmp7160, 136);
__m512 tmp7176 = _mm512_shuffle_f32x4(tmp7156, tmp7160, 221);
in1111 = _mm512_shuffle_f32x4(tmp7161, tmp7169, 136);
in1114 = _mm512_shuffle_f32x4(tmp7161, tmp7169, 221);
tmp7109 = _mm512_shuffle_f32x4(tmp7163, tmp7171, 136);
tmp7116 = _mm512_shuffle_f32x4(tmp7163, tmp7171, 221);
tmp7110 = _mm512_shuffle_f32x4(tmp7165, tmp7173, 136);
tmp7117 = _mm512_shuffle_f32x4(tmp7165, tmp7173, 221);
tmp7112 = _mm512_shuffle_f32x4(tmp7167, tmp7175, 136);
tmp7119 = _mm512_shuffle_f32x4(tmp7167, tmp7175, 221);
tmp7108 = _mm512_shuffle_f32x4(tmp7162, tmp7170, 136);
tmp7115 = _mm512_shuffle_f32x4(tmp7162, tmp7170, 221);
in1113 = _mm512_shuffle_f32x4(tmp7164, tmp7172, 136);
in1116 = _mm512_shuffle_f32x4(tmp7164, tmp7172, 221);
tmp7111 = _mm512_shuffle_f32x4(tmp7166, tmp7174, 136);
tmp7118 = _mm512_shuffle_f32x4(tmp7166, tmp7174, 221);
tmp7113 = _mm512_shuffle_f32x4(tmp7168, tmp7176, 136);
tmp7120 = _mm512_shuffle_f32x4(tmp7168, tmp7176, 221);
__m512 tmp7121 = _mm512_add_ps(tmp7109, in1113);
__m512 tmp7125 = _mm512_add_ps(tmp7116, in1116);
__m512 tmp7122 = _mm512_sub_ps(tmp7108, tmp7110);
__m512 tmp7126 = _mm512_sub_ps(tmp7115, tmp7117);
__m512 tmp7123 = _mm512_add_ps(tmp7110, tmp7111);
__m512 tmp7127 = _mm512_add_ps(tmp7117, tmp7118);
in1111 = _mm512_sub_ps(in1111, tmp7111);
in1114 = _mm512_sub_ps(in1114, tmp7118);
tmp7121 = _mm512_fmadd_ps(tmp7112, _mm512_set1_ps(-4.25e+00f), tmp7121);
tmp7125 = _mm512_fmadd_ps(tmp7119, _mm512_set1_ps(-4.25e+00f), tmp7125);
tmp7123 = _mm512_fmadd_ps(tmp7108, _mm512_set1_ps(-4.25e+00f), tmp7123);
tmp7127 = _mm512_fmadd_ps(tmp7115, _mm512_set1_ps(-4.25e+00f), tmp7127);
in1111 = _mm512_fmadd_ps(tmp7122, _mm512_set1_ps(5.25e+00f), in1111);
in1114 = _mm512_fmadd_ps(tmp7126, _mm512_set1_ps(5.25e+00f), in1114);
tmp7122 = _mm512_fmadd_ps(tmp7110, _mm512_set1_ps(2.5e-01f), tmp7111);
tmp7126 = _mm512_fmadd_ps(tmp7117, _mm512_set1_ps(2.5e-01f), tmp7118);
tmp7110 = _mm512_fmadd_ps(tmp7110, _mm512_set1_ps(4e+00f), tmp7111);
tmp7117 = _mm512_fmadd_ps(tmp7117, _mm512_set1_ps(4e+00f), tmp7118);
__m512 tmp7124 = _mm512_sub_ps(tmp7123, tmp7121);
__m512 tmp7128 = _mm512_sub_ps(tmp7127, tmp7125);
tmp7123 = _mm512_add_ps(tmp7121, tmp7123);
tmp7127 = _mm512_add_ps(tmp7125, tmp7127);
tmp7121 = _mm512_fmadd_ps(tmp7109, _mm512_set1_ps(2.5e-01f), in1113);
tmp7125 = _mm512_fmadd_ps(tmp7116, _mm512_set1_ps(2.5e-01f), in1116);
tmp7122 = _mm512_fmadd_ps(tmp7108, _mm512_set1_ps(-1.25e+00f), tmp7122);
tmp7126 = _mm512_fmadd_ps(tmp7115, _mm512_set1_ps(-1.25e+00f), tmp7126);
tmp7108 = _mm512_fmadd_ps(tmp7108, _mm512_set1_ps(-5e+00f), tmp7110);
tmp7115 = _mm512_fmadd_ps(tmp7115, _mm512_set1_ps(-5e+00f), tmp7117);
tmp7121 = _mm512_fmadd_ps(tmp7112, _mm512_set1_ps(-1.25e+00f), tmp7121);
tmp7125 = _mm512_fmadd_ps(tmp7119, _mm512_set1_ps(-1.25e+00f), tmp7125);
tmp7111 = _mm512_fmadd_ps(tmp7121, _mm512_set1_ps(2e+00f), tmp7122);
tmp7118 = _mm512_fmadd_ps(tmp7125, _mm512_set1_ps(2e+00f), tmp7126);
tmp7122 = _mm512_fnmadd_ps(tmp7121, _mm512_set1_ps(2e+00f), tmp7122);
tmp7126 = _mm512_fnmadd_ps(tmp7125, _mm512_set1_ps(2e+00f), tmp7126);
tmp7121 = _mm512_fmadd_ps(in1113, _mm512_set1_ps(2.5e-01f), tmp7109);
tmp7125 = _mm512_fmadd_ps(in1116, _mm512_set1_ps(2.5e-01f), tmp7116);
tmp7109 = _mm512_sub_ps(tmp7113, tmp7109);
tmp7116 = _mm512_sub_ps(tmp7120, tmp7116);
tmp7121 = _mm512_fmadd_ps(tmp7112, _mm512_set1_ps(-1.25e+00f), tmp7121);
tmp7125 = _mm512_fmadd_ps(tmp7119, _mm512_set1_ps(-1.25e+00f), tmp7125);
tmp7112 = _mm512_sub_ps(tmp7112, in1113);
tmp7119 = _mm512_sub_ps(tmp7119, in1116);
tmp7112 = _mm512_fmadd_ps(tmp7112, _mm512_set1_ps(5.25e+00f), tmp7109);
tmp7119 = _mm512_fmadd_ps(tmp7119, _mm512_set1_ps(5.25e+00f), tmp7116);
tmp7110 = _mm512_fmadd_ps(tmp7121, _mm512_set1_ps(2e+00f), tmp7108);
tmp7117 = _mm512_fmadd_ps(tmp7125, _mm512_set1_ps(2e+00f), tmp7115);
tmp7108 = _mm512_fnmadd_ps(tmp7121, _mm512_set1_ps(2e+00f), tmp7108);
tmp7115 = _mm512_fnmadd_ps(tmp7125, _mm512_set1_ps(2e+00f), tmp7115);
__m512 out1091 = _mm512_shuffle_f32x4(in1111, tmp7123, 68);
__m512 out1099 = _mm512_shuffle_f32x4(in1111, tmp7123, 238);
__m512 out1092 = _mm512_shuffle_f32x4(tmp7124, tmp7111, 68);
__m512 out1100 = _mm512_shuffle_f32x4(tmp7124, tmp7111, 238);
__m512 out1093 = _mm512_shuffle_f32x4(tmp7122, tmp7110, 68);
__m512 out1101 = _mm512_shuffle_f32x4(tmp7122, tmp7110, 238);
__m512 out1094 = _mm512_shuffle_f32x4(tmp7108, tmp7112, 68);
__m512 out1102 = _mm512_shuffle_f32x4(tmp7108, tmp7112, 238);
__m512 out1095 = _mm512_shuffle_f32x4(in1114, tmp7127, 68);
__m512 out1103 = _mm512_shuffle_f32x4(in1114, tmp7127, 238);
__m512 out1096 = _mm512_shuffle_f32x4(tmp7128, tmp7118, 68);
__m512 out1104 = _mm512_shuffle_f32x4(tmp7128, tmp7118, 238);
__m512 out1097 = _mm512_shuffle_f32x4(tmp7126, tmp7117, 68);
__m512 out1105 = _mm512_shuffle_f32x4(tmp7126, tmp7117, 238);
__m512 out1098 = _mm512_shuffle_f32x4(tmp7115, tmp7119, 68);
__m512 out1106 = _mm512_shuffle_f32x4(tmp7115, tmp7119, 238);
_mm512_storeu_ps(dfPtr6+0+102400*i27+1536*j21+1024*s19+256*k87, out1091);
_mm512_storeu_ps(dfPtr6+128+102400*i27+1536*j21+1024*s19+256*k87, out1099);
_mm512_storeu_ps(dfPtr6+64+102400*i27+1536*j21+1024*s19+256*k87, out1095);
_mm512_storeu_ps(dfPtr6+192+102400*i27+1536*j21+1024*s19+256*k87, out1103);
_mm512_storeu_ps(dfPtr6+25600+102400*i27+1536*j21+1024*s19+256*k87, out1092);
_mm512_storeu_ps(dfPtr6+25728+102400*i27+1536*j21+1024*s19+256*k87, out1100);
_mm512_storeu_ps(dfPtr6+25664+102400*i27+1536*j21+1024*s19+256*k87, out1096);
_mm512_storeu_ps(dfPtr6+25792+102400*i27+1536*j21+1024*s19+256*k87, out1104);
_mm512_storeu_ps(dfPtr6+51200+102400*i27+1536*j21+1024*s19+256*k87, out1093);
_mm512_storeu_ps(dfPtr6+51328+102400*i27+1536*j21+1024*s19+256*k87, out1101);
_mm512_storeu_ps(dfPtr6+51264+102400*i27+1536*j21+1024*s19+256*k87, out1097);
_mm512_storeu_ps(dfPtr6+51392+102400*i27+1536*j21+1024*s19+256*k87, out1105);
_mm512_storeu_ps(dfPtr6+76800+102400*i27+1536*j21+1024*s19+256*k87, out1094);
_mm512_storeu_ps(dfPtr6+76928+102400*i27+1536*j21+1024*s19+256*k87, out1102);
_mm512_storeu_ps(dfPtr6+76864+102400*i27+1536*j21+1024*s19+256*k87, out1098);
_mm512_storeu_ps(dfPtr6+76992+102400*i27+1536*j21+1024*s19+256*k87, out1106);
}
++j21;
}
}

static void ResNeXt50ThreeArrangeDats2(ResNeXt50ThreaderTeam1* team33, char** tensors39) {
ResNeXt50ThreaderTask1 task43;
task43.callee1 = ResNeXt50ThreeArrangeDats2Callee1;
task43.any1 = tensors39;
task43.nd1 = 4;
task43.hull1[0] = 1;
task43.hull1[1] = 1;
task43.hull1[2] = 16;
task43.hull1[3] = 1;
ResNeXt50ThreaderDo1(team33, &task43);
}

static void ResNeXt50ThreeProduceSums2Callee1(ResNeXt50ThreaderTask1* task44, int64_t* pt27) {
void** pair10 = task44->any1;
char** tensors42 = pair10[0];
ptrdiff_t e14 = 0;
ptrdiff_t g15 = pt27[3];
ptrdiff_t f45 = 0;
ptrdiff_t d8 = 0;
ptrdiff_t w45 = 0;
char*restrict bfPtr7 = tensors42[0]+512*e14;
char*restrict wfPtr7 = tensors42[0]+512+6488064*e14;
char*restrict dfPtr7 = tensors42[1]+324403200*e14;
char*restrict sfPtr6 = tensors42[2];
ptrdiff_t i28 = 2*g15;
ptrdiff_t ii16 = i28+1;
for (; i28 <= ii16; ++i28) {
ptrdiff_t j22 = 4*f45;
ptrdiff_t jj29 = j22+3;
for (; j22 <= jj29; ++j22) {
ptrdiff_t k88 = 17*d8;
for (; k88 != 16; ++k88) {
ptrdiff_t l26 = 1*w45;
for (; l26 != 1; ++l26) {
__m512 sum148;
__m512 sum154;
__m512 sum160;
__m512 sum166;
if (__builtin_expect(!j22, 0)) {
sum148 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+0+16*i28+16*l26)));
sum154 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+4+16*i28+16*l26)));
sum160 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+8+16*i28+16*l26)));
sum166 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+12+16*i28+16*l26)));
} else {
sum148 = _mm512_setzero_ps();
sum154 = _mm512_setzero_ps();
sum160 = _mm512_setzero_ps();
sum166 = _mm512_setzero_ps();
}
__m512 sum149 = sum148;
__m512 sum150 = sum148;
__m512 sum151 = sum148;
__m512 sum152 = sum148;
__m512 sum153 = sum148;
__m512 sum155 = sum154;
__m512 sum156 = sum154;
__m512 sum157 = sum154;
__m512 sum158 = sum154;
__m512 sum159 = sum154;
__m512 sum161 = sum160;
__m512 sum162 = sum160;
__m512 sum163 = sum160;
__m512 sum164 = sum160;
__m512 sum165 = sum160;
__m512 sum167 = sum166;
__m512 sum168 = sum166;
__m512 sum169 = sum166;
__m512 sum170 = sum166;
__m512 sum171 = sum166;
ptrdiff_t b51 = 0;
for (; b51 != 4; ++b51) {
__m512i wfs21 = _mm512_maskz_loadu_epi32(65535, wfPtr7+0+2048*i28+512*j22+512*l26+128*b51);
__m512 wf73 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs21));
__m512 df651 = _mm512_loadu_ps(dfPtr7+0+102400*i28+25600*j22+1536*k88+384*b51);
sum148 = _mm512_fmadd_ps(wf73, df651, sum148);
__m512 df652 = _mm512_loadu_ps(dfPtr7+64+102400*i28+25600*j22+1536*k88+384*b51);
sum149 = _mm512_fmadd_ps(wf73, df652, sum149);
__m512 df653 = _mm512_loadu_ps(dfPtr7+128+102400*i28+25600*j22+1536*k88+384*b51);
sum150 = _mm512_fmadd_ps(wf73, df653, sum150);
__m512 df654 = _mm512_loadu_ps(dfPtr7+192+102400*i28+25600*j22+1536*k88+384*b51);
sum151 = _mm512_fmadd_ps(wf73, df654, sum151);
__m512 df655 = _mm512_loadu_ps(dfPtr7+256+102400*i28+25600*j22+1536*k88+384*b51);
sum152 = _mm512_fmadd_ps(wf73, df655, sum152);
__m512 df656 = _mm512_loadu_ps(dfPtr7+320+102400*i28+25600*j22+1536*k88+384*b51);
sum153 = _mm512_fmadd_ps(wf73, df656, sum153);
__m512 wf74 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs21, 1));
sum154 = _mm512_fmadd_ps(wf74, df651, sum154);
sum155 = _mm512_fmadd_ps(wf74, df652, sum155);
sum156 = _mm512_fmadd_ps(wf74, df653, sum156);
sum157 = _mm512_fmadd_ps(wf74, df654, sum157);
sum158 = _mm512_fmadd_ps(wf74, df655, sum158);
sum159 = _mm512_fmadd_ps(wf74, df656, sum159);
__m512i wfs22 = _mm512_maskz_loadu_epi32(65535, wfPtr7+64+2048*i28+512*j22+512*l26+128*b51);
__m512 wf75 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs22));
sum160 = _mm512_fmadd_ps(wf75, df651, sum160);
sum161 = _mm512_fmadd_ps(wf75, df652, sum161);
sum162 = _mm512_fmadd_ps(wf75, df653, sum162);
sum163 = _mm512_fmadd_ps(wf75, df654, sum163);
sum164 = _mm512_fmadd_ps(wf75, df655, sum164);
sum165 = _mm512_fmadd_ps(wf75, df656, sum165);
__m512 wf76 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs22, 1));
sum166 = _mm512_fmadd_ps(wf76, df651, sum166);
sum167 = _mm512_fmadd_ps(wf76, df652, sum167);
sum168 = _mm512_fmadd_ps(wf76, df653, sum168);
sum169 = _mm512_fmadd_ps(wf76, df654, sum169);
sum170 = _mm512_fmadd_ps(wf76, df655, sum170);
sum171 = _mm512_fmadd_ps(wf76, df656, sum171);
}
_mm512_storeu_ps(sfPtr6+0+102400*i28+25600*j22+1536*k88+1536*l26, sum148);
_mm512_storeu_ps(sfPtr6+64+102400*i28+25600*j22+1536*k88+1536*l26, sum149);
_mm512_storeu_ps(sfPtr6+128+102400*i28+25600*j22+1536*k88+1536*l26, sum150);
_mm512_storeu_ps(sfPtr6+192+102400*i28+25600*j22+1536*k88+1536*l26, sum151);
_mm512_storeu_ps(sfPtr6+256+102400*i28+25600*j22+1536*k88+1536*l26, sum152);
_mm512_storeu_ps(sfPtr6+320+102400*i28+25600*j22+1536*k88+1536*l26, sum153);
_mm512_storeu_ps(sfPtr6+384+102400*i28+25600*j22+1536*k88+1536*l26, sum154);
_mm512_storeu_ps(sfPtr6+448+102400*i28+25600*j22+1536*k88+1536*l26, sum155);
_mm512_storeu_ps(sfPtr6+512+102400*i28+25600*j22+1536*k88+1536*l26, sum156);
_mm512_storeu_ps(sfPtr6+576+102400*i28+25600*j22+1536*k88+1536*l26, sum157);
_mm512_storeu_ps(sfPtr6+640+102400*i28+25600*j22+1536*k88+1536*l26, sum158);
_mm512_storeu_ps(sfPtr6+704+102400*i28+25600*j22+1536*k88+1536*l26, sum159);
_mm512_storeu_ps(sfPtr6+768+102400*i28+25600*j22+1536*k88+1536*l26, sum160);
_mm512_storeu_ps(sfPtr6+832+102400*i28+25600*j22+1536*k88+1536*l26, sum161);
_mm512_storeu_ps(sfPtr6+896+102400*i28+25600*j22+1536*k88+1536*l26, sum162);
_mm512_storeu_ps(sfPtr6+960+102400*i28+25600*j22+1536*k88+1536*l26, sum163);
_mm512_storeu_ps(sfPtr6+1024+102400*i28+25600*j22+1536*k88+1536*l26, sum164);
_mm512_storeu_ps(sfPtr6+1088+102400*i28+25600*j22+1536*k88+1536*l26, sum165);
_mm512_storeu_ps(sfPtr6+1152+102400*i28+25600*j22+1536*k88+1536*l26, sum166);
_mm512_storeu_ps(sfPtr6+1216+102400*i28+25600*j22+1536*k88+1536*l26, sum167);
_mm512_storeu_ps(sfPtr6+1280+102400*i28+25600*j22+1536*k88+1536*l26, sum168);
_mm512_storeu_ps(sfPtr6+1344+102400*i28+25600*j22+1536*k88+1536*l26, sum169);
_mm512_storeu_ps(sfPtr6+1408+102400*i28+25600*j22+1536*k88+1536*l26, sum170);
_mm512_storeu_ps(sfPtr6+1472+102400*i28+25600*j22+1536*k88+1536*l26, sum171);
}
}
ptrdiff_t l27 = 1*w45;
for (; l27 != 1; ++l27) {
__m512 sum172;
__m512 sum176;
__m512 sum180;
__m512 sum184;
if (__builtin_expect(!j22, 0)) {
sum172 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+0+16*i28+16*l27)));
sum176 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+4+16*i28+16*l27)));
sum180 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+8+16*i28+16*l27)));
sum184 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr7+12+16*i28+16*l27)));
} else {
sum172 = _mm512_setzero_ps();
sum176 = _mm512_setzero_ps();
sum180 = _mm512_setzero_ps();
sum184 = _mm512_setzero_ps();
}
__m512 sum173 = sum172;
__m512 sum174 = sum172;
__m512 sum175 = sum172;
__m512 sum177 = sum176;
__m512 sum178 = sum176;
__m512 sum179 = sum176;
__m512 sum181 = sum180;
__m512 sum182 = sum180;
__m512 sum183 = sum180;
__m512 sum185 = sum184;
__m512 sum186 = sum184;
__m512 sum187 = sum184;
ptrdiff_t b52 = 0;
for (; b52 != 4; ++b52) {
__m512i wfs23 = _mm512_maskz_loadu_epi32(65535, wfPtr7+0+2048*i28+512*j22+512*l27+128*b52);
__m512 wf77 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs23));
__m512 df657 = _mm512_loadu_ps(dfPtr7+0+102400*i28+25600*j22+1536*k88+256*b52);
sum172 = _mm512_fmadd_ps(wf77, df657, sum172);
__m512 df658 = _mm512_loadu_ps(dfPtr7+64+102400*i28+25600*j22+1536*k88+256*b52);
sum173 = _mm512_fmadd_ps(wf77, df658, sum173);
__m512 df659 = _mm512_loadu_ps(dfPtr7+128+102400*i28+25600*j22+1536*k88+256*b52);
sum174 = _mm512_fmadd_ps(wf77, df659, sum174);
__m512 df660 = _mm512_loadu_ps(dfPtr7+192+102400*i28+25600*j22+1536*k88+256*b52);
sum175 = _mm512_fmadd_ps(wf77, df660, sum175);
__m512 wf78 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs23, 1));
sum176 = _mm512_fmadd_ps(wf78, df657, sum176);
sum177 = _mm512_fmadd_ps(wf78, df658, sum177);
sum178 = _mm512_fmadd_ps(wf78, df659, sum178);
sum179 = _mm512_fmadd_ps(wf78, df660, sum179);
__m512i wfs24 = _mm512_maskz_loadu_epi32(65535, wfPtr7+64+2048*i28+512*j22+512*l27+128*b52);
__m512 wf79 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs24));
sum180 = _mm512_fmadd_ps(wf79, df657, sum180);
sum181 = _mm512_fmadd_ps(wf79, df658, sum181);
sum182 = _mm512_fmadd_ps(wf79, df659, sum182);
sum183 = _mm512_fmadd_ps(wf79, df660, sum183);
__m512 wf80 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs24, 1));
sum184 = _mm512_fmadd_ps(wf80, df657, sum184);
sum185 = _mm512_fmadd_ps(wf80, df658, sum185);
sum186 = _mm512_fmadd_ps(wf80, df659, sum186);
sum187 = _mm512_fmadd_ps(wf80, df660, sum187);
}
_mm512_storeu_ps(sfPtr6+0+102400*i28+25600*j22+1536*k88+1024*l27, sum172);
_mm512_storeu_ps(sfPtr6+64+102400*i28+25600*j22+1536*k88+1024*l27, sum173);
_mm512_storeu_ps(sfPtr6+128+102400*i28+25600*j22+1536*k88+1024*l27, sum174);
_mm512_storeu_ps(sfPtr6+192+102400*i28+25600*j22+1536*k88+1024*l27, sum175);
_mm512_storeu_ps(sfPtr6+256+102400*i28+25600*j22+1536*k88+1024*l27, sum176);
_mm512_storeu_ps(sfPtr6+320+102400*i28+25600*j22+1536*k88+1024*l27, sum177);
_mm512_storeu_ps(sfPtr6+384+102400*i28+25600*j22+1536*k88+1024*l27, sum178);
_mm512_storeu_ps(sfPtr6+448+102400*i28+25600*j22+1536*k88+1024*l27, sum179);
_mm512_storeu_ps(sfPtr6+512+102400*i28+25600*j22+1536*k88+1024*l27, sum180);
_mm512_storeu_ps(sfPtr6+576+102400*i28+25600*j22+1536*k88+1024*l27, sum181);
_mm512_storeu_ps(sfPtr6+640+102400*i28+25600*j22+1536*k88+1024*l27, sum182);
_mm512_storeu_ps(sfPtr6+704+102400*i28+25600*j22+1536*k88+1024*l27, sum183);
_mm512_storeu_ps(sfPtr6+768+102400*i28+25600*j22+1536*k88+1024*l27, sum184);
_mm512_storeu_ps(sfPtr6+832+102400*i28+25600*j22+1536*k88+1024*l27, sum185);
_mm512_storeu_ps(sfPtr6+896+102400*i28+25600*j22+1536*k88+1024*l27, sum186);
_mm512_storeu_ps(sfPtr6+960+102400*i28+25600*j22+1536*k88+1024*l27, sum187);
}
}
}
}

static void ResNeXt50ThreeProduceSums2(ResNeXt50ThreaderTeam1* team34, char** tensors41) {
void* pair9[] = {tensors41, 0};
ResNeXt50ThreaderTask1 task45;
task45.callee1 = ResNeXt50ThreeProduceSums2Callee1;
task45.any1 = pair9;
task45.nd1 = 4;
task45.hull1[0] = 1;
task45.hull1[1] = 1;
task45.hull1[2] = 1;
task45.hull1[3] = 16;
ResNeXt50ThreaderDo1(team34, &task45);
}

static void ResNeXt50ThreeConsumeSums2Callee1(ResNeXt50ThreaderTask1* task46, int64_t* pt28) {
char** tensors44 = task46->any1;
ptrdiff_t w46 = 0;
ptrdiff_t d9 = 0;
ptrdiff_t g16 = pt28[2];
char*restrict sfPtr7 = tensors44[0];
char*restrict datPtr13 = tensors44[1];
ptrdiff_t i29 = 2*g16;
ptrdiff_t ii17 = i29+1;
for (; i29 <= ii17; ++i29) {
ptrdiff_t j23 = 17*d9;
if (j23 < 2) {
ptrdiff_t rel16 = j23-0;
ptrdiff_t base16 = 0;
if (rel16 < 1) {
ptrdiff_t toH29 = base16+0;
ptrdiff_t toW29 = 0;
ptrdiff_t k89 = 1*w46;
for (; k89 != 1; ++k89) {
ptrdiff_t l28 = 0;
for (; l28 != 2; ++l28) {
__m512 sf401 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf402 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1117 = _mm512_shuffle_f32x4(sf401, sf402, 68);
__m512 in1118 = _mm512_shuffle_f32x4(sf401, sf402, 238);
__m512 sf403 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf404 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1125 = _mm512_shuffle_f32x4(sf403, sf404, 68);
__m512 in1126 = _mm512_shuffle_f32x4(sf403, sf404, 238);
__m512 sf405 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf406 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1119 = _mm512_shuffle_f32x4(sf405, sf406, 68);
__m512 in1120 = _mm512_shuffle_f32x4(sf405, sf406, 238);
__m512 sf407 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf408 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1127 = _mm512_shuffle_f32x4(sf407, sf408, 68);
__m512 in1128 = _mm512_shuffle_f32x4(sf407, sf408, 238);
__m512 sf409 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf410 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1121 = _mm512_shuffle_f32x4(sf409, sf410, 68);
__m512 in1122 = _mm512_shuffle_f32x4(sf409, sf410, 238);
__m512 sf411 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf412 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1129 = _mm512_shuffle_f32x4(sf411, sf412, 68);
__m512 in1130 = _mm512_shuffle_f32x4(sf411, sf412, 238);
__m512 sf413 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf414 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1123 = _mm512_shuffle_f32x4(sf413, sf414, 68);
__m512 in1124 = _mm512_shuffle_f32x4(sf413, sf414, 238);
__m512 sf415 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf416 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1131 = _mm512_shuffle_f32x4(sf415, sf416, 68);
__m512 in1132 = _mm512_shuffle_f32x4(sf415, sf416, 238);
__m512 tmp7193 = _mm512_add_ps(in1118, in1119);
__m512 tmp7213 = _mm512_add_ps(in1126, in1127);
__m512 tmp7192 = _mm512_add_ps(in1120, in1121);
__m512 tmp7212 = _mm512_add_ps(in1128, in1129);
__m512 tmp7198 = _mm512_sub_ps(in1120, in1121);
__m512 tmp7218 = _mm512_sub_ps(in1128, in1129);
__m512 tmp7197 = _mm512_sub_ps(in1118, in1119);
__m512 tmp7217 = _mm512_sub_ps(in1126, in1127);
__m512 tmp7194 = _mm512_add_ps(in1122, in1123);
__m512 tmp7214 = _mm512_add_ps(in1130, in1131);
__m512 tmp7199 = _mm512_sub_ps(in1122, in1123);
__m512 tmp7219 = _mm512_sub_ps(in1130, in1131);
__m512 tmp7196 = _mm512_fmadd_ps(tmp7198, _mm512_set1_ps(2e+00f), tmp7197);
__m512 tmp7216 = _mm512_fmadd_ps(tmp7218, _mm512_set1_ps(2e+00f), tmp7217);
__m512 tmp7203 = _mm512_fmadd_ps(tmp7198, _mm512_set1_ps(8e+00f), tmp7197);
__m512 tmp7223 = _mm512_fmadd_ps(tmp7218, _mm512_set1_ps(8e+00f), tmp7217);
__m512 tmp7191 = _mm512_add_ps(tmp7192, tmp7193);
__m512 tmp7211 = _mm512_add_ps(tmp7212, tmp7213);
__m512 tmp7195 = _mm512_fmadd_ps(tmp7199, _mm512_set1_ps(1.6e+01f), tmp7196);
__m512 tmp7215 = _mm512_fmadd_ps(tmp7219, _mm512_set1_ps(1.6e+01f), tmp7216);
__m512 tmp7202 = _mm512_fmadd_ps(tmp7199, _mm512_set1_ps(4e+00f), tmp7203);
__m512 tmp7222 = _mm512_fmadd_ps(tmp7219, _mm512_set1_ps(4e+00f), tmp7223);
__m512 tmp7208 = _mm512_add_ps(tmp7199, tmp7197);
__m512 tmp7228 = _mm512_add_ps(tmp7219, tmp7217);
__m512 tmp7201 = _mm512_fmadd_ps(tmp7192, _mm512_set1_ps(4e+00f), tmp7193);
__m512 tmp7221 = _mm512_fmadd_ps(tmp7212, _mm512_set1_ps(4e+00f), tmp7213);
__m512 tmp7205 = _mm512_fmadd_ps(tmp7192, _mm512_set1_ps(1.6e+01f), tmp7193);
__m512 tmp7225 = _mm512_fmadd_ps(tmp7212, _mm512_set1_ps(1.6e+01f), tmp7213);
__m512 tmp7190 = _mm512_add_ps(tmp7191, in1117);
__m512 tmp7210 = _mm512_add_ps(tmp7211, in1125);
__m512 tmp7207 = _mm512_add_ps(tmp7208, in1124);
__m512 tmp7227 = _mm512_add_ps(tmp7228, in1132);
__m512 tmp7189 = _mm512_fmadd_ps(tmp7194, _mm512_set1_ps(3.2e+01f), tmp7190);
__m512 tmp7209 = _mm512_fmadd_ps(tmp7214, _mm512_set1_ps(3.2e+01f), tmp7210);
__m512 tmp7200 = _mm512_fmadd_ps(tmp7194, _mm512_set1_ps(8e+00f), tmp7201);
__m512 tmp7220 = _mm512_fmadd_ps(tmp7214, _mm512_set1_ps(8e+00f), tmp7221);
__m512 tmp7206 = _mm512_fmadd_ps(tmp7198, _mm512_set1_ps(3.2e+01f), tmp7207);
__m512 tmp7226 = _mm512_fmadd_ps(tmp7218, _mm512_set1_ps(3.2e+01f), tmp7227);
__m512 tmp7204 = _mm512_fmadd_ps(tmp7194, _mm512_set1_ps(2e+00f), tmp7205);
__m512 tmp7224 = _mm512_fmadd_ps(tmp7214, _mm512_set1_ps(2e+00f), tmp7225);
__m512 tmp7177 = tmp7189;
__m512 tmp7183 = tmp7209;
__m512 tmp7178 = tmp7195;
__m512 tmp7184 = tmp7215;
__m512 tmp7179 = tmp7200;
__m512 tmp7185 = tmp7220;
__m512 tmp7180 = tmp7202;
__m512 tmp7186 = tmp7222;
__m512 tmp7181 = tmp7204;
__m512 tmp7187 = tmp7224;
__m512 tmp7182 = tmp7206;
__m512 tmp7188 = tmp7226;
__m512 tmp7273 = _mm512_unpacklo_ps(tmp7177, tmp7178);
__m512 tmp7274 = _mm512_unpackhi_ps(tmp7177, tmp7178);
__m512 tmp7275 = _mm512_unpacklo_ps(tmp7179, tmp7180);
__m512 tmp7276 = _mm512_unpackhi_ps(tmp7179, tmp7180);
__m512 tmp7277 = _mm512_unpacklo_ps(tmp7181, tmp7182);
__m512 tmp7278 = _mm512_unpackhi_ps(tmp7181, tmp7182);
__m512 tmp7279 = _mm512_unpacklo_ps(tmp7183, tmp7184);
__m512 tmp7280 = _mm512_unpackhi_ps(tmp7183, tmp7184);
__m512 tmp7281 = _mm512_unpacklo_ps(tmp7185, tmp7186);
__m512 tmp7282 = _mm512_unpackhi_ps(tmp7185, tmp7186);
__m512 tmp7283 = _mm512_unpacklo_ps(tmp7187, tmp7188);
__m512 tmp7284 = _mm512_unpackhi_ps(tmp7187, tmp7188);
__m512 tmp7285 = _mm512_shuffle_ps(tmp7273, tmp7275, 68);
__m512 tmp7286 = _mm512_shuffle_ps(tmp7273, tmp7275, 238);
__m512 tmp7287 = _mm512_shuffle_ps(tmp7274, tmp7276, 68);
__m512 tmp7288 = _mm512_shuffle_ps(tmp7274, tmp7276, 238);
__m512 tmp7289 = _mm512_shuffle_ps(tmp7277, tmp7279, 68);
__m512 tmp7290 = _mm512_shuffle_ps(tmp7277, tmp7279, 238);
__m512 tmp7291 = _mm512_shuffle_ps(tmp7278, tmp7280, 68);
__m512 tmp7292 = _mm512_shuffle_ps(tmp7278, tmp7280, 238);
__m512 tmp7293 = _mm512_shuffle_ps(tmp7281, tmp7283, 68);
__m512 tmp7294 = _mm512_shuffle_ps(tmp7281, tmp7283, 238);
__m512 tmp7295 = _mm512_shuffle_ps(tmp7282, tmp7284, 68);
__m512 tmp7296 = _mm512_shuffle_ps(tmp7282, tmp7284, 238);
__m512 tmp7297 = _mm512_shuffle_f32x4(tmp7285, tmp7289, 136);
__m512 tmp7298 = _mm512_shuffle_f32x4(tmp7285, tmp7289, 221);
__m512 tmp7299 = _mm512_shuffle_f32x4(tmp7286, tmp7290, 136);
__m512 tmp7300 = _mm512_shuffle_f32x4(tmp7286, tmp7290, 221);
__m512 tmp7301 = _mm512_shuffle_f32x4(tmp7287, tmp7291, 136);
__m512 tmp7302 = _mm512_shuffle_f32x4(tmp7287, tmp7291, 221);
__m512 tmp7303 = _mm512_shuffle_f32x4(tmp7288, tmp7292, 136);
__m512 tmp7304 = _mm512_shuffle_f32x4(tmp7288, tmp7292, 221);
__m512 tmp7305 = _mm512_shuffle_f32x4(tmp7293, tmp7293, 136);
__m512 tmp7306 = _mm512_shuffle_f32x4(tmp7293, tmp7293, 221);
__m512 tmp7307 = _mm512_shuffle_f32x4(tmp7294, tmp7294, 136);
__m512 tmp7308 = _mm512_shuffle_f32x4(tmp7294, tmp7294, 221);
__m512 tmp7309 = _mm512_shuffle_f32x4(tmp7295, tmp7295, 136);
__m512 tmp7310 = _mm512_shuffle_f32x4(tmp7295, tmp7295, 221);
__m512 tmp7311 = _mm512_shuffle_f32x4(tmp7296, tmp7296, 136);
__m512 tmp7312 = _mm512_shuffle_f32x4(tmp7296, tmp7296, 221);
tmp7177 = _mm512_shuffle_f32x4(tmp7297, tmp7305, 136);
tmp7185 = _mm512_shuffle_f32x4(tmp7297, tmp7305, 221);
tmp7178 = _mm512_shuffle_f32x4(tmp7299, tmp7307, 136);
tmp7186 = _mm512_shuffle_f32x4(tmp7299, tmp7307, 221);
tmp7179 = _mm512_shuffle_f32x4(tmp7301, tmp7309, 136);
tmp7187 = _mm512_shuffle_f32x4(tmp7301, tmp7309, 221);
tmp7180 = _mm512_shuffle_f32x4(tmp7303, tmp7311, 136);
tmp7188 = _mm512_shuffle_f32x4(tmp7303, tmp7311, 221);
tmp7181 = _mm512_shuffle_f32x4(tmp7298, tmp7306, 136);
__m512 tmp7229 = _mm512_shuffle_f32x4(tmp7298, tmp7306, 221);
tmp7182 = _mm512_shuffle_f32x4(tmp7300, tmp7308, 136);
__m512 tmp7230 = _mm512_shuffle_f32x4(tmp7300, tmp7308, 221);
tmp7183 = _mm512_shuffle_f32x4(tmp7302, tmp7310, 136);
__m512 tmp7231 = _mm512_shuffle_f32x4(tmp7302, tmp7310, 221);
tmp7184 = _mm512_shuffle_f32x4(tmp7304, tmp7312, 136);
__m512 tmp7232 = _mm512_shuffle_f32x4(tmp7304, tmp7312, 221);
__m512 tmp7237 = _mm512_add_ps(tmp7178, tmp7179);
__m512 tmp7257 = _mm512_add_ps(tmp7186, tmp7187);
__m512 tmp7236 = _mm512_add_ps(tmp7180, tmp7181);
__m512 tmp7256 = _mm512_add_ps(tmp7188, tmp7229);
__m512 tmp7242 = _mm512_sub_ps(tmp7180, tmp7181);
__m512 tmp7262 = _mm512_sub_ps(tmp7188, tmp7229);
__m512 tmp7241 = _mm512_sub_ps(tmp7178, tmp7179);
__m512 tmp7261 = _mm512_sub_ps(tmp7186, tmp7187);
__m512 tmp7238 = _mm512_add_ps(tmp7182, tmp7183);
__m512 tmp7258 = _mm512_add_ps(tmp7230, tmp7231);
__m512 tmp7243 = _mm512_sub_ps(tmp7182, tmp7183);
__m512 tmp7263 = _mm512_sub_ps(tmp7230, tmp7231);
__m512 tmp7240 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(2e+00f), tmp7241);
__m512 tmp7260 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(2e+00f), tmp7261);
__m512 tmp7247 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(8e+00f), tmp7241);
__m512 tmp7267 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(8e+00f), tmp7261);
__m512 tmp7235 = _mm512_add_ps(tmp7236, tmp7237);
__m512 tmp7255 = _mm512_add_ps(tmp7256, tmp7257);
__m512 tmp7239 = _mm512_fmadd_ps(tmp7243, _mm512_set1_ps(1.6e+01f), tmp7240);
__m512 tmp7259 = _mm512_fmadd_ps(tmp7263, _mm512_set1_ps(1.6e+01f), tmp7260);
__m512 tmp7246 = _mm512_fmadd_ps(tmp7243, _mm512_set1_ps(4e+00f), tmp7247);
__m512 tmp7266 = _mm512_fmadd_ps(tmp7263, _mm512_set1_ps(4e+00f), tmp7267);
__m512 tmp7252 = _mm512_add_ps(tmp7243, tmp7241);
__m512 tmp7272 = _mm512_add_ps(tmp7263, tmp7261);
__m512 tmp7245 = _mm512_fmadd_ps(tmp7236, _mm512_set1_ps(4e+00f), tmp7237);
__m512 tmp7265 = _mm512_fmadd_ps(tmp7256, _mm512_set1_ps(4e+00f), tmp7257);
__m512 tmp7249 = _mm512_fmadd_ps(tmp7236, _mm512_set1_ps(1.6e+01f), tmp7237);
__m512 tmp7269 = _mm512_fmadd_ps(tmp7256, _mm512_set1_ps(1.6e+01f), tmp7257);
__m512 tmp7234 = _mm512_add_ps(tmp7235, tmp7177);
__m512 tmp7254 = _mm512_add_ps(tmp7255, tmp7185);
__m512 tmp7251 = _mm512_add_ps(tmp7252, tmp7184);
__m512 tmp7271 = _mm512_add_ps(tmp7272, tmp7232);
__m512 tmp7233 = _mm512_fmadd_ps(tmp7238, _mm512_set1_ps(3.2e+01f), tmp7234);
__m512 tmp7253 = _mm512_fmadd_ps(tmp7258, _mm512_set1_ps(3.2e+01f), tmp7254);
__m512 tmp7244 = _mm512_fmadd_ps(tmp7238, _mm512_set1_ps(8e+00f), tmp7245);
__m512 tmp7264 = _mm512_fmadd_ps(tmp7258, _mm512_set1_ps(8e+00f), tmp7265);
__m512 tmp7250 = _mm512_fmadd_ps(tmp7242, _mm512_set1_ps(3.2e+01f), tmp7251);
__m512 tmp7270 = _mm512_fmadd_ps(tmp7262, _mm512_set1_ps(3.2e+01f), tmp7271);
__m512 tmp7248 = _mm512_fmadd_ps(tmp7238, _mm512_set1_ps(2e+00f), tmp7249);
__m512 tmp7268 = _mm512_fmadd_ps(tmp7258, _mm512_set1_ps(2e+00f), tmp7269);
__m512 out1107 = tmp7233;
__m512 out1113 = tmp7253;
__m512 out1108 = tmp7239;
__m512 out1114 = tmp7259;
__m512 out1109 = tmp7244;
__m512 out1115 = tmp7264;
__m512 out1110 = tmp7246;
__m512 out1116 = tmp7266;
__m512 out1111 = tmp7248;
__m512 out1117 = tmp7268;
__m512 out1112 = tmp7250;
__m512 out1118 = tmp7270;
out1107 = _mm512_max_ps(_mm512_setzero_ps(), out1107);
out1113 = _mm512_max_ps(_mm512_setzero_ps(), out1113);
out1108 = _mm512_max_ps(_mm512_setzero_ps(), out1108);
out1114 = _mm512_max_ps(_mm512_setzero_ps(), out1114);
out1109 = _mm512_max_ps(_mm512_setzero_ps(), out1109);
out1115 = _mm512_max_ps(_mm512_setzero_ps(), out1115);
out1110 = _mm512_max_ps(_mm512_setzero_ps(), out1110);
out1116 = _mm512_max_ps(_mm512_setzero_ps(), out1116);
out1111 = _mm512_max_ps(_mm512_setzero_ps(), out1111);
out1117 = _mm512_max_ps(_mm512_setzero_ps(), out1117);
out1112 = _mm512_max_ps(_mm512_setzero_ps(), out1112);
out1118 = _mm512_max_ps(_mm512_setzero_ps(), out1118);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1107);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1113);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1108);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1114);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1109);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1115);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1110);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1116);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1111);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1117);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1112);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1118);
__m512 sf417 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf418 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1133 = _mm512_shuffle_f32x4(sf417, sf418, 68);
__m512 in1134 = _mm512_shuffle_f32x4(sf417, sf418, 238);
__m512 sf419 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf420 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1141 = _mm512_shuffle_f32x4(sf419, sf420, 68);
__m512 in1142 = _mm512_shuffle_f32x4(sf419, sf420, 238);
__m512 sf421 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf422 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1135 = _mm512_shuffle_f32x4(sf421, sf422, 68);
__m512 in1136 = _mm512_shuffle_f32x4(sf421, sf422, 238);
__m512 sf423 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf424 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1143 = _mm512_shuffle_f32x4(sf423, sf424, 68);
__m512 in1144 = _mm512_shuffle_f32x4(sf423, sf424, 238);
__m512 sf425 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf426 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1137 = _mm512_shuffle_f32x4(sf425, sf426, 68);
__m512 in1138 = _mm512_shuffle_f32x4(sf425, sf426, 238);
__m512 sf427 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf428 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1145 = _mm512_shuffle_f32x4(sf427, sf428, 68);
__m512 in1146 = _mm512_shuffle_f32x4(sf427, sf428, 238);
__m512 sf429 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf430 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1139 = _mm512_shuffle_f32x4(sf429, sf430, 68);
__m512 in1140 = _mm512_shuffle_f32x4(sf429, sf430, 238);
__m512 sf431 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf432 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1147 = _mm512_shuffle_f32x4(sf431, sf432, 68);
__m512 in1148 = _mm512_shuffle_f32x4(sf431, sf432, 238);
__m512 tmp7329 = _mm512_add_ps(in1134, in1135);
__m512 tmp7349 = _mm512_add_ps(in1142, in1143);
__m512 tmp7328 = _mm512_add_ps(in1136, in1137);
__m512 tmp7348 = _mm512_add_ps(in1144, in1145);
__m512 tmp7334 = _mm512_sub_ps(in1136, in1137);
__m512 tmp7354 = _mm512_sub_ps(in1144, in1145);
__m512 tmp7333 = _mm512_sub_ps(in1134, in1135);
__m512 tmp7353 = _mm512_sub_ps(in1142, in1143);
__m512 tmp7330 = _mm512_add_ps(in1138, in1139);
__m512 tmp7350 = _mm512_add_ps(in1146, in1147);
__m512 tmp7335 = _mm512_sub_ps(in1138, in1139);
__m512 tmp7355 = _mm512_sub_ps(in1146, in1147);
__m512 tmp7332 = _mm512_fmadd_ps(tmp7334, _mm512_set1_ps(2e+00f), tmp7333);
__m512 tmp7352 = _mm512_fmadd_ps(tmp7354, _mm512_set1_ps(2e+00f), tmp7353);
__m512 tmp7339 = _mm512_fmadd_ps(tmp7334, _mm512_set1_ps(8e+00f), tmp7333);
__m512 tmp7359 = _mm512_fmadd_ps(tmp7354, _mm512_set1_ps(8e+00f), tmp7353);
__m512 tmp7327 = _mm512_add_ps(tmp7328, tmp7329);
__m512 tmp7347 = _mm512_add_ps(tmp7348, tmp7349);
__m512 tmp7331 = _mm512_fmadd_ps(tmp7335, _mm512_set1_ps(1.6e+01f), tmp7332);
__m512 tmp7351 = _mm512_fmadd_ps(tmp7355, _mm512_set1_ps(1.6e+01f), tmp7352);
__m512 tmp7338 = _mm512_fmadd_ps(tmp7335, _mm512_set1_ps(4e+00f), tmp7339);
__m512 tmp7358 = _mm512_fmadd_ps(tmp7355, _mm512_set1_ps(4e+00f), tmp7359);
__m512 tmp7344 = _mm512_add_ps(tmp7335, tmp7333);
__m512 tmp7364 = _mm512_add_ps(tmp7355, tmp7353);
__m512 tmp7337 = _mm512_fmadd_ps(tmp7328, _mm512_set1_ps(4e+00f), tmp7329);
__m512 tmp7357 = _mm512_fmadd_ps(tmp7348, _mm512_set1_ps(4e+00f), tmp7349);
__m512 tmp7341 = _mm512_fmadd_ps(tmp7328, _mm512_set1_ps(1.6e+01f), tmp7329);
__m512 tmp7361 = _mm512_fmadd_ps(tmp7348, _mm512_set1_ps(1.6e+01f), tmp7349);
__m512 tmp7326 = _mm512_add_ps(tmp7327, in1133);
__m512 tmp7346 = _mm512_add_ps(tmp7347, in1141);
__m512 tmp7343 = _mm512_add_ps(tmp7344, in1140);
__m512 tmp7363 = _mm512_add_ps(tmp7364, in1148);
__m512 tmp7325 = _mm512_fmadd_ps(tmp7330, _mm512_set1_ps(3.2e+01f), tmp7326);
__m512 tmp7345 = _mm512_fmadd_ps(tmp7350, _mm512_set1_ps(3.2e+01f), tmp7346);
__m512 tmp7336 = _mm512_fmadd_ps(tmp7330, _mm512_set1_ps(8e+00f), tmp7337);
__m512 tmp7356 = _mm512_fmadd_ps(tmp7350, _mm512_set1_ps(8e+00f), tmp7357);
__m512 tmp7342 = _mm512_fmadd_ps(tmp7334, _mm512_set1_ps(3.2e+01f), tmp7343);
__m512 tmp7362 = _mm512_fmadd_ps(tmp7354, _mm512_set1_ps(3.2e+01f), tmp7363);
__m512 tmp7340 = _mm512_fmadd_ps(tmp7330, _mm512_set1_ps(2e+00f), tmp7341);
__m512 tmp7360 = _mm512_fmadd_ps(tmp7350, _mm512_set1_ps(2e+00f), tmp7361);
__m512 tmp7313 = tmp7325;
__m512 tmp7319 = tmp7345;
__m512 tmp7314 = tmp7331;
__m512 tmp7320 = tmp7351;
__m512 tmp7315 = tmp7336;
__m512 tmp7321 = tmp7356;
__m512 tmp7316 = tmp7338;
__m512 tmp7322 = tmp7358;
__m512 tmp7317 = tmp7340;
__m512 tmp7323 = tmp7360;
__m512 tmp7318 = tmp7342;
__m512 tmp7324 = tmp7362;
__m512 tmp7409 = _mm512_unpacklo_ps(tmp7313, tmp7314);
__m512 tmp7410 = _mm512_unpackhi_ps(tmp7313, tmp7314);
__m512 tmp7411 = _mm512_unpacklo_ps(tmp7315, tmp7316);
__m512 tmp7412 = _mm512_unpackhi_ps(tmp7315, tmp7316);
__m512 tmp7413 = _mm512_unpacklo_ps(tmp7317, tmp7318);
__m512 tmp7414 = _mm512_unpackhi_ps(tmp7317, tmp7318);
__m512 tmp7415 = _mm512_unpacklo_ps(tmp7319, tmp7320);
__m512 tmp7416 = _mm512_unpackhi_ps(tmp7319, tmp7320);
__m512 tmp7417 = _mm512_unpacklo_ps(tmp7321, tmp7322);
__m512 tmp7418 = _mm512_unpackhi_ps(tmp7321, tmp7322);
__m512 tmp7419 = _mm512_unpacklo_ps(tmp7323, tmp7324);
__m512 tmp7420 = _mm512_unpackhi_ps(tmp7323, tmp7324);
__m512 tmp7421 = _mm512_shuffle_ps(tmp7409, tmp7411, 68);
__m512 tmp7422 = _mm512_shuffle_ps(tmp7409, tmp7411, 238);
__m512 tmp7423 = _mm512_shuffle_ps(tmp7410, tmp7412, 68);
__m512 tmp7424 = _mm512_shuffle_ps(tmp7410, tmp7412, 238);
__m512 tmp7425 = _mm512_shuffle_ps(tmp7413, tmp7415, 68);
__m512 tmp7426 = _mm512_shuffle_ps(tmp7413, tmp7415, 238);
__m512 tmp7427 = _mm512_shuffle_ps(tmp7414, tmp7416, 68);
__m512 tmp7428 = _mm512_shuffle_ps(tmp7414, tmp7416, 238);
__m512 tmp7429 = _mm512_shuffle_ps(tmp7417, tmp7419, 68);
__m512 tmp7430 = _mm512_shuffle_ps(tmp7417, tmp7419, 238);
__m512 tmp7431 = _mm512_shuffle_ps(tmp7418, tmp7420, 68);
__m512 tmp7432 = _mm512_shuffle_ps(tmp7418, tmp7420, 238);
__m512 tmp7433 = _mm512_shuffle_f32x4(tmp7421, tmp7425, 136);
__m512 tmp7434 = _mm512_shuffle_f32x4(tmp7421, tmp7425, 221);
__m512 tmp7435 = _mm512_shuffle_f32x4(tmp7422, tmp7426, 136);
__m512 tmp7436 = _mm512_shuffle_f32x4(tmp7422, tmp7426, 221);
__m512 tmp7437 = _mm512_shuffle_f32x4(tmp7423, tmp7427, 136);
__m512 tmp7438 = _mm512_shuffle_f32x4(tmp7423, tmp7427, 221);
__m512 tmp7439 = _mm512_shuffle_f32x4(tmp7424, tmp7428, 136);
__m512 tmp7440 = _mm512_shuffle_f32x4(tmp7424, tmp7428, 221);
__m512 tmp7441 = _mm512_shuffle_f32x4(tmp7429, tmp7429, 136);
__m512 tmp7442 = _mm512_shuffle_f32x4(tmp7429, tmp7429, 221);
__m512 tmp7443 = _mm512_shuffle_f32x4(tmp7430, tmp7430, 136);
__m512 tmp7444 = _mm512_shuffle_f32x4(tmp7430, tmp7430, 221);
__m512 tmp7445 = _mm512_shuffle_f32x4(tmp7431, tmp7431, 136);
__m512 tmp7446 = _mm512_shuffle_f32x4(tmp7431, tmp7431, 221);
__m512 tmp7447 = _mm512_shuffle_f32x4(tmp7432, tmp7432, 136);
__m512 tmp7448 = _mm512_shuffle_f32x4(tmp7432, tmp7432, 221);
tmp7313 = _mm512_shuffle_f32x4(tmp7433, tmp7441, 136);
tmp7321 = _mm512_shuffle_f32x4(tmp7433, tmp7441, 221);
tmp7314 = _mm512_shuffle_f32x4(tmp7435, tmp7443, 136);
tmp7322 = _mm512_shuffle_f32x4(tmp7435, tmp7443, 221);
tmp7315 = _mm512_shuffle_f32x4(tmp7437, tmp7445, 136);
tmp7323 = _mm512_shuffle_f32x4(tmp7437, tmp7445, 221);
tmp7316 = _mm512_shuffle_f32x4(tmp7439, tmp7447, 136);
tmp7324 = _mm512_shuffle_f32x4(tmp7439, tmp7447, 221);
tmp7317 = _mm512_shuffle_f32x4(tmp7434, tmp7442, 136);
__m512 tmp7365 = _mm512_shuffle_f32x4(tmp7434, tmp7442, 221);
tmp7318 = _mm512_shuffle_f32x4(tmp7436, tmp7444, 136);
__m512 tmp7366 = _mm512_shuffle_f32x4(tmp7436, tmp7444, 221);
tmp7319 = _mm512_shuffle_f32x4(tmp7438, tmp7446, 136);
__m512 tmp7367 = _mm512_shuffle_f32x4(tmp7438, tmp7446, 221);
tmp7320 = _mm512_shuffle_f32x4(tmp7440, tmp7448, 136);
__m512 tmp7368 = _mm512_shuffle_f32x4(tmp7440, tmp7448, 221);
__m512 tmp7373 = _mm512_add_ps(tmp7314, tmp7315);
__m512 tmp7393 = _mm512_add_ps(tmp7322, tmp7323);
__m512 tmp7372 = _mm512_add_ps(tmp7316, tmp7317);
__m512 tmp7392 = _mm512_add_ps(tmp7324, tmp7365);
__m512 tmp7378 = _mm512_sub_ps(tmp7316, tmp7317);
__m512 tmp7398 = _mm512_sub_ps(tmp7324, tmp7365);
__m512 tmp7377 = _mm512_sub_ps(tmp7314, tmp7315);
__m512 tmp7397 = _mm512_sub_ps(tmp7322, tmp7323);
__m512 tmp7374 = _mm512_add_ps(tmp7318, tmp7319);
__m512 tmp7394 = _mm512_add_ps(tmp7366, tmp7367);
__m512 tmp7379 = _mm512_sub_ps(tmp7318, tmp7319);
__m512 tmp7399 = _mm512_sub_ps(tmp7366, tmp7367);
__m512 tmp7376 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(2e+00f), tmp7377);
__m512 tmp7396 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(2e+00f), tmp7397);
__m512 tmp7383 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(8e+00f), tmp7377);
__m512 tmp7403 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(8e+00f), tmp7397);
__m512 tmp7371 = _mm512_add_ps(tmp7372, tmp7373);
__m512 tmp7391 = _mm512_add_ps(tmp7392, tmp7393);
__m512 tmp7375 = _mm512_fmadd_ps(tmp7379, _mm512_set1_ps(1.6e+01f), tmp7376);
__m512 tmp7395 = _mm512_fmadd_ps(tmp7399, _mm512_set1_ps(1.6e+01f), tmp7396);
__m512 tmp7382 = _mm512_fmadd_ps(tmp7379, _mm512_set1_ps(4e+00f), tmp7383);
__m512 tmp7402 = _mm512_fmadd_ps(tmp7399, _mm512_set1_ps(4e+00f), tmp7403);
__m512 tmp7388 = _mm512_add_ps(tmp7379, tmp7377);
__m512 tmp7408 = _mm512_add_ps(tmp7399, tmp7397);
__m512 tmp7381 = _mm512_fmadd_ps(tmp7372, _mm512_set1_ps(4e+00f), tmp7373);
__m512 tmp7401 = _mm512_fmadd_ps(tmp7392, _mm512_set1_ps(4e+00f), tmp7393);
__m512 tmp7385 = _mm512_fmadd_ps(tmp7372, _mm512_set1_ps(1.6e+01f), tmp7373);
__m512 tmp7405 = _mm512_fmadd_ps(tmp7392, _mm512_set1_ps(1.6e+01f), tmp7393);
__m512 tmp7370 = _mm512_add_ps(tmp7371, tmp7313);
__m512 tmp7390 = _mm512_add_ps(tmp7391, tmp7321);
__m512 tmp7387 = _mm512_add_ps(tmp7388, tmp7320);
__m512 tmp7407 = _mm512_add_ps(tmp7408, tmp7368);
__m512 tmp7369 = _mm512_fmadd_ps(tmp7374, _mm512_set1_ps(3.2e+01f), tmp7370);
__m512 tmp7389 = _mm512_fmadd_ps(tmp7394, _mm512_set1_ps(3.2e+01f), tmp7390);
__m512 tmp7380 = _mm512_fmadd_ps(tmp7374, _mm512_set1_ps(8e+00f), tmp7381);
__m512 tmp7400 = _mm512_fmadd_ps(tmp7394, _mm512_set1_ps(8e+00f), tmp7401);
__m512 tmp7386 = _mm512_fmadd_ps(tmp7378, _mm512_set1_ps(3.2e+01f), tmp7387);
__m512 tmp7406 = _mm512_fmadd_ps(tmp7398, _mm512_set1_ps(3.2e+01f), tmp7407);
__m512 tmp7384 = _mm512_fmadd_ps(tmp7374, _mm512_set1_ps(2e+00f), tmp7385);
__m512 tmp7404 = _mm512_fmadd_ps(tmp7394, _mm512_set1_ps(2e+00f), tmp7405);
__m512 out1119 = tmp7369;
__m512 out1125 = tmp7389;
__m512 out1120 = tmp7375;
__m512 out1126 = tmp7395;
__m512 out1121 = tmp7380;
__m512 out1127 = tmp7400;
__m512 out1122 = tmp7382;
__m512 out1128 = tmp7402;
__m512 out1123 = tmp7384;
__m512 out1129 = tmp7404;
__m512 out1124 = tmp7386;
__m512 out1130 = tmp7406;
out1119 = _mm512_max_ps(_mm512_setzero_ps(), out1119);
out1125 = _mm512_max_ps(_mm512_setzero_ps(), out1125);
out1120 = _mm512_max_ps(_mm512_setzero_ps(), out1120);
out1126 = _mm512_max_ps(_mm512_setzero_ps(), out1126);
out1121 = _mm512_max_ps(_mm512_setzero_ps(), out1121);
out1127 = _mm512_max_ps(_mm512_setzero_ps(), out1127);
out1122 = _mm512_max_ps(_mm512_setzero_ps(), out1122);
out1128 = _mm512_max_ps(_mm512_setzero_ps(), out1128);
out1123 = _mm512_max_ps(_mm512_setzero_ps(), out1123);
out1129 = _mm512_max_ps(_mm512_setzero_ps(), out1129);
out1124 = _mm512_max_ps(_mm512_setzero_ps(), out1124);
out1130 = _mm512_max_ps(_mm512_setzero_ps(), out1130);
_mm512_mask_storeu_ps(datPtr13+96+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1119);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1125);
_mm512_mask_storeu_ps(datPtr13+320+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1120);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1126);
_mm512_mask_storeu_ps(datPtr13+544+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1121);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1127);
_mm512_mask_storeu_ps(datPtr13+768+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1122);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1128);
_mm512_mask_storeu_ps(datPtr13+992+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1123);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1129);
_mm512_mask_storeu_ps(datPtr13+1216+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1124);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1130);
__m512 sf433 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf434 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1149 = _mm512_shuffle_f32x4(sf433, sf434, 68);
__m512 in1150 = _mm512_shuffle_f32x4(sf433, sf434, 238);
__m512 sf435 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf436 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1157 = _mm512_shuffle_f32x4(sf435, sf436, 68);
__m512 in1158 = _mm512_shuffle_f32x4(sf435, sf436, 238);
__m512 sf437 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf438 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1151 = _mm512_shuffle_f32x4(sf437, sf438, 68);
__m512 in1152 = _mm512_shuffle_f32x4(sf437, sf438, 238);
__m512 sf439 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf440 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1159 = _mm512_shuffle_f32x4(sf439, sf440, 68);
__m512 in1160 = _mm512_shuffle_f32x4(sf439, sf440, 238);
__m512 sf441 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf442 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1153 = _mm512_shuffle_f32x4(sf441, sf442, 68);
__m512 in1154 = _mm512_shuffle_f32x4(sf441, sf442, 238);
__m512 sf443 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf444 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1161 = _mm512_shuffle_f32x4(sf443, sf444, 68);
__m512 in1162 = _mm512_shuffle_f32x4(sf443, sf444, 238);
__m512 sf445 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf446 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1155 = _mm512_shuffle_f32x4(sf445, sf446, 68);
__m512 in1156 = _mm512_shuffle_f32x4(sf445, sf446, 238);
__m512 sf447 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k89+768*l28);
__m512 sf448 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k89+768*l28);
__m512 in1163 = _mm512_shuffle_f32x4(sf447, sf448, 68);
__m512 in1164 = _mm512_shuffle_f32x4(sf447, sf448, 238);
__m512 tmp7465 = _mm512_add_ps(in1150, in1151);
__m512 tmp7485 = _mm512_add_ps(in1158, in1159);
__m512 tmp7464 = _mm512_add_ps(in1152, in1153);
__m512 tmp7484 = _mm512_add_ps(in1160, in1161);
__m512 tmp7470 = _mm512_sub_ps(in1152, in1153);
__m512 tmp7490 = _mm512_sub_ps(in1160, in1161);
__m512 tmp7469 = _mm512_sub_ps(in1150, in1151);
__m512 tmp7489 = _mm512_sub_ps(in1158, in1159);
__m512 tmp7466 = _mm512_add_ps(in1154, in1155);
__m512 tmp7486 = _mm512_add_ps(in1162, in1163);
__m512 tmp7471 = _mm512_sub_ps(in1154, in1155);
__m512 tmp7491 = _mm512_sub_ps(in1162, in1163);
__m512 tmp7468 = _mm512_fmadd_ps(tmp7470, _mm512_set1_ps(2e+00f), tmp7469);
__m512 tmp7488 = _mm512_fmadd_ps(tmp7490, _mm512_set1_ps(2e+00f), tmp7489);
__m512 tmp7475 = _mm512_fmadd_ps(tmp7470, _mm512_set1_ps(8e+00f), tmp7469);
__m512 tmp7495 = _mm512_fmadd_ps(tmp7490, _mm512_set1_ps(8e+00f), tmp7489);
__m512 tmp7463 = _mm512_add_ps(tmp7464, tmp7465);
__m512 tmp7483 = _mm512_add_ps(tmp7484, tmp7485);
__m512 tmp7467 = _mm512_fmadd_ps(tmp7471, _mm512_set1_ps(1.6e+01f), tmp7468);
__m512 tmp7487 = _mm512_fmadd_ps(tmp7491, _mm512_set1_ps(1.6e+01f), tmp7488);
__m512 tmp7474 = _mm512_fmadd_ps(tmp7471, _mm512_set1_ps(4e+00f), tmp7475);
__m512 tmp7494 = _mm512_fmadd_ps(tmp7491, _mm512_set1_ps(4e+00f), tmp7495);
__m512 tmp7480 = _mm512_add_ps(tmp7471, tmp7469);
__m512 tmp7500 = _mm512_add_ps(tmp7491, tmp7489);
__m512 tmp7473 = _mm512_fmadd_ps(tmp7464, _mm512_set1_ps(4e+00f), tmp7465);
__m512 tmp7493 = _mm512_fmadd_ps(tmp7484, _mm512_set1_ps(4e+00f), tmp7485);
__m512 tmp7477 = _mm512_fmadd_ps(tmp7464, _mm512_set1_ps(1.6e+01f), tmp7465);
__m512 tmp7497 = _mm512_fmadd_ps(tmp7484, _mm512_set1_ps(1.6e+01f), tmp7485);
__m512 tmp7462 = _mm512_add_ps(tmp7463, in1149);
__m512 tmp7482 = _mm512_add_ps(tmp7483, in1157);
__m512 tmp7479 = _mm512_add_ps(tmp7480, in1156);
__m512 tmp7499 = _mm512_add_ps(tmp7500, in1164);
__m512 tmp7461 = _mm512_fmadd_ps(tmp7466, _mm512_set1_ps(3.2e+01f), tmp7462);
__m512 tmp7481 = _mm512_fmadd_ps(tmp7486, _mm512_set1_ps(3.2e+01f), tmp7482);
__m512 tmp7472 = _mm512_fmadd_ps(tmp7466, _mm512_set1_ps(8e+00f), tmp7473);
__m512 tmp7492 = _mm512_fmadd_ps(tmp7486, _mm512_set1_ps(8e+00f), tmp7493);
__m512 tmp7478 = _mm512_fmadd_ps(tmp7470, _mm512_set1_ps(3.2e+01f), tmp7479);
__m512 tmp7498 = _mm512_fmadd_ps(tmp7490, _mm512_set1_ps(3.2e+01f), tmp7499);
__m512 tmp7476 = _mm512_fmadd_ps(tmp7466, _mm512_set1_ps(2e+00f), tmp7477);
__m512 tmp7496 = _mm512_fmadd_ps(tmp7486, _mm512_set1_ps(2e+00f), tmp7497);
__m512 tmp7449 = tmp7461;
__m512 tmp7455 = tmp7481;
__m512 tmp7450 = tmp7467;
__m512 tmp7456 = tmp7487;
__m512 tmp7451 = tmp7472;
__m512 tmp7457 = tmp7492;
__m512 tmp7452 = tmp7474;
__m512 tmp7458 = tmp7494;
__m512 tmp7453 = tmp7476;
__m512 tmp7459 = tmp7496;
__m512 tmp7454 = tmp7478;
__m512 tmp7460 = tmp7498;
__m512 tmp7545 = _mm512_unpacklo_ps(tmp7449, tmp7450);
__m512 tmp7546 = _mm512_unpackhi_ps(tmp7449, tmp7450);
__m512 tmp7547 = _mm512_unpacklo_ps(tmp7451, tmp7452);
__m512 tmp7548 = _mm512_unpackhi_ps(tmp7451, tmp7452);
__m512 tmp7549 = _mm512_unpacklo_ps(tmp7453, tmp7454);
__m512 tmp7550 = _mm512_unpackhi_ps(tmp7453, tmp7454);
__m512 tmp7551 = _mm512_unpacklo_ps(tmp7455, tmp7456);
__m512 tmp7552 = _mm512_unpackhi_ps(tmp7455, tmp7456);
__m512 tmp7553 = _mm512_unpacklo_ps(tmp7457, tmp7458);
__m512 tmp7554 = _mm512_unpackhi_ps(tmp7457, tmp7458);
__m512 tmp7555 = _mm512_unpacklo_ps(tmp7459, tmp7460);
__m512 tmp7556 = _mm512_unpackhi_ps(tmp7459, tmp7460);
__m512 tmp7557 = _mm512_shuffle_ps(tmp7545, tmp7547, 68);
__m512 tmp7558 = _mm512_shuffle_ps(tmp7545, tmp7547, 238);
__m512 tmp7559 = _mm512_shuffle_ps(tmp7546, tmp7548, 68);
__m512 tmp7560 = _mm512_shuffle_ps(tmp7546, tmp7548, 238);
__m512 tmp7561 = _mm512_shuffle_ps(tmp7549, tmp7551, 68);
__m512 tmp7562 = _mm512_shuffle_ps(tmp7549, tmp7551, 238);
__m512 tmp7563 = _mm512_shuffle_ps(tmp7550, tmp7552, 68);
__m512 tmp7564 = _mm512_shuffle_ps(tmp7550, tmp7552, 238);
__m512 tmp7565 = _mm512_shuffle_ps(tmp7553, tmp7555, 68);
__m512 tmp7566 = _mm512_shuffle_ps(tmp7553, tmp7555, 238);
__m512 tmp7567 = _mm512_shuffle_ps(tmp7554, tmp7556, 68);
__m512 tmp7568 = _mm512_shuffle_ps(tmp7554, tmp7556, 238);
__m512 tmp7569 = _mm512_shuffle_f32x4(tmp7557, tmp7561, 136);
__m512 tmp7570 = _mm512_shuffle_f32x4(tmp7557, tmp7561, 221);
__m512 tmp7571 = _mm512_shuffle_f32x4(tmp7558, tmp7562, 136);
__m512 tmp7572 = _mm512_shuffle_f32x4(tmp7558, tmp7562, 221);
__m512 tmp7573 = _mm512_shuffle_f32x4(tmp7559, tmp7563, 136);
__m512 tmp7574 = _mm512_shuffle_f32x4(tmp7559, tmp7563, 221);
__m512 tmp7575 = _mm512_shuffle_f32x4(tmp7560, tmp7564, 136);
__m512 tmp7576 = _mm512_shuffle_f32x4(tmp7560, tmp7564, 221);
__m512 tmp7577 = _mm512_shuffle_f32x4(tmp7565, tmp7565, 136);
__m512 tmp7578 = _mm512_shuffle_f32x4(tmp7565, tmp7565, 221);
__m512 tmp7579 = _mm512_shuffle_f32x4(tmp7566, tmp7566, 136);
__m512 tmp7580 = _mm512_shuffle_f32x4(tmp7566, tmp7566, 221);
__m512 tmp7581 = _mm512_shuffle_f32x4(tmp7567, tmp7567, 136);
__m512 tmp7582 = _mm512_shuffle_f32x4(tmp7567, tmp7567, 221);
__m512 tmp7583 = _mm512_shuffle_f32x4(tmp7568, tmp7568, 136);
__m512 tmp7584 = _mm512_shuffle_f32x4(tmp7568, tmp7568, 221);
tmp7449 = _mm512_shuffle_f32x4(tmp7569, tmp7577, 136);
tmp7457 = _mm512_shuffle_f32x4(tmp7569, tmp7577, 221);
tmp7450 = _mm512_shuffle_f32x4(tmp7571, tmp7579, 136);
tmp7458 = _mm512_shuffle_f32x4(tmp7571, tmp7579, 221);
tmp7451 = _mm512_shuffle_f32x4(tmp7573, tmp7581, 136);
tmp7459 = _mm512_shuffle_f32x4(tmp7573, tmp7581, 221);
tmp7452 = _mm512_shuffle_f32x4(tmp7575, tmp7583, 136);
tmp7460 = _mm512_shuffle_f32x4(tmp7575, tmp7583, 221);
tmp7453 = _mm512_shuffle_f32x4(tmp7570, tmp7578, 136);
__m512 tmp7501 = _mm512_shuffle_f32x4(tmp7570, tmp7578, 221);
tmp7454 = _mm512_shuffle_f32x4(tmp7572, tmp7580, 136);
__m512 tmp7502 = _mm512_shuffle_f32x4(tmp7572, tmp7580, 221);
tmp7455 = _mm512_shuffle_f32x4(tmp7574, tmp7582, 136);
__m512 tmp7503 = _mm512_shuffle_f32x4(tmp7574, tmp7582, 221);
tmp7456 = _mm512_shuffle_f32x4(tmp7576, tmp7584, 136);
__m512 tmp7504 = _mm512_shuffle_f32x4(tmp7576, tmp7584, 221);
__m512 tmp7509 = _mm512_add_ps(tmp7450, tmp7451);
__m512 tmp7529 = _mm512_add_ps(tmp7458, tmp7459);
__m512 tmp7508 = _mm512_add_ps(tmp7452, tmp7453);
__m512 tmp7528 = _mm512_add_ps(tmp7460, tmp7501);
__m512 tmp7514 = _mm512_sub_ps(tmp7452, tmp7453);
__m512 tmp7534 = _mm512_sub_ps(tmp7460, tmp7501);
__m512 tmp7513 = _mm512_sub_ps(tmp7450, tmp7451);
__m512 tmp7533 = _mm512_sub_ps(tmp7458, tmp7459);
__m512 tmp7510 = _mm512_add_ps(tmp7454, tmp7455);
__m512 tmp7530 = _mm512_add_ps(tmp7502, tmp7503);
__m512 tmp7515 = _mm512_sub_ps(tmp7454, tmp7455);
__m512 tmp7535 = _mm512_sub_ps(tmp7502, tmp7503);
__m512 tmp7512 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(2e+00f), tmp7513);
__m512 tmp7532 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(2e+00f), tmp7533);
__m512 tmp7519 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(8e+00f), tmp7513);
__m512 tmp7539 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(8e+00f), tmp7533);
__m512 tmp7507 = _mm512_add_ps(tmp7508, tmp7509);
__m512 tmp7527 = _mm512_add_ps(tmp7528, tmp7529);
__m512 tmp7511 = _mm512_fmadd_ps(tmp7515, _mm512_set1_ps(1.6e+01f), tmp7512);
__m512 tmp7531 = _mm512_fmadd_ps(tmp7535, _mm512_set1_ps(1.6e+01f), tmp7532);
__m512 tmp7518 = _mm512_fmadd_ps(tmp7515, _mm512_set1_ps(4e+00f), tmp7519);
__m512 tmp7538 = _mm512_fmadd_ps(tmp7535, _mm512_set1_ps(4e+00f), tmp7539);
__m512 tmp7524 = _mm512_add_ps(tmp7515, tmp7513);
__m512 tmp7544 = _mm512_add_ps(tmp7535, tmp7533);
__m512 tmp7517 = _mm512_fmadd_ps(tmp7508, _mm512_set1_ps(4e+00f), tmp7509);
__m512 tmp7537 = _mm512_fmadd_ps(tmp7528, _mm512_set1_ps(4e+00f), tmp7529);
__m512 tmp7521 = _mm512_fmadd_ps(tmp7508, _mm512_set1_ps(1.6e+01f), tmp7509);
__m512 tmp7541 = _mm512_fmadd_ps(tmp7528, _mm512_set1_ps(1.6e+01f), tmp7529);
__m512 tmp7506 = _mm512_add_ps(tmp7507, tmp7449);
__m512 tmp7526 = _mm512_add_ps(tmp7527, tmp7457);
__m512 tmp7523 = _mm512_add_ps(tmp7524, tmp7456);
__m512 tmp7543 = _mm512_add_ps(tmp7544, tmp7504);
__m512 tmp7505 = _mm512_fmadd_ps(tmp7510, _mm512_set1_ps(3.2e+01f), tmp7506);
__m512 tmp7525 = _mm512_fmadd_ps(tmp7530, _mm512_set1_ps(3.2e+01f), tmp7526);
__m512 tmp7516 = _mm512_fmadd_ps(tmp7510, _mm512_set1_ps(8e+00f), tmp7517);
__m512 tmp7536 = _mm512_fmadd_ps(tmp7530, _mm512_set1_ps(8e+00f), tmp7537);
__m512 tmp7522 = _mm512_fmadd_ps(tmp7514, _mm512_set1_ps(3.2e+01f), tmp7523);
__m512 tmp7542 = _mm512_fmadd_ps(tmp7534, _mm512_set1_ps(3.2e+01f), tmp7543);
__m512 tmp7520 = _mm512_fmadd_ps(tmp7510, _mm512_set1_ps(2e+00f), tmp7521);
__m512 tmp7540 = _mm512_fmadd_ps(tmp7530, _mm512_set1_ps(2e+00f), tmp7541);
__m512 out1131 = tmp7505;
__m512 out1137 = tmp7525;
__m512 out1132 = tmp7511;
__m512 out1138 = tmp7531;
__m512 out1133 = tmp7516;
__m512 out1139 = tmp7536;
__m512 out1134 = tmp7518;
__m512 out1140 = tmp7538;
__m512 out1135 = tmp7520;
__m512 out1141 = tmp7540;
__m512 out1136 = tmp7522;
__m512 out1142 = tmp7542;
out1131 = _mm512_max_ps(_mm512_setzero_ps(), out1131);
out1137 = _mm512_max_ps(_mm512_setzero_ps(), out1137);
out1132 = _mm512_max_ps(_mm512_setzero_ps(), out1132);
out1138 = _mm512_max_ps(_mm512_setzero_ps(), out1138);
out1133 = _mm512_max_ps(_mm512_setzero_ps(), out1133);
out1139 = _mm512_max_ps(_mm512_setzero_ps(), out1139);
out1134 = _mm512_max_ps(_mm512_setzero_ps(), out1134);
out1140 = _mm512_max_ps(_mm512_setzero_ps(), out1140);
out1135 = _mm512_max_ps(_mm512_setzero_ps(), out1135);
out1141 = _mm512_max_ps(_mm512_setzero_ps(), out1141);
out1136 = _mm512_max_ps(_mm512_setzero_ps(), out1136);
out1142 = _mm512_max_ps(_mm512_setzero_ps(), out1142);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1131);
_mm512_mask_storeu_ps(datPtr13+12704+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1137);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1132);
_mm512_mask_storeu_ps(datPtr13+12928+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1138);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1133);
_mm512_mask_storeu_ps(datPtr13+13152+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1139);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1134);
_mm512_mask_storeu_ps(datPtr13+13376+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1140);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1135);
_mm512_mask_storeu_ps(datPtr13+13600+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1141);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1136);
_mm512_mask_storeu_ps(datPtr13+13824+50432*i29+224*toH29+4*toW29+50432*k89+25216*l28, 4095, out1142);
}
}
++j23;
rel16 = 1;
}
ptrdiff_t toH30 = base16+0;
ptrdiff_t toW30 = 36;
ptrdiff_t k90 = 1*w46;
for (; k90 != 1; ++k90) {
ptrdiff_t l29 = 0;
for (; l29 != 2; ++l29) {
__m512 sf449 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf450 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1165 = _mm512_shuffle_f32x4(sf449, sf450, 68);
__m512 in1166 = _mm512_shuffle_f32x4(sf449, sf450, 238);
__m512 sf451 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf452 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1173 = _mm512_shuffle_f32x4(sf451, sf452, 68);
__m512 in1174 = _mm512_shuffle_f32x4(sf451, sf452, 238);
__m512 sf453 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf454 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1167 = _mm512_shuffle_f32x4(sf453, sf454, 68);
__m512 in1168 = _mm512_shuffle_f32x4(sf453, sf454, 238);
__m512 sf455 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf456 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1175 = _mm512_shuffle_f32x4(sf455, sf456, 68);
__m512 in1176 = _mm512_shuffle_f32x4(sf455, sf456, 238);
__m512 sf457 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf458 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1169 = _mm512_shuffle_f32x4(sf457, sf458, 68);
__m512 in1170 = _mm512_shuffle_f32x4(sf457, sf458, 238);
__m512 sf459 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf460 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1177 = _mm512_shuffle_f32x4(sf459, sf460, 68);
__m512 in1178 = _mm512_shuffle_f32x4(sf459, sf460, 238);
__m512 sf461 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf462 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1171 = _mm512_shuffle_f32x4(sf461, sf462, 68);
__m512 in1172 = _mm512_shuffle_f32x4(sf461, sf462, 238);
__m512 sf463 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf464 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1179 = _mm512_shuffle_f32x4(sf463, sf464, 68);
__m512 in1180 = _mm512_shuffle_f32x4(sf463, sf464, 238);
__m512 tmp7601 = _mm512_add_ps(in1166, in1167);
__m512 tmp7621 = _mm512_add_ps(in1174, in1175);
__m512 tmp7600 = _mm512_add_ps(in1168, in1169);
__m512 tmp7620 = _mm512_add_ps(in1176, in1177);
__m512 tmp7606 = _mm512_sub_ps(in1168, in1169);
__m512 tmp7626 = _mm512_sub_ps(in1176, in1177);
__m512 tmp7605 = _mm512_sub_ps(in1166, in1167);
__m512 tmp7625 = _mm512_sub_ps(in1174, in1175);
__m512 tmp7602 = _mm512_add_ps(in1170, in1171);
__m512 tmp7622 = _mm512_add_ps(in1178, in1179);
__m512 tmp7607 = _mm512_sub_ps(in1170, in1171);
__m512 tmp7627 = _mm512_sub_ps(in1178, in1179);
__m512 tmp7604 = _mm512_fmadd_ps(tmp7606, _mm512_set1_ps(2e+00f), tmp7605);
__m512 tmp7624 = _mm512_fmadd_ps(tmp7626, _mm512_set1_ps(2e+00f), tmp7625);
__m512 tmp7611 = _mm512_fmadd_ps(tmp7606, _mm512_set1_ps(8e+00f), tmp7605);
__m512 tmp7631 = _mm512_fmadd_ps(tmp7626, _mm512_set1_ps(8e+00f), tmp7625);
__m512 tmp7599 = _mm512_add_ps(tmp7600, tmp7601);
__m512 tmp7619 = _mm512_add_ps(tmp7620, tmp7621);
__m512 tmp7603 = _mm512_fmadd_ps(tmp7607, _mm512_set1_ps(1.6e+01f), tmp7604);
__m512 tmp7623 = _mm512_fmadd_ps(tmp7627, _mm512_set1_ps(1.6e+01f), tmp7624);
__m512 tmp7610 = _mm512_fmadd_ps(tmp7607, _mm512_set1_ps(4e+00f), tmp7611);
__m512 tmp7630 = _mm512_fmadd_ps(tmp7627, _mm512_set1_ps(4e+00f), tmp7631);
__m512 tmp7616 = _mm512_add_ps(tmp7607, tmp7605);
__m512 tmp7636 = _mm512_add_ps(tmp7627, tmp7625);
__m512 tmp7609 = _mm512_fmadd_ps(tmp7600, _mm512_set1_ps(4e+00f), tmp7601);
__m512 tmp7629 = _mm512_fmadd_ps(tmp7620, _mm512_set1_ps(4e+00f), tmp7621);
__m512 tmp7613 = _mm512_fmadd_ps(tmp7600, _mm512_set1_ps(1.6e+01f), tmp7601);
__m512 tmp7633 = _mm512_fmadd_ps(tmp7620, _mm512_set1_ps(1.6e+01f), tmp7621);
__m512 tmp7598 = _mm512_add_ps(tmp7599, in1165);
__m512 tmp7618 = _mm512_add_ps(tmp7619, in1173);
__m512 tmp7615 = _mm512_add_ps(tmp7616, in1172);
__m512 tmp7635 = _mm512_add_ps(tmp7636, in1180);
__m512 tmp7597 = _mm512_fmadd_ps(tmp7602, _mm512_set1_ps(3.2e+01f), tmp7598);
__m512 tmp7617 = _mm512_fmadd_ps(tmp7622, _mm512_set1_ps(3.2e+01f), tmp7618);
__m512 tmp7608 = _mm512_fmadd_ps(tmp7602, _mm512_set1_ps(8e+00f), tmp7609);
__m512 tmp7628 = _mm512_fmadd_ps(tmp7622, _mm512_set1_ps(8e+00f), tmp7629);
__m512 tmp7614 = _mm512_fmadd_ps(tmp7606, _mm512_set1_ps(3.2e+01f), tmp7615);
__m512 tmp7634 = _mm512_fmadd_ps(tmp7626, _mm512_set1_ps(3.2e+01f), tmp7635);
__m512 tmp7612 = _mm512_fmadd_ps(tmp7602, _mm512_set1_ps(2e+00f), tmp7613);
__m512 tmp7632 = _mm512_fmadd_ps(tmp7622, _mm512_set1_ps(2e+00f), tmp7633);
__m512 tmp7585 = tmp7597;
__m512 tmp7591 = tmp7617;
__m512 tmp7586 = tmp7603;
__m512 tmp7592 = tmp7623;
__m512 tmp7587 = tmp7608;
__m512 tmp7593 = tmp7628;
__m512 tmp7588 = tmp7610;
__m512 tmp7594 = tmp7630;
__m512 tmp7589 = tmp7612;
__m512 tmp7595 = tmp7632;
__m512 tmp7590 = tmp7614;
__m512 tmp7596 = tmp7634;
__m512 tmp7681 = _mm512_unpacklo_ps(tmp7585, tmp7586);
__m512 tmp7682 = _mm512_unpackhi_ps(tmp7585, tmp7586);
__m512 tmp7683 = _mm512_unpacklo_ps(tmp7587, tmp7588);
__m512 tmp7684 = _mm512_unpackhi_ps(tmp7587, tmp7588);
__m512 tmp7685 = _mm512_unpacklo_ps(tmp7589, tmp7590);
__m512 tmp7686 = _mm512_unpackhi_ps(tmp7589, tmp7590);
__m512 tmp7687 = _mm512_unpacklo_ps(tmp7591, tmp7592);
__m512 tmp7688 = _mm512_unpackhi_ps(tmp7591, tmp7592);
__m512 tmp7689 = _mm512_unpacklo_ps(tmp7593, tmp7594);
__m512 tmp7690 = _mm512_unpackhi_ps(tmp7593, tmp7594);
__m512 tmp7691 = _mm512_unpacklo_ps(tmp7595, tmp7596);
__m512 tmp7692 = _mm512_unpackhi_ps(tmp7595, tmp7596);
__m512 tmp7693 = _mm512_shuffle_ps(tmp7681, tmp7683, 68);
__m512 tmp7694 = _mm512_shuffle_ps(tmp7681, tmp7683, 238);
__m512 tmp7695 = _mm512_shuffle_ps(tmp7682, tmp7684, 68);
__m512 tmp7696 = _mm512_shuffle_ps(tmp7682, tmp7684, 238);
__m512 tmp7697 = _mm512_shuffle_ps(tmp7685, tmp7687, 68);
__m512 tmp7698 = _mm512_shuffle_ps(tmp7685, tmp7687, 238);
__m512 tmp7699 = _mm512_shuffle_ps(tmp7686, tmp7688, 68);
__m512 tmp7700 = _mm512_shuffle_ps(tmp7686, tmp7688, 238);
__m512 tmp7701 = _mm512_shuffle_ps(tmp7689, tmp7691, 68);
__m512 tmp7702 = _mm512_shuffle_ps(tmp7689, tmp7691, 238);
__m512 tmp7703 = _mm512_shuffle_ps(tmp7690, tmp7692, 68);
__m512 tmp7704 = _mm512_shuffle_ps(tmp7690, tmp7692, 238);
__m512 tmp7705 = _mm512_shuffle_f32x4(tmp7693, tmp7697, 136);
__m512 tmp7706 = _mm512_shuffle_f32x4(tmp7693, tmp7697, 221);
__m512 tmp7707 = _mm512_shuffle_f32x4(tmp7694, tmp7698, 136);
__m512 tmp7708 = _mm512_shuffle_f32x4(tmp7694, tmp7698, 221);
__m512 tmp7709 = _mm512_shuffle_f32x4(tmp7695, tmp7699, 136);
__m512 tmp7710 = _mm512_shuffle_f32x4(tmp7695, tmp7699, 221);
__m512 tmp7711 = _mm512_shuffle_f32x4(tmp7696, tmp7700, 136);
__m512 tmp7712 = _mm512_shuffle_f32x4(tmp7696, tmp7700, 221);
__m512 tmp7713 = _mm512_shuffle_f32x4(tmp7701, tmp7701, 136);
__m512 tmp7714 = _mm512_shuffle_f32x4(tmp7701, tmp7701, 221);
__m512 tmp7715 = _mm512_shuffle_f32x4(tmp7702, tmp7702, 136);
__m512 tmp7716 = _mm512_shuffle_f32x4(tmp7702, tmp7702, 221);
__m512 tmp7717 = _mm512_shuffle_f32x4(tmp7703, tmp7703, 136);
__m512 tmp7718 = _mm512_shuffle_f32x4(tmp7703, tmp7703, 221);
__m512 tmp7719 = _mm512_shuffle_f32x4(tmp7704, tmp7704, 136);
__m512 tmp7720 = _mm512_shuffle_f32x4(tmp7704, tmp7704, 221);
tmp7585 = _mm512_shuffle_f32x4(tmp7705, tmp7713, 136);
tmp7593 = _mm512_shuffle_f32x4(tmp7705, tmp7713, 221);
tmp7586 = _mm512_shuffle_f32x4(tmp7707, tmp7715, 136);
tmp7594 = _mm512_shuffle_f32x4(tmp7707, tmp7715, 221);
tmp7587 = _mm512_shuffle_f32x4(tmp7709, tmp7717, 136);
tmp7595 = _mm512_shuffle_f32x4(tmp7709, tmp7717, 221);
tmp7588 = _mm512_shuffle_f32x4(tmp7711, tmp7719, 136);
tmp7596 = _mm512_shuffle_f32x4(tmp7711, tmp7719, 221);
tmp7589 = _mm512_shuffle_f32x4(tmp7706, tmp7714, 136);
__m512 tmp7637 = _mm512_shuffle_f32x4(tmp7706, tmp7714, 221);
tmp7590 = _mm512_shuffle_f32x4(tmp7708, tmp7716, 136);
__m512 tmp7638 = _mm512_shuffle_f32x4(tmp7708, tmp7716, 221);
tmp7591 = _mm512_shuffle_f32x4(tmp7710, tmp7718, 136);
__m512 tmp7639 = _mm512_shuffle_f32x4(tmp7710, tmp7718, 221);
tmp7592 = _mm512_shuffle_f32x4(tmp7712, tmp7720, 136);
__m512 tmp7640 = _mm512_shuffle_f32x4(tmp7712, tmp7720, 221);
__m512 tmp7645 = _mm512_add_ps(tmp7586, tmp7587);
__m512 tmp7665 = _mm512_add_ps(tmp7594, tmp7595);
__m512 tmp7644 = _mm512_add_ps(tmp7588, tmp7589);
__m512 tmp7664 = _mm512_add_ps(tmp7596, tmp7637);
__m512 tmp7650 = _mm512_sub_ps(tmp7588, tmp7589);
__m512 tmp7670 = _mm512_sub_ps(tmp7596, tmp7637);
__m512 tmp7649 = _mm512_sub_ps(tmp7586, tmp7587);
__m512 tmp7669 = _mm512_sub_ps(tmp7594, tmp7595);
__m512 tmp7646 = _mm512_add_ps(tmp7590, tmp7591);
__m512 tmp7666 = _mm512_add_ps(tmp7638, tmp7639);
__m512 tmp7651 = _mm512_sub_ps(tmp7590, tmp7591);
__m512 tmp7671 = _mm512_sub_ps(tmp7638, tmp7639);
__m512 tmp7648 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(2e+00f), tmp7649);
__m512 tmp7668 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(2e+00f), tmp7669);
__m512 tmp7655 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(8e+00f), tmp7649);
__m512 tmp7675 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(8e+00f), tmp7669);
__m512 tmp7643 = _mm512_add_ps(tmp7644, tmp7645);
__m512 tmp7663 = _mm512_add_ps(tmp7664, tmp7665);
__m512 tmp7647 = _mm512_fmadd_ps(tmp7651, _mm512_set1_ps(1.6e+01f), tmp7648);
__m512 tmp7667 = _mm512_fmadd_ps(tmp7671, _mm512_set1_ps(1.6e+01f), tmp7668);
__m512 tmp7654 = _mm512_fmadd_ps(tmp7651, _mm512_set1_ps(4e+00f), tmp7655);
__m512 tmp7674 = _mm512_fmadd_ps(tmp7671, _mm512_set1_ps(4e+00f), tmp7675);
__m512 tmp7660 = _mm512_add_ps(tmp7651, tmp7649);
__m512 tmp7680 = _mm512_add_ps(tmp7671, tmp7669);
__m512 tmp7653 = _mm512_fmadd_ps(tmp7644, _mm512_set1_ps(4e+00f), tmp7645);
__m512 tmp7673 = _mm512_fmadd_ps(tmp7664, _mm512_set1_ps(4e+00f), tmp7665);
__m512 tmp7657 = _mm512_fmadd_ps(tmp7644, _mm512_set1_ps(1.6e+01f), tmp7645);
__m512 tmp7677 = _mm512_fmadd_ps(tmp7664, _mm512_set1_ps(1.6e+01f), tmp7665);
__m512 tmp7642 = _mm512_add_ps(tmp7643, tmp7585);
__m512 tmp7662 = _mm512_add_ps(tmp7663, tmp7593);
__m512 tmp7659 = _mm512_add_ps(tmp7660, tmp7592);
__m512 tmp7679 = _mm512_add_ps(tmp7680, tmp7640);
__m512 tmp7641 = _mm512_fmadd_ps(tmp7646, _mm512_set1_ps(3.2e+01f), tmp7642);
__m512 tmp7661 = _mm512_fmadd_ps(tmp7666, _mm512_set1_ps(3.2e+01f), tmp7662);
__m512 tmp7652 = _mm512_fmadd_ps(tmp7646, _mm512_set1_ps(8e+00f), tmp7653);
__m512 tmp7672 = _mm512_fmadd_ps(tmp7666, _mm512_set1_ps(8e+00f), tmp7673);
__m512 tmp7658 = _mm512_fmadd_ps(tmp7650, _mm512_set1_ps(3.2e+01f), tmp7659);
__m512 tmp7678 = _mm512_fmadd_ps(tmp7670, _mm512_set1_ps(3.2e+01f), tmp7679);
__m512 tmp7656 = _mm512_fmadd_ps(tmp7646, _mm512_set1_ps(2e+00f), tmp7657);
__m512 tmp7676 = _mm512_fmadd_ps(tmp7666, _mm512_set1_ps(2e+00f), tmp7677);
__m512 out1143 = tmp7641;
__m512 out1149 = tmp7661;
__m512 out1144 = tmp7647;
__m512 out1150 = tmp7667;
__m512 out1145 = tmp7652;
__m512 out1151 = tmp7672;
__m512 out1146 = tmp7654;
__m512 out1152 = tmp7674;
__m512 out1147 = tmp7656;
__m512 out1153 = tmp7676;
__m512 out1148 = tmp7658;
__m512 out1154 = tmp7678;
out1143 = _mm512_max_ps(_mm512_setzero_ps(), out1143);
out1149 = _mm512_max_ps(_mm512_setzero_ps(), out1149);
out1144 = _mm512_max_ps(_mm512_setzero_ps(), out1144);
out1150 = _mm512_max_ps(_mm512_setzero_ps(), out1150);
out1145 = _mm512_max_ps(_mm512_setzero_ps(), out1145);
out1151 = _mm512_max_ps(_mm512_setzero_ps(), out1151);
out1146 = _mm512_max_ps(_mm512_setzero_ps(), out1146);
out1152 = _mm512_max_ps(_mm512_setzero_ps(), out1152);
out1147 = _mm512_max_ps(_mm512_setzero_ps(), out1147);
out1153 = _mm512_max_ps(_mm512_setzero_ps(), out1153);
out1148 = _mm512_max_ps(_mm512_setzero_ps(), out1148);
out1154 = _mm512_max_ps(_mm512_setzero_ps(), out1154);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1143);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1149);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1144);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1150);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1145);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1151);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1146);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1152);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1147);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1153);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1148);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1154);
__m512 sf465 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf466 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1181 = _mm512_shuffle_f32x4(sf465, sf466, 68);
__m512 in1182 = _mm512_shuffle_f32x4(sf465, sf466, 238);
__m512 sf467 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf468 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1189 = _mm512_shuffle_f32x4(sf467, sf468, 68);
__m512 in1190 = _mm512_shuffle_f32x4(sf467, sf468, 238);
__m512 sf469 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf470 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1183 = _mm512_shuffle_f32x4(sf469, sf470, 68);
__m512 in1184 = _mm512_shuffle_f32x4(sf469, sf470, 238);
__m512 sf471 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf472 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1191 = _mm512_shuffle_f32x4(sf471, sf472, 68);
__m512 in1192 = _mm512_shuffle_f32x4(sf471, sf472, 238);
__m512 sf473 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf474 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1185 = _mm512_shuffle_f32x4(sf473, sf474, 68);
__m512 in1186 = _mm512_shuffle_f32x4(sf473, sf474, 238);
__m512 sf475 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf476 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1193 = _mm512_shuffle_f32x4(sf475, sf476, 68);
__m512 in1194 = _mm512_shuffle_f32x4(sf475, sf476, 238);
__m512 sf477 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf478 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1187 = _mm512_shuffle_f32x4(sf477, sf478, 68);
__m512 in1188 = _mm512_shuffle_f32x4(sf477, sf478, 238);
__m512 sf479 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf480 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1195 = _mm512_shuffle_f32x4(sf479, sf480, 68);
__m512 in1196 = _mm512_shuffle_f32x4(sf479, sf480, 238);
__m512 tmp7737 = _mm512_add_ps(in1182, in1183);
__m512 tmp7757 = _mm512_add_ps(in1190, in1191);
__m512 tmp7736 = _mm512_add_ps(in1184, in1185);
__m512 tmp7756 = _mm512_add_ps(in1192, in1193);
__m512 tmp7742 = _mm512_sub_ps(in1184, in1185);
__m512 tmp7762 = _mm512_sub_ps(in1192, in1193);
__m512 tmp7741 = _mm512_sub_ps(in1182, in1183);
__m512 tmp7761 = _mm512_sub_ps(in1190, in1191);
__m512 tmp7738 = _mm512_add_ps(in1186, in1187);
__m512 tmp7758 = _mm512_add_ps(in1194, in1195);
__m512 tmp7743 = _mm512_sub_ps(in1186, in1187);
__m512 tmp7763 = _mm512_sub_ps(in1194, in1195);
__m512 tmp7740 = _mm512_fmadd_ps(tmp7742, _mm512_set1_ps(2e+00f), tmp7741);
__m512 tmp7760 = _mm512_fmadd_ps(tmp7762, _mm512_set1_ps(2e+00f), tmp7761);
__m512 tmp7747 = _mm512_fmadd_ps(tmp7742, _mm512_set1_ps(8e+00f), tmp7741);
__m512 tmp7767 = _mm512_fmadd_ps(tmp7762, _mm512_set1_ps(8e+00f), tmp7761);
__m512 tmp7735 = _mm512_add_ps(tmp7736, tmp7737);
__m512 tmp7755 = _mm512_add_ps(tmp7756, tmp7757);
__m512 tmp7739 = _mm512_fmadd_ps(tmp7743, _mm512_set1_ps(1.6e+01f), tmp7740);
__m512 tmp7759 = _mm512_fmadd_ps(tmp7763, _mm512_set1_ps(1.6e+01f), tmp7760);
__m512 tmp7746 = _mm512_fmadd_ps(tmp7743, _mm512_set1_ps(4e+00f), tmp7747);
__m512 tmp7766 = _mm512_fmadd_ps(tmp7763, _mm512_set1_ps(4e+00f), tmp7767);
__m512 tmp7752 = _mm512_add_ps(tmp7743, tmp7741);
__m512 tmp7772 = _mm512_add_ps(tmp7763, tmp7761);
__m512 tmp7745 = _mm512_fmadd_ps(tmp7736, _mm512_set1_ps(4e+00f), tmp7737);
__m512 tmp7765 = _mm512_fmadd_ps(tmp7756, _mm512_set1_ps(4e+00f), tmp7757);
__m512 tmp7749 = _mm512_fmadd_ps(tmp7736, _mm512_set1_ps(1.6e+01f), tmp7737);
__m512 tmp7769 = _mm512_fmadd_ps(tmp7756, _mm512_set1_ps(1.6e+01f), tmp7757);
__m512 tmp7734 = _mm512_add_ps(tmp7735, in1181);
__m512 tmp7754 = _mm512_add_ps(tmp7755, in1189);
__m512 tmp7751 = _mm512_add_ps(tmp7752, in1188);
__m512 tmp7771 = _mm512_add_ps(tmp7772, in1196);
__m512 tmp7733 = _mm512_fmadd_ps(tmp7738, _mm512_set1_ps(3.2e+01f), tmp7734);
__m512 tmp7753 = _mm512_fmadd_ps(tmp7758, _mm512_set1_ps(3.2e+01f), tmp7754);
__m512 tmp7744 = _mm512_fmadd_ps(tmp7738, _mm512_set1_ps(8e+00f), tmp7745);
__m512 tmp7764 = _mm512_fmadd_ps(tmp7758, _mm512_set1_ps(8e+00f), tmp7765);
__m512 tmp7750 = _mm512_fmadd_ps(tmp7742, _mm512_set1_ps(3.2e+01f), tmp7751);
__m512 tmp7770 = _mm512_fmadd_ps(tmp7762, _mm512_set1_ps(3.2e+01f), tmp7771);
__m512 tmp7748 = _mm512_fmadd_ps(tmp7738, _mm512_set1_ps(2e+00f), tmp7749);
__m512 tmp7768 = _mm512_fmadd_ps(tmp7758, _mm512_set1_ps(2e+00f), tmp7769);
__m512 tmp7721 = tmp7733;
__m512 tmp7727 = tmp7753;
__m512 tmp7722 = tmp7739;
__m512 tmp7728 = tmp7759;
__m512 tmp7723 = tmp7744;
__m512 tmp7729 = tmp7764;
__m512 tmp7724 = tmp7746;
__m512 tmp7730 = tmp7766;
__m512 tmp7725 = tmp7748;
__m512 tmp7731 = tmp7768;
__m512 tmp7726 = tmp7750;
__m512 tmp7732 = tmp7770;
__m512 tmp7817 = _mm512_unpacklo_ps(tmp7721, tmp7722);
__m512 tmp7818 = _mm512_unpackhi_ps(tmp7721, tmp7722);
__m512 tmp7819 = _mm512_unpacklo_ps(tmp7723, tmp7724);
__m512 tmp7820 = _mm512_unpackhi_ps(tmp7723, tmp7724);
__m512 tmp7821 = _mm512_unpacklo_ps(tmp7725, tmp7726);
__m512 tmp7822 = _mm512_unpackhi_ps(tmp7725, tmp7726);
__m512 tmp7823 = _mm512_unpacklo_ps(tmp7727, tmp7728);
__m512 tmp7824 = _mm512_unpackhi_ps(tmp7727, tmp7728);
__m512 tmp7825 = _mm512_unpacklo_ps(tmp7729, tmp7730);
__m512 tmp7826 = _mm512_unpackhi_ps(tmp7729, tmp7730);
__m512 tmp7827 = _mm512_unpacklo_ps(tmp7731, tmp7732);
__m512 tmp7828 = _mm512_unpackhi_ps(tmp7731, tmp7732);
__m512 tmp7829 = _mm512_shuffle_ps(tmp7817, tmp7819, 68);
__m512 tmp7830 = _mm512_shuffle_ps(tmp7817, tmp7819, 238);
__m512 tmp7831 = _mm512_shuffle_ps(tmp7818, tmp7820, 68);
__m512 tmp7832 = _mm512_shuffle_ps(tmp7818, tmp7820, 238);
__m512 tmp7833 = _mm512_shuffle_ps(tmp7821, tmp7823, 68);
__m512 tmp7834 = _mm512_shuffle_ps(tmp7821, tmp7823, 238);
__m512 tmp7835 = _mm512_shuffle_ps(tmp7822, tmp7824, 68);
__m512 tmp7836 = _mm512_shuffle_ps(tmp7822, tmp7824, 238);
__m512 tmp7837 = _mm512_shuffle_ps(tmp7825, tmp7827, 68);
__m512 tmp7838 = _mm512_shuffle_ps(tmp7825, tmp7827, 238);
__m512 tmp7839 = _mm512_shuffle_ps(tmp7826, tmp7828, 68);
__m512 tmp7840 = _mm512_shuffle_ps(tmp7826, tmp7828, 238);
__m512 tmp7841 = _mm512_shuffle_f32x4(tmp7829, tmp7833, 136);
__m512 tmp7842 = _mm512_shuffle_f32x4(tmp7829, tmp7833, 221);
__m512 tmp7843 = _mm512_shuffle_f32x4(tmp7830, tmp7834, 136);
__m512 tmp7844 = _mm512_shuffle_f32x4(tmp7830, tmp7834, 221);
__m512 tmp7845 = _mm512_shuffle_f32x4(tmp7831, tmp7835, 136);
__m512 tmp7846 = _mm512_shuffle_f32x4(tmp7831, tmp7835, 221);
__m512 tmp7847 = _mm512_shuffle_f32x4(tmp7832, tmp7836, 136);
__m512 tmp7848 = _mm512_shuffle_f32x4(tmp7832, tmp7836, 221);
__m512 tmp7849 = _mm512_shuffle_f32x4(tmp7837, tmp7837, 136);
__m512 tmp7850 = _mm512_shuffle_f32x4(tmp7837, tmp7837, 221);
__m512 tmp7851 = _mm512_shuffle_f32x4(tmp7838, tmp7838, 136);
__m512 tmp7852 = _mm512_shuffle_f32x4(tmp7838, tmp7838, 221);
__m512 tmp7853 = _mm512_shuffle_f32x4(tmp7839, tmp7839, 136);
__m512 tmp7854 = _mm512_shuffle_f32x4(tmp7839, tmp7839, 221);
__m512 tmp7855 = _mm512_shuffle_f32x4(tmp7840, tmp7840, 136);
__m512 tmp7856 = _mm512_shuffle_f32x4(tmp7840, tmp7840, 221);
tmp7721 = _mm512_shuffle_f32x4(tmp7841, tmp7849, 136);
tmp7729 = _mm512_shuffle_f32x4(tmp7841, tmp7849, 221);
tmp7722 = _mm512_shuffle_f32x4(tmp7843, tmp7851, 136);
tmp7730 = _mm512_shuffle_f32x4(tmp7843, tmp7851, 221);
tmp7723 = _mm512_shuffle_f32x4(tmp7845, tmp7853, 136);
tmp7731 = _mm512_shuffle_f32x4(tmp7845, tmp7853, 221);
tmp7724 = _mm512_shuffle_f32x4(tmp7847, tmp7855, 136);
tmp7732 = _mm512_shuffle_f32x4(tmp7847, tmp7855, 221);
tmp7725 = _mm512_shuffle_f32x4(tmp7842, tmp7850, 136);
__m512 tmp7773 = _mm512_shuffle_f32x4(tmp7842, tmp7850, 221);
tmp7726 = _mm512_shuffle_f32x4(tmp7844, tmp7852, 136);
__m512 tmp7774 = _mm512_shuffle_f32x4(tmp7844, tmp7852, 221);
tmp7727 = _mm512_shuffle_f32x4(tmp7846, tmp7854, 136);
__m512 tmp7775 = _mm512_shuffle_f32x4(tmp7846, tmp7854, 221);
tmp7728 = _mm512_shuffle_f32x4(tmp7848, tmp7856, 136);
__m512 tmp7776 = _mm512_shuffle_f32x4(tmp7848, tmp7856, 221);
__m512 tmp7781 = _mm512_add_ps(tmp7722, tmp7723);
__m512 tmp7801 = _mm512_add_ps(tmp7730, tmp7731);
__m512 tmp7780 = _mm512_add_ps(tmp7724, tmp7725);
__m512 tmp7800 = _mm512_add_ps(tmp7732, tmp7773);
__m512 tmp7786 = _mm512_sub_ps(tmp7724, tmp7725);
__m512 tmp7806 = _mm512_sub_ps(tmp7732, tmp7773);
__m512 tmp7785 = _mm512_sub_ps(tmp7722, tmp7723);
__m512 tmp7805 = _mm512_sub_ps(tmp7730, tmp7731);
__m512 tmp7782 = _mm512_add_ps(tmp7726, tmp7727);
__m512 tmp7802 = _mm512_add_ps(tmp7774, tmp7775);
__m512 tmp7787 = _mm512_sub_ps(tmp7726, tmp7727);
__m512 tmp7807 = _mm512_sub_ps(tmp7774, tmp7775);
__m512 tmp7784 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(2e+00f), tmp7785);
__m512 tmp7804 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(2e+00f), tmp7805);
__m512 tmp7791 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(8e+00f), tmp7785);
__m512 tmp7811 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(8e+00f), tmp7805);
__m512 tmp7779 = _mm512_add_ps(tmp7780, tmp7781);
__m512 tmp7799 = _mm512_add_ps(tmp7800, tmp7801);
__m512 tmp7783 = _mm512_fmadd_ps(tmp7787, _mm512_set1_ps(1.6e+01f), tmp7784);
__m512 tmp7803 = _mm512_fmadd_ps(tmp7807, _mm512_set1_ps(1.6e+01f), tmp7804);
__m512 tmp7790 = _mm512_fmadd_ps(tmp7787, _mm512_set1_ps(4e+00f), tmp7791);
__m512 tmp7810 = _mm512_fmadd_ps(tmp7807, _mm512_set1_ps(4e+00f), tmp7811);
__m512 tmp7796 = _mm512_add_ps(tmp7787, tmp7785);
__m512 tmp7816 = _mm512_add_ps(tmp7807, tmp7805);
__m512 tmp7789 = _mm512_fmadd_ps(tmp7780, _mm512_set1_ps(4e+00f), tmp7781);
__m512 tmp7809 = _mm512_fmadd_ps(tmp7800, _mm512_set1_ps(4e+00f), tmp7801);
__m512 tmp7793 = _mm512_fmadd_ps(tmp7780, _mm512_set1_ps(1.6e+01f), tmp7781);
__m512 tmp7813 = _mm512_fmadd_ps(tmp7800, _mm512_set1_ps(1.6e+01f), tmp7801);
__m512 tmp7778 = _mm512_add_ps(tmp7779, tmp7721);
__m512 tmp7798 = _mm512_add_ps(tmp7799, tmp7729);
__m512 tmp7795 = _mm512_add_ps(tmp7796, tmp7728);
__m512 tmp7815 = _mm512_add_ps(tmp7816, tmp7776);
__m512 tmp7777 = _mm512_fmadd_ps(tmp7782, _mm512_set1_ps(3.2e+01f), tmp7778);
__m512 tmp7797 = _mm512_fmadd_ps(tmp7802, _mm512_set1_ps(3.2e+01f), tmp7798);
__m512 tmp7788 = _mm512_fmadd_ps(tmp7782, _mm512_set1_ps(8e+00f), tmp7789);
__m512 tmp7808 = _mm512_fmadd_ps(tmp7802, _mm512_set1_ps(8e+00f), tmp7809);
__m512 tmp7794 = _mm512_fmadd_ps(tmp7786, _mm512_set1_ps(3.2e+01f), tmp7795);
__m512 tmp7814 = _mm512_fmadd_ps(tmp7806, _mm512_set1_ps(3.2e+01f), tmp7815);
__m512 tmp7792 = _mm512_fmadd_ps(tmp7782, _mm512_set1_ps(2e+00f), tmp7793);
__m512 tmp7812 = _mm512_fmadd_ps(tmp7802, _mm512_set1_ps(2e+00f), tmp7813);
__m512 out1155 = tmp7777;
__m512 out1161 = tmp7797;
__m512 out1156 = tmp7783;
__m512 out1162 = tmp7803;
__m512 out1157 = tmp7788;
__m512 out1163 = tmp7808;
__m512 out1158 = tmp7790;
__m512 out1164 = tmp7810;
__m512 out1159 = tmp7792;
__m512 out1165 = tmp7812;
__m512 out1160 = tmp7794;
__m512 out1166 = tmp7814;
out1155 = _mm512_max_ps(_mm512_setzero_ps(), out1155);
out1161 = _mm512_max_ps(_mm512_setzero_ps(), out1161);
out1156 = _mm512_max_ps(_mm512_setzero_ps(), out1156);
out1162 = _mm512_max_ps(_mm512_setzero_ps(), out1162);
out1157 = _mm512_max_ps(_mm512_setzero_ps(), out1157);
out1163 = _mm512_max_ps(_mm512_setzero_ps(), out1163);
out1158 = _mm512_max_ps(_mm512_setzero_ps(), out1158);
out1164 = _mm512_max_ps(_mm512_setzero_ps(), out1164);
out1159 = _mm512_max_ps(_mm512_setzero_ps(), out1159);
out1165 = _mm512_max_ps(_mm512_setzero_ps(), out1165);
out1160 = _mm512_max_ps(_mm512_setzero_ps(), out1160);
out1166 = _mm512_max_ps(_mm512_setzero_ps(), out1166);
_mm512_mask_storeu_ps(datPtr13+1200+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1155);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1161);
_mm512_mask_storeu_ps(datPtr13+1424+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1156);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1162);
_mm512_mask_storeu_ps(datPtr13+1648+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1157);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1163);
_mm512_mask_storeu_ps(datPtr13+1872+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1158);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1164);
_mm512_mask_storeu_ps(datPtr13+2096+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1159);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1165);
_mm512_mask_storeu_ps(datPtr13+2320+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1160);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1166);
__m512 sf481 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf482 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1197 = _mm512_shuffle_f32x4(sf481, sf482, 68);
__m512 in1198 = _mm512_shuffle_f32x4(sf481, sf482, 238);
__m512 sf483 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf484 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1205 = _mm512_shuffle_f32x4(sf483, sf484, 68);
__m512 in1206 = _mm512_shuffle_f32x4(sf483, sf484, 238);
__m512 sf485 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf486 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1199 = _mm512_shuffle_f32x4(sf485, sf486, 68);
__m512 in1200 = _mm512_shuffle_f32x4(sf485, sf486, 238);
__m512 sf487 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf488 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1207 = _mm512_shuffle_f32x4(sf487, sf488, 68);
__m512 in1208 = _mm512_shuffle_f32x4(sf487, sf488, 238);
__m512 sf489 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf490 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1201 = _mm512_shuffle_f32x4(sf489, sf490, 68);
__m512 in1202 = _mm512_shuffle_f32x4(sf489, sf490, 238);
__m512 sf491 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf492 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1209 = _mm512_shuffle_f32x4(sf491, sf492, 68);
__m512 in1210 = _mm512_shuffle_f32x4(sf491, sf492, 238);
__m512 sf493 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf494 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1203 = _mm512_shuffle_f32x4(sf493, sf494, 68);
__m512 in1204 = _mm512_shuffle_f32x4(sf493, sf494, 238);
__m512 sf495 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k90+768*l29);
__m512 sf496 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k90+768*l29);
__m512 in1211 = _mm512_shuffle_f32x4(sf495, sf496, 68);
__m512 in1212 = _mm512_shuffle_f32x4(sf495, sf496, 238);
__m512 tmp7873 = _mm512_add_ps(in1198, in1199);
__m512 tmp7893 = _mm512_add_ps(in1206, in1207);
__m512 tmp7872 = _mm512_add_ps(in1200, in1201);
__m512 tmp7892 = _mm512_add_ps(in1208, in1209);
__m512 tmp7878 = _mm512_sub_ps(in1200, in1201);
__m512 tmp7898 = _mm512_sub_ps(in1208, in1209);
__m512 tmp7877 = _mm512_sub_ps(in1198, in1199);
__m512 tmp7897 = _mm512_sub_ps(in1206, in1207);
__m512 tmp7874 = _mm512_add_ps(in1202, in1203);
__m512 tmp7894 = _mm512_add_ps(in1210, in1211);
__m512 tmp7879 = _mm512_sub_ps(in1202, in1203);
__m512 tmp7899 = _mm512_sub_ps(in1210, in1211);
__m512 tmp7876 = _mm512_fmadd_ps(tmp7878, _mm512_set1_ps(2e+00f), tmp7877);
__m512 tmp7896 = _mm512_fmadd_ps(tmp7898, _mm512_set1_ps(2e+00f), tmp7897);
__m512 tmp7883 = _mm512_fmadd_ps(tmp7878, _mm512_set1_ps(8e+00f), tmp7877);
__m512 tmp7903 = _mm512_fmadd_ps(tmp7898, _mm512_set1_ps(8e+00f), tmp7897);
__m512 tmp7871 = _mm512_add_ps(tmp7872, tmp7873);
__m512 tmp7891 = _mm512_add_ps(tmp7892, tmp7893);
__m512 tmp7875 = _mm512_fmadd_ps(tmp7879, _mm512_set1_ps(1.6e+01f), tmp7876);
__m512 tmp7895 = _mm512_fmadd_ps(tmp7899, _mm512_set1_ps(1.6e+01f), tmp7896);
__m512 tmp7882 = _mm512_fmadd_ps(tmp7879, _mm512_set1_ps(4e+00f), tmp7883);
__m512 tmp7902 = _mm512_fmadd_ps(tmp7899, _mm512_set1_ps(4e+00f), tmp7903);
__m512 tmp7888 = _mm512_add_ps(tmp7879, tmp7877);
__m512 tmp7908 = _mm512_add_ps(tmp7899, tmp7897);
__m512 tmp7881 = _mm512_fmadd_ps(tmp7872, _mm512_set1_ps(4e+00f), tmp7873);
__m512 tmp7901 = _mm512_fmadd_ps(tmp7892, _mm512_set1_ps(4e+00f), tmp7893);
__m512 tmp7885 = _mm512_fmadd_ps(tmp7872, _mm512_set1_ps(1.6e+01f), tmp7873);
__m512 tmp7905 = _mm512_fmadd_ps(tmp7892, _mm512_set1_ps(1.6e+01f), tmp7893);
__m512 tmp7870 = _mm512_add_ps(tmp7871, in1197);
__m512 tmp7890 = _mm512_add_ps(tmp7891, in1205);
__m512 tmp7887 = _mm512_add_ps(tmp7888, in1204);
__m512 tmp7907 = _mm512_add_ps(tmp7908, in1212);
__m512 tmp7869 = _mm512_fmadd_ps(tmp7874, _mm512_set1_ps(3.2e+01f), tmp7870);
__m512 tmp7889 = _mm512_fmadd_ps(tmp7894, _mm512_set1_ps(3.2e+01f), tmp7890);
__m512 tmp7880 = _mm512_fmadd_ps(tmp7874, _mm512_set1_ps(8e+00f), tmp7881);
__m512 tmp7900 = _mm512_fmadd_ps(tmp7894, _mm512_set1_ps(8e+00f), tmp7901);
__m512 tmp7886 = _mm512_fmadd_ps(tmp7878, _mm512_set1_ps(3.2e+01f), tmp7887);
__m512 tmp7906 = _mm512_fmadd_ps(tmp7898, _mm512_set1_ps(3.2e+01f), tmp7907);
__m512 tmp7884 = _mm512_fmadd_ps(tmp7874, _mm512_set1_ps(2e+00f), tmp7885);
__m512 tmp7904 = _mm512_fmadd_ps(tmp7894, _mm512_set1_ps(2e+00f), tmp7905);
__m512 tmp7857 = tmp7869;
__m512 tmp7863 = tmp7889;
__m512 tmp7858 = tmp7875;
__m512 tmp7864 = tmp7895;
__m512 tmp7859 = tmp7880;
__m512 tmp7865 = tmp7900;
__m512 tmp7860 = tmp7882;
__m512 tmp7866 = tmp7902;
__m512 tmp7861 = tmp7884;
__m512 tmp7867 = tmp7904;
__m512 tmp7862 = tmp7886;
__m512 tmp7868 = tmp7906;
__m512 tmp7953 = _mm512_unpacklo_ps(tmp7857, tmp7858);
__m512 tmp7954 = _mm512_unpackhi_ps(tmp7857, tmp7858);
__m512 tmp7955 = _mm512_unpacklo_ps(tmp7859, tmp7860);
__m512 tmp7956 = _mm512_unpackhi_ps(tmp7859, tmp7860);
__m512 tmp7957 = _mm512_unpacklo_ps(tmp7861, tmp7862);
__m512 tmp7958 = _mm512_unpackhi_ps(tmp7861, tmp7862);
__m512 tmp7959 = _mm512_unpacklo_ps(tmp7863, tmp7864);
__m512 tmp7960 = _mm512_unpackhi_ps(tmp7863, tmp7864);
__m512 tmp7961 = _mm512_unpacklo_ps(tmp7865, tmp7866);
__m512 tmp7962 = _mm512_unpackhi_ps(tmp7865, tmp7866);
__m512 tmp7963 = _mm512_unpacklo_ps(tmp7867, tmp7868);
__m512 tmp7964 = _mm512_unpackhi_ps(tmp7867, tmp7868);
__m512 tmp7965 = _mm512_shuffle_ps(tmp7953, tmp7955, 68);
__m512 tmp7966 = _mm512_shuffle_ps(tmp7953, tmp7955, 238);
__m512 tmp7967 = _mm512_shuffle_ps(tmp7954, tmp7956, 68);
__m512 tmp7968 = _mm512_shuffle_ps(tmp7954, tmp7956, 238);
__m512 tmp7969 = _mm512_shuffle_ps(tmp7957, tmp7959, 68);
__m512 tmp7970 = _mm512_shuffle_ps(tmp7957, tmp7959, 238);
__m512 tmp7971 = _mm512_shuffle_ps(tmp7958, tmp7960, 68);
__m512 tmp7972 = _mm512_shuffle_ps(tmp7958, tmp7960, 238);
__m512 tmp7973 = _mm512_shuffle_ps(tmp7961, tmp7963, 68);
__m512 tmp7974 = _mm512_shuffle_ps(tmp7961, tmp7963, 238);
__m512 tmp7975 = _mm512_shuffle_ps(tmp7962, tmp7964, 68);
__m512 tmp7976 = _mm512_shuffle_ps(tmp7962, tmp7964, 238);
__m512 tmp7977 = _mm512_shuffle_f32x4(tmp7965, tmp7969, 136);
__m512 tmp7978 = _mm512_shuffle_f32x4(tmp7965, tmp7969, 221);
__m512 tmp7979 = _mm512_shuffle_f32x4(tmp7966, tmp7970, 136);
__m512 tmp7980 = _mm512_shuffle_f32x4(tmp7966, tmp7970, 221);
__m512 tmp7981 = _mm512_shuffle_f32x4(tmp7967, tmp7971, 136);
__m512 tmp7982 = _mm512_shuffle_f32x4(tmp7967, tmp7971, 221);
__m512 tmp7983 = _mm512_shuffle_f32x4(tmp7968, tmp7972, 136);
__m512 tmp7984 = _mm512_shuffle_f32x4(tmp7968, tmp7972, 221);
__m512 tmp7985 = _mm512_shuffle_f32x4(tmp7973, tmp7973, 136);
__m512 tmp7986 = _mm512_shuffle_f32x4(tmp7973, tmp7973, 221);
__m512 tmp7987 = _mm512_shuffle_f32x4(tmp7974, tmp7974, 136);
__m512 tmp7988 = _mm512_shuffle_f32x4(tmp7974, tmp7974, 221);
__m512 tmp7989 = _mm512_shuffle_f32x4(tmp7975, tmp7975, 136);
__m512 tmp7990 = _mm512_shuffle_f32x4(tmp7975, tmp7975, 221);
__m512 tmp7991 = _mm512_shuffle_f32x4(tmp7976, tmp7976, 136);
__m512 tmp7992 = _mm512_shuffle_f32x4(tmp7976, tmp7976, 221);
tmp7857 = _mm512_shuffle_f32x4(tmp7977, tmp7985, 136);
tmp7865 = _mm512_shuffle_f32x4(tmp7977, tmp7985, 221);
tmp7858 = _mm512_shuffle_f32x4(tmp7979, tmp7987, 136);
tmp7866 = _mm512_shuffle_f32x4(tmp7979, tmp7987, 221);
tmp7859 = _mm512_shuffle_f32x4(tmp7981, tmp7989, 136);
tmp7867 = _mm512_shuffle_f32x4(tmp7981, tmp7989, 221);
tmp7860 = _mm512_shuffle_f32x4(tmp7983, tmp7991, 136);
tmp7868 = _mm512_shuffle_f32x4(tmp7983, tmp7991, 221);
tmp7861 = _mm512_shuffle_f32x4(tmp7978, tmp7986, 136);
__m512 tmp7909 = _mm512_shuffle_f32x4(tmp7978, tmp7986, 221);
tmp7862 = _mm512_shuffle_f32x4(tmp7980, tmp7988, 136);
__m512 tmp7910 = _mm512_shuffle_f32x4(tmp7980, tmp7988, 221);
tmp7863 = _mm512_shuffle_f32x4(tmp7982, tmp7990, 136);
__m512 tmp7911 = _mm512_shuffle_f32x4(tmp7982, tmp7990, 221);
tmp7864 = _mm512_shuffle_f32x4(tmp7984, tmp7992, 136);
__m512 tmp7912 = _mm512_shuffle_f32x4(tmp7984, tmp7992, 221);
__m512 tmp7917 = _mm512_add_ps(tmp7858, tmp7859);
__m512 tmp7937 = _mm512_add_ps(tmp7866, tmp7867);
__m512 tmp7916 = _mm512_add_ps(tmp7860, tmp7861);
__m512 tmp7936 = _mm512_add_ps(tmp7868, tmp7909);
__m512 tmp7922 = _mm512_sub_ps(tmp7860, tmp7861);
__m512 tmp7942 = _mm512_sub_ps(tmp7868, tmp7909);
__m512 tmp7921 = _mm512_sub_ps(tmp7858, tmp7859);
__m512 tmp7941 = _mm512_sub_ps(tmp7866, tmp7867);
__m512 tmp7918 = _mm512_add_ps(tmp7862, tmp7863);
__m512 tmp7938 = _mm512_add_ps(tmp7910, tmp7911);
__m512 tmp7923 = _mm512_sub_ps(tmp7862, tmp7863);
__m512 tmp7943 = _mm512_sub_ps(tmp7910, tmp7911);
__m512 tmp7920 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(2e+00f), tmp7921);
__m512 tmp7940 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(2e+00f), tmp7941);
__m512 tmp7927 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(8e+00f), tmp7921);
__m512 tmp7947 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(8e+00f), tmp7941);
__m512 tmp7915 = _mm512_add_ps(tmp7916, tmp7917);
__m512 tmp7935 = _mm512_add_ps(tmp7936, tmp7937);
__m512 tmp7919 = _mm512_fmadd_ps(tmp7923, _mm512_set1_ps(1.6e+01f), tmp7920);
__m512 tmp7939 = _mm512_fmadd_ps(tmp7943, _mm512_set1_ps(1.6e+01f), tmp7940);
__m512 tmp7926 = _mm512_fmadd_ps(tmp7923, _mm512_set1_ps(4e+00f), tmp7927);
__m512 tmp7946 = _mm512_fmadd_ps(tmp7943, _mm512_set1_ps(4e+00f), tmp7947);
__m512 tmp7932 = _mm512_add_ps(tmp7923, tmp7921);
__m512 tmp7952 = _mm512_add_ps(tmp7943, tmp7941);
__m512 tmp7925 = _mm512_fmadd_ps(tmp7916, _mm512_set1_ps(4e+00f), tmp7917);
__m512 tmp7945 = _mm512_fmadd_ps(tmp7936, _mm512_set1_ps(4e+00f), tmp7937);
__m512 tmp7929 = _mm512_fmadd_ps(tmp7916, _mm512_set1_ps(1.6e+01f), tmp7917);
__m512 tmp7949 = _mm512_fmadd_ps(tmp7936, _mm512_set1_ps(1.6e+01f), tmp7937);
__m512 tmp7914 = _mm512_add_ps(tmp7915, tmp7857);
__m512 tmp7934 = _mm512_add_ps(tmp7935, tmp7865);
__m512 tmp7931 = _mm512_add_ps(tmp7932, tmp7864);
__m512 tmp7951 = _mm512_add_ps(tmp7952, tmp7912);
__m512 tmp7913 = _mm512_fmadd_ps(tmp7918, _mm512_set1_ps(3.2e+01f), tmp7914);
__m512 tmp7933 = _mm512_fmadd_ps(tmp7938, _mm512_set1_ps(3.2e+01f), tmp7934);
__m512 tmp7924 = _mm512_fmadd_ps(tmp7918, _mm512_set1_ps(8e+00f), tmp7925);
__m512 tmp7944 = _mm512_fmadd_ps(tmp7938, _mm512_set1_ps(8e+00f), tmp7945);
__m512 tmp7930 = _mm512_fmadd_ps(tmp7922, _mm512_set1_ps(3.2e+01f), tmp7931);
__m512 tmp7950 = _mm512_fmadd_ps(tmp7942, _mm512_set1_ps(3.2e+01f), tmp7951);
__m512 tmp7928 = _mm512_fmadd_ps(tmp7918, _mm512_set1_ps(2e+00f), tmp7929);
__m512 tmp7948 = _mm512_fmadd_ps(tmp7938, _mm512_set1_ps(2e+00f), tmp7949);
__m512 out1167 = tmp7913;
__m512 out1173 = tmp7933;
__m512 out1168 = tmp7919;
__m512 out1174 = tmp7939;
__m512 out1169 = tmp7924;
__m512 out1175 = tmp7944;
__m512 out1170 = tmp7926;
__m512 out1176 = tmp7946;
__m512 out1171 = tmp7928;
__m512 out1177 = tmp7948;
__m512 out1172 = tmp7930;
__m512 out1178 = tmp7950;
out1167 = _mm512_max_ps(_mm512_setzero_ps(), out1167);
out1173 = _mm512_max_ps(_mm512_setzero_ps(), out1173);
out1168 = _mm512_max_ps(_mm512_setzero_ps(), out1168);
out1174 = _mm512_max_ps(_mm512_setzero_ps(), out1174);
out1169 = _mm512_max_ps(_mm512_setzero_ps(), out1169);
out1175 = _mm512_max_ps(_mm512_setzero_ps(), out1175);
out1170 = _mm512_max_ps(_mm512_setzero_ps(), out1170);
out1176 = _mm512_max_ps(_mm512_setzero_ps(), out1176);
out1171 = _mm512_max_ps(_mm512_setzero_ps(), out1171);
out1177 = _mm512_max_ps(_mm512_setzero_ps(), out1177);
out1172 = _mm512_max_ps(_mm512_setzero_ps(), out1172);
out1178 = _mm512_max_ps(_mm512_setzero_ps(), out1178);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1167);
_mm512_mask_storeu_ps(datPtr13+13808+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1173);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1168);
_mm512_mask_storeu_ps(datPtr13+14032+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1174);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1169);
_mm512_mask_storeu_ps(datPtr13+14256+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1175);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1170);
_mm512_mask_storeu_ps(datPtr13+14480+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1176);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1171);
_mm512_mask_storeu_ps(datPtr13+14704+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1177);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 255, out1172);
_mm512_mask_storeu_ps(datPtr13+14928+50432*i29+224*toH30+4*toW30+50432*k90+25216*l29, 4095, out1178);
}
}
++j23;
j23 = 2;
}
if (j23 < 15) {
ptrdiff_t rel17 = (size_t)(j23-2)%5;
ptrdiff_t base17 = 6+(size_t)(j23-2)/5*18;
for (; ; rel17 = 0, base17 += 18) {
if (rel17 < 2) {
if (rel17 < 1) {
ptrdiff_t toH31 = base17+0;
ptrdiff_t toW31 = 12;
ptrdiff_t k91 = 1*w46;
for (; k91 != 1; ++k91) {
ptrdiff_t l30 = 0;
for (; l30 != 2; ++l30) {
__m512 sf497 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf498 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1213 = _mm512_shuffle_f32x4(sf497, sf498, 68);
__m512 in1214 = _mm512_shuffle_f32x4(sf497, sf498, 238);
__m512 sf499 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf500 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1221 = _mm512_shuffle_f32x4(sf499, sf500, 68);
__m512 in1222 = _mm512_shuffle_f32x4(sf499, sf500, 238);
__m512 sf501 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf502 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1215 = _mm512_shuffle_f32x4(sf501, sf502, 68);
__m512 in1216 = _mm512_shuffle_f32x4(sf501, sf502, 238);
__m512 sf503 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf504 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1223 = _mm512_shuffle_f32x4(sf503, sf504, 68);
__m512 in1224 = _mm512_shuffle_f32x4(sf503, sf504, 238);
__m512 sf505 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf506 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1217 = _mm512_shuffle_f32x4(sf505, sf506, 68);
__m512 in1218 = _mm512_shuffle_f32x4(sf505, sf506, 238);
__m512 sf507 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf508 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1225 = _mm512_shuffle_f32x4(sf507, sf508, 68);
__m512 in1226 = _mm512_shuffle_f32x4(sf507, sf508, 238);
__m512 sf509 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf510 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1219 = _mm512_shuffle_f32x4(sf509, sf510, 68);
__m512 in1220 = _mm512_shuffle_f32x4(sf509, sf510, 238);
__m512 sf511 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf512 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1227 = _mm512_shuffle_f32x4(sf511, sf512, 68);
__m512 in1228 = _mm512_shuffle_f32x4(sf511, sf512, 238);
__m512 tmp8009 = _mm512_add_ps(in1214, in1215);
__m512 tmp8029 = _mm512_add_ps(in1222, in1223);
__m512 tmp8008 = _mm512_add_ps(in1216, in1217);
__m512 tmp8028 = _mm512_add_ps(in1224, in1225);
__m512 tmp8014 = _mm512_sub_ps(in1216, in1217);
__m512 tmp8034 = _mm512_sub_ps(in1224, in1225);
__m512 tmp8013 = _mm512_sub_ps(in1214, in1215);
__m512 tmp8033 = _mm512_sub_ps(in1222, in1223);
__m512 tmp8010 = _mm512_add_ps(in1218, in1219);
__m512 tmp8030 = _mm512_add_ps(in1226, in1227);
__m512 tmp8015 = _mm512_sub_ps(in1218, in1219);
__m512 tmp8035 = _mm512_sub_ps(in1226, in1227);
__m512 tmp8012 = _mm512_fmadd_ps(tmp8014, _mm512_set1_ps(2e+00f), tmp8013);
__m512 tmp8032 = _mm512_fmadd_ps(tmp8034, _mm512_set1_ps(2e+00f), tmp8033);
__m512 tmp8019 = _mm512_fmadd_ps(tmp8014, _mm512_set1_ps(8e+00f), tmp8013);
__m512 tmp8039 = _mm512_fmadd_ps(tmp8034, _mm512_set1_ps(8e+00f), tmp8033);
__m512 tmp8007 = _mm512_add_ps(tmp8008, tmp8009);
__m512 tmp8027 = _mm512_add_ps(tmp8028, tmp8029);
__m512 tmp8011 = _mm512_fmadd_ps(tmp8015, _mm512_set1_ps(1.6e+01f), tmp8012);
__m512 tmp8031 = _mm512_fmadd_ps(tmp8035, _mm512_set1_ps(1.6e+01f), tmp8032);
__m512 tmp8018 = _mm512_fmadd_ps(tmp8015, _mm512_set1_ps(4e+00f), tmp8019);
__m512 tmp8038 = _mm512_fmadd_ps(tmp8035, _mm512_set1_ps(4e+00f), tmp8039);
__m512 tmp8024 = _mm512_add_ps(tmp8015, tmp8013);
__m512 tmp8044 = _mm512_add_ps(tmp8035, tmp8033);
__m512 tmp8017 = _mm512_fmadd_ps(tmp8008, _mm512_set1_ps(4e+00f), tmp8009);
__m512 tmp8037 = _mm512_fmadd_ps(tmp8028, _mm512_set1_ps(4e+00f), tmp8029);
__m512 tmp8021 = _mm512_fmadd_ps(tmp8008, _mm512_set1_ps(1.6e+01f), tmp8009);
__m512 tmp8041 = _mm512_fmadd_ps(tmp8028, _mm512_set1_ps(1.6e+01f), tmp8029);
__m512 tmp8006 = _mm512_add_ps(tmp8007, in1213);
__m512 tmp8026 = _mm512_add_ps(tmp8027, in1221);
__m512 tmp8023 = _mm512_add_ps(tmp8024, in1220);
__m512 tmp8043 = _mm512_add_ps(tmp8044, in1228);
__m512 tmp8005 = _mm512_fmadd_ps(tmp8010, _mm512_set1_ps(3.2e+01f), tmp8006);
__m512 tmp8025 = _mm512_fmadd_ps(tmp8030, _mm512_set1_ps(3.2e+01f), tmp8026);
__m512 tmp8016 = _mm512_fmadd_ps(tmp8010, _mm512_set1_ps(8e+00f), tmp8017);
__m512 tmp8036 = _mm512_fmadd_ps(tmp8030, _mm512_set1_ps(8e+00f), tmp8037);
__m512 tmp8022 = _mm512_fmadd_ps(tmp8014, _mm512_set1_ps(3.2e+01f), tmp8023);
__m512 tmp8042 = _mm512_fmadd_ps(tmp8034, _mm512_set1_ps(3.2e+01f), tmp8043);
__m512 tmp8020 = _mm512_fmadd_ps(tmp8010, _mm512_set1_ps(2e+00f), tmp8021);
__m512 tmp8040 = _mm512_fmadd_ps(tmp8030, _mm512_set1_ps(2e+00f), tmp8041);
__m512 tmp7993 = tmp8005;
__m512 tmp7999 = tmp8025;
__m512 tmp7994 = tmp8011;
__m512 tmp8000 = tmp8031;
__m512 tmp7995 = tmp8016;
__m512 tmp8001 = tmp8036;
__m512 tmp7996 = tmp8018;
__m512 tmp8002 = tmp8038;
__m512 tmp7997 = tmp8020;
__m512 tmp8003 = tmp8040;
__m512 tmp7998 = tmp8022;
__m512 tmp8004 = tmp8042;
__m512 tmp8089 = _mm512_unpacklo_ps(tmp7993, tmp7994);
__m512 tmp8090 = _mm512_unpackhi_ps(tmp7993, tmp7994);
__m512 tmp8091 = _mm512_unpacklo_ps(tmp7995, tmp7996);
__m512 tmp8092 = _mm512_unpackhi_ps(tmp7995, tmp7996);
__m512 tmp8093 = _mm512_unpacklo_ps(tmp7997, tmp7998);
__m512 tmp8094 = _mm512_unpackhi_ps(tmp7997, tmp7998);
__m512 tmp8095 = _mm512_unpacklo_ps(tmp7999, tmp8000);
__m512 tmp8096 = _mm512_unpackhi_ps(tmp7999, tmp8000);
__m512 tmp8097 = _mm512_unpacklo_ps(tmp8001, tmp8002);
__m512 tmp8098 = _mm512_unpackhi_ps(tmp8001, tmp8002);
__m512 tmp8099 = _mm512_unpacklo_ps(tmp8003, tmp8004);
__m512 tmp8100 = _mm512_unpackhi_ps(tmp8003, tmp8004);
__m512 tmp8101 = _mm512_shuffle_ps(tmp8089, tmp8091, 68);
__m512 tmp8102 = _mm512_shuffle_ps(tmp8089, tmp8091, 238);
__m512 tmp8103 = _mm512_shuffle_ps(tmp8090, tmp8092, 68);
__m512 tmp8104 = _mm512_shuffle_ps(tmp8090, tmp8092, 238);
__m512 tmp8105 = _mm512_shuffle_ps(tmp8093, tmp8095, 68);
__m512 tmp8106 = _mm512_shuffle_ps(tmp8093, tmp8095, 238);
__m512 tmp8107 = _mm512_shuffle_ps(tmp8094, tmp8096, 68);
__m512 tmp8108 = _mm512_shuffle_ps(tmp8094, tmp8096, 238);
__m512 tmp8109 = _mm512_shuffle_ps(tmp8097, tmp8099, 68);
__m512 tmp8110 = _mm512_shuffle_ps(tmp8097, tmp8099, 238);
__m512 tmp8111 = _mm512_shuffle_ps(tmp8098, tmp8100, 68);
__m512 tmp8112 = _mm512_shuffle_ps(tmp8098, tmp8100, 238);
__m512 tmp8113 = _mm512_shuffle_f32x4(tmp8101, tmp8105, 136);
__m512 tmp8114 = _mm512_shuffle_f32x4(tmp8101, tmp8105, 221);
__m512 tmp8115 = _mm512_shuffle_f32x4(tmp8102, tmp8106, 136);
__m512 tmp8116 = _mm512_shuffle_f32x4(tmp8102, tmp8106, 221);
__m512 tmp8117 = _mm512_shuffle_f32x4(tmp8103, tmp8107, 136);
__m512 tmp8118 = _mm512_shuffle_f32x4(tmp8103, tmp8107, 221);
__m512 tmp8119 = _mm512_shuffle_f32x4(tmp8104, tmp8108, 136);
__m512 tmp8120 = _mm512_shuffle_f32x4(tmp8104, tmp8108, 221);
__m512 tmp8121 = _mm512_shuffle_f32x4(tmp8109, tmp8109, 136);
__m512 tmp8122 = _mm512_shuffle_f32x4(tmp8109, tmp8109, 221);
__m512 tmp8123 = _mm512_shuffle_f32x4(tmp8110, tmp8110, 136);
__m512 tmp8124 = _mm512_shuffle_f32x4(tmp8110, tmp8110, 221);
__m512 tmp8125 = _mm512_shuffle_f32x4(tmp8111, tmp8111, 136);
__m512 tmp8126 = _mm512_shuffle_f32x4(tmp8111, tmp8111, 221);
__m512 tmp8127 = _mm512_shuffle_f32x4(tmp8112, tmp8112, 136);
__m512 tmp8128 = _mm512_shuffle_f32x4(tmp8112, tmp8112, 221);
tmp7993 = _mm512_shuffle_f32x4(tmp8113, tmp8121, 136);
tmp8001 = _mm512_shuffle_f32x4(tmp8113, tmp8121, 221);
tmp7994 = _mm512_shuffle_f32x4(tmp8115, tmp8123, 136);
tmp8002 = _mm512_shuffle_f32x4(tmp8115, tmp8123, 221);
tmp7995 = _mm512_shuffle_f32x4(tmp8117, tmp8125, 136);
tmp8003 = _mm512_shuffle_f32x4(tmp8117, tmp8125, 221);
tmp7996 = _mm512_shuffle_f32x4(tmp8119, tmp8127, 136);
tmp8004 = _mm512_shuffle_f32x4(tmp8119, tmp8127, 221);
tmp7997 = _mm512_shuffle_f32x4(tmp8114, tmp8122, 136);
__m512 tmp8045 = _mm512_shuffle_f32x4(tmp8114, tmp8122, 221);
tmp7998 = _mm512_shuffle_f32x4(tmp8116, tmp8124, 136);
__m512 tmp8046 = _mm512_shuffle_f32x4(tmp8116, tmp8124, 221);
tmp7999 = _mm512_shuffle_f32x4(tmp8118, tmp8126, 136);
__m512 tmp8047 = _mm512_shuffle_f32x4(tmp8118, tmp8126, 221);
tmp8000 = _mm512_shuffle_f32x4(tmp8120, tmp8128, 136);
__m512 tmp8048 = _mm512_shuffle_f32x4(tmp8120, tmp8128, 221);
__m512 tmp8053 = _mm512_add_ps(tmp7994, tmp7995);
__m512 tmp8073 = _mm512_add_ps(tmp8002, tmp8003);
__m512 tmp8052 = _mm512_add_ps(tmp7996, tmp7997);
__m512 tmp8072 = _mm512_add_ps(tmp8004, tmp8045);
__m512 tmp8058 = _mm512_sub_ps(tmp7996, tmp7997);
__m512 tmp8078 = _mm512_sub_ps(tmp8004, tmp8045);
__m512 tmp8057 = _mm512_sub_ps(tmp7994, tmp7995);
__m512 tmp8077 = _mm512_sub_ps(tmp8002, tmp8003);
__m512 tmp8054 = _mm512_add_ps(tmp7998, tmp7999);
__m512 tmp8074 = _mm512_add_ps(tmp8046, tmp8047);
__m512 tmp8059 = _mm512_sub_ps(tmp7998, tmp7999);
__m512 tmp8079 = _mm512_sub_ps(tmp8046, tmp8047);
__m512 tmp8056 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(2e+00f), tmp8057);
__m512 tmp8076 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(2e+00f), tmp8077);
__m512 tmp8063 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(8e+00f), tmp8057);
__m512 tmp8083 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(8e+00f), tmp8077);
__m512 tmp8051 = _mm512_add_ps(tmp8052, tmp8053);
__m512 tmp8071 = _mm512_add_ps(tmp8072, tmp8073);
__m512 tmp8055 = _mm512_fmadd_ps(tmp8059, _mm512_set1_ps(1.6e+01f), tmp8056);
__m512 tmp8075 = _mm512_fmadd_ps(tmp8079, _mm512_set1_ps(1.6e+01f), tmp8076);
__m512 tmp8062 = _mm512_fmadd_ps(tmp8059, _mm512_set1_ps(4e+00f), tmp8063);
__m512 tmp8082 = _mm512_fmadd_ps(tmp8079, _mm512_set1_ps(4e+00f), tmp8083);
__m512 tmp8068 = _mm512_add_ps(tmp8059, tmp8057);
__m512 tmp8088 = _mm512_add_ps(tmp8079, tmp8077);
__m512 tmp8061 = _mm512_fmadd_ps(tmp8052, _mm512_set1_ps(4e+00f), tmp8053);
__m512 tmp8081 = _mm512_fmadd_ps(tmp8072, _mm512_set1_ps(4e+00f), tmp8073);
__m512 tmp8065 = _mm512_fmadd_ps(tmp8052, _mm512_set1_ps(1.6e+01f), tmp8053);
__m512 tmp8085 = _mm512_fmadd_ps(tmp8072, _mm512_set1_ps(1.6e+01f), tmp8073);
__m512 tmp8050 = _mm512_add_ps(tmp8051, tmp7993);
__m512 tmp8070 = _mm512_add_ps(tmp8071, tmp8001);
__m512 tmp8067 = _mm512_add_ps(tmp8068, tmp8000);
__m512 tmp8087 = _mm512_add_ps(tmp8088, tmp8048);
__m512 tmp8049 = _mm512_fmadd_ps(tmp8054, _mm512_set1_ps(3.2e+01f), tmp8050);
__m512 tmp8069 = _mm512_fmadd_ps(tmp8074, _mm512_set1_ps(3.2e+01f), tmp8070);
__m512 tmp8060 = _mm512_fmadd_ps(tmp8054, _mm512_set1_ps(8e+00f), tmp8061);
__m512 tmp8080 = _mm512_fmadd_ps(tmp8074, _mm512_set1_ps(8e+00f), tmp8081);
__m512 tmp8066 = _mm512_fmadd_ps(tmp8058, _mm512_set1_ps(3.2e+01f), tmp8067);
__m512 tmp8086 = _mm512_fmadd_ps(tmp8078, _mm512_set1_ps(3.2e+01f), tmp8087);
__m512 tmp8064 = _mm512_fmadd_ps(tmp8054, _mm512_set1_ps(2e+00f), tmp8065);
__m512 tmp8084 = _mm512_fmadd_ps(tmp8074, _mm512_set1_ps(2e+00f), tmp8085);
__m512 out1179 = tmp8049;
__m512 out1185 = tmp8069;
__m512 out1180 = tmp8055;
__m512 out1186 = tmp8075;
__m512 out1181 = tmp8060;
__m512 out1187 = tmp8080;
__m512 out1182 = tmp8062;
__m512 out1188 = tmp8082;
__m512 out1183 = tmp8064;
__m512 out1189 = tmp8084;
__m512 out1184 = tmp8066;
__m512 out1190 = tmp8086;
out1179 = _mm512_max_ps(_mm512_setzero_ps(), out1179);
out1185 = _mm512_max_ps(_mm512_setzero_ps(), out1185);
out1180 = _mm512_max_ps(_mm512_setzero_ps(), out1180);
out1186 = _mm512_max_ps(_mm512_setzero_ps(), out1186);
out1181 = _mm512_max_ps(_mm512_setzero_ps(), out1181);
out1187 = _mm512_max_ps(_mm512_setzero_ps(), out1187);
out1182 = _mm512_max_ps(_mm512_setzero_ps(), out1182);
out1188 = _mm512_max_ps(_mm512_setzero_ps(), out1188);
out1183 = _mm512_max_ps(_mm512_setzero_ps(), out1183);
out1189 = _mm512_max_ps(_mm512_setzero_ps(), out1189);
out1184 = _mm512_max_ps(_mm512_setzero_ps(), out1184);
out1190 = _mm512_max_ps(_mm512_setzero_ps(), out1190);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1179);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1185);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1180);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1186);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1181);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1187);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1182);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1188);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1183);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1189);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1184);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1190);
__m512 sf513 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf514 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1229 = _mm512_shuffle_f32x4(sf513, sf514, 68);
__m512 in1230 = _mm512_shuffle_f32x4(sf513, sf514, 238);
__m512 sf515 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf516 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1237 = _mm512_shuffle_f32x4(sf515, sf516, 68);
__m512 in1238 = _mm512_shuffle_f32x4(sf515, sf516, 238);
__m512 sf517 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf518 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1231 = _mm512_shuffle_f32x4(sf517, sf518, 68);
__m512 in1232 = _mm512_shuffle_f32x4(sf517, sf518, 238);
__m512 sf519 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf520 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1239 = _mm512_shuffle_f32x4(sf519, sf520, 68);
__m512 in1240 = _mm512_shuffle_f32x4(sf519, sf520, 238);
__m512 sf521 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf522 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1233 = _mm512_shuffle_f32x4(sf521, sf522, 68);
__m512 in1234 = _mm512_shuffle_f32x4(sf521, sf522, 238);
__m512 sf523 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf524 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1241 = _mm512_shuffle_f32x4(sf523, sf524, 68);
__m512 in1242 = _mm512_shuffle_f32x4(sf523, sf524, 238);
__m512 sf525 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf526 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1235 = _mm512_shuffle_f32x4(sf525, sf526, 68);
__m512 in1236 = _mm512_shuffle_f32x4(sf525, sf526, 238);
__m512 sf527 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf528 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1243 = _mm512_shuffle_f32x4(sf527, sf528, 68);
__m512 in1244 = _mm512_shuffle_f32x4(sf527, sf528, 238);
__m512 tmp8145 = _mm512_add_ps(in1230, in1231);
__m512 tmp8165 = _mm512_add_ps(in1238, in1239);
__m512 tmp8144 = _mm512_add_ps(in1232, in1233);
__m512 tmp8164 = _mm512_add_ps(in1240, in1241);
__m512 tmp8150 = _mm512_sub_ps(in1232, in1233);
__m512 tmp8170 = _mm512_sub_ps(in1240, in1241);
__m512 tmp8149 = _mm512_sub_ps(in1230, in1231);
__m512 tmp8169 = _mm512_sub_ps(in1238, in1239);
__m512 tmp8146 = _mm512_add_ps(in1234, in1235);
__m512 tmp8166 = _mm512_add_ps(in1242, in1243);
__m512 tmp8151 = _mm512_sub_ps(in1234, in1235);
__m512 tmp8171 = _mm512_sub_ps(in1242, in1243);
__m512 tmp8148 = _mm512_fmadd_ps(tmp8150, _mm512_set1_ps(2e+00f), tmp8149);
__m512 tmp8168 = _mm512_fmadd_ps(tmp8170, _mm512_set1_ps(2e+00f), tmp8169);
__m512 tmp8155 = _mm512_fmadd_ps(tmp8150, _mm512_set1_ps(8e+00f), tmp8149);
__m512 tmp8175 = _mm512_fmadd_ps(tmp8170, _mm512_set1_ps(8e+00f), tmp8169);
__m512 tmp8143 = _mm512_add_ps(tmp8144, tmp8145);
__m512 tmp8163 = _mm512_add_ps(tmp8164, tmp8165);
__m512 tmp8147 = _mm512_fmadd_ps(tmp8151, _mm512_set1_ps(1.6e+01f), tmp8148);
__m512 tmp8167 = _mm512_fmadd_ps(tmp8171, _mm512_set1_ps(1.6e+01f), tmp8168);
__m512 tmp8154 = _mm512_fmadd_ps(tmp8151, _mm512_set1_ps(4e+00f), tmp8155);
__m512 tmp8174 = _mm512_fmadd_ps(tmp8171, _mm512_set1_ps(4e+00f), tmp8175);
__m512 tmp8160 = _mm512_add_ps(tmp8151, tmp8149);
__m512 tmp8180 = _mm512_add_ps(tmp8171, tmp8169);
__m512 tmp8153 = _mm512_fmadd_ps(tmp8144, _mm512_set1_ps(4e+00f), tmp8145);
__m512 tmp8173 = _mm512_fmadd_ps(tmp8164, _mm512_set1_ps(4e+00f), tmp8165);
__m512 tmp8157 = _mm512_fmadd_ps(tmp8144, _mm512_set1_ps(1.6e+01f), tmp8145);
__m512 tmp8177 = _mm512_fmadd_ps(tmp8164, _mm512_set1_ps(1.6e+01f), tmp8165);
__m512 tmp8142 = _mm512_add_ps(tmp8143, in1229);
__m512 tmp8162 = _mm512_add_ps(tmp8163, in1237);
__m512 tmp8159 = _mm512_add_ps(tmp8160, in1236);
__m512 tmp8179 = _mm512_add_ps(tmp8180, in1244);
__m512 tmp8141 = _mm512_fmadd_ps(tmp8146, _mm512_set1_ps(3.2e+01f), tmp8142);
__m512 tmp8161 = _mm512_fmadd_ps(tmp8166, _mm512_set1_ps(3.2e+01f), tmp8162);
__m512 tmp8152 = _mm512_fmadd_ps(tmp8146, _mm512_set1_ps(8e+00f), tmp8153);
__m512 tmp8172 = _mm512_fmadd_ps(tmp8166, _mm512_set1_ps(8e+00f), tmp8173);
__m512 tmp8158 = _mm512_fmadd_ps(tmp8150, _mm512_set1_ps(3.2e+01f), tmp8159);
__m512 tmp8178 = _mm512_fmadd_ps(tmp8170, _mm512_set1_ps(3.2e+01f), tmp8179);
__m512 tmp8156 = _mm512_fmadd_ps(tmp8146, _mm512_set1_ps(2e+00f), tmp8157);
__m512 tmp8176 = _mm512_fmadd_ps(tmp8166, _mm512_set1_ps(2e+00f), tmp8177);
__m512 tmp8129 = tmp8141;
__m512 tmp8135 = tmp8161;
__m512 tmp8130 = tmp8147;
__m512 tmp8136 = tmp8167;
__m512 tmp8131 = tmp8152;
__m512 tmp8137 = tmp8172;
__m512 tmp8132 = tmp8154;
__m512 tmp8138 = tmp8174;
__m512 tmp8133 = tmp8156;
__m512 tmp8139 = tmp8176;
__m512 tmp8134 = tmp8158;
__m512 tmp8140 = tmp8178;
__m512 tmp8225 = _mm512_unpacklo_ps(tmp8129, tmp8130);
__m512 tmp8226 = _mm512_unpackhi_ps(tmp8129, tmp8130);
__m512 tmp8227 = _mm512_unpacklo_ps(tmp8131, tmp8132);
__m512 tmp8228 = _mm512_unpackhi_ps(tmp8131, tmp8132);
__m512 tmp8229 = _mm512_unpacklo_ps(tmp8133, tmp8134);
__m512 tmp8230 = _mm512_unpackhi_ps(tmp8133, tmp8134);
__m512 tmp8231 = _mm512_unpacklo_ps(tmp8135, tmp8136);
__m512 tmp8232 = _mm512_unpackhi_ps(tmp8135, tmp8136);
__m512 tmp8233 = _mm512_unpacklo_ps(tmp8137, tmp8138);
__m512 tmp8234 = _mm512_unpackhi_ps(tmp8137, tmp8138);
__m512 tmp8235 = _mm512_unpacklo_ps(tmp8139, tmp8140);
__m512 tmp8236 = _mm512_unpackhi_ps(tmp8139, tmp8140);
__m512 tmp8237 = _mm512_shuffle_ps(tmp8225, tmp8227, 68);
__m512 tmp8238 = _mm512_shuffle_ps(tmp8225, tmp8227, 238);
__m512 tmp8239 = _mm512_shuffle_ps(tmp8226, tmp8228, 68);
__m512 tmp8240 = _mm512_shuffle_ps(tmp8226, tmp8228, 238);
__m512 tmp8241 = _mm512_shuffle_ps(tmp8229, tmp8231, 68);
__m512 tmp8242 = _mm512_shuffle_ps(tmp8229, tmp8231, 238);
__m512 tmp8243 = _mm512_shuffle_ps(tmp8230, tmp8232, 68);
__m512 tmp8244 = _mm512_shuffle_ps(tmp8230, tmp8232, 238);
__m512 tmp8245 = _mm512_shuffle_ps(tmp8233, tmp8235, 68);
__m512 tmp8246 = _mm512_shuffle_ps(tmp8233, tmp8235, 238);
__m512 tmp8247 = _mm512_shuffle_ps(tmp8234, tmp8236, 68);
__m512 tmp8248 = _mm512_shuffle_ps(tmp8234, tmp8236, 238);
__m512 tmp8249 = _mm512_shuffle_f32x4(tmp8237, tmp8241, 136);
__m512 tmp8250 = _mm512_shuffle_f32x4(tmp8237, tmp8241, 221);
__m512 tmp8251 = _mm512_shuffle_f32x4(tmp8238, tmp8242, 136);
__m512 tmp8252 = _mm512_shuffle_f32x4(tmp8238, tmp8242, 221);
__m512 tmp8253 = _mm512_shuffle_f32x4(tmp8239, tmp8243, 136);
__m512 tmp8254 = _mm512_shuffle_f32x4(tmp8239, tmp8243, 221);
__m512 tmp8255 = _mm512_shuffle_f32x4(tmp8240, tmp8244, 136);
__m512 tmp8256 = _mm512_shuffle_f32x4(tmp8240, tmp8244, 221);
__m512 tmp8257 = _mm512_shuffle_f32x4(tmp8245, tmp8245, 136);
__m512 tmp8258 = _mm512_shuffle_f32x4(tmp8245, tmp8245, 221);
__m512 tmp8259 = _mm512_shuffle_f32x4(tmp8246, tmp8246, 136);
__m512 tmp8260 = _mm512_shuffle_f32x4(tmp8246, tmp8246, 221);
__m512 tmp8261 = _mm512_shuffle_f32x4(tmp8247, tmp8247, 136);
__m512 tmp8262 = _mm512_shuffle_f32x4(tmp8247, tmp8247, 221);
__m512 tmp8263 = _mm512_shuffle_f32x4(tmp8248, tmp8248, 136);
__m512 tmp8264 = _mm512_shuffle_f32x4(tmp8248, tmp8248, 221);
tmp8129 = _mm512_shuffle_f32x4(tmp8249, tmp8257, 136);
tmp8137 = _mm512_shuffle_f32x4(tmp8249, tmp8257, 221);
tmp8130 = _mm512_shuffle_f32x4(tmp8251, tmp8259, 136);
tmp8138 = _mm512_shuffle_f32x4(tmp8251, tmp8259, 221);
tmp8131 = _mm512_shuffle_f32x4(tmp8253, tmp8261, 136);
tmp8139 = _mm512_shuffle_f32x4(tmp8253, tmp8261, 221);
tmp8132 = _mm512_shuffle_f32x4(tmp8255, tmp8263, 136);
tmp8140 = _mm512_shuffle_f32x4(tmp8255, tmp8263, 221);
tmp8133 = _mm512_shuffle_f32x4(tmp8250, tmp8258, 136);
__m512 tmp8181 = _mm512_shuffle_f32x4(tmp8250, tmp8258, 221);
tmp8134 = _mm512_shuffle_f32x4(tmp8252, tmp8260, 136);
__m512 tmp8182 = _mm512_shuffle_f32x4(tmp8252, tmp8260, 221);
tmp8135 = _mm512_shuffle_f32x4(tmp8254, tmp8262, 136);
__m512 tmp8183 = _mm512_shuffle_f32x4(tmp8254, tmp8262, 221);
tmp8136 = _mm512_shuffle_f32x4(tmp8256, tmp8264, 136);
__m512 tmp8184 = _mm512_shuffle_f32x4(tmp8256, tmp8264, 221);
__m512 tmp8189 = _mm512_add_ps(tmp8130, tmp8131);
__m512 tmp8209 = _mm512_add_ps(tmp8138, tmp8139);
__m512 tmp8188 = _mm512_add_ps(tmp8132, tmp8133);
__m512 tmp8208 = _mm512_add_ps(tmp8140, tmp8181);
__m512 tmp8194 = _mm512_sub_ps(tmp8132, tmp8133);
__m512 tmp8214 = _mm512_sub_ps(tmp8140, tmp8181);
__m512 tmp8193 = _mm512_sub_ps(tmp8130, tmp8131);
__m512 tmp8213 = _mm512_sub_ps(tmp8138, tmp8139);
__m512 tmp8190 = _mm512_add_ps(tmp8134, tmp8135);
__m512 tmp8210 = _mm512_add_ps(tmp8182, tmp8183);
__m512 tmp8195 = _mm512_sub_ps(tmp8134, tmp8135);
__m512 tmp8215 = _mm512_sub_ps(tmp8182, tmp8183);
__m512 tmp8192 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(2e+00f), tmp8193);
__m512 tmp8212 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(2e+00f), tmp8213);
__m512 tmp8199 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(8e+00f), tmp8193);
__m512 tmp8219 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(8e+00f), tmp8213);
__m512 tmp8187 = _mm512_add_ps(tmp8188, tmp8189);
__m512 tmp8207 = _mm512_add_ps(tmp8208, tmp8209);
__m512 tmp8191 = _mm512_fmadd_ps(tmp8195, _mm512_set1_ps(1.6e+01f), tmp8192);
__m512 tmp8211 = _mm512_fmadd_ps(tmp8215, _mm512_set1_ps(1.6e+01f), tmp8212);
__m512 tmp8198 = _mm512_fmadd_ps(tmp8195, _mm512_set1_ps(4e+00f), tmp8199);
__m512 tmp8218 = _mm512_fmadd_ps(tmp8215, _mm512_set1_ps(4e+00f), tmp8219);
__m512 tmp8204 = _mm512_add_ps(tmp8195, tmp8193);
__m512 tmp8224 = _mm512_add_ps(tmp8215, tmp8213);
__m512 tmp8197 = _mm512_fmadd_ps(tmp8188, _mm512_set1_ps(4e+00f), tmp8189);
__m512 tmp8217 = _mm512_fmadd_ps(tmp8208, _mm512_set1_ps(4e+00f), tmp8209);
__m512 tmp8201 = _mm512_fmadd_ps(tmp8188, _mm512_set1_ps(1.6e+01f), tmp8189);
__m512 tmp8221 = _mm512_fmadd_ps(tmp8208, _mm512_set1_ps(1.6e+01f), tmp8209);
__m512 tmp8186 = _mm512_add_ps(tmp8187, tmp8129);
__m512 tmp8206 = _mm512_add_ps(tmp8207, tmp8137);
__m512 tmp8203 = _mm512_add_ps(tmp8204, tmp8136);
__m512 tmp8223 = _mm512_add_ps(tmp8224, tmp8184);
__m512 tmp8185 = _mm512_fmadd_ps(tmp8190, _mm512_set1_ps(3.2e+01f), tmp8186);
__m512 tmp8205 = _mm512_fmadd_ps(tmp8210, _mm512_set1_ps(3.2e+01f), tmp8206);
__m512 tmp8196 = _mm512_fmadd_ps(tmp8190, _mm512_set1_ps(8e+00f), tmp8197);
__m512 tmp8216 = _mm512_fmadd_ps(tmp8210, _mm512_set1_ps(8e+00f), tmp8217);
__m512 tmp8202 = _mm512_fmadd_ps(tmp8194, _mm512_set1_ps(3.2e+01f), tmp8203);
__m512 tmp8222 = _mm512_fmadd_ps(tmp8214, _mm512_set1_ps(3.2e+01f), tmp8223);
__m512 tmp8200 = _mm512_fmadd_ps(tmp8190, _mm512_set1_ps(2e+00f), tmp8201);
__m512 tmp8220 = _mm512_fmadd_ps(tmp8210, _mm512_set1_ps(2e+00f), tmp8221);
__m512 out1191 = tmp8185;
__m512 out1197 = tmp8205;
__m512 out1192 = tmp8191;
__m512 out1198 = tmp8211;
__m512 out1193 = tmp8196;
__m512 out1199 = tmp8216;
__m512 out1194 = tmp8198;
__m512 out1200 = tmp8218;
__m512 out1195 = tmp8200;
__m512 out1201 = tmp8220;
__m512 out1196 = tmp8202;
__m512 out1202 = tmp8222;
out1191 = _mm512_max_ps(_mm512_setzero_ps(), out1191);
out1197 = _mm512_max_ps(_mm512_setzero_ps(), out1197);
out1192 = _mm512_max_ps(_mm512_setzero_ps(), out1192);
out1198 = _mm512_max_ps(_mm512_setzero_ps(), out1198);
out1193 = _mm512_max_ps(_mm512_setzero_ps(), out1193);
out1199 = _mm512_max_ps(_mm512_setzero_ps(), out1199);
out1194 = _mm512_max_ps(_mm512_setzero_ps(), out1194);
out1200 = _mm512_max_ps(_mm512_setzero_ps(), out1200);
out1195 = _mm512_max_ps(_mm512_setzero_ps(), out1195);
out1201 = _mm512_max_ps(_mm512_setzero_ps(), out1201);
out1196 = _mm512_max_ps(_mm512_setzero_ps(), out1196);
out1202 = _mm512_max_ps(_mm512_setzero_ps(), out1202);
_mm512_mask_storeu_ps(datPtr13+96+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1191);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1197);
_mm512_mask_storeu_ps(datPtr13+320+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1192);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1198);
_mm512_mask_storeu_ps(datPtr13+544+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1193);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1199);
_mm512_mask_storeu_ps(datPtr13+768+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1194);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1200);
_mm512_mask_storeu_ps(datPtr13+992+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1195);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1201);
_mm512_mask_storeu_ps(datPtr13+1216+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1196);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1202);
__m512 sf529 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf530 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1245 = _mm512_shuffle_f32x4(sf529, sf530, 68);
__m512 in1246 = _mm512_shuffle_f32x4(sf529, sf530, 238);
__m512 sf531 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf532 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1253 = _mm512_shuffle_f32x4(sf531, sf532, 68);
__m512 in1254 = _mm512_shuffle_f32x4(sf531, sf532, 238);
__m512 sf533 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf534 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1247 = _mm512_shuffle_f32x4(sf533, sf534, 68);
__m512 in1248 = _mm512_shuffle_f32x4(sf533, sf534, 238);
__m512 sf535 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf536 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1255 = _mm512_shuffle_f32x4(sf535, sf536, 68);
__m512 in1256 = _mm512_shuffle_f32x4(sf535, sf536, 238);
__m512 sf537 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf538 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1249 = _mm512_shuffle_f32x4(sf537, sf538, 68);
__m512 in1250 = _mm512_shuffle_f32x4(sf537, sf538, 238);
__m512 sf539 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf540 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1257 = _mm512_shuffle_f32x4(sf539, sf540, 68);
__m512 in1258 = _mm512_shuffle_f32x4(sf539, sf540, 238);
__m512 sf541 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf542 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1251 = _mm512_shuffle_f32x4(sf541, sf542, 68);
__m512 in1252 = _mm512_shuffle_f32x4(sf541, sf542, 238);
__m512 sf543 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k91+768*l30);
__m512 sf544 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k91+768*l30);
__m512 in1259 = _mm512_shuffle_f32x4(sf543, sf544, 68);
__m512 in1260 = _mm512_shuffle_f32x4(sf543, sf544, 238);
__m512 tmp8281 = _mm512_add_ps(in1246, in1247);
__m512 tmp8301 = _mm512_add_ps(in1254, in1255);
__m512 tmp8280 = _mm512_add_ps(in1248, in1249);
__m512 tmp8300 = _mm512_add_ps(in1256, in1257);
__m512 tmp8286 = _mm512_sub_ps(in1248, in1249);
__m512 tmp8306 = _mm512_sub_ps(in1256, in1257);
__m512 tmp8285 = _mm512_sub_ps(in1246, in1247);
__m512 tmp8305 = _mm512_sub_ps(in1254, in1255);
__m512 tmp8282 = _mm512_add_ps(in1250, in1251);
__m512 tmp8302 = _mm512_add_ps(in1258, in1259);
__m512 tmp8287 = _mm512_sub_ps(in1250, in1251);
__m512 tmp8307 = _mm512_sub_ps(in1258, in1259);
__m512 tmp8284 = _mm512_fmadd_ps(tmp8286, _mm512_set1_ps(2e+00f), tmp8285);
__m512 tmp8304 = _mm512_fmadd_ps(tmp8306, _mm512_set1_ps(2e+00f), tmp8305);
__m512 tmp8291 = _mm512_fmadd_ps(tmp8286, _mm512_set1_ps(8e+00f), tmp8285);
__m512 tmp8311 = _mm512_fmadd_ps(tmp8306, _mm512_set1_ps(8e+00f), tmp8305);
__m512 tmp8279 = _mm512_add_ps(tmp8280, tmp8281);
__m512 tmp8299 = _mm512_add_ps(tmp8300, tmp8301);
__m512 tmp8283 = _mm512_fmadd_ps(tmp8287, _mm512_set1_ps(1.6e+01f), tmp8284);
__m512 tmp8303 = _mm512_fmadd_ps(tmp8307, _mm512_set1_ps(1.6e+01f), tmp8304);
__m512 tmp8290 = _mm512_fmadd_ps(tmp8287, _mm512_set1_ps(4e+00f), tmp8291);
__m512 tmp8310 = _mm512_fmadd_ps(tmp8307, _mm512_set1_ps(4e+00f), tmp8311);
__m512 tmp8296 = _mm512_add_ps(tmp8287, tmp8285);
__m512 tmp8316 = _mm512_add_ps(tmp8307, tmp8305);
__m512 tmp8289 = _mm512_fmadd_ps(tmp8280, _mm512_set1_ps(4e+00f), tmp8281);
__m512 tmp8309 = _mm512_fmadd_ps(tmp8300, _mm512_set1_ps(4e+00f), tmp8301);
__m512 tmp8293 = _mm512_fmadd_ps(tmp8280, _mm512_set1_ps(1.6e+01f), tmp8281);
__m512 tmp8313 = _mm512_fmadd_ps(tmp8300, _mm512_set1_ps(1.6e+01f), tmp8301);
__m512 tmp8278 = _mm512_add_ps(tmp8279, in1245);
__m512 tmp8298 = _mm512_add_ps(tmp8299, in1253);
__m512 tmp8295 = _mm512_add_ps(tmp8296, in1252);
__m512 tmp8315 = _mm512_add_ps(tmp8316, in1260);
__m512 tmp8277 = _mm512_fmadd_ps(tmp8282, _mm512_set1_ps(3.2e+01f), tmp8278);
__m512 tmp8297 = _mm512_fmadd_ps(tmp8302, _mm512_set1_ps(3.2e+01f), tmp8298);
__m512 tmp8288 = _mm512_fmadd_ps(tmp8282, _mm512_set1_ps(8e+00f), tmp8289);
__m512 tmp8308 = _mm512_fmadd_ps(tmp8302, _mm512_set1_ps(8e+00f), tmp8309);
__m512 tmp8294 = _mm512_fmadd_ps(tmp8286, _mm512_set1_ps(3.2e+01f), tmp8295);
__m512 tmp8314 = _mm512_fmadd_ps(tmp8306, _mm512_set1_ps(3.2e+01f), tmp8315);
__m512 tmp8292 = _mm512_fmadd_ps(tmp8282, _mm512_set1_ps(2e+00f), tmp8293);
__m512 tmp8312 = _mm512_fmadd_ps(tmp8302, _mm512_set1_ps(2e+00f), tmp8313);
__m512 tmp8265 = tmp8277;
__m512 tmp8271 = tmp8297;
__m512 tmp8266 = tmp8283;
__m512 tmp8272 = tmp8303;
__m512 tmp8267 = tmp8288;
__m512 tmp8273 = tmp8308;
__m512 tmp8268 = tmp8290;
__m512 tmp8274 = tmp8310;
__m512 tmp8269 = tmp8292;
__m512 tmp8275 = tmp8312;
__m512 tmp8270 = tmp8294;
__m512 tmp8276 = tmp8314;
__m512 tmp8361 = _mm512_unpacklo_ps(tmp8265, tmp8266);
__m512 tmp8362 = _mm512_unpackhi_ps(tmp8265, tmp8266);
__m512 tmp8363 = _mm512_unpacklo_ps(tmp8267, tmp8268);
__m512 tmp8364 = _mm512_unpackhi_ps(tmp8267, tmp8268);
__m512 tmp8365 = _mm512_unpacklo_ps(tmp8269, tmp8270);
__m512 tmp8366 = _mm512_unpackhi_ps(tmp8269, tmp8270);
__m512 tmp8367 = _mm512_unpacklo_ps(tmp8271, tmp8272);
__m512 tmp8368 = _mm512_unpackhi_ps(tmp8271, tmp8272);
__m512 tmp8369 = _mm512_unpacklo_ps(tmp8273, tmp8274);
__m512 tmp8370 = _mm512_unpackhi_ps(tmp8273, tmp8274);
__m512 tmp8371 = _mm512_unpacklo_ps(tmp8275, tmp8276);
__m512 tmp8372 = _mm512_unpackhi_ps(tmp8275, tmp8276);
__m512 tmp8373 = _mm512_shuffle_ps(tmp8361, tmp8363, 68);
__m512 tmp8374 = _mm512_shuffle_ps(tmp8361, tmp8363, 238);
__m512 tmp8375 = _mm512_shuffle_ps(tmp8362, tmp8364, 68);
__m512 tmp8376 = _mm512_shuffle_ps(tmp8362, tmp8364, 238);
__m512 tmp8377 = _mm512_shuffle_ps(tmp8365, tmp8367, 68);
__m512 tmp8378 = _mm512_shuffle_ps(tmp8365, tmp8367, 238);
__m512 tmp8379 = _mm512_shuffle_ps(tmp8366, tmp8368, 68);
__m512 tmp8380 = _mm512_shuffle_ps(tmp8366, tmp8368, 238);
__m512 tmp8381 = _mm512_shuffle_ps(tmp8369, tmp8371, 68);
__m512 tmp8382 = _mm512_shuffle_ps(tmp8369, tmp8371, 238);
__m512 tmp8383 = _mm512_shuffle_ps(tmp8370, tmp8372, 68);
__m512 tmp8384 = _mm512_shuffle_ps(tmp8370, tmp8372, 238);
__m512 tmp8385 = _mm512_shuffle_f32x4(tmp8373, tmp8377, 136);
__m512 tmp8386 = _mm512_shuffle_f32x4(tmp8373, tmp8377, 221);
__m512 tmp8387 = _mm512_shuffle_f32x4(tmp8374, tmp8378, 136);
__m512 tmp8388 = _mm512_shuffle_f32x4(tmp8374, tmp8378, 221);
__m512 tmp8389 = _mm512_shuffle_f32x4(tmp8375, tmp8379, 136);
__m512 tmp8390 = _mm512_shuffle_f32x4(tmp8375, tmp8379, 221);
__m512 tmp8391 = _mm512_shuffle_f32x4(tmp8376, tmp8380, 136);
__m512 tmp8392 = _mm512_shuffle_f32x4(tmp8376, tmp8380, 221);
__m512 tmp8393 = _mm512_shuffle_f32x4(tmp8381, tmp8381, 136);
__m512 tmp8394 = _mm512_shuffle_f32x4(tmp8381, tmp8381, 221);
__m512 tmp8395 = _mm512_shuffle_f32x4(tmp8382, tmp8382, 136);
__m512 tmp8396 = _mm512_shuffle_f32x4(tmp8382, tmp8382, 221);
__m512 tmp8397 = _mm512_shuffle_f32x4(tmp8383, tmp8383, 136);
__m512 tmp8398 = _mm512_shuffle_f32x4(tmp8383, tmp8383, 221);
__m512 tmp8399 = _mm512_shuffle_f32x4(tmp8384, tmp8384, 136);
__m512 tmp8400 = _mm512_shuffle_f32x4(tmp8384, tmp8384, 221);
tmp8265 = _mm512_shuffle_f32x4(tmp8385, tmp8393, 136);
tmp8273 = _mm512_shuffle_f32x4(tmp8385, tmp8393, 221);
tmp8266 = _mm512_shuffle_f32x4(tmp8387, tmp8395, 136);
tmp8274 = _mm512_shuffle_f32x4(tmp8387, tmp8395, 221);
tmp8267 = _mm512_shuffle_f32x4(tmp8389, tmp8397, 136);
tmp8275 = _mm512_shuffle_f32x4(tmp8389, tmp8397, 221);
tmp8268 = _mm512_shuffle_f32x4(tmp8391, tmp8399, 136);
tmp8276 = _mm512_shuffle_f32x4(tmp8391, tmp8399, 221);
tmp8269 = _mm512_shuffle_f32x4(tmp8386, tmp8394, 136);
__m512 tmp8317 = _mm512_shuffle_f32x4(tmp8386, tmp8394, 221);
tmp8270 = _mm512_shuffle_f32x4(tmp8388, tmp8396, 136);
__m512 tmp8318 = _mm512_shuffle_f32x4(tmp8388, tmp8396, 221);
tmp8271 = _mm512_shuffle_f32x4(tmp8390, tmp8398, 136);
__m512 tmp8319 = _mm512_shuffle_f32x4(tmp8390, tmp8398, 221);
tmp8272 = _mm512_shuffle_f32x4(tmp8392, tmp8400, 136);
__m512 tmp8320 = _mm512_shuffle_f32x4(tmp8392, tmp8400, 221);
__m512 tmp8325 = _mm512_add_ps(tmp8266, tmp8267);
__m512 tmp8345 = _mm512_add_ps(tmp8274, tmp8275);
__m512 tmp8324 = _mm512_add_ps(tmp8268, tmp8269);
__m512 tmp8344 = _mm512_add_ps(tmp8276, tmp8317);
__m512 tmp8330 = _mm512_sub_ps(tmp8268, tmp8269);
__m512 tmp8350 = _mm512_sub_ps(tmp8276, tmp8317);
__m512 tmp8329 = _mm512_sub_ps(tmp8266, tmp8267);
__m512 tmp8349 = _mm512_sub_ps(tmp8274, tmp8275);
__m512 tmp8326 = _mm512_add_ps(tmp8270, tmp8271);
__m512 tmp8346 = _mm512_add_ps(tmp8318, tmp8319);
__m512 tmp8331 = _mm512_sub_ps(tmp8270, tmp8271);
__m512 tmp8351 = _mm512_sub_ps(tmp8318, tmp8319);
__m512 tmp8328 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(2e+00f), tmp8329);
__m512 tmp8348 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(2e+00f), tmp8349);
__m512 tmp8335 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(8e+00f), tmp8329);
__m512 tmp8355 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(8e+00f), tmp8349);
__m512 tmp8323 = _mm512_add_ps(tmp8324, tmp8325);
__m512 tmp8343 = _mm512_add_ps(tmp8344, tmp8345);
__m512 tmp8327 = _mm512_fmadd_ps(tmp8331, _mm512_set1_ps(1.6e+01f), tmp8328);
__m512 tmp8347 = _mm512_fmadd_ps(tmp8351, _mm512_set1_ps(1.6e+01f), tmp8348);
__m512 tmp8334 = _mm512_fmadd_ps(tmp8331, _mm512_set1_ps(4e+00f), tmp8335);
__m512 tmp8354 = _mm512_fmadd_ps(tmp8351, _mm512_set1_ps(4e+00f), tmp8355);
__m512 tmp8340 = _mm512_add_ps(tmp8331, tmp8329);
__m512 tmp8360 = _mm512_add_ps(tmp8351, tmp8349);
__m512 tmp8333 = _mm512_fmadd_ps(tmp8324, _mm512_set1_ps(4e+00f), tmp8325);
__m512 tmp8353 = _mm512_fmadd_ps(tmp8344, _mm512_set1_ps(4e+00f), tmp8345);
__m512 tmp8337 = _mm512_fmadd_ps(tmp8324, _mm512_set1_ps(1.6e+01f), tmp8325);
__m512 tmp8357 = _mm512_fmadd_ps(tmp8344, _mm512_set1_ps(1.6e+01f), tmp8345);
__m512 tmp8322 = _mm512_add_ps(tmp8323, tmp8265);
__m512 tmp8342 = _mm512_add_ps(tmp8343, tmp8273);
__m512 tmp8339 = _mm512_add_ps(tmp8340, tmp8272);
__m512 tmp8359 = _mm512_add_ps(tmp8360, tmp8320);
__m512 tmp8321 = _mm512_fmadd_ps(tmp8326, _mm512_set1_ps(3.2e+01f), tmp8322);
__m512 tmp8341 = _mm512_fmadd_ps(tmp8346, _mm512_set1_ps(3.2e+01f), tmp8342);
__m512 tmp8332 = _mm512_fmadd_ps(tmp8326, _mm512_set1_ps(8e+00f), tmp8333);
__m512 tmp8352 = _mm512_fmadd_ps(tmp8346, _mm512_set1_ps(8e+00f), tmp8353);
__m512 tmp8338 = _mm512_fmadd_ps(tmp8330, _mm512_set1_ps(3.2e+01f), tmp8339);
__m512 tmp8358 = _mm512_fmadd_ps(tmp8350, _mm512_set1_ps(3.2e+01f), tmp8359);
__m512 tmp8336 = _mm512_fmadd_ps(tmp8326, _mm512_set1_ps(2e+00f), tmp8337);
__m512 tmp8356 = _mm512_fmadd_ps(tmp8346, _mm512_set1_ps(2e+00f), tmp8357);
__m512 out1203 = tmp8321;
__m512 out1209 = tmp8341;
__m512 out1204 = tmp8327;
__m512 out1210 = tmp8347;
__m512 out1205 = tmp8332;
__m512 out1211 = tmp8352;
__m512 out1206 = tmp8334;
__m512 out1212 = tmp8354;
__m512 out1207 = tmp8336;
__m512 out1213 = tmp8356;
__m512 out1208 = tmp8338;
__m512 out1214 = tmp8358;
out1203 = _mm512_max_ps(_mm512_setzero_ps(), out1203);
out1209 = _mm512_max_ps(_mm512_setzero_ps(), out1209);
out1204 = _mm512_max_ps(_mm512_setzero_ps(), out1204);
out1210 = _mm512_max_ps(_mm512_setzero_ps(), out1210);
out1205 = _mm512_max_ps(_mm512_setzero_ps(), out1205);
out1211 = _mm512_max_ps(_mm512_setzero_ps(), out1211);
out1206 = _mm512_max_ps(_mm512_setzero_ps(), out1206);
out1212 = _mm512_max_ps(_mm512_setzero_ps(), out1212);
out1207 = _mm512_max_ps(_mm512_setzero_ps(), out1207);
out1213 = _mm512_max_ps(_mm512_setzero_ps(), out1213);
out1208 = _mm512_max_ps(_mm512_setzero_ps(), out1208);
out1214 = _mm512_max_ps(_mm512_setzero_ps(), out1214);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1203);
_mm512_mask_storeu_ps(datPtr13+12704+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1209);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1204);
_mm512_mask_storeu_ps(datPtr13+12928+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1210);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1205);
_mm512_mask_storeu_ps(datPtr13+13152+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1211);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1206);
_mm512_mask_storeu_ps(datPtr13+13376+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1212);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1207);
_mm512_mask_storeu_ps(datPtr13+13600+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1213);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1208);
_mm512_mask_storeu_ps(datPtr13+13824+50432*i29+224*toH31+4*toW31+50432*k91+25216*l30, 4095, out1214);
}
}
++j23;
rel17 = 1;
}
ptrdiff_t toH32 = base17+0;
ptrdiff_t toW32 = 48;
ptrdiff_t k92 = 1*w46;
for (; k92 != 1; ++k92) {
ptrdiff_t l31 = 0;
for (; l31 != 2; ++l31) {
__m512 sf545 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf546 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1261 = _mm512_shuffle_f32x4(sf545, sf546, 68);
__m512 in1262 = _mm512_shuffle_f32x4(sf545, sf546, 238);
__m512 sf547 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf548 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1269 = _mm512_shuffle_f32x4(sf547, sf548, 68);
__m512 in1270 = _mm512_shuffle_f32x4(sf547, sf548, 238);
__m512 sf549 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf550 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1263 = _mm512_shuffle_f32x4(sf549, sf550, 68);
__m512 in1264 = _mm512_shuffle_f32x4(sf549, sf550, 238);
__m512 sf551 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf552 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1271 = _mm512_shuffle_f32x4(sf551, sf552, 68);
__m512 in1272 = _mm512_shuffle_f32x4(sf551, sf552, 238);
__m512 sf553 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf554 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1265 = _mm512_shuffle_f32x4(sf553, sf554, 68);
__m512 in1266 = _mm512_shuffle_f32x4(sf553, sf554, 238);
__m512 sf555 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf556 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1273 = _mm512_shuffle_f32x4(sf555, sf556, 68);
__m512 in1274 = _mm512_shuffle_f32x4(sf555, sf556, 238);
__m512 sf557 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf558 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1267 = _mm512_shuffle_f32x4(sf557, sf558, 68);
__m512 in1268 = _mm512_shuffle_f32x4(sf557, sf558, 238);
__m512 sf559 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf560 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1275 = _mm512_shuffle_f32x4(sf559, sf560, 68);
__m512 in1276 = _mm512_shuffle_f32x4(sf559, sf560, 238);
__m512 tmp8417 = _mm512_add_ps(in1262, in1263);
__m512 tmp8437 = _mm512_add_ps(in1270, in1271);
__m512 tmp8416 = _mm512_add_ps(in1264, in1265);
__m512 tmp8436 = _mm512_add_ps(in1272, in1273);
__m512 tmp8422 = _mm512_sub_ps(in1264, in1265);
__m512 tmp8442 = _mm512_sub_ps(in1272, in1273);
__m512 tmp8421 = _mm512_sub_ps(in1262, in1263);
__m512 tmp8441 = _mm512_sub_ps(in1270, in1271);
__m512 tmp8418 = _mm512_add_ps(in1266, in1267);
__m512 tmp8438 = _mm512_add_ps(in1274, in1275);
__m512 tmp8423 = _mm512_sub_ps(in1266, in1267);
__m512 tmp8443 = _mm512_sub_ps(in1274, in1275);
__m512 tmp8420 = _mm512_fmadd_ps(tmp8422, _mm512_set1_ps(2e+00f), tmp8421);
__m512 tmp8440 = _mm512_fmadd_ps(tmp8442, _mm512_set1_ps(2e+00f), tmp8441);
__m512 tmp8427 = _mm512_fmadd_ps(tmp8422, _mm512_set1_ps(8e+00f), tmp8421);
__m512 tmp8447 = _mm512_fmadd_ps(tmp8442, _mm512_set1_ps(8e+00f), tmp8441);
__m512 tmp8415 = _mm512_add_ps(tmp8416, tmp8417);
__m512 tmp8435 = _mm512_add_ps(tmp8436, tmp8437);
__m512 tmp8419 = _mm512_fmadd_ps(tmp8423, _mm512_set1_ps(1.6e+01f), tmp8420);
__m512 tmp8439 = _mm512_fmadd_ps(tmp8443, _mm512_set1_ps(1.6e+01f), tmp8440);
__m512 tmp8426 = _mm512_fmadd_ps(tmp8423, _mm512_set1_ps(4e+00f), tmp8427);
__m512 tmp8446 = _mm512_fmadd_ps(tmp8443, _mm512_set1_ps(4e+00f), tmp8447);
__m512 tmp8432 = _mm512_add_ps(tmp8423, tmp8421);
__m512 tmp8452 = _mm512_add_ps(tmp8443, tmp8441);
__m512 tmp8425 = _mm512_fmadd_ps(tmp8416, _mm512_set1_ps(4e+00f), tmp8417);
__m512 tmp8445 = _mm512_fmadd_ps(tmp8436, _mm512_set1_ps(4e+00f), tmp8437);
__m512 tmp8429 = _mm512_fmadd_ps(tmp8416, _mm512_set1_ps(1.6e+01f), tmp8417);
__m512 tmp8449 = _mm512_fmadd_ps(tmp8436, _mm512_set1_ps(1.6e+01f), tmp8437);
__m512 tmp8414 = _mm512_add_ps(tmp8415, in1261);
__m512 tmp8434 = _mm512_add_ps(tmp8435, in1269);
__m512 tmp8431 = _mm512_add_ps(tmp8432, in1268);
__m512 tmp8451 = _mm512_add_ps(tmp8452, in1276);
__m512 tmp8413 = _mm512_fmadd_ps(tmp8418, _mm512_set1_ps(3.2e+01f), tmp8414);
__m512 tmp8433 = _mm512_fmadd_ps(tmp8438, _mm512_set1_ps(3.2e+01f), tmp8434);
__m512 tmp8424 = _mm512_fmadd_ps(tmp8418, _mm512_set1_ps(8e+00f), tmp8425);
__m512 tmp8444 = _mm512_fmadd_ps(tmp8438, _mm512_set1_ps(8e+00f), tmp8445);
__m512 tmp8430 = _mm512_fmadd_ps(tmp8422, _mm512_set1_ps(3.2e+01f), tmp8431);
__m512 tmp8450 = _mm512_fmadd_ps(tmp8442, _mm512_set1_ps(3.2e+01f), tmp8451);
__m512 tmp8428 = _mm512_fmadd_ps(tmp8418, _mm512_set1_ps(2e+00f), tmp8429);
__m512 tmp8448 = _mm512_fmadd_ps(tmp8438, _mm512_set1_ps(2e+00f), tmp8449);
__m512 tmp8401 = tmp8413;
__m512 tmp8407 = tmp8433;
__m512 tmp8402 = tmp8419;
__m512 tmp8408 = tmp8439;
__m512 tmp8403 = tmp8424;
__m512 tmp8409 = tmp8444;
__m512 tmp8404 = tmp8426;
__m512 tmp8410 = tmp8446;
__m512 tmp8405 = tmp8428;
__m512 tmp8411 = tmp8448;
__m512 tmp8406 = tmp8430;
__m512 tmp8412 = tmp8450;
__m512 tmp8497 = _mm512_unpacklo_ps(tmp8401, tmp8402);
__m512 tmp8498 = _mm512_unpackhi_ps(tmp8401, tmp8402);
__m512 tmp8499 = _mm512_unpacklo_ps(tmp8403, tmp8404);
__m512 tmp8500 = _mm512_unpackhi_ps(tmp8403, tmp8404);
__m512 tmp8501 = _mm512_unpacklo_ps(tmp8405, tmp8406);
__m512 tmp8502 = _mm512_unpackhi_ps(tmp8405, tmp8406);
__m512 tmp8503 = _mm512_unpacklo_ps(tmp8407, tmp8408);
__m512 tmp8504 = _mm512_unpackhi_ps(tmp8407, tmp8408);
__m512 tmp8505 = _mm512_unpacklo_ps(tmp8409, tmp8410);
__m512 tmp8506 = _mm512_unpackhi_ps(tmp8409, tmp8410);
__m512 tmp8507 = _mm512_unpacklo_ps(tmp8411, tmp8412);
__m512 tmp8508 = _mm512_unpackhi_ps(tmp8411, tmp8412);
__m512 tmp8509 = _mm512_shuffle_ps(tmp8497, tmp8499, 68);
__m512 tmp8510 = _mm512_shuffle_ps(tmp8497, tmp8499, 238);
__m512 tmp8511 = _mm512_shuffle_ps(tmp8498, tmp8500, 68);
__m512 tmp8512 = _mm512_shuffle_ps(tmp8498, tmp8500, 238);
__m512 tmp8513 = _mm512_shuffle_ps(tmp8501, tmp8503, 68);
__m512 tmp8514 = _mm512_shuffle_ps(tmp8501, tmp8503, 238);
__m512 tmp8515 = _mm512_shuffle_ps(tmp8502, tmp8504, 68);
__m512 tmp8516 = _mm512_shuffle_ps(tmp8502, tmp8504, 238);
__m512 tmp8517 = _mm512_shuffle_ps(tmp8505, tmp8507, 68);
__m512 tmp8518 = _mm512_shuffle_ps(tmp8505, tmp8507, 238);
__m512 tmp8519 = _mm512_shuffle_ps(tmp8506, tmp8508, 68);
__m512 tmp8520 = _mm512_shuffle_ps(tmp8506, tmp8508, 238);
__m512 tmp8521 = _mm512_shuffle_f32x4(tmp8509, tmp8513, 136);
__m512 tmp8522 = _mm512_shuffle_f32x4(tmp8509, tmp8513, 221);
__m512 tmp8523 = _mm512_shuffle_f32x4(tmp8510, tmp8514, 136);
__m512 tmp8524 = _mm512_shuffle_f32x4(tmp8510, tmp8514, 221);
__m512 tmp8525 = _mm512_shuffle_f32x4(tmp8511, tmp8515, 136);
__m512 tmp8526 = _mm512_shuffle_f32x4(tmp8511, tmp8515, 221);
__m512 tmp8527 = _mm512_shuffle_f32x4(tmp8512, tmp8516, 136);
__m512 tmp8528 = _mm512_shuffle_f32x4(tmp8512, tmp8516, 221);
__m512 tmp8529 = _mm512_shuffle_f32x4(tmp8517, tmp8517, 136);
__m512 tmp8530 = _mm512_shuffle_f32x4(tmp8517, tmp8517, 221);
__m512 tmp8531 = _mm512_shuffle_f32x4(tmp8518, tmp8518, 136);
__m512 tmp8532 = _mm512_shuffle_f32x4(tmp8518, tmp8518, 221);
__m512 tmp8533 = _mm512_shuffle_f32x4(tmp8519, tmp8519, 136);
__m512 tmp8534 = _mm512_shuffle_f32x4(tmp8519, tmp8519, 221);
__m512 tmp8535 = _mm512_shuffle_f32x4(tmp8520, tmp8520, 136);
__m512 tmp8536 = _mm512_shuffle_f32x4(tmp8520, tmp8520, 221);
tmp8401 = _mm512_shuffle_f32x4(tmp8521, tmp8529, 136);
tmp8409 = _mm512_shuffle_f32x4(tmp8521, tmp8529, 221);
tmp8402 = _mm512_shuffle_f32x4(tmp8523, tmp8531, 136);
tmp8410 = _mm512_shuffle_f32x4(tmp8523, tmp8531, 221);
tmp8403 = _mm512_shuffle_f32x4(tmp8525, tmp8533, 136);
tmp8411 = _mm512_shuffle_f32x4(tmp8525, tmp8533, 221);
tmp8404 = _mm512_shuffle_f32x4(tmp8527, tmp8535, 136);
tmp8412 = _mm512_shuffle_f32x4(tmp8527, tmp8535, 221);
tmp8405 = _mm512_shuffle_f32x4(tmp8522, tmp8530, 136);
__m512 tmp8453 = _mm512_shuffle_f32x4(tmp8522, tmp8530, 221);
tmp8406 = _mm512_shuffle_f32x4(tmp8524, tmp8532, 136);
__m512 tmp8454 = _mm512_shuffle_f32x4(tmp8524, tmp8532, 221);
tmp8407 = _mm512_shuffle_f32x4(tmp8526, tmp8534, 136);
__m512 tmp8455 = _mm512_shuffle_f32x4(tmp8526, tmp8534, 221);
tmp8408 = _mm512_shuffle_f32x4(tmp8528, tmp8536, 136);
__m512 tmp8456 = _mm512_shuffle_f32x4(tmp8528, tmp8536, 221);
__m512 tmp8461 = _mm512_add_ps(tmp8402, tmp8403);
__m512 tmp8481 = _mm512_add_ps(tmp8410, tmp8411);
__m512 tmp8460 = _mm512_add_ps(tmp8404, tmp8405);
__m512 tmp8480 = _mm512_add_ps(tmp8412, tmp8453);
__m512 tmp8466 = _mm512_sub_ps(tmp8404, tmp8405);
__m512 tmp8486 = _mm512_sub_ps(tmp8412, tmp8453);
__m512 tmp8465 = _mm512_sub_ps(tmp8402, tmp8403);
__m512 tmp8485 = _mm512_sub_ps(tmp8410, tmp8411);
__m512 tmp8462 = _mm512_add_ps(tmp8406, tmp8407);
__m512 tmp8482 = _mm512_add_ps(tmp8454, tmp8455);
__m512 tmp8467 = _mm512_sub_ps(tmp8406, tmp8407);
__m512 tmp8487 = _mm512_sub_ps(tmp8454, tmp8455);
__m512 tmp8464 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(2e+00f), tmp8465);
__m512 tmp8484 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(2e+00f), tmp8485);
__m512 tmp8471 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(8e+00f), tmp8465);
__m512 tmp8491 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(8e+00f), tmp8485);
__m512 tmp8459 = _mm512_add_ps(tmp8460, tmp8461);
__m512 tmp8479 = _mm512_add_ps(tmp8480, tmp8481);
__m512 tmp8463 = _mm512_fmadd_ps(tmp8467, _mm512_set1_ps(1.6e+01f), tmp8464);
__m512 tmp8483 = _mm512_fmadd_ps(tmp8487, _mm512_set1_ps(1.6e+01f), tmp8484);
__m512 tmp8470 = _mm512_fmadd_ps(tmp8467, _mm512_set1_ps(4e+00f), tmp8471);
__m512 tmp8490 = _mm512_fmadd_ps(tmp8487, _mm512_set1_ps(4e+00f), tmp8491);
__m512 tmp8476 = _mm512_add_ps(tmp8467, tmp8465);
__m512 tmp8496 = _mm512_add_ps(tmp8487, tmp8485);
__m512 tmp8469 = _mm512_fmadd_ps(tmp8460, _mm512_set1_ps(4e+00f), tmp8461);
__m512 tmp8489 = _mm512_fmadd_ps(tmp8480, _mm512_set1_ps(4e+00f), tmp8481);
__m512 tmp8473 = _mm512_fmadd_ps(tmp8460, _mm512_set1_ps(1.6e+01f), tmp8461);
__m512 tmp8493 = _mm512_fmadd_ps(tmp8480, _mm512_set1_ps(1.6e+01f), tmp8481);
__m512 tmp8458 = _mm512_add_ps(tmp8459, tmp8401);
__m512 tmp8478 = _mm512_add_ps(tmp8479, tmp8409);
__m512 tmp8475 = _mm512_add_ps(tmp8476, tmp8408);
__m512 tmp8495 = _mm512_add_ps(tmp8496, tmp8456);
__m512 tmp8457 = _mm512_fmadd_ps(tmp8462, _mm512_set1_ps(3.2e+01f), tmp8458);
__m512 tmp8477 = _mm512_fmadd_ps(tmp8482, _mm512_set1_ps(3.2e+01f), tmp8478);
__m512 tmp8468 = _mm512_fmadd_ps(tmp8462, _mm512_set1_ps(8e+00f), tmp8469);
__m512 tmp8488 = _mm512_fmadd_ps(tmp8482, _mm512_set1_ps(8e+00f), tmp8489);
__m512 tmp8474 = _mm512_fmadd_ps(tmp8466, _mm512_set1_ps(3.2e+01f), tmp8475);
__m512 tmp8494 = _mm512_fmadd_ps(tmp8486, _mm512_set1_ps(3.2e+01f), tmp8495);
__m512 tmp8472 = _mm512_fmadd_ps(tmp8462, _mm512_set1_ps(2e+00f), tmp8473);
__m512 tmp8492 = _mm512_fmadd_ps(tmp8482, _mm512_set1_ps(2e+00f), tmp8493);
__m512 out1215 = tmp8457;
__m512 out1221 = tmp8477;
__m512 out1216 = tmp8463;
__m512 out1222 = tmp8483;
__m512 out1217 = tmp8468;
__m512 out1223 = tmp8488;
__m512 out1218 = tmp8470;
__m512 out1224 = tmp8490;
__m512 out1219 = tmp8472;
__m512 out1225 = tmp8492;
__m512 out1220 = tmp8474;
__m512 out1226 = tmp8494;
out1215 = _mm512_max_ps(_mm512_setzero_ps(), out1215);
out1221 = _mm512_max_ps(_mm512_setzero_ps(), out1221);
out1216 = _mm512_max_ps(_mm512_setzero_ps(), out1216);
out1222 = _mm512_max_ps(_mm512_setzero_ps(), out1222);
out1217 = _mm512_max_ps(_mm512_setzero_ps(), out1217);
out1223 = _mm512_max_ps(_mm512_setzero_ps(), out1223);
out1218 = _mm512_max_ps(_mm512_setzero_ps(), out1218);
out1224 = _mm512_max_ps(_mm512_setzero_ps(), out1224);
out1219 = _mm512_max_ps(_mm512_setzero_ps(), out1219);
out1225 = _mm512_max_ps(_mm512_setzero_ps(), out1225);
out1220 = _mm512_max_ps(_mm512_setzero_ps(), out1220);
out1226 = _mm512_max_ps(_mm512_setzero_ps(), out1226);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1215);
_mm512_mask_storeu_ps(datPtr13+1152+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1221);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1216);
_mm512_mask_storeu_ps(datPtr13+1376+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1222);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1217);
_mm512_mask_storeu_ps(datPtr13+1600+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1223);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1218);
_mm512_mask_storeu_ps(datPtr13+1824+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1224);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1219);
_mm512_mask_storeu_ps(datPtr13+2048+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1225);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1220);
_mm512_mask_storeu_ps(datPtr13+2272+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1226);
__m512 sf561 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf562 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1277 = _mm512_shuffle_f32x4(sf561, sf562, 68);
__m512 in1278 = _mm512_shuffle_f32x4(sf561, sf562, 238);
__m512 sf563 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf564 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1285 = _mm512_shuffle_f32x4(sf563, sf564, 68);
__m512 in1286 = _mm512_shuffle_f32x4(sf563, sf564, 238);
__m512 sf565 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf566 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1279 = _mm512_shuffle_f32x4(sf565, sf566, 68);
__m512 in1280 = _mm512_shuffle_f32x4(sf565, sf566, 238);
__m512 sf567 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf568 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1287 = _mm512_shuffle_f32x4(sf567, sf568, 68);
__m512 in1288 = _mm512_shuffle_f32x4(sf567, sf568, 238);
__m512 sf569 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf570 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1281 = _mm512_shuffle_f32x4(sf569, sf570, 68);
__m512 in1282 = _mm512_shuffle_f32x4(sf569, sf570, 238);
__m512 sf571 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf572 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1289 = _mm512_shuffle_f32x4(sf571, sf572, 68);
__m512 in1290 = _mm512_shuffle_f32x4(sf571, sf572, 238);
__m512 sf573 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf574 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1283 = _mm512_shuffle_f32x4(sf573, sf574, 68);
__m512 in1284 = _mm512_shuffle_f32x4(sf573, sf574, 238);
__m512 sf575 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf576 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1291 = _mm512_shuffle_f32x4(sf575, sf576, 68);
__m512 in1292 = _mm512_shuffle_f32x4(sf575, sf576, 238);
__m512 tmp8553 = _mm512_add_ps(in1278, in1279);
__m512 tmp8573 = _mm512_add_ps(in1286, in1287);
__m512 tmp8552 = _mm512_add_ps(in1280, in1281);
__m512 tmp8572 = _mm512_add_ps(in1288, in1289);
__m512 tmp8558 = _mm512_sub_ps(in1280, in1281);
__m512 tmp8578 = _mm512_sub_ps(in1288, in1289);
__m512 tmp8557 = _mm512_sub_ps(in1278, in1279);
__m512 tmp8577 = _mm512_sub_ps(in1286, in1287);
__m512 tmp8554 = _mm512_add_ps(in1282, in1283);
__m512 tmp8574 = _mm512_add_ps(in1290, in1291);
__m512 tmp8559 = _mm512_sub_ps(in1282, in1283);
__m512 tmp8579 = _mm512_sub_ps(in1290, in1291);
__m512 tmp8556 = _mm512_fmadd_ps(tmp8558, _mm512_set1_ps(2e+00f), tmp8557);
__m512 tmp8576 = _mm512_fmadd_ps(tmp8578, _mm512_set1_ps(2e+00f), tmp8577);
__m512 tmp8563 = _mm512_fmadd_ps(tmp8558, _mm512_set1_ps(8e+00f), tmp8557);
__m512 tmp8583 = _mm512_fmadd_ps(tmp8578, _mm512_set1_ps(8e+00f), tmp8577);
__m512 tmp8551 = _mm512_add_ps(tmp8552, tmp8553);
__m512 tmp8571 = _mm512_add_ps(tmp8572, tmp8573);
__m512 tmp8555 = _mm512_fmadd_ps(tmp8559, _mm512_set1_ps(1.6e+01f), tmp8556);
__m512 tmp8575 = _mm512_fmadd_ps(tmp8579, _mm512_set1_ps(1.6e+01f), tmp8576);
__m512 tmp8562 = _mm512_fmadd_ps(tmp8559, _mm512_set1_ps(4e+00f), tmp8563);
__m512 tmp8582 = _mm512_fmadd_ps(tmp8579, _mm512_set1_ps(4e+00f), tmp8583);
__m512 tmp8568 = _mm512_add_ps(tmp8559, tmp8557);
__m512 tmp8588 = _mm512_add_ps(tmp8579, tmp8577);
__m512 tmp8561 = _mm512_fmadd_ps(tmp8552, _mm512_set1_ps(4e+00f), tmp8553);
__m512 tmp8581 = _mm512_fmadd_ps(tmp8572, _mm512_set1_ps(4e+00f), tmp8573);
__m512 tmp8565 = _mm512_fmadd_ps(tmp8552, _mm512_set1_ps(1.6e+01f), tmp8553);
__m512 tmp8585 = _mm512_fmadd_ps(tmp8572, _mm512_set1_ps(1.6e+01f), tmp8573);
__m512 tmp8550 = _mm512_add_ps(tmp8551, in1277);
__m512 tmp8570 = _mm512_add_ps(tmp8571, in1285);
__m512 tmp8567 = _mm512_add_ps(tmp8568, in1284);
__m512 tmp8587 = _mm512_add_ps(tmp8588, in1292);
__m512 tmp8549 = _mm512_fmadd_ps(tmp8554, _mm512_set1_ps(3.2e+01f), tmp8550);
__m512 tmp8569 = _mm512_fmadd_ps(tmp8574, _mm512_set1_ps(3.2e+01f), tmp8570);
__m512 tmp8560 = _mm512_fmadd_ps(tmp8554, _mm512_set1_ps(8e+00f), tmp8561);
__m512 tmp8580 = _mm512_fmadd_ps(tmp8574, _mm512_set1_ps(8e+00f), tmp8581);
__m512 tmp8566 = _mm512_fmadd_ps(tmp8558, _mm512_set1_ps(3.2e+01f), tmp8567);
__m512 tmp8586 = _mm512_fmadd_ps(tmp8578, _mm512_set1_ps(3.2e+01f), tmp8587);
__m512 tmp8564 = _mm512_fmadd_ps(tmp8554, _mm512_set1_ps(2e+00f), tmp8565);
__m512 tmp8584 = _mm512_fmadd_ps(tmp8574, _mm512_set1_ps(2e+00f), tmp8585);
__m512 tmp8537 = tmp8549;
__m512 tmp8543 = tmp8569;
__m512 tmp8538 = tmp8555;
__m512 tmp8544 = tmp8575;
__m512 tmp8539 = tmp8560;
__m512 tmp8545 = tmp8580;
__m512 tmp8540 = tmp8562;
__m512 tmp8546 = tmp8582;
__m512 tmp8541 = tmp8564;
__m512 tmp8547 = tmp8584;
__m512 tmp8542 = tmp8566;
__m512 tmp8548 = tmp8586;
__m512 tmp8633 = _mm512_unpacklo_ps(tmp8537, tmp8538);
__m512 tmp8634 = _mm512_unpackhi_ps(tmp8537, tmp8538);
__m512 tmp8635 = _mm512_unpacklo_ps(tmp8539, tmp8540);
__m512 tmp8636 = _mm512_unpackhi_ps(tmp8539, tmp8540);
__m512 tmp8637 = _mm512_unpacklo_ps(tmp8541, tmp8542);
__m512 tmp8638 = _mm512_unpackhi_ps(tmp8541, tmp8542);
__m512 tmp8639 = _mm512_unpacklo_ps(tmp8543, tmp8544);
__m512 tmp8640 = _mm512_unpackhi_ps(tmp8543, tmp8544);
__m512 tmp8641 = _mm512_unpacklo_ps(tmp8545, tmp8546);
__m512 tmp8642 = _mm512_unpackhi_ps(tmp8545, tmp8546);
__m512 tmp8643 = _mm512_unpacklo_ps(tmp8547, tmp8548);
__m512 tmp8644 = _mm512_unpackhi_ps(tmp8547, tmp8548);
__m512 tmp8645 = _mm512_shuffle_ps(tmp8633, tmp8635, 68);
__m512 tmp8646 = _mm512_shuffle_ps(tmp8633, tmp8635, 238);
__m512 tmp8647 = _mm512_shuffle_ps(tmp8634, tmp8636, 68);
__m512 tmp8648 = _mm512_shuffle_ps(tmp8634, tmp8636, 238);
__m512 tmp8649 = _mm512_shuffle_ps(tmp8637, tmp8639, 68);
__m512 tmp8650 = _mm512_shuffle_ps(tmp8637, tmp8639, 238);
__m512 tmp8651 = _mm512_shuffle_ps(tmp8638, tmp8640, 68);
__m512 tmp8652 = _mm512_shuffle_ps(tmp8638, tmp8640, 238);
__m512 tmp8653 = _mm512_shuffle_ps(tmp8641, tmp8643, 68);
__m512 tmp8654 = _mm512_shuffle_ps(tmp8641, tmp8643, 238);
__m512 tmp8655 = _mm512_shuffle_ps(tmp8642, tmp8644, 68);
__m512 tmp8656 = _mm512_shuffle_ps(tmp8642, tmp8644, 238);
__m512 tmp8657 = _mm512_shuffle_f32x4(tmp8645, tmp8649, 136);
__m512 tmp8658 = _mm512_shuffle_f32x4(tmp8645, tmp8649, 221);
__m512 tmp8659 = _mm512_shuffle_f32x4(tmp8646, tmp8650, 136);
__m512 tmp8660 = _mm512_shuffle_f32x4(tmp8646, tmp8650, 221);
__m512 tmp8661 = _mm512_shuffle_f32x4(tmp8647, tmp8651, 136);
__m512 tmp8662 = _mm512_shuffle_f32x4(tmp8647, tmp8651, 221);
__m512 tmp8663 = _mm512_shuffle_f32x4(tmp8648, tmp8652, 136);
__m512 tmp8664 = _mm512_shuffle_f32x4(tmp8648, tmp8652, 221);
__m512 tmp8665 = _mm512_shuffle_f32x4(tmp8653, tmp8653, 136);
__m512 tmp8666 = _mm512_shuffle_f32x4(tmp8653, tmp8653, 221);
__m512 tmp8667 = _mm512_shuffle_f32x4(tmp8654, tmp8654, 136);
__m512 tmp8668 = _mm512_shuffle_f32x4(tmp8654, tmp8654, 221);
__m512 tmp8669 = _mm512_shuffle_f32x4(tmp8655, tmp8655, 136);
__m512 tmp8670 = _mm512_shuffle_f32x4(tmp8655, tmp8655, 221);
__m512 tmp8671 = _mm512_shuffle_f32x4(tmp8656, tmp8656, 136);
__m512 tmp8672 = _mm512_shuffle_f32x4(tmp8656, tmp8656, 221);
tmp8537 = _mm512_shuffle_f32x4(tmp8657, tmp8665, 136);
tmp8545 = _mm512_shuffle_f32x4(tmp8657, tmp8665, 221);
tmp8538 = _mm512_shuffle_f32x4(tmp8659, tmp8667, 136);
tmp8546 = _mm512_shuffle_f32x4(tmp8659, tmp8667, 221);
tmp8539 = _mm512_shuffle_f32x4(tmp8661, tmp8669, 136);
tmp8547 = _mm512_shuffle_f32x4(tmp8661, tmp8669, 221);
tmp8540 = _mm512_shuffle_f32x4(tmp8663, tmp8671, 136);
tmp8548 = _mm512_shuffle_f32x4(tmp8663, tmp8671, 221);
tmp8541 = _mm512_shuffle_f32x4(tmp8658, tmp8666, 136);
__m512 tmp8589 = _mm512_shuffle_f32x4(tmp8658, tmp8666, 221);
tmp8542 = _mm512_shuffle_f32x4(tmp8660, tmp8668, 136);
__m512 tmp8590 = _mm512_shuffle_f32x4(tmp8660, tmp8668, 221);
tmp8543 = _mm512_shuffle_f32x4(tmp8662, tmp8670, 136);
__m512 tmp8591 = _mm512_shuffle_f32x4(tmp8662, tmp8670, 221);
tmp8544 = _mm512_shuffle_f32x4(tmp8664, tmp8672, 136);
__m512 tmp8592 = _mm512_shuffle_f32x4(tmp8664, tmp8672, 221);
__m512 tmp8597 = _mm512_add_ps(tmp8538, tmp8539);
__m512 tmp8617 = _mm512_add_ps(tmp8546, tmp8547);
__m512 tmp8596 = _mm512_add_ps(tmp8540, tmp8541);
__m512 tmp8616 = _mm512_add_ps(tmp8548, tmp8589);
__m512 tmp8602 = _mm512_sub_ps(tmp8540, tmp8541);
__m512 tmp8622 = _mm512_sub_ps(tmp8548, tmp8589);
__m512 tmp8601 = _mm512_sub_ps(tmp8538, tmp8539);
__m512 tmp8621 = _mm512_sub_ps(tmp8546, tmp8547);
__m512 tmp8598 = _mm512_add_ps(tmp8542, tmp8543);
__m512 tmp8618 = _mm512_add_ps(tmp8590, tmp8591);
__m512 tmp8603 = _mm512_sub_ps(tmp8542, tmp8543);
__m512 tmp8623 = _mm512_sub_ps(tmp8590, tmp8591);
__m512 tmp8600 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(2e+00f), tmp8601);
__m512 tmp8620 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(2e+00f), tmp8621);
__m512 tmp8607 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(8e+00f), tmp8601);
__m512 tmp8627 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(8e+00f), tmp8621);
__m512 tmp8595 = _mm512_add_ps(tmp8596, tmp8597);
__m512 tmp8615 = _mm512_add_ps(tmp8616, tmp8617);
__m512 tmp8599 = _mm512_fmadd_ps(tmp8603, _mm512_set1_ps(1.6e+01f), tmp8600);
__m512 tmp8619 = _mm512_fmadd_ps(tmp8623, _mm512_set1_ps(1.6e+01f), tmp8620);
__m512 tmp8606 = _mm512_fmadd_ps(tmp8603, _mm512_set1_ps(4e+00f), tmp8607);
__m512 tmp8626 = _mm512_fmadd_ps(tmp8623, _mm512_set1_ps(4e+00f), tmp8627);
__m512 tmp8612 = _mm512_add_ps(tmp8603, tmp8601);
__m512 tmp8632 = _mm512_add_ps(tmp8623, tmp8621);
__m512 tmp8605 = _mm512_fmadd_ps(tmp8596, _mm512_set1_ps(4e+00f), tmp8597);
__m512 tmp8625 = _mm512_fmadd_ps(tmp8616, _mm512_set1_ps(4e+00f), tmp8617);
__m512 tmp8609 = _mm512_fmadd_ps(tmp8596, _mm512_set1_ps(1.6e+01f), tmp8597);
__m512 tmp8629 = _mm512_fmadd_ps(tmp8616, _mm512_set1_ps(1.6e+01f), tmp8617);
__m512 tmp8594 = _mm512_add_ps(tmp8595, tmp8537);
__m512 tmp8614 = _mm512_add_ps(tmp8615, tmp8545);
__m512 tmp8611 = _mm512_add_ps(tmp8612, tmp8544);
__m512 tmp8631 = _mm512_add_ps(tmp8632, tmp8592);
__m512 tmp8593 = _mm512_fmadd_ps(tmp8598, _mm512_set1_ps(3.2e+01f), tmp8594);
__m512 tmp8613 = _mm512_fmadd_ps(tmp8618, _mm512_set1_ps(3.2e+01f), tmp8614);
__m512 tmp8604 = _mm512_fmadd_ps(tmp8598, _mm512_set1_ps(8e+00f), tmp8605);
__m512 tmp8624 = _mm512_fmadd_ps(tmp8618, _mm512_set1_ps(8e+00f), tmp8625);
__m512 tmp8610 = _mm512_fmadd_ps(tmp8602, _mm512_set1_ps(3.2e+01f), tmp8611);
__m512 tmp8630 = _mm512_fmadd_ps(tmp8622, _mm512_set1_ps(3.2e+01f), tmp8631);
__m512 tmp8608 = _mm512_fmadd_ps(tmp8598, _mm512_set1_ps(2e+00f), tmp8609);
__m512 tmp8628 = _mm512_fmadd_ps(tmp8618, _mm512_set1_ps(2e+00f), tmp8629);
__m512 out1227 = tmp8593;
__m512 out1233 = tmp8613;
__m512 out1228 = tmp8599;
__m512 out1234 = tmp8619;
__m512 out1229 = tmp8604;
__m512 out1235 = tmp8624;
__m512 out1230 = tmp8606;
__m512 out1236 = tmp8626;
__m512 out1231 = tmp8608;
__m512 out1237 = tmp8628;
__m512 out1232 = tmp8610;
__m512 out1238 = tmp8630;
out1227 = _mm512_max_ps(_mm512_setzero_ps(), out1227);
out1233 = _mm512_max_ps(_mm512_setzero_ps(), out1233);
out1228 = _mm512_max_ps(_mm512_setzero_ps(), out1228);
out1234 = _mm512_max_ps(_mm512_setzero_ps(), out1234);
out1229 = _mm512_max_ps(_mm512_setzero_ps(), out1229);
out1235 = _mm512_max_ps(_mm512_setzero_ps(), out1235);
out1230 = _mm512_max_ps(_mm512_setzero_ps(), out1230);
out1236 = _mm512_max_ps(_mm512_setzero_ps(), out1236);
out1231 = _mm512_max_ps(_mm512_setzero_ps(), out1231);
out1237 = _mm512_max_ps(_mm512_setzero_ps(), out1237);
out1232 = _mm512_max_ps(_mm512_setzero_ps(), out1232);
out1238 = _mm512_max_ps(_mm512_setzero_ps(), out1238);
_mm512_mask_storeu_ps(datPtr13+1200+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1227);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1233);
_mm512_mask_storeu_ps(datPtr13+1424+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1228);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1234);
_mm512_mask_storeu_ps(datPtr13+1648+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1229);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1235);
_mm512_mask_storeu_ps(datPtr13+1872+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1230);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1236);
_mm512_mask_storeu_ps(datPtr13+2096+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1231);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1237);
_mm512_mask_storeu_ps(datPtr13+2320+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1232);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 255, out1238);
__m512 sf577 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf578 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1293 = _mm512_shuffle_f32x4(sf577, sf578, 68);
__m512 in1294 = _mm512_shuffle_f32x4(sf577, sf578, 238);
__m512 sf579 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf580 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1301 = _mm512_shuffle_f32x4(sf579, sf580, 68);
__m512 in1302 = _mm512_shuffle_f32x4(sf579, sf580, 238);
__m512 sf581 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf582 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1295 = _mm512_shuffle_f32x4(sf581, sf582, 68);
__m512 in1296 = _mm512_shuffle_f32x4(sf581, sf582, 238);
__m512 sf583 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf584 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1303 = _mm512_shuffle_f32x4(sf583, sf584, 68);
__m512 in1304 = _mm512_shuffle_f32x4(sf583, sf584, 238);
__m512 sf585 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf586 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1297 = _mm512_shuffle_f32x4(sf585, sf586, 68);
__m512 in1298 = _mm512_shuffle_f32x4(sf585, sf586, 238);
__m512 sf587 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf588 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1305 = _mm512_shuffle_f32x4(sf587, sf588, 68);
__m512 in1306 = _mm512_shuffle_f32x4(sf587, sf588, 238);
__m512 sf589 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf590 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1299 = _mm512_shuffle_f32x4(sf589, sf590, 68);
__m512 in1300 = _mm512_shuffle_f32x4(sf589, sf590, 238);
__m512 sf591 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k92+768*l31);
__m512 sf592 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k92+768*l31);
__m512 in1307 = _mm512_shuffle_f32x4(sf591, sf592, 68);
__m512 in1308 = _mm512_shuffle_f32x4(sf591, sf592, 238);
__m512 tmp8689 = _mm512_add_ps(in1294, in1295);
__m512 tmp8709 = _mm512_add_ps(in1302, in1303);
__m512 tmp8688 = _mm512_add_ps(in1296, in1297);
__m512 tmp8708 = _mm512_add_ps(in1304, in1305);
__m512 tmp8694 = _mm512_sub_ps(in1296, in1297);
__m512 tmp8714 = _mm512_sub_ps(in1304, in1305);
__m512 tmp8693 = _mm512_sub_ps(in1294, in1295);
__m512 tmp8713 = _mm512_sub_ps(in1302, in1303);
__m512 tmp8690 = _mm512_add_ps(in1298, in1299);
__m512 tmp8710 = _mm512_add_ps(in1306, in1307);
__m512 tmp8695 = _mm512_sub_ps(in1298, in1299);
__m512 tmp8715 = _mm512_sub_ps(in1306, in1307);
__m512 tmp8692 = _mm512_fmadd_ps(tmp8694, _mm512_set1_ps(2e+00f), tmp8693);
__m512 tmp8712 = _mm512_fmadd_ps(tmp8714, _mm512_set1_ps(2e+00f), tmp8713);
__m512 tmp8699 = _mm512_fmadd_ps(tmp8694, _mm512_set1_ps(8e+00f), tmp8693);
__m512 tmp8719 = _mm512_fmadd_ps(tmp8714, _mm512_set1_ps(8e+00f), tmp8713);
__m512 tmp8687 = _mm512_add_ps(tmp8688, tmp8689);
__m512 tmp8707 = _mm512_add_ps(tmp8708, tmp8709);
__m512 tmp8691 = _mm512_fmadd_ps(tmp8695, _mm512_set1_ps(1.6e+01f), tmp8692);
__m512 tmp8711 = _mm512_fmadd_ps(tmp8715, _mm512_set1_ps(1.6e+01f), tmp8712);
__m512 tmp8698 = _mm512_fmadd_ps(tmp8695, _mm512_set1_ps(4e+00f), tmp8699);
__m512 tmp8718 = _mm512_fmadd_ps(tmp8715, _mm512_set1_ps(4e+00f), tmp8719);
__m512 tmp8704 = _mm512_add_ps(tmp8695, tmp8693);
__m512 tmp8724 = _mm512_add_ps(tmp8715, tmp8713);
__m512 tmp8697 = _mm512_fmadd_ps(tmp8688, _mm512_set1_ps(4e+00f), tmp8689);
__m512 tmp8717 = _mm512_fmadd_ps(tmp8708, _mm512_set1_ps(4e+00f), tmp8709);
__m512 tmp8701 = _mm512_fmadd_ps(tmp8688, _mm512_set1_ps(1.6e+01f), tmp8689);
__m512 tmp8721 = _mm512_fmadd_ps(tmp8708, _mm512_set1_ps(1.6e+01f), tmp8709);
__m512 tmp8686 = _mm512_add_ps(tmp8687, in1293);
__m512 tmp8706 = _mm512_add_ps(tmp8707, in1301);
__m512 tmp8703 = _mm512_add_ps(tmp8704, in1300);
__m512 tmp8723 = _mm512_add_ps(tmp8724, in1308);
__m512 tmp8685 = _mm512_fmadd_ps(tmp8690, _mm512_set1_ps(3.2e+01f), tmp8686);
__m512 tmp8705 = _mm512_fmadd_ps(tmp8710, _mm512_set1_ps(3.2e+01f), tmp8706);
__m512 tmp8696 = _mm512_fmadd_ps(tmp8690, _mm512_set1_ps(8e+00f), tmp8697);
__m512 tmp8716 = _mm512_fmadd_ps(tmp8710, _mm512_set1_ps(8e+00f), tmp8717);
__m512 tmp8702 = _mm512_fmadd_ps(tmp8694, _mm512_set1_ps(3.2e+01f), tmp8703);
__m512 tmp8722 = _mm512_fmadd_ps(tmp8714, _mm512_set1_ps(3.2e+01f), tmp8723);
__m512 tmp8700 = _mm512_fmadd_ps(tmp8690, _mm512_set1_ps(2e+00f), tmp8701);
__m512 tmp8720 = _mm512_fmadd_ps(tmp8710, _mm512_set1_ps(2e+00f), tmp8721);
__m512 tmp8673 = tmp8685;
__m512 tmp8679 = tmp8705;
__m512 tmp8674 = tmp8691;
__m512 tmp8680 = tmp8711;
__m512 tmp8675 = tmp8696;
__m512 tmp8681 = tmp8716;
__m512 tmp8676 = tmp8698;
__m512 tmp8682 = tmp8718;
__m512 tmp8677 = tmp8700;
__m512 tmp8683 = tmp8720;
__m512 tmp8678 = tmp8702;
__m512 tmp8684 = tmp8722;
__m512 tmp8769 = _mm512_unpacklo_ps(tmp8673, tmp8674);
__m512 tmp8770 = _mm512_unpackhi_ps(tmp8673, tmp8674);
__m512 tmp8771 = _mm512_unpacklo_ps(tmp8675, tmp8676);
__m512 tmp8772 = _mm512_unpackhi_ps(tmp8675, tmp8676);
__m512 tmp8773 = _mm512_unpacklo_ps(tmp8677, tmp8678);
__m512 tmp8774 = _mm512_unpackhi_ps(tmp8677, tmp8678);
__m512 tmp8775 = _mm512_unpacklo_ps(tmp8679, tmp8680);
__m512 tmp8776 = _mm512_unpackhi_ps(tmp8679, tmp8680);
__m512 tmp8777 = _mm512_unpacklo_ps(tmp8681, tmp8682);
__m512 tmp8778 = _mm512_unpackhi_ps(tmp8681, tmp8682);
__m512 tmp8779 = _mm512_unpacklo_ps(tmp8683, tmp8684);
__m512 tmp8780 = _mm512_unpackhi_ps(tmp8683, tmp8684);
__m512 tmp8781 = _mm512_shuffle_ps(tmp8769, tmp8771, 68);
__m512 tmp8782 = _mm512_shuffle_ps(tmp8769, tmp8771, 238);
__m512 tmp8783 = _mm512_shuffle_ps(tmp8770, tmp8772, 68);
__m512 tmp8784 = _mm512_shuffle_ps(tmp8770, tmp8772, 238);
__m512 tmp8785 = _mm512_shuffle_ps(tmp8773, tmp8775, 68);
__m512 tmp8786 = _mm512_shuffle_ps(tmp8773, tmp8775, 238);
__m512 tmp8787 = _mm512_shuffle_ps(tmp8774, tmp8776, 68);
__m512 tmp8788 = _mm512_shuffle_ps(tmp8774, tmp8776, 238);
__m512 tmp8789 = _mm512_shuffle_ps(tmp8777, tmp8779, 68);
__m512 tmp8790 = _mm512_shuffle_ps(tmp8777, tmp8779, 238);
__m512 tmp8791 = _mm512_shuffle_ps(tmp8778, tmp8780, 68);
__m512 tmp8792 = _mm512_shuffle_ps(tmp8778, tmp8780, 238);
__m512 tmp8793 = _mm512_shuffle_f32x4(tmp8781, tmp8785, 136);
__m512 tmp8794 = _mm512_shuffle_f32x4(tmp8781, tmp8785, 221);
__m512 tmp8795 = _mm512_shuffle_f32x4(tmp8782, tmp8786, 136);
__m512 tmp8796 = _mm512_shuffle_f32x4(tmp8782, tmp8786, 221);
__m512 tmp8797 = _mm512_shuffle_f32x4(tmp8783, tmp8787, 136);
__m512 tmp8798 = _mm512_shuffle_f32x4(tmp8783, tmp8787, 221);
__m512 tmp8799 = _mm512_shuffle_f32x4(tmp8784, tmp8788, 136);
__m512 tmp8800 = _mm512_shuffle_f32x4(tmp8784, tmp8788, 221);
__m512 tmp8801 = _mm512_shuffle_f32x4(tmp8789, tmp8789, 136);
__m512 tmp8802 = _mm512_shuffle_f32x4(tmp8789, tmp8789, 221);
__m512 tmp8803 = _mm512_shuffle_f32x4(tmp8790, tmp8790, 136);
__m512 tmp8804 = _mm512_shuffle_f32x4(tmp8790, tmp8790, 221);
__m512 tmp8805 = _mm512_shuffle_f32x4(tmp8791, tmp8791, 136);
__m512 tmp8806 = _mm512_shuffle_f32x4(tmp8791, tmp8791, 221);
__m512 tmp8807 = _mm512_shuffle_f32x4(tmp8792, tmp8792, 136);
__m512 tmp8808 = _mm512_shuffle_f32x4(tmp8792, tmp8792, 221);
tmp8673 = _mm512_shuffle_f32x4(tmp8793, tmp8801, 136);
tmp8681 = _mm512_shuffle_f32x4(tmp8793, tmp8801, 221);
tmp8674 = _mm512_shuffle_f32x4(tmp8795, tmp8803, 136);
tmp8682 = _mm512_shuffle_f32x4(tmp8795, tmp8803, 221);
tmp8675 = _mm512_shuffle_f32x4(tmp8797, tmp8805, 136);
tmp8683 = _mm512_shuffle_f32x4(tmp8797, tmp8805, 221);
tmp8676 = _mm512_shuffle_f32x4(tmp8799, tmp8807, 136);
tmp8684 = _mm512_shuffle_f32x4(tmp8799, tmp8807, 221);
tmp8677 = _mm512_shuffle_f32x4(tmp8794, tmp8802, 136);
__m512 tmp8725 = _mm512_shuffle_f32x4(tmp8794, tmp8802, 221);
tmp8678 = _mm512_shuffle_f32x4(tmp8796, tmp8804, 136);
__m512 tmp8726 = _mm512_shuffle_f32x4(tmp8796, tmp8804, 221);
tmp8679 = _mm512_shuffle_f32x4(tmp8798, tmp8806, 136);
__m512 tmp8727 = _mm512_shuffle_f32x4(tmp8798, tmp8806, 221);
tmp8680 = _mm512_shuffle_f32x4(tmp8800, tmp8808, 136);
__m512 tmp8728 = _mm512_shuffle_f32x4(tmp8800, tmp8808, 221);
__m512 tmp8733 = _mm512_add_ps(tmp8674, tmp8675);
__m512 tmp8753 = _mm512_add_ps(tmp8682, tmp8683);
__m512 tmp8732 = _mm512_add_ps(tmp8676, tmp8677);
__m512 tmp8752 = _mm512_add_ps(tmp8684, tmp8725);
__m512 tmp8738 = _mm512_sub_ps(tmp8676, tmp8677);
__m512 tmp8758 = _mm512_sub_ps(tmp8684, tmp8725);
__m512 tmp8737 = _mm512_sub_ps(tmp8674, tmp8675);
__m512 tmp8757 = _mm512_sub_ps(tmp8682, tmp8683);
__m512 tmp8734 = _mm512_add_ps(tmp8678, tmp8679);
__m512 tmp8754 = _mm512_add_ps(tmp8726, tmp8727);
__m512 tmp8739 = _mm512_sub_ps(tmp8678, tmp8679);
__m512 tmp8759 = _mm512_sub_ps(tmp8726, tmp8727);
__m512 tmp8736 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(2e+00f), tmp8737);
__m512 tmp8756 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(2e+00f), tmp8757);
__m512 tmp8743 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(8e+00f), tmp8737);
__m512 tmp8763 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(8e+00f), tmp8757);
__m512 tmp8731 = _mm512_add_ps(tmp8732, tmp8733);
__m512 tmp8751 = _mm512_add_ps(tmp8752, tmp8753);
__m512 tmp8735 = _mm512_fmadd_ps(tmp8739, _mm512_set1_ps(1.6e+01f), tmp8736);
__m512 tmp8755 = _mm512_fmadd_ps(tmp8759, _mm512_set1_ps(1.6e+01f), tmp8756);
__m512 tmp8742 = _mm512_fmadd_ps(tmp8739, _mm512_set1_ps(4e+00f), tmp8743);
__m512 tmp8762 = _mm512_fmadd_ps(tmp8759, _mm512_set1_ps(4e+00f), tmp8763);
__m512 tmp8748 = _mm512_add_ps(tmp8739, tmp8737);
__m512 tmp8768 = _mm512_add_ps(tmp8759, tmp8757);
__m512 tmp8741 = _mm512_fmadd_ps(tmp8732, _mm512_set1_ps(4e+00f), tmp8733);
__m512 tmp8761 = _mm512_fmadd_ps(tmp8752, _mm512_set1_ps(4e+00f), tmp8753);
__m512 tmp8745 = _mm512_fmadd_ps(tmp8732, _mm512_set1_ps(1.6e+01f), tmp8733);
__m512 tmp8765 = _mm512_fmadd_ps(tmp8752, _mm512_set1_ps(1.6e+01f), tmp8753);
__m512 tmp8730 = _mm512_add_ps(tmp8731, tmp8673);
__m512 tmp8750 = _mm512_add_ps(tmp8751, tmp8681);
__m512 tmp8747 = _mm512_add_ps(tmp8748, tmp8680);
__m512 tmp8767 = _mm512_add_ps(tmp8768, tmp8728);
__m512 tmp8729 = _mm512_fmadd_ps(tmp8734, _mm512_set1_ps(3.2e+01f), tmp8730);
__m512 tmp8749 = _mm512_fmadd_ps(tmp8754, _mm512_set1_ps(3.2e+01f), tmp8750);
__m512 tmp8740 = _mm512_fmadd_ps(tmp8734, _mm512_set1_ps(8e+00f), tmp8741);
__m512 tmp8760 = _mm512_fmadd_ps(tmp8754, _mm512_set1_ps(8e+00f), tmp8761);
__m512 tmp8746 = _mm512_fmadd_ps(tmp8738, _mm512_set1_ps(3.2e+01f), tmp8747);
__m512 tmp8766 = _mm512_fmadd_ps(tmp8758, _mm512_set1_ps(3.2e+01f), tmp8767);
__m512 tmp8744 = _mm512_fmadd_ps(tmp8734, _mm512_set1_ps(2e+00f), tmp8745);
__m512 tmp8764 = _mm512_fmadd_ps(tmp8754, _mm512_set1_ps(2e+00f), tmp8765);
__m512 out1239 = tmp8729;
__m512 out1245 = tmp8749;
__m512 out1240 = tmp8735;
__m512 out1246 = tmp8755;
__m512 out1241 = tmp8740;
__m512 out1247 = tmp8760;
__m512 out1242 = tmp8742;
__m512 out1248 = tmp8762;
__m512 out1243 = tmp8744;
__m512 out1249 = tmp8764;
__m512 out1244 = tmp8746;
__m512 out1250 = tmp8766;
out1239 = _mm512_max_ps(_mm512_setzero_ps(), out1239);
out1245 = _mm512_max_ps(_mm512_setzero_ps(), out1245);
out1240 = _mm512_max_ps(_mm512_setzero_ps(), out1240);
out1246 = _mm512_max_ps(_mm512_setzero_ps(), out1246);
out1241 = _mm512_max_ps(_mm512_setzero_ps(), out1241);
out1247 = _mm512_max_ps(_mm512_setzero_ps(), out1247);
out1242 = _mm512_max_ps(_mm512_setzero_ps(), out1242);
out1248 = _mm512_max_ps(_mm512_setzero_ps(), out1248);
out1243 = _mm512_max_ps(_mm512_setzero_ps(), out1243);
out1249 = _mm512_max_ps(_mm512_setzero_ps(), out1249);
out1244 = _mm512_max_ps(_mm512_setzero_ps(), out1244);
out1250 = _mm512_max_ps(_mm512_setzero_ps(), out1250);
_mm512_mask_storeu_ps(datPtr13+13760+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1239);
_mm512_mask_storeu_ps(datPtr13+13808+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1245);
_mm512_mask_storeu_ps(datPtr13+13984+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1240);
_mm512_mask_storeu_ps(datPtr13+14032+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1246);
_mm512_mask_storeu_ps(datPtr13+14208+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1241);
_mm512_mask_storeu_ps(datPtr13+14256+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1247);
_mm512_mask_storeu_ps(datPtr13+14432+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1242);
_mm512_mask_storeu_ps(datPtr13+14480+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1248);
_mm512_mask_storeu_ps(datPtr13+14656+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1243);
_mm512_mask_storeu_ps(datPtr13+14704+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1249);
_mm512_mask_storeu_ps(datPtr13+14880+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1244);
_mm512_mask_storeu_ps(datPtr13+14928+50432*i29+224*toH32+4*toW32+50432*k92+25216*l31, 4095, out1250);
}
}
++j23;
rel17 = 2;
}
if (rel17 < 3) {
ptrdiff_t toH33 = base17+6;
ptrdiff_t toW33 = 24;
ptrdiff_t k93 = 1*w46;
for (; k93 != 1; ++k93) {
ptrdiff_t l32 = 0;
for (; l32 != 2; ++l32) {
__m512 sf593 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf594 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1309 = _mm512_shuffle_f32x4(sf593, sf594, 68);
__m512 in1310 = _mm512_shuffle_f32x4(sf593, sf594, 238);
__m512 sf595 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf596 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1317 = _mm512_shuffle_f32x4(sf595, sf596, 68);
__m512 in1318 = _mm512_shuffle_f32x4(sf595, sf596, 238);
__m512 sf597 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf598 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1311 = _mm512_shuffle_f32x4(sf597, sf598, 68);
__m512 in1312 = _mm512_shuffle_f32x4(sf597, sf598, 238);
__m512 sf599 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf600 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1319 = _mm512_shuffle_f32x4(sf599, sf600, 68);
__m512 in1320 = _mm512_shuffle_f32x4(sf599, sf600, 238);
__m512 sf601 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf602 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1313 = _mm512_shuffle_f32x4(sf601, sf602, 68);
__m512 in1314 = _mm512_shuffle_f32x4(sf601, sf602, 238);
__m512 sf603 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf604 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1321 = _mm512_shuffle_f32x4(sf603, sf604, 68);
__m512 in1322 = _mm512_shuffle_f32x4(sf603, sf604, 238);
__m512 sf605 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf606 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1315 = _mm512_shuffle_f32x4(sf605, sf606, 68);
__m512 in1316 = _mm512_shuffle_f32x4(sf605, sf606, 238);
__m512 sf607 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf608 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1323 = _mm512_shuffle_f32x4(sf607, sf608, 68);
__m512 in1324 = _mm512_shuffle_f32x4(sf607, sf608, 238);
__m512 tmp8825 = _mm512_add_ps(in1310, in1311);
__m512 tmp8845 = _mm512_add_ps(in1318, in1319);
__m512 tmp8824 = _mm512_add_ps(in1312, in1313);
__m512 tmp8844 = _mm512_add_ps(in1320, in1321);
__m512 tmp8830 = _mm512_sub_ps(in1312, in1313);
__m512 tmp8850 = _mm512_sub_ps(in1320, in1321);
__m512 tmp8829 = _mm512_sub_ps(in1310, in1311);
__m512 tmp8849 = _mm512_sub_ps(in1318, in1319);
__m512 tmp8826 = _mm512_add_ps(in1314, in1315);
__m512 tmp8846 = _mm512_add_ps(in1322, in1323);
__m512 tmp8831 = _mm512_sub_ps(in1314, in1315);
__m512 tmp8851 = _mm512_sub_ps(in1322, in1323);
__m512 tmp8828 = _mm512_fmadd_ps(tmp8830, _mm512_set1_ps(2e+00f), tmp8829);
__m512 tmp8848 = _mm512_fmadd_ps(tmp8850, _mm512_set1_ps(2e+00f), tmp8849);
__m512 tmp8835 = _mm512_fmadd_ps(tmp8830, _mm512_set1_ps(8e+00f), tmp8829);
__m512 tmp8855 = _mm512_fmadd_ps(tmp8850, _mm512_set1_ps(8e+00f), tmp8849);
__m512 tmp8823 = _mm512_add_ps(tmp8824, tmp8825);
__m512 tmp8843 = _mm512_add_ps(tmp8844, tmp8845);
__m512 tmp8827 = _mm512_fmadd_ps(tmp8831, _mm512_set1_ps(1.6e+01f), tmp8828);
__m512 tmp8847 = _mm512_fmadd_ps(tmp8851, _mm512_set1_ps(1.6e+01f), tmp8848);
__m512 tmp8834 = _mm512_fmadd_ps(tmp8831, _mm512_set1_ps(4e+00f), tmp8835);
__m512 tmp8854 = _mm512_fmadd_ps(tmp8851, _mm512_set1_ps(4e+00f), tmp8855);
__m512 tmp8840 = _mm512_add_ps(tmp8831, tmp8829);
__m512 tmp8860 = _mm512_add_ps(tmp8851, tmp8849);
__m512 tmp8833 = _mm512_fmadd_ps(tmp8824, _mm512_set1_ps(4e+00f), tmp8825);
__m512 tmp8853 = _mm512_fmadd_ps(tmp8844, _mm512_set1_ps(4e+00f), tmp8845);
__m512 tmp8837 = _mm512_fmadd_ps(tmp8824, _mm512_set1_ps(1.6e+01f), tmp8825);
__m512 tmp8857 = _mm512_fmadd_ps(tmp8844, _mm512_set1_ps(1.6e+01f), tmp8845);
__m512 tmp8822 = _mm512_add_ps(tmp8823, in1309);
__m512 tmp8842 = _mm512_add_ps(tmp8843, in1317);
__m512 tmp8839 = _mm512_add_ps(tmp8840, in1316);
__m512 tmp8859 = _mm512_add_ps(tmp8860, in1324);
__m512 tmp8821 = _mm512_fmadd_ps(tmp8826, _mm512_set1_ps(3.2e+01f), tmp8822);
__m512 tmp8841 = _mm512_fmadd_ps(tmp8846, _mm512_set1_ps(3.2e+01f), tmp8842);
__m512 tmp8832 = _mm512_fmadd_ps(tmp8826, _mm512_set1_ps(8e+00f), tmp8833);
__m512 tmp8852 = _mm512_fmadd_ps(tmp8846, _mm512_set1_ps(8e+00f), tmp8853);
__m512 tmp8838 = _mm512_fmadd_ps(tmp8830, _mm512_set1_ps(3.2e+01f), tmp8839);
__m512 tmp8858 = _mm512_fmadd_ps(tmp8850, _mm512_set1_ps(3.2e+01f), tmp8859);
__m512 tmp8836 = _mm512_fmadd_ps(tmp8826, _mm512_set1_ps(2e+00f), tmp8837);
__m512 tmp8856 = _mm512_fmadd_ps(tmp8846, _mm512_set1_ps(2e+00f), tmp8857);
__m512 tmp8809 = tmp8821;
__m512 tmp8815 = tmp8841;
__m512 tmp8810 = tmp8827;
__m512 tmp8816 = tmp8847;
__m512 tmp8811 = tmp8832;
__m512 tmp8817 = tmp8852;
__m512 tmp8812 = tmp8834;
__m512 tmp8818 = tmp8854;
__m512 tmp8813 = tmp8836;
__m512 tmp8819 = tmp8856;
__m512 tmp8814 = tmp8838;
__m512 tmp8820 = tmp8858;
__m512 tmp8905 = _mm512_unpacklo_ps(tmp8809, tmp8810);
__m512 tmp8906 = _mm512_unpackhi_ps(tmp8809, tmp8810);
__m512 tmp8907 = _mm512_unpacklo_ps(tmp8811, tmp8812);
__m512 tmp8908 = _mm512_unpackhi_ps(tmp8811, tmp8812);
__m512 tmp8909 = _mm512_unpacklo_ps(tmp8813, tmp8814);
__m512 tmp8910 = _mm512_unpackhi_ps(tmp8813, tmp8814);
__m512 tmp8911 = _mm512_unpacklo_ps(tmp8815, tmp8816);
__m512 tmp8912 = _mm512_unpackhi_ps(tmp8815, tmp8816);
__m512 tmp8913 = _mm512_unpacklo_ps(tmp8817, tmp8818);
__m512 tmp8914 = _mm512_unpackhi_ps(tmp8817, tmp8818);
__m512 tmp8915 = _mm512_unpacklo_ps(tmp8819, tmp8820);
__m512 tmp8916 = _mm512_unpackhi_ps(tmp8819, tmp8820);
__m512 tmp8917 = _mm512_shuffle_ps(tmp8905, tmp8907, 68);
__m512 tmp8918 = _mm512_shuffle_ps(tmp8905, tmp8907, 238);
__m512 tmp8919 = _mm512_shuffle_ps(tmp8906, tmp8908, 68);
__m512 tmp8920 = _mm512_shuffle_ps(tmp8906, tmp8908, 238);
__m512 tmp8921 = _mm512_shuffle_ps(tmp8909, tmp8911, 68);
__m512 tmp8922 = _mm512_shuffle_ps(tmp8909, tmp8911, 238);
__m512 tmp8923 = _mm512_shuffle_ps(tmp8910, tmp8912, 68);
__m512 tmp8924 = _mm512_shuffle_ps(tmp8910, tmp8912, 238);
__m512 tmp8925 = _mm512_shuffle_ps(tmp8913, tmp8915, 68);
__m512 tmp8926 = _mm512_shuffle_ps(tmp8913, tmp8915, 238);
__m512 tmp8927 = _mm512_shuffle_ps(tmp8914, tmp8916, 68);
__m512 tmp8928 = _mm512_shuffle_ps(tmp8914, tmp8916, 238);
__m512 tmp8929 = _mm512_shuffle_f32x4(tmp8917, tmp8921, 136);
__m512 tmp8930 = _mm512_shuffle_f32x4(tmp8917, tmp8921, 221);
__m512 tmp8931 = _mm512_shuffle_f32x4(tmp8918, tmp8922, 136);
__m512 tmp8932 = _mm512_shuffle_f32x4(tmp8918, tmp8922, 221);
__m512 tmp8933 = _mm512_shuffle_f32x4(tmp8919, tmp8923, 136);
__m512 tmp8934 = _mm512_shuffle_f32x4(tmp8919, tmp8923, 221);
__m512 tmp8935 = _mm512_shuffle_f32x4(tmp8920, tmp8924, 136);
__m512 tmp8936 = _mm512_shuffle_f32x4(tmp8920, tmp8924, 221);
__m512 tmp8937 = _mm512_shuffle_f32x4(tmp8925, tmp8925, 136);
__m512 tmp8938 = _mm512_shuffle_f32x4(tmp8925, tmp8925, 221);
__m512 tmp8939 = _mm512_shuffle_f32x4(tmp8926, tmp8926, 136);
__m512 tmp8940 = _mm512_shuffle_f32x4(tmp8926, tmp8926, 221);
__m512 tmp8941 = _mm512_shuffle_f32x4(tmp8927, tmp8927, 136);
__m512 tmp8942 = _mm512_shuffle_f32x4(tmp8927, tmp8927, 221);
__m512 tmp8943 = _mm512_shuffle_f32x4(tmp8928, tmp8928, 136);
__m512 tmp8944 = _mm512_shuffle_f32x4(tmp8928, tmp8928, 221);
tmp8809 = _mm512_shuffle_f32x4(tmp8929, tmp8937, 136);
tmp8817 = _mm512_shuffle_f32x4(tmp8929, tmp8937, 221);
tmp8810 = _mm512_shuffle_f32x4(tmp8931, tmp8939, 136);
tmp8818 = _mm512_shuffle_f32x4(tmp8931, tmp8939, 221);
tmp8811 = _mm512_shuffle_f32x4(tmp8933, tmp8941, 136);
tmp8819 = _mm512_shuffle_f32x4(tmp8933, tmp8941, 221);
tmp8812 = _mm512_shuffle_f32x4(tmp8935, tmp8943, 136);
tmp8820 = _mm512_shuffle_f32x4(tmp8935, tmp8943, 221);
tmp8813 = _mm512_shuffle_f32x4(tmp8930, tmp8938, 136);
__m512 tmp8861 = _mm512_shuffle_f32x4(tmp8930, tmp8938, 221);
tmp8814 = _mm512_shuffle_f32x4(tmp8932, tmp8940, 136);
__m512 tmp8862 = _mm512_shuffle_f32x4(tmp8932, tmp8940, 221);
tmp8815 = _mm512_shuffle_f32x4(tmp8934, tmp8942, 136);
__m512 tmp8863 = _mm512_shuffle_f32x4(tmp8934, tmp8942, 221);
tmp8816 = _mm512_shuffle_f32x4(tmp8936, tmp8944, 136);
__m512 tmp8864 = _mm512_shuffle_f32x4(tmp8936, tmp8944, 221);
__m512 tmp8869 = _mm512_add_ps(tmp8810, tmp8811);
__m512 tmp8889 = _mm512_add_ps(tmp8818, tmp8819);
__m512 tmp8868 = _mm512_add_ps(tmp8812, tmp8813);
__m512 tmp8888 = _mm512_add_ps(tmp8820, tmp8861);
__m512 tmp8874 = _mm512_sub_ps(tmp8812, tmp8813);
__m512 tmp8894 = _mm512_sub_ps(tmp8820, tmp8861);
__m512 tmp8873 = _mm512_sub_ps(tmp8810, tmp8811);
__m512 tmp8893 = _mm512_sub_ps(tmp8818, tmp8819);
__m512 tmp8870 = _mm512_add_ps(tmp8814, tmp8815);
__m512 tmp8890 = _mm512_add_ps(tmp8862, tmp8863);
__m512 tmp8875 = _mm512_sub_ps(tmp8814, tmp8815);
__m512 tmp8895 = _mm512_sub_ps(tmp8862, tmp8863);
__m512 tmp8872 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(2e+00f), tmp8873);
__m512 tmp8892 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(2e+00f), tmp8893);
__m512 tmp8879 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(8e+00f), tmp8873);
__m512 tmp8899 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(8e+00f), tmp8893);
__m512 tmp8867 = _mm512_add_ps(tmp8868, tmp8869);
__m512 tmp8887 = _mm512_add_ps(tmp8888, tmp8889);
__m512 tmp8871 = _mm512_fmadd_ps(tmp8875, _mm512_set1_ps(1.6e+01f), tmp8872);
__m512 tmp8891 = _mm512_fmadd_ps(tmp8895, _mm512_set1_ps(1.6e+01f), tmp8892);
__m512 tmp8878 = _mm512_fmadd_ps(tmp8875, _mm512_set1_ps(4e+00f), tmp8879);
__m512 tmp8898 = _mm512_fmadd_ps(tmp8895, _mm512_set1_ps(4e+00f), tmp8899);
__m512 tmp8884 = _mm512_add_ps(tmp8875, tmp8873);
__m512 tmp8904 = _mm512_add_ps(tmp8895, tmp8893);
__m512 tmp8877 = _mm512_fmadd_ps(tmp8868, _mm512_set1_ps(4e+00f), tmp8869);
__m512 tmp8897 = _mm512_fmadd_ps(tmp8888, _mm512_set1_ps(4e+00f), tmp8889);
__m512 tmp8881 = _mm512_fmadd_ps(tmp8868, _mm512_set1_ps(1.6e+01f), tmp8869);
__m512 tmp8901 = _mm512_fmadd_ps(tmp8888, _mm512_set1_ps(1.6e+01f), tmp8889);
__m512 tmp8866 = _mm512_add_ps(tmp8867, tmp8809);
__m512 tmp8886 = _mm512_add_ps(tmp8887, tmp8817);
__m512 tmp8883 = _mm512_add_ps(tmp8884, tmp8816);
__m512 tmp8903 = _mm512_add_ps(tmp8904, tmp8864);
__m512 tmp8865 = _mm512_fmadd_ps(tmp8870, _mm512_set1_ps(3.2e+01f), tmp8866);
__m512 tmp8885 = _mm512_fmadd_ps(tmp8890, _mm512_set1_ps(3.2e+01f), tmp8886);
__m512 tmp8876 = _mm512_fmadd_ps(tmp8870, _mm512_set1_ps(8e+00f), tmp8877);
__m512 tmp8896 = _mm512_fmadd_ps(tmp8890, _mm512_set1_ps(8e+00f), tmp8897);
__m512 tmp8882 = _mm512_fmadd_ps(tmp8874, _mm512_set1_ps(3.2e+01f), tmp8883);
__m512 tmp8902 = _mm512_fmadd_ps(tmp8894, _mm512_set1_ps(3.2e+01f), tmp8903);
__m512 tmp8880 = _mm512_fmadd_ps(tmp8870, _mm512_set1_ps(2e+00f), tmp8881);
__m512 tmp8900 = _mm512_fmadd_ps(tmp8890, _mm512_set1_ps(2e+00f), tmp8901);
__m512 out1251 = tmp8865;
__m512 out1257 = tmp8885;
__m512 out1252 = tmp8871;
__m512 out1258 = tmp8891;
__m512 out1253 = tmp8876;
__m512 out1259 = tmp8896;
__m512 out1254 = tmp8878;
__m512 out1260 = tmp8898;
__m512 out1255 = tmp8880;
__m512 out1261 = tmp8900;
__m512 out1256 = tmp8882;
__m512 out1262 = tmp8902;
out1251 = _mm512_max_ps(_mm512_setzero_ps(), out1251);
out1257 = _mm512_max_ps(_mm512_setzero_ps(), out1257);
out1252 = _mm512_max_ps(_mm512_setzero_ps(), out1252);
out1258 = _mm512_max_ps(_mm512_setzero_ps(), out1258);
out1253 = _mm512_max_ps(_mm512_setzero_ps(), out1253);
out1259 = _mm512_max_ps(_mm512_setzero_ps(), out1259);
out1254 = _mm512_max_ps(_mm512_setzero_ps(), out1254);
out1260 = _mm512_max_ps(_mm512_setzero_ps(), out1260);
out1255 = _mm512_max_ps(_mm512_setzero_ps(), out1255);
out1261 = _mm512_max_ps(_mm512_setzero_ps(), out1261);
out1256 = _mm512_max_ps(_mm512_setzero_ps(), out1256);
out1262 = _mm512_max_ps(_mm512_setzero_ps(), out1262);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1251);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1257);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1252);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1258);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1253);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1259);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1254);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1260);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1255);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1261);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1256);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1262);
__m512 sf609 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf610 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1325 = _mm512_shuffle_f32x4(sf609, sf610, 68);
__m512 in1326 = _mm512_shuffle_f32x4(sf609, sf610, 238);
__m512 sf611 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf612 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1333 = _mm512_shuffle_f32x4(sf611, sf612, 68);
__m512 in1334 = _mm512_shuffle_f32x4(sf611, sf612, 238);
__m512 sf613 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf614 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1327 = _mm512_shuffle_f32x4(sf613, sf614, 68);
__m512 in1328 = _mm512_shuffle_f32x4(sf613, sf614, 238);
__m512 sf615 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf616 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1335 = _mm512_shuffle_f32x4(sf615, sf616, 68);
__m512 in1336 = _mm512_shuffle_f32x4(sf615, sf616, 238);
__m512 sf617 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf618 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1329 = _mm512_shuffle_f32x4(sf617, sf618, 68);
__m512 in1330 = _mm512_shuffle_f32x4(sf617, sf618, 238);
__m512 sf619 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf620 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1337 = _mm512_shuffle_f32x4(sf619, sf620, 68);
__m512 in1338 = _mm512_shuffle_f32x4(sf619, sf620, 238);
__m512 sf621 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf622 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1331 = _mm512_shuffle_f32x4(sf621, sf622, 68);
__m512 in1332 = _mm512_shuffle_f32x4(sf621, sf622, 238);
__m512 sf623 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf624 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1339 = _mm512_shuffle_f32x4(sf623, sf624, 68);
__m512 in1340 = _mm512_shuffle_f32x4(sf623, sf624, 238);
__m512 tmp8961 = _mm512_add_ps(in1326, in1327);
__m512 tmp8981 = _mm512_add_ps(in1334, in1335);
__m512 tmp8960 = _mm512_add_ps(in1328, in1329);
__m512 tmp8980 = _mm512_add_ps(in1336, in1337);
__m512 tmp8966 = _mm512_sub_ps(in1328, in1329);
__m512 tmp8986 = _mm512_sub_ps(in1336, in1337);
__m512 tmp8965 = _mm512_sub_ps(in1326, in1327);
__m512 tmp8985 = _mm512_sub_ps(in1334, in1335);
__m512 tmp8962 = _mm512_add_ps(in1330, in1331);
__m512 tmp8982 = _mm512_add_ps(in1338, in1339);
__m512 tmp8967 = _mm512_sub_ps(in1330, in1331);
__m512 tmp8987 = _mm512_sub_ps(in1338, in1339);
__m512 tmp8964 = _mm512_fmadd_ps(tmp8966, _mm512_set1_ps(2e+00f), tmp8965);
__m512 tmp8984 = _mm512_fmadd_ps(tmp8986, _mm512_set1_ps(2e+00f), tmp8985);
__m512 tmp8971 = _mm512_fmadd_ps(tmp8966, _mm512_set1_ps(8e+00f), tmp8965);
__m512 tmp8991 = _mm512_fmadd_ps(tmp8986, _mm512_set1_ps(8e+00f), tmp8985);
__m512 tmp8959 = _mm512_add_ps(tmp8960, tmp8961);
__m512 tmp8979 = _mm512_add_ps(tmp8980, tmp8981);
__m512 tmp8963 = _mm512_fmadd_ps(tmp8967, _mm512_set1_ps(1.6e+01f), tmp8964);
__m512 tmp8983 = _mm512_fmadd_ps(tmp8987, _mm512_set1_ps(1.6e+01f), tmp8984);
__m512 tmp8970 = _mm512_fmadd_ps(tmp8967, _mm512_set1_ps(4e+00f), tmp8971);
__m512 tmp8990 = _mm512_fmadd_ps(tmp8987, _mm512_set1_ps(4e+00f), tmp8991);
__m512 tmp8976 = _mm512_add_ps(tmp8967, tmp8965);
__m512 tmp8996 = _mm512_add_ps(tmp8987, tmp8985);
__m512 tmp8969 = _mm512_fmadd_ps(tmp8960, _mm512_set1_ps(4e+00f), tmp8961);
__m512 tmp8989 = _mm512_fmadd_ps(tmp8980, _mm512_set1_ps(4e+00f), tmp8981);
__m512 tmp8973 = _mm512_fmadd_ps(tmp8960, _mm512_set1_ps(1.6e+01f), tmp8961);
__m512 tmp8993 = _mm512_fmadd_ps(tmp8980, _mm512_set1_ps(1.6e+01f), tmp8981);
__m512 tmp8958 = _mm512_add_ps(tmp8959, in1325);
__m512 tmp8978 = _mm512_add_ps(tmp8979, in1333);
__m512 tmp8975 = _mm512_add_ps(tmp8976, in1332);
__m512 tmp8995 = _mm512_add_ps(tmp8996, in1340);
__m512 tmp8957 = _mm512_fmadd_ps(tmp8962, _mm512_set1_ps(3.2e+01f), tmp8958);
__m512 tmp8977 = _mm512_fmadd_ps(tmp8982, _mm512_set1_ps(3.2e+01f), tmp8978);
__m512 tmp8968 = _mm512_fmadd_ps(tmp8962, _mm512_set1_ps(8e+00f), tmp8969);
__m512 tmp8988 = _mm512_fmadd_ps(tmp8982, _mm512_set1_ps(8e+00f), tmp8989);
__m512 tmp8974 = _mm512_fmadd_ps(tmp8966, _mm512_set1_ps(3.2e+01f), tmp8975);
__m512 tmp8994 = _mm512_fmadd_ps(tmp8986, _mm512_set1_ps(3.2e+01f), tmp8995);
__m512 tmp8972 = _mm512_fmadd_ps(tmp8962, _mm512_set1_ps(2e+00f), tmp8973);
__m512 tmp8992 = _mm512_fmadd_ps(tmp8982, _mm512_set1_ps(2e+00f), tmp8993);
__m512 tmp8945 = tmp8957;
__m512 tmp8951 = tmp8977;
__m512 tmp8946 = tmp8963;
__m512 tmp8952 = tmp8983;
__m512 tmp8947 = tmp8968;
__m512 tmp8953 = tmp8988;
__m512 tmp8948 = tmp8970;
__m512 tmp8954 = tmp8990;
__m512 tmp8949 = tmp8972;
__m512 tmp8955 = tmp8992;
__m512 tmp8950 = tmp8974;
__m512 tmp8956 = tmp8994;
__m512 tmp9041 = _mm512_unpacklo_ps(tmp8945, tmp8946);
__m512 tmp9042 = _mm512_unpackhi_ps(tmp8945, tmp8946);
__m512 tmp9043 = _mm512_unpacklo_ps(tmp8947, tmp8948);
__m512 tmp9044 = _mm512_unpackhi_ps(tmp8947, tmp8948);
__m512 tmp9045 = _mm512_unpacklo_ps(tmp8949, tmp8950);
__m512 tmp9046 = _mm512_unpackhi_ps(tmp8949, tmp8950);
__m512 tmp9047 = _mm512_unpacklo_ps(tmp8951, tmp8952);
__m512 tmp9048 = _mm512_unpackhi_ps(tmp8951, tmp8952);
__m512 tmp9049 = _mm512_unpacklo_ps(tmp8953, tmp8954);
__m512 tmp9050 = _mm512_unpackhi_ps(tmp8953, tmp8954);
__m512 tmp9051 = _mm512_unpacklo_ps(tmp8955, tmp8956);
__m512 tmp9052 = _mm512_unpackhi_ps(tmp8955, tmp8956);
__m512 tmp9053 = _mm512_shuffle_ps(tmp9041, tmp9043, 68);
__m512 tmp9054 = _mm512_shuffle_ps(tmp9041, tmp9043, 238);
__m512 tmp9055 = _mm512_shuffle_ps(tmp9042, tmp9044, 68);
__m512 tmp9056 = _mm512_shuffle_ps(tmp9042, tmp9044, 238);
__m512 tmp9057 = _mm512_shuffle_ps(tmp9045, tmp9047, 68);
__m512 tmp9058 = _mm512_shuffle_ps(tmp9045, tmp9047, 238);
__m512 tmp9059 = _mm512_shuffle_ps(tmp9046, tmp9048, 68);
__m512 tmp9060 = _mm512_shuffle_ps(tmp9046, tmp9048, 238);
__m512 tmp9061 = _mm512_shuffle_ps(tmp9049, tmp9051, 68);
__m512 tmp9062 = _mm512_shuffle_ps(tmp9049, tmp9051, 238);
__m512 tmp9063 = _mm512_shuffle_ps(tmp9050, tmp9052, 68);
__m512 tmp9064 = _mm512_shuffle_ps(tmp9050, tmp9052, 238);
__m512 tmp9065 = _mm512_shuffle_f32x4(tmp9053, tmp9057, 136);
__m512 tmp9066 = _mm512_shuffle_f32x4(tmp9053, tmp9057, 221);
__m512 tmp9067 = _mm512_shuffle_f32x4(tmp9054, tmp9058, 136);
__m512 tmp9068 = _mm512_shuffle_f32x4(tmp9054, tmp9058, 221);
__m512 tmp9069 = _mm512_shuffle_f32x4(tmp9055, tmp9059, 136);
__m512 tmp9070 = _mm512_shuffle_f32x4(tmp9055, tmp9059, 221);
__m512 tmp9071 = _mm512_shuffle_f32x4(tmp9056, tmp9060, 136);
__m512 tmp9072 = _mm512_shuffle_f32x4(tmp9056, tmp9060, 221);
__m512 tmp9073 = _mm512_shuffle_f32x4(tmp9061, tmp9061, 136);
__m512 tmp9074 = _mm512_shuffle_f32x4(tmp9061, tmp9061, 221);
__m512 tmp9075 = _mm512_shuffle_f32x4(tmp9062, tmp9062, 136);
__m512 tmp9076 = _mm512_shuffle_f32x4(tmp9062, tmp9062, 221);
__m512 tmp9077 = _mm512_shuffle_f32x4(tmp9063, tmp9063, 136);
__m512 tmp9078 = _mm512_shuffle_f32x4(tmp9063, tmp9063, 221);
__m512 tmp9079 = _mm512_shuffle_f32x4(tmp9064, tmp9064, 136);
__m512 tmp9080 = _mm512_shuffle_f32x4(tmp9064, tmp9064, 221);
tmp8945 = _mm512_shuffle_f32x4(tmp9065, tmp9073, 136);
tmp8953 = _mm512_shuffle_f32x4(tmp9065, tmp9073, 221);
tmp8946 = _mm512_shuffle_f32x4(tmp9067, tmp9075, 136);
tmp8954 = _mm512_shuffle_f32x4(tmp9067, tmp9075, 221);
tmp8947 = _mm512_shuffle_f32x4(tmp9069, tmp9077, 136);
tmp8955 = _mm512_shuffle_f32x4(tmp9069, tmp9077, 221);
tmp8948 = _mm512_shuffle_f32x4(tmp9071, tmp9079, 136);
tmp8956 = _mm512_shuffle_f32x4(tmp9071, tmp9079, 221);
tmp8949 = _mm512_shuffle_f32x4(tmp9066, tmp9074, 136);
__m512 tmp8997 = _mm512_shuffle_f32x4(tmp9066, tmp9074, 221);
tmp8950 = _mm512_shuffle_f32x4(tmp9068, tmp9076, 136);
__m512 tmp8998 = _mm512_shuffle_f32x4(tmp9068, tmp9076, 221);
tmp8951 = _mm512_shuffle_f32x4(tmp9070, tmp9078, 136);
__m512 tmp8999 = _mm512_shuffle_f32x4(tmp9070, tmp9078, 221);
tmp8952 = _mm512_shuffle_f32x4(tmp9072, tmp9080, 136);
__m512 tmp9000 = _mm512_shuffle_f32x4(tmp9072, tmp9080, 221);
__m512 tmp9005 = _mm512_add_ps(tmp8946, tmp8947);
__m512 tmp9025 = _mm512_add_ps(tmp8954, tmp8955);
__m512 tmp9004 = _mm512_add_ps(tmp8948, tmp8949);
__m512 tmp9024 = _mm512_add_ps(tmp8956, tmp8997);
__m512 tmp9010 = _mm512_sub_ps(tmp8948, tmp8949);
__m512 tmp9030 = _mm512_sub_ps(tmp8956, tmp8997);
__m512 tmp9009 = _mm512_sub_ps(tmp8946, tmp8947);
__m512 tmp9029 = _mm512_sub_ps(tmp8954, tmp8955);
__m512 tmp9006 = _mm512_add_ps(tmp8950, tmp8951);
__m512 tmp9026 = _mm512_add_ps(tmp8998, tmp8999);
__m512 tmp9011 = _mm512_sub_ps(tmp8950, tmp8951);
__m512 tmp9031 = _mm512_sub_ps(tmp8998, tmp8999);
__m512 tmp9008 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(2e+00f), tmp9009);
__m512 tmp9028 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(2e+00f), tmp9029);
__m512 tmp9015 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(8e+00f), tmp9009);
__m512 tmp9035 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(8e+00f), tmp9029);
__m512 tmp9003 = _mm512_add_ps(tmp9004, tmp9005);
__m512 tmp9023 = _mm512_add_ps(tmp9024, tmp9025);
__m512 tmp9007 = _mm512_fmadd_ps(tmp9011, _mm512_set1_ps(1.6e+01f), tmp9008);
__m512 tmp9027 = _mm512_fmadd_ps(tmp9031, _mm512_set1_ps(1.6e+01f), tmp9028);
__m512 tmp9014 = _mm512_fmadd_ps(tmp9011, _mm512_set1_ps(4e+00f), tmp9015);
__m512 tmp9034 = _mm512_fmadd_ps(tmp9031, _mm512_set1_ps(4e+00f), tmp9035);
__m512 tmp9020 = _mm512_add_ps(tmp9011, tmp9009);
__m512 tmp9040 = _mm512_add_ps(tmp9031, tmp9029);
__m512 tmp9013 = _mm512_fmadd_ps(tmp9004, _mm512_set1_ps(4e+00f), tmp9005);
__m512 tmp9033 = _mm512_fmadd_ps(tmp9024, _mm512_set1_ps(4e+00f), tmp9025);
__m512 tmp9017 = _mm512_fmadd_ps(tmp9004, _mm512_set1_ps(1.6e+01f), tmp9005);
__m512 tmp9037 = _mm512_fmadd_ps(tmp9024, _mm512_set1_ps(1.6e+01f), tmp9025);
__m512 tmp9002 = _mm512_add_ps(tmp9003, tmp8945);
__m512 tmp9022 = _mm512_add_ps(tmp9023, tmp8953);
__m512 tmp9019 = _mm512_add_ps(tmp9020, tmp8952);
__m512 tmp9039 = _mm512_add_ps(tmp9040, tmp9000);
__m512 tmp9001 = _mm512_fmadd_ps(tmp9006, _mm512_set1_ps(3.2e+01f), tmp9002);
__m512 tmp9021 = _mm512_fmadd_ps(tmp9026, _mm512_set1_ps(3.2e+01f), tmp9022);
__m512 tmp9012 = _mm512_fmadd_ps(tmp9006, _mm512_set1_ps(8e+00f), tmp9013);
__m512 tmp9032 = _mm512_fmadd_ps(tmp9026, _mm512_set1_ps(8e+00f), tmp9033);
__m512 tmp9018 = _mm512_fmadd_ps(tmp9010, _mm512_set1_ps(3.2e+01f), tmp9019);
__m512 tmp9038 = _mm512_fmadd_ps(tmp9030, _mm512_set1_ps(3.2e+01f), tmp9039);
__m512 tmp9016 = _mm512_fmadd_ps(tmp9006, _mm512_set1_ps(2e+00f), tmp9017);
__m512 tmp9036 = _mm512_fmadd_ps(tmp9026, _mm512_set1_ps(2e+00f), tmp9037);
__m512 out1263 = tmp9001;
__m512 out1269 = tmp9021;
__m512 out1264 = tmp9007;
__m512 out1270 = tmp9027;
__m512 out1265 = tmp9012;
__m512 out1271 = tmp9032;
__m512 out1266 = tmp9014;
__m512 out1272 = tmp9034;
__m512 out1267 = tmp9016;
__m512 out1273 = tmp9036;
__m512 out1268 = tmp9018;
__m512 out1274 = tmp9038;
out1263 = _mm512_max_ps(_mm512_setzero_ps(), out1263);
out1269 = _mm512_max_ps(_mm512_setzero_ps(), out1269);
out1264 = _mm512_max_ps(_mm512_setzero_ps(), out1264);
out1270 = _mm512_max_ps(_mm512_setzero_ps(), out1270);
out1265 = _mm512_max_ps(_mm512_setzero_ps(), out1265);
out1271 = _mm512_max_ps(_mm512_setzero_ps(), out1271);
out1266 = _mm512_max_ps(_mm512_setzero_ps(), out1266);
out1272 = _mm512_max_ps(_mm512_setzero_ps(), out1272);
out1267 = _mm512_max_ps(_mm512_setzero_ps(), out1267);
out1273 = _mm512_max_ps(_mm512_setzero_ps(), out1273);
out1268 = _mm512_max_ps(_mm512_setzero_ps(), out1268);
out1274 = _mm512_max_ps(_mm512_setzero_ps(), out1274);
_mm512_mask_storeu_ps(datPtr13+96+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1263);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1269);
_mm512_mask_storeu_ps(datPtr13+320+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1264);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1270);
_mm512_mask_storeu_ps(datPtr13+544+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1265);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1271);
_mm512_mask_storeu_ps(datPtr13+768+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1266);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1272);
_mm512_mask_storeu_ps(datPtr13+992+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1267);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1273);
_mm512_mask_storeu_ps(datPtr13+1216+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1268);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1274);
__m512 sf625 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf626 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1341 = _mm512_shuffle_f32x4(sf625, sf626, 68);
__m512 in1342 = _mm512_shuffle_f32x4(sf625, sf626, 238);
__m512 sf627 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf628 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1349 = _mm512_shuffle_f32x4(sf627, sf628, 68);
__m512 in1350 = _mm512_shuffle_f32x4(sf627, sf628, 238);
__m512 sf629 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf630 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1343 = _mm512_shuffle_f32x4(sf629, sf630, 68);
__m512 in1344 = _mm512_shuffle_f32x4(sf629, sf630, 238);
__m512 sf631 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf632 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1351 = _mm512_shuffle_f32x4(sf631, sf632, 68);
__m512 in1352 = _mm512_shuffle_f32x4(sf631, sf632, 238);
__m512 sf633 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf634 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1345 = _mm512_shuffle_f32x4(sf633, sf634, 68);
__m512 in1346 = _mm512_shuffle_f32x4(sf633, sf634, 238);
__m512 sf635 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf636 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1353 = _mm512_shuffle_f32x4(sf635, sf636, 68);
__m512 in1354 = _mm512_shuffle_f32x4(sf635, sf636, 238);
__m512 sf637 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf638 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1347 = _mm512_shuffle_f32x4(sf637, sf638, 68);
__m512 in1348 = _mm512_shuffle_f32x4(sf637, sf638, 238);
__m512 sf639 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k93+768*l32);
__m512 sf640 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k93+768*l32);
__m512 in1355 = _mm512_shuffle_f32x4(sf639, sf640, 68);
__m512 in1356 = _mm512_shuffle_f32x4(sf639, sf640, 238);
__m512 tmp9097 = _mm512_add_ps(in1342, in1343);
__m512 tmp9117 = _mm512_add_ps(in1350, in1351);
__m512 tmp9096 = _mm512_add_ps(in1344, in1345);
__m512 tmp9116 = _mm512_add_ps(in1352, in1353);
__m512 tmp9102 = _mm512_sub_ps(in1344, in1345);
__m512 tmp9122 = _mm512_sub_ps(in1352, in1353);
__m512 tmp9101 = _mm512_sub_ps(in1342, in1343);
__m512 tmp9121 = _mm512_sub_ps(in1350, in1351);
__m512 tmp9098 = _mm512_add_ps(in1346, in1347);
__m512 tmp9118 = _mm512_add_ps(in1354, in1355);
__m512 tmp9103 = _mm512_sub_ps(in1346, in1347);
__m512 tmp9123 = _mm512_sub_ps(in1354, in1355);
__m512 tmp9100 = _mm512_fmadd_ps(tmp9102, _mm512_set1_ps(2e+00f), tmp9101);
__m512 tmp9120 = _mm512_fmadd_ps(tmp9122, _mm512_set1_ps(2e+00f), tmp9121);
__m512 tmp9107 = _mm512_fmadd_ps(tmp9102, _mm512_set1_ps(8e+00f), tmp9101);
__m512 tmp9127 = _mm512_fmadd_ps(tmp9122, _mm512_set1_ps(8e+00f), tmp9121);
__m512 tmp9095 = _mm512_add_ps(tmp9096, tmp9097);
__m512 tmp9115 = _mm512_add_ps(tmp9116, tmp9117);
__m512 tmp9099 = _mm512_fmadd_ps(tmp9103, _mm512_set1_ps(1.6e+01f), tmp9100);
__m512 tmp9119 = _mm512_fmadd_ps(tmp9123, _mm512_set1_ps(1.6e+01f), tmp9120);
__m512 tmp9106 = _mm512_fmadd_ps(tmp9103, _mm512_set1_ps(4e+00f), tmp9107);
__m512 tmp9126 = _mm512_fmadd_ps(tmp9123, _mm512_set1_ps(4e+00f), tmp9127);
__m512 tmp9112 = _mm512_add_ps(tmp9103, tmp9101);
__m512 tmp9132 = _mm512_add_ps(tmp9123, tmp9121);
__m512 tmp9105 = _mm512_fmadd_ps(tmp9096, _mm512_set1_ps(4e+00f), tmp9097);
__m512 tmp9125 = _mm512_fmadd_ps(tmp9116, _mm512_set1_ps(4e+00f), tmp9117);
__m512 tmp9109 = _mm512_fmadd_ps(tmp9096, _mm512_set1_ps(1.6e+01f), tmp9097);
__m512 tmp9129 = _mm512_fmadd_ps(tmp9116, _mm512_set1_ps(1.6e+01f), tmp9117);
__m512 tmp9094 = _mm512_add_ps(tmp9095, in1341);
__m512 tmp9114 = _mm512_add_ps(tmp9115, in1349);
__m512 tmp9111 = _mm512_add_ps(tmp9112, in1348);
__m512 tmp9131 = _mm512_add_ps(tmp9132, in1356);
__m512 tmp9093 = _mm512_fmadd_ps(tmp9098, _mm512_set1_ps(3.2e+01f), tmp9094);
__m512 tmp9113 = _mm512_fmadd_ps(tmp9118, _mm512_set1_ps(3.2e+01f), tmp9114);
__m512 tmp9104 = _mm512_fmadd_ps(tmp9098, _mm512_set1_ps(8e+00f), tmp9105);
__m512 tmp9124 = _mm512_fmadd_ps(tmp9118, _mm512_set1_ps(8e+00f), tmp9125);
__m512 tmp9110 = _mm512_fmadd_ps(tmp9102, _mm512_set1_ps(3.2e+01f), tmp9111);
__m512 tmp9130 = _mm512_fmadd_ps(tmp9122, _mm512_set1_ps(3.2e+01f), tmp9131);
__m512 tmp9108 = _mm512_fmadd_ps(tmp9098, _mm512_set1_ps(2e+00f), tmp9109);
__m512 tmp9128 = _mm512_fmadd_ps(tmp9118, _mm512_set1_ps(2e+00f), tmp9129);
__m512 tmp9081 = tmp9093;
__m512 tmp9087 = tmp9113;
__m512 tmp9082 = tmp9099;
__m512 tmp9088 = tmp9119;
__m512 tmp9083 = tmp9104;
__m512 tmp9089 = tmp9124;
__m512 tmp9084 = tmp9106;
__m512 tmp9090 = tmp9126;
__m512 tmp9085 = tmp9108;
__m512 tmp9091 = tmp9128;
__m512 tmp9086 = tmp9110;
__m512 tmp9092 = tmp9130;
__m512 tmp9177 = _mm512_unpacklo_ps(tmp9081, tmp9082);
__m512 tmp9178 = _mm512_unpackhi_ps(tmp9081, tmp9082);
__m512 tmp9179 = _mm512_unpacklo_ps(tmp9083, tmp9084);
__m512 tmp9180 = _mm512_unpackhi_ps(tmp9083, tmp9084);
__m512 tmp9181 = _mm512_unpacklo_ps(tmp9085, tmp9086);
__m512 tmp9182 = _mm512_unpackhi_ps(tmp9085, tmp9086);
__m512 tmp9183 = _mm512_unpacklo_ps(tmp9087, tmp9088);
__m512 tmp9184 = _mm512_unpackhi_ps(tmp9087, tmp9088);
__m512 tmp9185 = _mm512_unpacklo_ps(tmp9089, tmp9090);
__m512 tmp9186 = _mm512_unpackhi_ps(tmp9089, tmp9090);
__m512 tmp9187 = _mm512_unpacklo_ps(tmp9091, tmp9092);
__m512 tmp9188 = _mm512_unpackhi_ps(tmp9091, tmp9092);
__m512 tmp9189 = _mm512_shuffle_ps(tmp9177, tmp9179, 68);
__m512 tmp9190 = _mm512_shuffle_ps(tmp9177, tmp9179, 238);
__m512 tmp9191 = _mm512_shuffle_ps(tmp9178, tmp9180, 68);
__m512 tmp9192 = _mm512_shuffle_ps(tmp9178, tmp9180, 238);
__m512 tmp9193 = _mm512_shuffle_ps(tmp9181, tmp9183, 68);
__m512 tmp9194 = _mm512_shuffle_ps(tmp9181, tmp9183, 238);
__m512 tmp9195 = _mm512_shuffle_ps(tmp9182, tmp9184, 68);
__m512 tmp9196 = _mm512_shuffle_ps(tmp9182, tmp9184, 238);
__m512 tmp9197 = _mm512_shuffle_ps(tmp9185, tmp9187, 68);
__m512 tmp9198 = _mm512_shuffle_ps(tmp9185, tmp9187, 238);
__m512 tmp9199 = _mm512_shuffle_ps(tmp9186, tmp9188, 68);
__m512 tmp9200 = _mm512_shuffle_ps(tmp9186, tmp9188, 238);
__m512 tmp9201 = _mm512_shuffle_f32x4(tmp9189, tmp9193, 136);
__m512 tmp9202 = _mm512_shuffle_f32x4(tmp9189, tmp9193, 221);
__m512 tmp9203 = _mm512_shuffle_f32x4(tmp9190, tmp9194, 136);
__m512 tmp9204 = _mm512_shuffle_f32x4(tmp9190, tmp9194, 221);
__m512 tmp9205 = _mm512_shuffle_f32x4(tmp9191, tmp9195, 136);
__m512 tmp9206 = _mm512_shuffle_f32x4(tmp9191, tmp9195, 221);
__m512 tmp9207 = _mm512_shuffle_f32x4(tmp9192, tmp9196, 136);
__m512 tmp9208 = _mm512_shuffle_f32x4(tmp9192, tmp9196, 221);
__m512 tmp9209 = _mm512_shuffle_f32x4(tmp9197, tmp9197, 136);
__m512 tmp9210 = _mm512_shuffle_f32x4(tmp9197, tmp9197, 221);
__m512 tmp9211 = _mm512_shuffle_f32x4(tmp9198, tmp9198, 136);
__m512 tmp9212 = _mm512_shuffle_f32x4(tmp9198, tmp9198, 221);
__m512 tmp9213 = _mm512_shuffle_f32x4(tmp9199, tmp9199, 136);
__m512 tmp9214 = _mm512_shuffle_f32x4(tmp9199, tmp9199, 221);
__m512 tmp9215 = _mm512_shuffle_f32x4(tmp9200, tmp9200, 136);
__m512 tmp9216 = _mm512_shuffle_f32x4(tmp9200, tmp9200, 221);
tmp9081 = _mm512_shuffle_f32x4(tmp9201, tmp9209, 136);
tmp9089 = _mm512_shuffle_f32x4(tmp9201, tmp9209, 221);
tmp9082 = _mm512_shuffle_f32x4(tmp9203, tmp9211, 136);
tmp9090 = _mm512_shuffle_f32x4(tmp9203, tmp9211, 221);
tmp9083 = _mm512_shuffle_f32x4(tmp9205, tmp9213, 136);
tmp9091 = _mm512_shuffle_f32x4(tmp9205, tmp9213, 221);
tmp9084 = _mm512_shuffle_f32x4(tmp9207, tmp9215, 136);
tmp9092 = _mm512_shuffle_f32x4(tmp9207, tmp9215, 221);
tmp9085 = _mm512_shuffle_f32x4(tmp9202, tmp9210, 136);
__m512 tmp9133 = _mm512_shuffle_f32x4(tmp9202, tmp9210, 221);
tmp9086 = _mm512_shuffle_f32x4(tmp9204, tmp9212, 136);
__m512 tmp9134 = _mm512_shuffle_f32x4(tmp9204, tmp9212, 221);
tmp9087 = _mm512_shuffle_f32x4(tmp9206, tmp9214, 136);
__m512 tmp9135 = _mm512_shuffle_f32x4(tmp9206, tmp9214, 221);
tmp9088 = _mm512_shuffle_f32x4(tmp9208, tmp9216, 136);
__m512 tmp9136 = _mm512_shuffle_f32x4(tmp9208, tmp9216, 221);
__m512 tmp9141 = _mm512_add_ps(tmp9082, tmp9083);
__m512 tmp9161 = _mm512_add_ps(tmp9090, tmp9091);
__m512 tmp9140 = _mm512_add_ps(tmp9084, tmp9085);
__m512 tmp9160 = _mm512_add_ps(tmp9092, tmp9133);
__m512 tmp9146 = _mm512_sub_ps(tmp9084, tmp9085);
__m512 tmp9166 = _mm512_sub_ps(tmp9092, tmp9133);
__m512 tmp9145 = _mm512_sub_ps(tmp9082, tmp9083);
__m512 tmp9165 = _mm512_sub_ps(tmp9090, tmp9091);
__m512 tmp9142 = _mm512_add_ps(tmp9086, tmp9087);
__m512 tmp9162 = _mm512_add_ps(tmp9134, tmp9135);
__m512 tmp9147 = _mm512_sub_ps(tmp9086, tmp9087);
__m512 tmp9167 = _mm512_sub_ps(tmp9134, tmp9135);
__m512 tmp9144 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(2e+00f), tmp9145);
__m512 tmp9164 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(2e+00f), tmp9165);
__m512 tmp9151 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(8e+00f), tmp9145);
__m512 tmp9171 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(8e+00f), tmp9165);
__m512 tmp9139 = _mm512_add_ps(tmp9140, tmp9141);
__m512 tmp9159 = _mm512_add_ps(tmp9160, tmp9161);
__m512 tmp9143 = _mm512_fmadd_ps(tmp9147, _mm512_set1_ps(1.6e+01f), tmp9144);
__m512 tmp9163 = _mm512_fmadd_ps(tmp9167, _mm512_set1_ps(1.6e+01f), tmp9164);
__m512 tmp9150 = _mm512_fmadd_ps(tmp9147, _mm512_set1_ps(4e+00f), tmp9151);
__m512 tmp9170 = _mm512_fmadd_ps(tmp9167, _mm512_set1_ps(4e+00f), tmp9171);
__m512 tmp9156 = _mm512_add_ps(tmp9147, tmp9145);
__m512 tmp9176 = _mm512_add_ps(tmp9167, tmp9165);
__m512 tmp9149 = _mm512_fmadd_ps(tmp9140, _mm512_set1_ps(4e+00f), tmp9141);
__m512 tmp9169 = _mm512_fmadd_ps(tmp9160, _mm512_set1_ps(4e+00f), tmp9161);
__m512 tmp9153 = _mm512_fmadd_ps(tmp9140, _mm512_set1_ps(1.6e+01f), tmp9141);
__m512 tmp9173 = _mm512_fmadd_ps(tmp9160, _mm512_set1_ps(1.6e+01f), tmp9161);
__m512 tmp9138 = _mm512_add_ps(tmp9139, tmp9081);
__m512 tmp9158 = _mm512_add_ps(tmp9159, tmp9089);
__m512 tmp9155 = _mm512_add_ps(tmp9156, tmp9088);
__m512 tmp9175 = _mm512_add_ps(tmp9176, tmp9136);
__m512 tmp9137 = _mm512_fmadd_ps(tmp9142, _mm512_set1_ps(3.2e+01f), tmp9138);
__m512 tmp9157 = _mm512_fmadd_ps(tmp9162, _mm512_set1_ps(3.2e+01f), tmp9158);
__m512 tmp9148 = _mm512_fmadd_ps(tmp9142, _mm512_set1_ps(8e+00f), tmp9149);
__m512 tmp9168 = _mm512_fmadd_ps(tmp9162, _mm512_set1_ps(8e+00f), tmp9169);
__m512 tmp9154 = _mm512_fmadd_ps(tmp9146, _mm512_set1_ps(3.2e+01f), tmp9155);
__m512 tmp9174 = _mm512_fmadd_ps(tmp9166, _mm512_set1_ps(3.2e+01f), tmp9175);
__m512 tmp9152 = _mm512_fmadd_ps(tmp9142, _mm512_set1_ps(2e+00f), tmp9153);
__m512 tmp9172 = _mm512_fmadd_ps(tmp9162, _mm512_set1_ps(2e+00f), tmp9173);
__m512 out1275 = tmp9137;
__m512 out1281 = tmp9157;
__m512 out1276 = tmp9143;
__m512 out1282 = tmp9163;
__m512 out1277 = tmp9148;
__m512 out1283 = tmp9168;
__m512 out1278 = tmp9150;
__m512 out1284 = tmp9170;
__m512 out1279 = tmp9152;
__m512 out1285 = tmp9172;
__m512 out1280 = tmp9154;
__m512 out1286 = tmp9174;
out1275 = _mm512_max_ps(_mm512_setzero_ps(), out1275);
out1281 = _mm512_max_ps(_mm512_setzero_ps(), out1281);
out1276 = _mm512_max_ps(_mm512_setzero_ps(), out1276);
out1282 = _mm512_max_ps(_mm512_setzero_ps(), out1282);
out1277 = _mm512_max_ps(_mm512_setzero_ps(), out1277);
out1283 = _mm512_max_ps(_mm512_setzero_ps(), out1283);
out1278 = _mm512_max_ps(_mm512_setzero_ps(), out1278);
out1284 = _mm512_max_ps(_mm512_setzero_ps(), out1284);
out1279 = _mm512_max_ps(_mm512_setzero_ps(), out1279);
out1285 = _mm512_max_ps(_mm512_setzero_ps(), out1285);
out1280 = _mm512_max_ps(_mm512_setzero_ps(), out1280);
out1286 = _mm512_max_ps(_mm512_setzero_ps(), out1286);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1275);
_mm512_mask_storeu_ps(datPtr13+12704+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1281);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1276);
_mm512_mask_storeu_ps(datPtr13+12928+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1282);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1277);
_mm512_mask_storeu_ps(datPtr13+13152+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1283);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1278);
_mm512_mask_storeu_ps(datPtr13+13376+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1284);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1279);
_mm512_mask_storeu_ps(datPtr13+13600+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1285);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 4095, out1280);
_mm512_mask_storeu_ps(datPtr13+13824+50432*i29+224*toH33+4*toW33+50432*k93+25216*l32, 255, out1286);
}
}
++j23;
if (j23 >= 15) break;
rel17 = 3;
}
if (rel17 < 4) {
ptrdiff_t toH34 = base17+12;
ptrdiff_t toW34 = 0;
ptrdiff_t k94 = 1*w46;
for (; k94 != 1; ++k94) {
ptrdiff_t l33 = 0;
for (; l33 != 2; ++l33) {
__m512 sf641 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf642 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1357 = _mm512_shuffle_f32x4(sf641, sf642, 68);
__m512 in1358 = _mm512_shuffle_f32x4(sf641, sf642, 238);
__m512 sf643 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf644 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1365 = _mm512_shuffle_f32x4(sf643, sf644, 68);
__m512 in1366 = _mm512_shuffle_f32x4(sf643, sf644, 238);
__m512 sf645 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf646 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1359 = _mm512_shuffle_f32x4(sf645, sf646, 68);
__m512 in1360 = _mm512_shuffle_f32x4(sf645, sf646, 238);
__m512 sf647 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf648 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1367 = _mm512_shuffle_f32x4(sf647, sf648, 68);
__m512 in1368 = _mm512_shuffle_f32x4(sf647, sf648, 238);
__m512 sf649 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf650 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1361 = _mm512_shuffle_f32x4(sf649, sf650, 68);
__m512 in1362 = _mm512_shuffle_f32x4(sf649, sf650, 238);
__m512 sf651 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf652 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1369 = _mm512_shuffle_f32x4(sf651, sf652, 68);
__m512 in1370 = _mm512_shuffle_f32x4(sf651, sf652, 238);
__m512 sf653 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf654 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1363 = _mm512_shuffle_f32x4(sf653, sf654, 68);
__m512 in1364 = _mm512_shuffle_f32x4(sf653, sf654, 238);
__m512 sf655 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf656 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1371 = _mm512_shuffle_f32x4(sf655, sf656, 68);
__m512 in1372 = _mm512_shuffle_f32x4(sf655, sf656, 238);
__m512 tmp9233 = _mm512_add_ps(in1358, in1359);
__m512 tmp9253 = _mm512_add_ps(in1366, in1367);
__m512 tmp9232 = _mm512_add_ps(in1360, in1361);
__m512 tmp9252 = _mm512_add_ps(in1368, in1369);
__m512 tmp9238 = _mm512_sub_ps(in1360, in1361);
__m512 tmp9258 = _mm512_sub_ps(in1368, in1369);
__m512 tmp9237 = _mm512_sub_ps(in1358, in1359);
__m512 tmp9257 = _mm512_sub_ps(in1366, in1367);
__m512 tmp9234 = _mm512_add_ps(in1362, in1363);
__m512 tmp9254 = _mm512_add_ps(in1370, in1371);
__m512 tmp9239 = _mm512_sub_ps(in1362, in1363);
__m512 tmp9259 = _mm512_sub_ps(in1370, in1371);
__m512 tmp9236 = _mm512_fmadd_ps(tmp9238, _mm512_set1_ps(2e+00f), tmp9237);
__m512 tmp9256 = _mm512_fmadd_ps(tmp9258, _mm512_set1_ps(2e+00f), tmp9257);
__m512 tmp9243 = _mm512_fmadd_ps(tmp9238, _mm512_set1_ps(8e+00f), tmp9237);
__m512 tmp9263 = _mm512_fmadd_ps(tmp9258, _mm512_set1_ps(8e+00f), tmp9257);
__m512 tmp9231 = _mm512_add_ps(tmp9232, tmp9233);
__m512 tmp9251 = _mm512_add_ps(tmp9252, tmp9253);
__m512 tmp9235 = _mm512_fmadd_ps(tmp9239, _mm512_set1_ps(1.6e+01f), tmp9236);
__m512 tmp9255 = _mm512_fmadd_ps(tmp9259, _mm512_set1_ps(1.6e+01f), tmp9256);
__m512 tmp9242 = _mm512_fmadd_ps(tmp9239, _mm512_set1_ps(4e+00f), tmp9243);
__m512 tmp9262 = _mm512_fmadd_ps(tmp9259, _mm512_set1_ps(4e+00f), tmp9263);
__m512 tmp9248 = _mm512_add_ps(tmp9239, tmp9237);
__m512 tmp9268 = _mm512_add_ps(tmp9259, tmp9257);
__m512 tmp9241 = _mm512_fmadd_ps(tmp9232, _mm512_set1_ps(4e+00f), tmp9233);
__m512 tmp9261 = _mm512_fmadd_ps(tmp9252, _mm512_set1_ps(4e+00f), tmp9253);
__m512 tmp9245 = _mm512_fmadd_ps(tmp9232, _mm512_set1_ps(1.6e+01f), tmp9233);
__m512 tmp9265 = _mm512_fmadd_ps(tmp9252, _mm512_set1_ps(1.6e+01f), tmp9253);
__m512 tmp9230 = _mm512_add_ps(tmp9231, in1357);
__m512 tmp9250 = _mm512_add_ps(tmp9251, in1365);
__m512 tmp9247 = _mm512_add_ps(tmp9248, in1364);
__m512 tmp9267 = _mm512_add_ps(tmp9268, in1372);
__m512 tmp9229 = _mm512_fmadd_ps(tmp9234, _mm512_set1_ps(3.2e+01f), tmp9230);
__m512 tmp9249 = _mm512_fmadd_ps(tmp9254, _mm512_set1_ps(3.2e+01f), tmp9250);
__m512 tmp9240 = _mm512_fmadd_ps(tmp9234, _mm512_set1_ps(8e+00f), tmp9241);
__m512 tmp9260 = _mm512_fmadd_ps(tmp9254, _mm512_set1_ps(8e+00f), tmp9261);
__m512 tmp9246 = _mm512_fmadd_ps(tmp9238, _mm512_set1_ps(3.2e+01f), tmp9247);
__m512 tmp9266 = _mm512_fmadd_ps(tmp9258, _mm512_set1_ps(3.2e+01f), tmp9267);
__m512 tmp9244 = _mm512_fmadd_ps(tmp9234, _mm512_set1_ps(2e+00f), tmp9245);
__m512 tmp9264 = _mm512_fmadd_ps(tmp9254, _mm512_set1_ps(2e+00f), tmp9265);
__m512 tmp9217 = tmp9229;
__m512 tmp9223 = tmp9249;
__m512 tmp9218 = tmp9235;
__m512 tmp9224 = tmp9255;
__m512 tmp9219 = tmp9240;
__m512 tmp9225 = tmp9260;
__m512 tmp9220 = tmp9242;
__m512 tmp9226 = tmp9262;
__m512 tmp9221 = tmp9244;
__m512 tmp9227 = tmp9264;
__m512 tmp9222 = tmp9246;
__m512 tmp9228 = tmp9266;
__m512 tmp9313 = _mm512_unpacklo_ps(tmp9217, tmp9218);
__m512 tmp9314 = _mm512_unpackhi_ps(tmp9217, tmp9218);
__m512 tmp9315 = _mm512_unpacklo_ps(tmp9219, tmp9220);
__m512 tmp9316 = _mm512_unpackhi_ps(tmp9219, tmp9220);
__m512 tmp9317 = _mm512_unpacklo_ps(tmp9221, tmp9222);
__m512 tmp9318 = _mm512_unpackhi_ps(tmp9221, tmp9222);
__m512 tmp9319 = _mm512_unpacklo_ps(tmp9223, tmp9224);
__m512 tmp9320 = _mm512_unpackhi_ps(tmp9223, tmp9224);
__m512 tmp9321 = _mm512_unpacklo_ps(tmp9225, tmp9226);
__m512 tmp9322 = _mm512_unpackhi_ps(tmp9225, tmp9226);
__m512 tmp9323 = _mm512_unpacklo_ps(tmp9227, tmp9228);
__m512 tmp9324 = _mm512_unpackhi_ps(tmp9227, tmp9228);
__m512 tmp9325 = _mm512_shuffle_ps(tmp9313, tmp9315, 68);
__m512 tmp9326 = _mm512_shuffle_ps(tmp9313, tmp9315, 238);
__m512 tmp9327 = _mm512_shuffle_ps(tmp9314, tmp9316, 68);
__m512 tmp9328 = _mm512_shuffle_ps(tmp9314, tmp9316, 238);
__m512 tmp9329 = _mm512_shuffle_ps(tmp9317, tmp9319, 68);
__m512 tmp9330 = _mm512_shuffle_ps(tmp9317, tmp9319, 238);
__m512 tmp9331 = _mm512_shuffle_ps(tmp9318, tmp9320, 68);
__m512 tmp9332 = _mm512_shuffle_ps(tmp9318, tmp9320, 238);
__m512 tmp9333 = _mm512_shuffle_ps(tmp9321, tmp9323, 68);
__m512 tmp9334 = _mm512_shuffle_ps(tmp9321, tmp9323, 238);
__m512 tmp9335 = _mm512_shuffle_ps(tmp9322, tmp9324, 68);
__m512 tmp9336 = _mm512_shuffle_ps(tmp9322, tmp9324, 238);
__m512 tmp9337 = _mm512_shuffle_f32x4(tmp9325, tmp9329, 136);
__m512 tmp9338 = _mm512_shuffle_f32x4(tmp9325, tmp9329, 221);
__m512 tmp9339 = _mm512_shuffle_f32x4(tmp9326, tmp9330, 136);
__m512 tmp9340 = _mm512_shuffle_f32x4(tmp9326, tmp9330, 221);
__m512 tmp9341 = _mm512_shuffle_f32x4(tmp9327, tmp9331, 136);
__m512 tmp9342 = _mm512_shuffle_f32x4(tmp9327, tmp9331, 221);
__m512 tmp9343 = _mm512_shuffle_f32x4(tmp9328, tmp9332, 136);
__m512 tmp9344 = _mm512_shuffle_f32x4(tmp9328, tmp9332, 221);
__m512 tmp9345 = _mm512_shuffle_f32x4(tmp9333, tmp9333, 136);
__m512 tmp9346 = _mm512_shuffle_f32x4(tmp9333, tmp9333, 221);
__m512 tmp9347 = _mm512_shuffle_f32x4(tmp9334, tmp9334, 136);
__m512 tmp9348 = _mm512_shuffle_f32x4(tmp9334, tmp9334, 221);
__m512 tmp9349 = _mm512_shuffle_f32x4(tmp9335, tmp9335, 136);
__m512 tmp9350 = _mm512_shuffle_f32x4(tmp9335, tmp9335, 221);
__m512 tmp9351 = _mm512_shuffle_f32x4(tmp9336, tmp9336, 136);
__m512 tmp9352 = _mm512_shuffle_f32x4(tmp9336, tmp9336, 221);
tmp9217 = _mm512_shuffle_f32x4(tmp9337, tmp9345, 136);
tmp9225 = _mm512_shuffle_f32x4(tmp9337, tmp9345, 221);
tmp9218 = _mm512_shuffle_f32x4(tmp9339, tmp9347, 136);
tmp9226 = _mm512_shuffle_f32x4(tmp9339, tmp9347, 221);
tmp9219 = _mm512_shuffle_f32x4(tmp9341, tmp9349, 136);
tmp9227 = _mm512_shuffle_f32x4(tmp9341, tmp9349, 221);
tmp9220 = _mm512_shuffle_f32x4(tmp9343, tmp9351, 136);
tmp9228 = _mm512_shuffle_f32x4(tmp9343, tmp9351, 221);
tmp9221 = _mm512_shuffle_f32x4(tmp9338, tmp9346, 136);
__m512 tmp9269 = _mm512_shuffle_f32x4(tmp9338, tmp9346, 221);
tmp9222 = _mm512_shuffle_f32x4(tmp9340, tmp9348, 136);
__m512 tmp9270 = _mm512_shuffle_f32x4(tmp9340, tmp9348, 221);
tmp9223 = _mm512_shuffle_f32x4(tmp9342, tmp9350, 136);
__m512 tmp9271 = _mm512_shuffle_f32x4(tmp9342, tmp9350, 221);
tmp9224 = _mm512_shuffle_f32x4(tmp9344, tmp9352, 136);
__m512 tmp9272 = _mm512_shuffle_f32x4(tmp9344, tmp9352, 221);
__m512 tmp9277 = _mm512_add_ps(tmp9218, tmp9219);
__m512 tmp9297 = _mm512_add_ps(tmp9226, tmp9227);
__m512 tmp9276 = _mm512_add_ps(tmp9220, tmp9221);
__m512 tmp9296 = _mm512_add_ps(tmp9228, tmp9269);
__m512 tmp9282 = _mm512_sub_ps(tmp9220, tmp9221);
__m512 tmp9302 = _mm512_sub_ps(tmp9228, tmp9269);
__m512 tmp9281 = _mm512_sub_ps(tmp9218, tmp9219);
__m512 tmp9301 = _mm512_sub_ps(tmp9226, tmp9227);
__m512 tmp9278 = _mm512_add_ps(tmp9222, tmp9223);
__m512 tmp9298 = _mm512_add_ps(tmp9270, tmp9271);
__m512 tmp9283 = _mm512_sub_ps(tmp9222, tmp9223);
__m512 tmp9303 = _mm512_sub_ps(tmp9270, tmp9271);
__m512 tmp9280 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(2e+00f), tmp9281);
__m512 tmp9300 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(2e+00f), tmp9301);
__m512 tmp9287 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(8e+00f), tmp9281);
__m512 tmp9307 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(8e+00f), tmp9301);
__m512 tmp9275 = _mm512_add_ps(tmp9276, tmp9277);
__m512 tmp9295 = _mm512_add_ps(tmp9296, tmp9297);
__m512 tmp9279 = _mm512_fmadd_ps(tmp9283, _mm512_set1_ps(1.6e+01f), tmp9280);
__m512 tmp9299 = _mm512_fmadd_ps(tmp9303, _mm512_set1_ps(1.6e+01f), tmp9300);
__m512 tmp9286 = _mm512_fmadd_ps(tmp9283, _mm512_set1_ps(4e+00f), tmp9287);
__m512 tmp9306 = _mm512_fmadd_ps(tmp9303, _mm512_set1_ps(4e+00f), tmp9307);
__m512 tmp9292 = _mm512_add_ps(tmp9283, tmp9281);
__m512 tmp9312 = _mm512_add_ps(tmp9303, tmp9301);
__m512 tmp9285 = _mm512_fmadd_ps(tmp9276, _mm512_set1_ps(4e+00f), tmp9277);
__m512 tmp9305 = _mm512_fmadd_ps(tmp9296, _mm512_set1_ps(4e+00f), tmp9297);
__m512 tmp9289 = _mm512_fmadd_ps(tmp9276, _mm512_set1_ps(1.6e+01f), tmp9277);
__m512 tmp9309 = _mm512_fmadd_ps(tmp9296, _mm512_set1_ps(1.6e+01f), tmp9297);
__m512 tmp9274 = _mm512_add_ps(tmp9275, tmp9217);
__m512 tmp9294 = _mm512_add_ps(tmp9295, tmp9225);
__m512 tmp9291 = _mm512_add_ps(tmp9292, tmp9224);
__m512 tmp9311 = _mm512_add_ps(tmp9312, tmp9272);
__m512 tmp9273 = _mm512_fmadd_ps(tmp9278, _mm512_set1_ps(3.2e+01f), tmp9274);
__m512 tmp9293 = _mm512_fmadd_ps(tmp9298, _mm512_set1_ps(3.2e+01f), tmp9294);
__m512 tmp9284 = _mm512_fmadd_ps(tmp9278, _mm512_set1_ps(8e+00f), tmp9285);
__m512 tmp9304 = _mm512_fmadd_ps(tmp9298, _mm512_set1_ps(8e+00f), tmp9305);
__m512 tmp9290 = _mm512_fmadd_ps(tmp9282, _mm512_set1_ps(3.2e+01f), tmp9291);
__m512 tmp9310 = _mm512_fmadd_ps(tmp9302, _mm512_set1_ps(3.2e+01f), tmp9311);
__m512 tmp9288 = _mm512_fmadd_ps(tmp9278, _mm512_set1_ps(2e+00f), tmp9289);
__m512 tmp9308 = _mm512_fmadd_ps(tmp9298, _mm512_set1_ps(2e+00f), tmp9309);
__m512 out1287 = tmp9273;
__m512 out1293 = tmp9293;
__m512 out1288 = tmp9279;
__m512 out1294 = tmp9299;
__m512 out1289 = tmp9284;
__m512 out1295 = tmp9304;
__m512 out1290 = tmp9286;
__m512 out1296 = tmp9306;
__m512 out1291 = tmp9288;
__m512 out1297 = tmp9308;
__m512 out1292 = tmp9290;
__m512 out1298 = tmp9310;
out1287 = _mm512_max_ps(_mm512_setzero_ps(), out1287);
out1293 = _mm512_max_ps(_mm512_setzero_ps(), out1293);
out1288 = _mm512_max_ps(_mm512_setzero_ps(), out1288);
out1294 = _mm512_max_ps(_mm512_setzero_ps(), out1294);
out1289 = _mm512_max_ps(_mm512_setzero_ps(), out1289);
out1295 = _mm512_max_ps(_mm512_setzero_ps(), out1295);
out1290 = _mm512_max_ps(_mm512_setzero_ps(), out1290);
out1296 = _mm512_max_ps(_mm512_setzero_ps(), out1296);
out1291 = _mm512_max_ps(_mm512_setzero_ps(), out1291);
out1297 = _mm512_max_ps(_mm512_setzero_ps(), out1297);
out1292 = _mm512_max_ps(_mm512_setzero_ps(), out1292);
out1298 = _mm512_max_ps(_mm512_setzero_ps(), out1298);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1287);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1293);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1288);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1294);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1289);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1295);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1290);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1296);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1291);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1297);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1292);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1298);
__m512 sf657 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf658 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1373 = _mm512_shuffle_f32x4(sf657, sf658, 68);
__m512 in1374 = _mm512_shuffle_f32x4(sf657, sf658, 238);
__m512 sf659 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf660 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1381 = _mm512_shuffle_f32x4(sf659, sf660, 68);
__m512 in1382 = _mm512_shuffle_f32x4(sf659, sf660, 238);
__m512 sf661 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf662 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1375 = _mm512_shuffle_f32x4(sf661, sf662, 68);
__m512 in1376 = _mm512_shuffle_f32x4(sf661, sf662, 238);
__m512 sf663 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf664 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1383 = _mm512_shuffle_f32x4(sf663, sf664, 68);
__m512 in1384 = _mm512_shuffle_f32x4(sf663, sf664, 238);
__m512 sf665 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf666 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1377 = _mm512_shuffle_f32x4(sf665, sf666, 68);
__m512 in1378 = _mm512_shuffle_f32x4(sf665, sf666, 238);
__m512 sf667 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf668 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1385 = _mm512_shuffle_f32x4(sf667, sf668, 68);
__m512 in1386 = _mm512_shuffle_f32x4(sf667, sf668, 238);
__m512 sf669 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf670 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1379 = _mm512_shuffle_f32x4(sf669, sf670, 68);
__m512 in1380 = _mm512_shuffle_f32x4(sf669, sf670, 238);
__m512 sf671 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf672 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1387 = _mm512_shuffle_f32x4(sf671, sf672, 68);
__m512 in1388 = _mm512_shuffle_f32x4(sf671, sf672, 238);
__m512 tmp9369 = _mm512_add_ps(in1374, in1375);
__m512 tmp9389 = _mm512_add_ps(in1382, in1383);
__m512 tmp9368 = _mm512_add_ps(in1376, in1377);
__m512 tmp9388 = _mm512_add_ps(in1384, in1385);
__m512 tmp9374 = _mm512_sub_ps(in1376, in1377);
__m512 tmp9394 = _mm512_sub_ps(in1384, in1385);
__m512 tmp9373 = _mm512_sub_ps(in1374, in1375);
__m512 tmp9393 = _mm512_sub_ps(in1382, in1383);
__m512 tmp9370 = _mm512_add_ps(in1378, in1379);
__m512 tmp9390 = _mm512_add_ps(in1386, in1387);
__m512 tmp9375 = _mm512_sub_ps(in1378, in1379);
__m512 tmp9395 = _mm512_sub_ps(in1386, in1387);
__m512 tmp9372 = _mm512_fmadd_ps(tmp9374, _mm512_set1_ps(2e+00f), tmp9373);
__m512 tmp9392 = _mm512_fmadd_ps(tmp9394, _mm512_set1_ps(2e+00f), tmp9393);
__m512 tmp9379 = _mm512_fmadd_ps(tmp9374, _mm512_set1_ps(8e+00f), tmp9373);
__m512 tmp9399 = _mm512_fmadd_ps(tmp9394, _mm512_set1_ps(8e+00f), tmp9393);
__m512 tmp9367 = _mm512_add_ps(tmp9368, tmp9369);
__m512 tmp9387 = _mm512_add_ps(tmp9388, tmp9389);
__m512 tmp9371 = _mm512_fmadd_ps(tmp9375, _mm512_set1_ps(1.6e+01f), tmp9372);
__m512 tmp9391 = _mm512_fmadd_ps(tmp9395, _mm512_set1_ps(1.6e+01f), tmp9392);
__m512 tmp9378 = _mm512_fmadd_ps(tmp9375, _mm512_set1_ps(4e+00f), tmp9379);
__m512 tmp9398 = _mm512_fmadd_ps(tmp9395, _mm512_set1_ps(4e+00f), tmp9399);
__m512 tmp9384 = _mm512_add_ps(tmp9375, tmp9373);
__m512 tmp9404 = _mm512_add_ps(tmp9395, tmp9393);
__m512 tmp9377 = _mm512_fmadd_ps(tmp9368, _mm512_set1_ps(4e+00f), tmp9369);
__m512 tmp9397 = _mm512_fmadd_ps(tmp9388, _mm512_set1_ps(4e+00f), tmp9389);
__m512 tmp9381 = _mm512_fmadd_ps(tmp9368, _mm512_set1_ps(1.6e+01f), tmp9369);
__m512 tmp9401 = _mm512_fmadd_ps(tmp9388, _mm512_set1_ps(1.6e+01f), tmp9389);
__m512 tmp9366 = _mm512_add_ps(tmp9367, in1373);
__m512 tmp9386 = _mm512_add_ps(tmp9387, in1381);
__m512 tmp9383 = _mm512_add_ps(tmp9384, in1380);
__m512 tmp9403 = _mm512_add_ps(tmp9404, in1388);
__m512 tmp9365 = _mm512_fmadd_ps(tmp9370, _mm512_set1_ps(3.2e+01f), tmp9366);
__m512 tmp9385 = _mm512_fmadd_ps(tmp9390, _mm512_set1_ps(3.2e+01f), tmp9386);
__m512 tmp9376 = _mm512_fmadd_ps(tmp9370, _mm512_set1_ps(8e+00f), tmp9377);
__m512 tmp9396 = _mm512_fmadd_ps(tmp9390, _mm512_set1_ps(8e+00f), tmp9397);
__m512 tmp9382 = _mm512_fmadd_ps(tmp9374, _mm512_set1_ps(3.2e+01f), tmp9383);
__m512 tmp9402 = _mm512_fmadd_ps(tmp9394, _mm512_set1_ps(3.2e+01f), tmp9403);
__m512 tmp9380 = _mm512_fmadd_ps(tmp9370, _mm512_set1_ps(2e+00f), tmp9381);
__m512 tmp9400 = _mm512_fmadd_ps(tmp9390, _mm512_set1_ps(2e+00f), tmp9401);
__m512 tmp9353 = tmp9365;
__m512 tmp9359 = tmp9385;
__m512 tmp9354 = tmp9371;
__m512 tmp9360 = tmp9391;
__m512 tmp9355 = tmp9376;
__m512 tmp9361 = tmp9396;
__m512 tmp9356 = tmp9378;
__m512 tmp9362 = tmp9398;
__m512 tmp9357 = tmp9380;
__m512 tmp9363 = tmp9400;
__m512 tmp9358 = tmp9382;
__m512 tmp9364 = tmp9402;
__m512 tmp9449 = _mm512_unpacklo_ps(tmp9353, tmp9354);
__m512 tmp9450 = _mm512_unpackhi_ps(tmp9353, tmp9354);
__m512 tmp9451 = _mm512_unpacklo_ps(tmp9355, tmp9356);
__m512 tmp9452 = _mm512_unpackhi_ps(tmp9355, tmp9356);
__m512 tmp9453 = _mm512_unpacklo_ps(tmp9357, tmp9358);
__m512 tmp9454 = _mm512_unpackhi_ps(tmp9357, tmp9358);
__m512 tmp9455 = _mm512_unpacklo_ps(tmp9359, tmp9360);
__m512 tmp9456 = _mm512_unpackhi_ps(tmp9359, tmp9360);
__m512 tmp9457 = _mm512_unpacklo_ps(tmp9361, tmp9362);
__m512 tmp9458 = _mm512_unpackhi_ps(tmp9361, tmp9362);
__m512 tmp9459 = _mm512_unpacklo_ps(tmp9363, tmp9364);
__m512 tmp9460 = _mm512_unpackhi_ps(tmp9363, tmp9364);
__m512 tmp9461 = _mm512_shuffle_ps(tmp9449, tmp9451, 68);
__m512 tmp9462 = _mm512_shuffle_ps(tmp9449, tmp9451, 238);
__m512 tmp9463 = _mm512_shuffle_ps(tmp9450, tmp9452, 68);
__m512 tmp9464 = _mm512_shuffle_ps(tmp9450, tmp9452, 238);
__m512 tmp9465 = _mm512_shuffle_ps(tmp9453, tmp9455, 68);
__m512 tmp9466 = _mm512_shuffle_ps(tmp9453, tmp9455, 238);
__m512 tmp9467 = _mm512_shuffle_ps(tmp9454, tmp9456, 68);
__m512 tmp9468 = _mm512_shuffle_ps(tmp9454, tmp9456, 238);
__m512 tmp9469 = _mm512_shuffle_ps(tmp9457, tmp9459, 68);
__m512 tmp9470 = _mm512_shuffle_ps(tmp9457, tmp9459, 238);
__m512 tmp9471 = _mm512_shuffle_ps(tmp9458, tmp9460, 68);
__m512 tmp9472 = _mm512_shuffle_ps(tmp9458, tmp9460, 238);
__m512 tmp9473 = _mm512_shuffle_f32x4(tmp9461, tmp9465, 136);
__m512 tmp9474 = _mm512_shuffle_f32x4(tmp9461, tmp9465, 221);
__m512 tmp9475 = _mm512_shuffle_f32x4(tmp9462, tmp9466, 136);
__m512 tmp9476 = _mm512_shuffle_f32x4(tmp9462, tmp9466, 221);
__m512 tmp9477 = _mm512_shuffle_f32x4(tmp9463, tmp9467, 136);
__m512 tmp9478 = _mm512_shuffle_f32x4(tmp9463, tmp9467, 221);
__m512 tmp9479 = _mm512_shuffle_f32x4(tmp9464, tmp9468, 136);
__m512 tmp9480 = _mm512_shuffle_f32x4(tmp9464, tmp9468, 221);
__m512 tmp9481 = _mm512_shuffle_f32x4(tmp9469, tmp9469, 136);
__m512 tmp9482 = _mm512_shuffle_f32x4(tmp9469, tmp9469, 221);
__m512 tmp9483 = _mm512_shuffle_f32x4(tmp9470, tmp9470, 136);
__m512 tmp9484 = _mm512_shuffle_f32x4(tmp9470, tmp9470, 221);
__m512 tmp9485 = _mm512_shuffle_f32x4(tmp9471, tmp9471, 136);
__m512 tmp9486 = _mm512_shuffle_f32x4(tmp9471, tmp9471, 221);
__m512 tmp9487 = _mm512_shuffle_f32x4(tmp9472, tmp9472, 136);
__m512 tmp9488 = _mm512_shuffle_f32x4(tmp9472, tmp9472, 221);
tmp9353 = _mm512_shuffle_f32x4(tmp9473, tmp9481, 136);
tmp9361 = _mm512_shuffle_f32x4(tmp9473, tmp9481, 221);
tmp9354 = _mm512_shuffle_f32x4(tmp9475, tmp9483, 136);
tmp9362 = _mm512_shuffle_f32x4(tmp9475, tmp9483, 221);
tmp9355 = _mm512_shuffle_f32x4(tmp9477, tmp9485, 136);
tmp9363 = _mm512_shuffle_f32x4(tmp9477, tmp9485, 221);
tmp9356 = _mm512_shuffle_f32x4(tmp9479, tmp9487, 136);
tmp9364 = _mm512_shuffle_f32x4(tmp9479, tmp9487, 221);
tmp9357 = _mm512_shuffle_f32x4(tmp9474, tmp9482, 136);
__m512 tmp9405 = _mm512_shuffle_f32x4(tmp9474, tmp9482, 221);
tmp9358 = _mm512_shuffle_f32x4(tmp9476, tmp9484, 136);
__m512 tmp9406 = _mm512_shuffle_f32x4(tmp9476, tmp9484, 221);
tmp9359 = _mm512_shuffle_f32x4(tmp9478, tmp9486, 136);
__m512 tmp9407 = _mm512_shuffle_f32x4(tmp9478, tmp9486, 221);
tmp9360 = _mm512_shuffle_f32x4(tmp9480, tmp9488, 136);
__m512 tmp9408 = _mm512_shuffle_f32x4(tmp9480, tmp9488, 221);
__m512 tmp9413 = _mm512_add_ps(tmp9354, tmp9355);
__m512 tmp9433 = _mm512_add_ps(tmp9362, tmp9363);
__m512 tmp9412 = _mm512_add_ps(tmp9356, tmp9357);
__m512 tmp9432 = _mm512_add_ps(tmp9364, tmp9405);
__m512 tmp9418 = _mm512_sub_ps(tmp9356, tmp9357);
__m512 tmp9438 = _mm512_sub_ps(tmp9364, tmp9405);
__m512 tmp9417 = _mm512_sub_ps(tmp9354, tmp9355);
__m512 tmp9437 = _mm512_sub_ps(tmp9362, tmp9363);
__m512 tmp9414 = _mm512_add_ps(tmp9358, tmp9359);
__m512 tmp9434 = _mm512_add_ps(tmp9406, tmp9407);
__m512 tmp9419 = _mm512_sub_ps(tmp9358, tmp9359);
__m512 tmp9439 = _mm512_sub_ps(tmp9406, tmp9407);
__m512 tmp9416 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(2e+00f), tmp9417);
__m512 tmp9436 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(2e+00f), tmp9437);
__m512 tmp9423 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(8e+00f), tmp9417);
__m512 tmp9443 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(8e+00f), tmp9437);
__m512 tmp9411 = _mm512_add_ps(tmp9412, tmp9413);
__m512 tmp9431 = _mm512_add_ps(tmp9432, tmp9433);
__m512 tmp9415 = _mm512_fmadd_ps(tmp9419, _mm512_set1_ps(1.6e+01f), tmp9416);
__m512 tmp9435 = _mm512_fmadd_ps(tmp9439, _mm512_set1_ps(1.6e+01f), tmp9436);
__m512 tmp9422 = _mm512_fmadd_ps(tmp9419, _mm512_set1_ps(4e+00f), tmp9423);
__m512 tmp9442 = _mm512_fmadd_ps(tmp9439, _mm512_set1_ps(4e+00f), tmp9443);
__m512 tmp9428 = _mm512_add_ps(tmp9419, tmp9417);
__m512 tmp9448 = _mm512_add_ps(tmp9439, tmp9437);
__m512 tmp9421 = _mm512_fmadd_ps(tmp9412, _mm512_set1_ps(4e+00f), tmp9413);
__m512 tmp9441 = _mm512_fmadd_ps(tmp9432, _mm512_set1_ps(4e+00f), tmp9433);
__m512 tmp9425 = _mm512_fmadd_ps(tmp9412, _mm512_set1_ps(1.6e+01f), tmp9413);
__m512 tmp9445 = _mm512_fmadd_ps(tmp9432, _mm512_set1_ps(1.6e+01f), tmp9433);
__m512 tmp9410 = _mm512_add_ps(tmp9411, tmp9353);
__m512 tmp9430 = _mm512_add_ps(tmp9431, tmp9361);
__m512 tmp9427 = _mm512_add_ps(tmp9428, tmp9360);
__m512 tmp9447 = _mm512_add_ps(tmp9448, tmp9408);
__m512 tmp9409 = _mm512_fmadd_ps(tmp9414, _mm512_set1_ps(3.2e+01f), tmp9410);
__m512 tmp9429 = _mm512_fmadd_ps(tmp9434, _mm512_set1_ps(3.2e+01f), tmp9430);
__m512 tmp9420 = _mm512_fmadd_ps(tmp9414, _mm512_set1_ps(8e+00f), tmp9421);
__m512 tmp9440 = _mm512_fmadd_ps(tmp9434, _mm512_set1_ps(8e+00f), tmp9441);
__m512 tmp9426 = _mm512_fmadd_ps(tmp9418, _mm512_set1_ps(3.2e+01f), tmp9427);
__m512 tmp9446 = _mm512_fmadd_ps(tmp9438, _mm512_set1_ps(3.2e+01f), tmp9447);
__m512 tmp9424 = _mm512_fmadd_ps(tmp9414, _mm512_set1_ps(2e+00f), tmp9425);
__m512 tmp9444 = _mm512_fmadd_ps(tmp9434, _mm512_set1_ps(2e+00f), tmp9445);
__m512 out1299 = tmp9409;
__m512 out1305 = tmp9429;
__m512 out1300 = tmp9415;
__m512 out1306 = tmp9435;
__m512 out1301 = tmp9420;
__m512 out1307 = tmp9440;
__m512 out1302 = tmp9422;
__m512 out1308 = tmp9442;
__m512 out1303 = tmp9424;
__m512 out1309 = tmp9444;
__m512 out1304 = tmp9426;
__m512 out1310 = tmp9446;
out1299 = _mm512_max_ps(_mm512_setzero_ps(), out1299);
out1305 = _mm512_max_ps(_mm512_setzero_ps(), out1305);
out1300 = _mm512_max_ps(_mm512_setzero_ps(), out1300);
out1306 = _mm512_max_ps(_mm512_setzero_ps(), out1306);
out1301 = _mm512_max_ps(_mm512_setzero_ps(), out1301);
out1307 = _mm512_max_ps(_mm512_setzero_ps(), out1307);
out1302 = _mm512_max_ps(_mm512_setzero_ps(), out1302);
out1308 = _mm512_max_ps(_mm512_setzero_ps(), out1308);
out1303 = _mm512_max_ps(_mm512_setzero_ps(), out1303);
out1309 = _mm512_max_ps(_mm512_setzero_ps(), out1309);
out1304 = _mm512_max_ps(_mm512_setzero_ps(), out1304);
out1310 = _mm512_max_ps(_mm512_setzero_ps(), out1310);
_mm512_mask_storeu_ps(datPtr13+96+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1299);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1305);
_mm512_mask_storeu_ps(datPtr13+320+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1300);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1306);
_mm512_mask_storeu_ps(datPtr13+544+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1301);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1307);
_mm512_mask_storeu_ps(datPtr13+768+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1302);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1308);
_mm512_mask_storeu_ps(datPtr13+992+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1303);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1309);
_mm512_mask_storeu_ps(datPtr13+1216+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1304);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1310);
__m512 sf673 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf674 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1389 = _mm512_shuffle_f32x4(sf673, sf674, 68);
__m512 in1390 = _mm512_shuffle_f32x4(sf673, sf674, 238);
__m512 sf675 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf676 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1397 = _mm512_shuffle_f32x4(sf675, sf676, 68);
__m512 in1398 = _mm512_shuffle_f32x4(sf675, sf676, 238);
__m512 sf677 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf678 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1391 = _mm512_shuffle_f32x4(sf677, sf678, 68);
__m512 in1392 = _mm512_shuffle_f32x4(sf677, sf678, 238);
__m512 sf679 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf680 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1399 = _mm512_shuffle_f32x4(sf679, sf680, 68);
__m512 in1400 = _mm512_shuffle_f32x4(sf679, sf680, 238);
__m512 sf681 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf682 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1393 = _mm512_shuffle_f32x4(sf681, sf682, 68);
__m512 in1394 = _mm512_shuffle_f32x4(sf681, sf682, 238);
__m512 sf683 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf684 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1401 = _mm512_shuffle_f32x4(sf683, sf684, 68);
__m512 in1402 = _mm512_shuffle_f32x4(sf683, sf684, 238);
__m512 sf685 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf686 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1395 = _mm512_shuffle_f32x4(sf685, sf686, 68);
__m512 in1396 = _mm512_shuffle_f32x4(sf685, sf686, 238);
__m512 sf687 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k94+768*l33);
__m512 sf688 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k94+768*l33);
__m512 in1403 = _mm512_shuffle_f32x4(sf687, sf688, 68);
__m512 in1404 = _mm512_shuffle_f32x4(sf687, sf688, 238);
__m512 tmp9505 = _mm512_add_ps(in1390, in1391);
__m512 tmp9525 = _mm512_add_ps(in1398, in1399);
__m512 tmp9504 = _mm512_add_ps(in1392, in1393);
__m512 tmp9524 = _mm512_add_ps(in1400, in1401);
__m512 tmp9510 = _mm512_sub_ps(in1392, in1393);
__m512 tmp9530 = _mm512_sub_ps(in1400, in1401);
__m512 tmp9509 = _mm512_sub_ps(in1390, in1391);
__m512 tmp9529 = _mm512_sub_ps(in1398, in1399);
__m512 tmp9506 = _mm512_add_ps(in1394, in1395);
__m512 tmp9526 = _mm512_add_ps(in1402, in1403);
__m512 tmp9511 = _mm512_sub_ps(in1394, in1395);
__m512 tmp9531 = _mm512_sub_ps(in1402, in1403);
__m512 tmp9508 = _mm512_fmadd_ps(tmp9510, _mm512_set1_ps(2e+00f), tmp9509);
__m512 tmp9528 = _mm512_fmadd_ps(tmp9530, _mm512_set1_ps(2e+00f), tmp9529);
__m512 tmp9515 = _mm512_fmadd_ps(tmp9510, _mm512_set1_ps(8e+00f), tmp9509);
__m512 tmp9535 = _mm512_fmadd_ps(tmp9530, _mm512_set1_ps(8e+00f), tmp9529);
__m512 tmp9503 = _mm512_add_ps(tmp9504, tmp9505);
__m512 tmp9523 = _mm512_add_ps(tmp9524, tmp9525);
__m512 tmp9507 = _mm512_fmadd_ps(tmp9511, _mm512_set1_ps(1.6e+01f), tmp9508);
__m512 tmp9527 = _mm512_fmadd_ps(tmp9531, _mm512_set1_ps(1.6e+01f), tmp9528);
__m512 tmp9514 = _mm512_fmadd_ps(tmp9511, _mm512_set1_ps(4e+00f), tmp9515);
__m512 tmp9534 = _mm512_fmadd_ps(tmp9531, _mm512_set1_ps(4e+00f), tmp9535);
__m512 tmp9520 = _mm512_add_ps(tmp9511, tmp9509);
__m512 tmp9540 = _mm512_add_ps(tmp9531, tmp9529);
__m512 tmp9513 = _mm512_fmadd_ps(tmp9504, _mm512_set1_ps(4e+00f), tmp9505);
__m512 tmp9533 = _mm512_fmadd_ps(tmp9524, _mm512_set1_ps(4e+00f), tmp9525);
__m512 tmp9517 = _mm512_fmadd_ps(tmp9504, _mm512_set1_ps(1.6e+01f), tmp9505);
__m512 tmp9537 = _mm512_fmadd_ps(tmp9524, _mm512_set1_ps(1.6e+01f), tmp9525);
__m512 tmp9502 = _mm512_add_ps(tmp9503, in1389);
__m512 tmp9522 = _mm512_add_ps(tmp9523, in1397);
__m512 tmp9519 = _mm512_add_ps(tmp9520, in1396);
__m512 tmp9539 = _mm512_add_ps(tmp9540, in1404);
__m512 tmp9501 = _mm512_fmadd_ps(tmp9506, _mm512_set1_ps(3.2e+01f), tmp9502);
__m512 tmp9521 = _mm512_fmadd_ps(tmp9526, _mm512_set1_ps(3.2e+01f), tmp9522);
__m512 tmp9512 = _mm512_fmadd_ps(tmp9506, _mm512_set1_ps(8e+00f), tmp9513);
__m512 tmp9532 = _mm512_fmadd_ps(tmp9526, _mm512_set1_ps(8e+00f), tmp9533);
__m512 tmp9518 = _mm512_fmadd_ps(tmp9510, _mm512_set1_ps(3.2e+01f), tmp9519);
__m512 tmp9538 = _mm512_fmadd_ps(tmp9530, _mm512_set1_ps(3.2e+01f), tmp9539);
__m512 tmp9516 = _mm512_fmadd_ps(tmp9506, _mm512_set1_ps(2e+00f), tmp9517);
__m512 tmp9536 = _mm512_fmadd_ps(tmp9526, _mm512_set1_ps(2e+00f), tmp9537);
__m512 tmp9489 = tmp9501;
__m512 tmp9495 = tmp9521;
__m512 tmp9490 = tmp9507;
__m512 tmp9496 = tmp9527;
__m512 tmp9491 = tmp9512;
__m512 tmp9497 = tmp9532;
__m512 tmp9492 = tmp9514;
__m512 tmp9498 = tmp9534;
__m512 tmp9493 = tmp9516;
__m512 tmp9499 = tmp9536;
__m512 tmp9494 = tmp9518;
__m512 tmp9500 = tmp9538;
__m512 tmp9585 = _mm512_unpacklo_ps(tmp9489, tmp9490);
__m512 tmp9586 = _mm512_unpackhi_ps(tmp9489, tmp9490);
__m512 tmp9587 = _mm512_unpacklo_ps(tmp9491, tmp9492);
__m512 tmp9588 = _mm512_unpackhi_ps(tmp9491, tmp9492);
__m512 tmp9589 = _mm512_unpacklo_ps(tmp9493, tmp9494);
__m512 tmp9590 = _mm512_unpackhi_ps(tmp9493, tmp9494);
__m512 tmp9591 = _mm512_unpacklo_ps(tmp9495, tmp9496);
__m512 tmp9592 = _mm512_unpackhi_ps(tmp9495, tmp9496);
__m512 tmp9593 = _mm512_unpacklo_ps(tmp9497, tmp9498);
__m512 tmp9594 = _mm512_unpackhi_ps(tmp9497, tmp9498);
__m512 tmp9595 = _mm512_unpacklo_ps(tmp9499, tmp9500);
__m512 tmp9596 = _mm512_unpackhi_ps(tmp9499, tmp9500);
__m512 tmp9597 = _mm512_shuffle_ps(tmp9585, tmp9587, 68);
__m512 tmp9598 = _mm512_shuffle_ps(tmp9585, tmp9587, 238);
__m512 tmp9599 = _mm512_shuffle_ps(tmp9586, tmp9588, 68);
__m512 tmp9600 = _mm512_shuffle_ps(tmp9586, tmp9588, 238);
__m512 tmp9601 = _mm512_shuffle_ps(tmp9589, tmp9591, 68);
__m512 tmp9602 = _mm512_shuffle_ps(tmp9589, tmp9591, 238);
__m512 tmp9603 = _mm512_shuffle_ps(tmp9590, tmp9592, 68);
__m512 tmp9604 = _mm512_shuffle_ps(tmp9590, tmp9592, 238);
__m512 tmp9605 = _mm512_shuffle_ps(tmp9593, tmp9595, 68);
__m512 tmp9606 = _mm512_shuffle_ps(tmp9593, tmp9595, 238);
__m512 tmp9607 = _mm512_shuffle_ps(tmp9594, tmp9596, 68);
__m512 tmp9608 = _mm512_shuffle_ps(tmp9594, tmp9596, 238);
__m512 tmp9609 = _mm512_shuffle_f32x4(tmp9597, tmp9601, 136);
__m512 tmp9610 = _mm512_shuffle_f32x4(tmp9597, tmp9601, 221);
__m512 tmp9611 = _mm512_shuffle_f32x4(tmp9598, tmp9602, 136);
__m512 tmp9612 = _mm512_shuffle_f32x4(tmp9598, tmp9602, 221);
__m512 tmp9613 = _mm512_shuffle_f32x4(tmp9599, tmp9603, 136);
__m512 tmp9614 = _mm512_shuffle_f32x4(tmp9599, tmp9603, 221);
__m512 tmp9615 = _mm512_shuffle_f32x4(tmp9600, tmp9604, 136);
__m512 tmp9616 = _mm512_shuffle_f32x4(tmp9600, tmp9604, 221);
__m512 tmp9617 = _mm512_shuffle_f32x4(tmp9605, tmp9605, 136);
__m512 tmp9618 = _mm512_shuffle_f32x4(tmp9605, tmp9605, 221);
__m512 tmp9619 = _mm512_shuffle_f32x4(tmp9606, tmp9606, 136);
__m512 tmp9620 = _mm512_shuffle_f32x4(tmp9606, tmp9606, 221);
__m512 tmp9621 = _mm512_shuffle_f32x4(tmp9607, tmp9607, 136);
__m512 tmp9622 = _mm512_shuffle_f32x4(tmp9607, tmp9607, 221);
__m512 tmp9623 = _mm512_shuffle_f32x4(tmp9608, tmp9608, 136);
__m512 tmp9624 = _mm512_shuffle_f32x4(tmp9608, tmp9608, 221);
tmp9489 = _mm512_shuffle_f32x4(tmp9609, tmp9617, 136);
tmp9497 = _mm512_shuffle_f32x4(tmp9609, tmp9617, 221);
tmp9490 = _mm512_shuffle_f32x4(tmp9611, tmp9619, 136);
tmp9498 = _mm512_shuffle_f32x4(tmp9611, tmp9619, 221);
tmp9491 = _mm512_shuffle_f32x4(tmp9613, tmp9621, 136);
tmp9499 = _mm512_shuffle_f32x4(tmp9613, tmp9621, 221);
tmp9492 = _mm512_shuffle_f32x4(tmp9615, tmp9623, 136);
tmp9500 = _mm512_shuffle_f32x4(tmp9615, tmp9623, 221);
tmp9493 = _mm512_shuffle_f32x4(tmp9610, tmp9618, 136);
__m512 tmp9541 = _mm512_shuffle_f32x4(tmp9610, tmp9618, 221);
tmp9494 = _mm512_shuffle_f32x4(tmp9612, tmp9620, 136);
__m512 tmp9542 = _mm512_shuffle_f32x4(tmp9612, tmp9620, 221);
tmp9495 = _mm512_shuffle_f32x4(tmp9614, tmp9622, 136);
__m512 tmp9543 = _mm512_shuffle_f32x4(tmp9614, tmp9622, 221);
tmp9496 = _mm512_shuffle_f32x4(tmp9616, tmp9624, 136);
__m512 tmp9544 = _mm512_shuffle_f32x4(tmp9616, tmp9624, 221);
__m512 tmp9549 = _mm512_add_ps(tmp9490, tmp9491);
__m512 tmp9569 = _mm512_add_ps(tmp9498, tmp9499);
__m512 tmp9548 = _mm512_add_ps(tmp9492, tmp9493);
__m512 tmp9568 = _mm512_add_ps(tmp9500, tmp9541);
__m512 tmp9554 = _mm512_sub_ps(tmp9492, tmp9493);
__m512 tmp9574 = _mm512_sub_ps(tmp9500, tmp9541);
__m512 tmp9553 = _mm512_sub_ps(tmp9490, tmp9491);
__m512 tmp9573 = _mm512_sub_ps(tmp9498, tmp9499);
__m512 tmp9550 = _mm512_add_ps(tmp9494, tmp9495);
__m512 tmp9570 = _mm512_add_ps(tmp9542, tmp9543);
__m512 tmp9555 = _mm512_sub_ps(tmp9494, tmp9495);
__m512 tmp9575 = _mm512_sub_ps(tmp9542, tmp9543);
__m512 tmp9552 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(2e+00f), tmp9553);
__m512 tmp9572 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(2e+00f), tmp9573);
__m512 tmp9559 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(8e+00f), tmp9553);
__m512 tmp9579 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(8e+00f), tmp9573);
__m512 tmp9547 = _mm512_add_ps(tmp9548, tmp9549);
__m512 tmp9567 = _mm512_add_ps(tmp9568, tmp9569);
__m512 tmp9551 = _mm512_fmadd_ps(tmp9555, _mm512_set1_ps(1.6e+01f), tmp9552);
__m512 tmp9571 = _mm512_fmadd_ps(tmp9575, _mm512_set1_ps(1.6e+01f), tmp9572);
__m512 tmp9558 = _mm512_fmadd_ps(tmp9555, _mm512_set1_ps(4e+00f), tmp9559);
__m512 tmp9578 = _mm512_fmadd_ps(tmp9575, _mm512_set1_ps(4e+00f), tmp9579);
__m512 tmp9564 = _mm512_add_ps(tmp9555, tmp9553);
__m512 tmp9584 = _mm512_add_ps(tmp9575, tmp9573);
__m512 tmp9557 = _mm512_fmadd_ps(tmp9548, _mm512_set1_ps(4e+00f), tmp9549);
__m512 tmp9577 = _mm512_fmadd_ps(tmp9568, _mm512_set1_ps(4e+00f), tmp9569);
__m512 tmp9561 = _mm512_fmadd_ps(tmp9548, _mm512_set1_ps(1.6e+01f), tmp9549);
__m512 tmp9581 = _mm512_fmadd_ps(tmp9568, _mm512_set1_ps(1.6e+01f), tmp9569);
__m512 tmp9546 = _mm512_add_ps(tmp9547, tmp9489);
__m512 tmp9566 = _mm512_add_ps(tmp9567, tmp9497);
__m512 tmp9563 = _mm512_add_ps(tmp9564, tmp9496);
__m512 tmp9583 = _mm512_add_ps(tmp9584, tmp9544);
__m512 tmp9545 = _mm512_fmadd_ps(tmp9550, _mm512_set1_ps(3.2e+01f), tmp9546);
__m512 tmp9565 = _mm512_fmadd_ps(tmp9570, _mm512_set1_ps(3.2e+01f), tmp9566);
__m512 tmp9556 = _mm512_fmadd_ps(tmp9550, _mm512_set1_ps(8e+00f), tmp9557);
__m512 tmp9576 = _mm512_fmadd_ps(tmp9570, _mm512_set1_ps(8e+00f), tmp9577);
__m512 tmp9562 = _mm512_fmadd_ps(tmp9554, _mm512_set1_ps(3.2e+01f), tmp9563);
__m512 tmp9582 = _mm512_fmadd_ps(tmp9574, _mm512_set1_ps(3.2e+01f), tmp9583);
__m512 tmp9560 = _mm512_fmadd_ps(tmp9550, _mm512_set1_ps(2e+00f), tmp9561);
__m512 tmp9580 = _mm512_fmadd_ps(tmp9570, _mm512_set1_ps(2e+00f), tmp9581);
__m512 out1311 = tmp9545;
__m512 out1317 = tmp9565;
__m512 out1312 = tmp9551;
__m512 out1318 = tmp9571;
__m512 out1313 = tmp9556;
__m512 out1319 = tmp9576;
__m512 out1314 = tmp9558;
__m512 out1320 = tmp9578;
__m512 out1315 = tmp9560;
__m512 out1321 = tmp9580;
__m512 out1316 = tmp9562;
__m512 out1322 = tmp9582;
out1311 = _mm512_max_ps(_mm512_setzero_ps(), out1311);
out1317 = _mm512_max_ps(_mm512_setzero_ps(), out1317);
out1312 = _mm512_max_ps(_mm512_setzero_ps(), out1312);
out1318 = _mm512_max_ps(_mm512_setzero_ps(), out1318);
out1313 = _mm512_max_ps(_mm512_setzero_ps(), out1313);
out1319 = _mm512_max_ps(_mm512_setzero_ps(), out1319);
out1314 = _mm512_max_ps(_mm512_setzero_ps(), out1314);
out1320 = _mm512_max_ps(_mm512_setzero_ps(), out1320);
out1315 = _mm512_max_ps(_mm512_setzero_ps(), out1315);
out1321 = _mm512_max_ps(_mm512_setzero_ps(), out1321);
out1316 = _mm512_max_ps(_mm512_setzero_ps(), out1316);
out1322 = _mm512_max_ps(_mm512_setzero_ps(), out1322);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1311);
_mm512_mask_storeu_ps(datPtr13+12704+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1317);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1312);
_mm512_mask_storeu_ps(datPtr13+12928+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1318);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1313);
_mm512_mask_storeu_ps(datPtr13+13152+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1319);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1314);
_mm512_mask_storeu_ps(datPtr13+13376+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1320);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1315);
_mm512_mask_storeu_ps(datPtr13+13600+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1321);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1316);
_mm512_mask_storeu_ps(datPtr13+13824+50432*i29+224*toH34+4*toW34+50432*k94+25216*l33, 4095, out1322);
}
}
++j23;
rel17 = 4;
}
ptrdiff_t toH35 = base17+12;
ptrdiff_t toW35 = 36;
ptrdiff_t k95 = 1*w46;
for (; k95 != 1; ++k95) {
ptrdiff_t l34 = 0;
for (; l34 != 2; ++l34) {
__m512 sf689 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf690 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1405 = _mm512_shuffle_f32x4(sf689, sf690, 68);
__m512 in1406 = _mm512_shuffle_f32x4(sf689, sf690, 238);
__m512 sf691 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf692 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1413 = _mm512_shuffle_f32x4(sf691, sf692, 68);
__m512 in1414 = _mm512_shuffle_f32x4(sf691, sf692, 238);
__m512 sf693 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf694 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1407 = _mm512_shuffle_f32x4(sf693, sf694, 68);
__m512 in1408 = _mm512_shuffle_f32x4(sf693, sf694, 238);
__m512 sf695 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf696 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1415 = _mm512_shuffle_f32x4(sf695, sf696, 68);
__m512 in1416 = _mm512_shuffle_f32x4(sf695, sf696, 238);
__m512 sf697 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf698 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1409 = _mm512_shuffle_f32x4(sf697, sf698, 68);
__m512 in1410 = _mm512_shuffle_f32x4(sf697, sf698, 238);
__m512 sf699 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf700 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1417 = _mm512_shuffle_f32x4(sf699, sf700, 68);
__m512 in1418 = _mm512_shuffle_f32x4(sf699, sf700, 238);
__m512 sf701 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf702 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1411 = _mm512_shuffle_f32x4(sf701, sf702, 68);
__m512 in1412 = _mm512_shuffle_f32x4(sf701, sf702, 238);
__m512 sf703 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf704 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1419 = _mm512_shuffle_f32x4(sf703, sf704, 68);
__m512 in1420 = _mm512_shuffle_f32x4(sf703, sf704, 238);
__m512 tmp9641 = _mm512_add_ps(in1406, in1407);
__m512 tmp9661 = _mm512_add_ps(in1414, in1415);
__m512 tmp9640 = _mm512_add_ps(in1408, in1409);
__m512 tmp9660 = _mm512_add_ps(in1416, in1417);
__m512 tmp9646 = _mm512_sub_ps(in1408, in1409);
__m512 tmp9666 = _mm512_sub_ps(in1416, in1417);
__m512 tmp9645 = _mm512_sub_ps(in1406, in1407);
__m512 tmp9665 = _mm512_sub_ps(in1414, in1415);
__m512 tmp9642 = _mm512_add_ps(in1410, in1411);
__m512 tmp9662 = _mm512_add_ps(in1418, in1419);
__m512 tmp9647 = _mm512_sub_ps(in1410, in1411);
__m512 tmp9667 = _mm512_sub_ps(in1418, in1419);
__m512 tmp9644 = _mm512_fmadd_ps(tmp9646, _mm512_set1_ps(2e+00f), tmp9645);
__m512 tmp9664 = _mm512_fmadd_ps(tmp9666, _mm512_set1_ps(2e+00f), tmp9665);
__m512 tmp9651 = _mm512_fmadd_ps(tmp9646, _mm512_set1_ps(8e+00f), tmp9645);
__m512 tmp9671 = _mm512_fmadd_ps(tmp9666, _mm512_set1_ps(8e+00f), tmp9665);
__m512 tmp9639 = _mm512_add_ps(tmp9640, tmp9641);
__m512 tmp9659 = _mm512_add_ps(tmp9660, tmp9661);
__m512 tmp9643 = _mm512_fmadd_ps(tmp9647, _mm512_set1_ps(1.6e+01f), tmp9644);
__m512 tmp9663 = _mm512_fmadd_ps(tmp9667, _mm512_set1_ps(1.6e+01f), tmp9664);
__m512 tmp9650 = _mm512_fmadd_ps(tmp9647, _mm512_set1_ps(4e+00f), tmp9651);
__m512 tmp9670 = _mm512_fmadd_ps(tmp9667, _mm512_set1_ps(4e+00f), tmp9671);
__m512 tmp9656 = _mm512_add_ps(tmp9647, tmp9645);
__m512 tmp9676 = _mm512_add_ps(tmp9667, tmp9665);
__m512 tmp9649 = _mm512_fmadd_ps(tmp9640, _mm512_set1_ps(4e+00f), tmp9641);
__m512 tmp9669 = _mm512_fmadd_ps(tmp9660, _mm512_set1_ps(4e+00f), tmp9661);
__m512 tmp9653 = _mm512_fmadd_ps(tmp9640, _mm512_set1_ps(1.6e+01f), tmp9641);
__m512 tmp9673 = _mm512_fmadd_ps(tmp9660, _mm512_set1_ps(1.6e+01f), tmp9661);
__m512 tmp9638 = _mm512_add_ps(tmp9639, in1405);
__m512 tmp9658 = _mm512_add_ps(tmp9659, in1413);
__m512 tmp9655 = _mm512_add_ps(tmp9656, in1412);
__m512 tmp9675 = _mm512_add_ps(tmp9676, in1420);
__m512 tmp9637 = _mm512_fmadd_ps(tmp9642, _mm512_set1_ps(3.2e+01f), tmp9638);
__m512 tmp9657 = _mm512_fmadd_ps(tmp9662, _mm512_set1_ps(3.2e+01f), tmp9658);
__m512 tmp9648 = _mm512_fmadd_ps(tmp9642, _mm512_set1_ps(8e+00f), tmp9649);
__m512 tmp9668 = _mm512_fmadd_ps(tmp9662, _mm512_set1_ps(8e+00f), tmp9669);
__m512 tmp9654 = _mm512_fmadd_ps(tmp9646, _mm512_set1_ps(3.2e+01f), tmp9655);
__m512 tmp9674 = _mm512_fmadd_ps(tmp9666, _mm512_set1_ps(3.2e+01f), tmp9675);
__m512 tmp9652 = _mm512_fmadd_ps(tmp9642, _mm512_set1_ps(2e+00f), tmp9653);
__m512 tmp9672 = _mm512_fmadd_ps(tmp9662, _mm512_set1_ps(2e+00f), tmp9673);
__m512 tmp9625 = tmp9637;
__m512 tmp9631 = tmp9657;
__m512 tmp9626 = tmp9643;
__m512 tmp9632 = tmp9663;
__m512 tmp9627 = tmp9648;
__m512 tmp9633 = tmp9668;
__m512 tmp9628 = tmp9650;
__m512 tmp9634 = tmp9670;
__m512 tmp9629 = tmp9652;
__m512 tmp9635 = tmp9672;
__m512 tmp9630 = tmp9654;
__m512 tmp9636 = tmp9674;
__m512 tmp9721 = _mm512_unpacklo_ps(tmp9625, tmp9626);
__m512 tmp9722 = _mm512_unpackhi_ps(tmp9625, tmp9626);
__m512 tmp9723 = _mm512_unpacklo_ps(tmp9627, tmp9628);
__m512 tmp9724 = _mm512_unpackhi_ps(tmp9627, tmp9628);
__m512 tmp9725 = _mm512_unpacklo_ps(tmp9629, tmp9630);
__m512 tmp9726 = _mm512_unpackhi_ps(tmp9629, tmp9630);
__m512 tmp9727 = _mm512_unpacklo_ps(tmp9631, tmp9632);
__m512 tmp9728 = _mm512_unpackhi_ps(tmp9631, tmp9632);
__m512 tmp9729 = _mm512_unpacklo_ps(tmp9633, tmp9634);
__m512 tmp9730 = _mm512_unpackhi_ps(tmp9633, tmp9634);
__m512 tmp9731 = _mm512_unpacklo_ps(tmp9635, tmp9636);
__m512 tmp9732 = _mm512_unpackhi_ps(tmp9635, tmp9636);
__m512 tmp9733 = _mm512_shuffle_ps(tmp9721, tmp9723, 68);
__m512 tmp9734 = _mm512_shuffle_ps(tmp9721, tmp9723, 238);
__m512 tmp9735 = _mm512_shuffle_ps(tmp9722, tmp9724, 68);
__m512 tmp9736 = _mm512_shuffle_ps(tmp9722, tmp9724, 238);
__m512 tmp9737 = _mm512_shuffle_ps(tmp9725, tmp9727, 68);
__m512 tmp9738 = _mm512_shuffle_ps(tmp9725, tmp9727, 238);
__m512 tmp9739 = _mm512_shuffle_ps(tmp9726, tmp9728, 68);
__m512 tmp9740 = _mm512_shuffle_ps(tmp9726, tmp9728, 238);
__m512 tmp9741 = _mm512_shuffle_ps(tmp9729, tmp9731, 68);
__m512 tmp9742 = _mm512_shuffle_ps(tmp9729, tmp9731, 238);
__m512 tmp9743 = _mm512_shuffle_ps(tmp9730, tmp9732, 68);
__m512 tmp9744 = _mm512_shuffle_ps(tmp9730, tmp9732, 238);
__m512 tmp9745 = _mm512_shuffle_f32x4(tmp9733, tmp9737, 136);
__m512 tmp9746 = _mm512_shuffle_f32x4(tmp9733, tmp9737, 221);
__m512 tmp9747 = _mm512_shuffle_f32x4(tmp9734, tmp9738, 136);
__m512 tmp9748 = _mm512_shuffle_f32x4(tmp9734, tmp9738, 221);
__m512 tmp9749 = _mm512_shuffle_f32x4(tmp9735, tmp9739, 136);
__m512 tmp9750 = _mm512_shuffle_f32x4(tmp9735, tmp9739, 221);
__m512 tmp9751 = _mm512_shuffle_f32x4(tmp9736, tmp9740, 136);
__m512 tmp9752 = _mm512_shuffle_f32x4(tmp9736, tmp9740, 221);
__m512 tmp9753 = _mm512_shuffle_f32x4(tmp9741, tmp9741, 136);
__m512 tmp9754 = _mm512_shuffle_f32x4(tmp9741, tmp9741, 221);
__m512 tmp9755 = _mm512_shuffle_f32x4(tmp9742, tmp9742, 136);
__m512 tmp9756 = _mm512_shuffle_f32x4(tmp9742, tmp9742, 221);
__m512 tmp9757 = _mm512_shuffle_f32x4(tmp9743, tmp9743, 136);
__m512 tmp9758 = _mm512_shuffle_f32x4(tmp9743, tmp9743, 221);
__m512 tmp9759 = _mm512_shuffle_f32x4(tmp9744, tmp9744, 136);
__m512 tmp9760 = _mm512_shuffle_f32x4(tmp9744, tmp9744, 221);
tmp9625 = _mm512_shuffle_f32x4(tmp9745, tmp9753, 136);
tmp9633 = _mm512_shuffle_f32x4(tmp9745, tmp9753, 221);
tmp9626 = _mm512_shuffle_f32x4(tmp9747, tmp9755, 136);
tmp9634 = _mm512_shuffle_f32x4(tmp9747, tmp9755, 221);
tmp9627 = _mm512_shuffle_f32x4(tmp9749, tmp9757, 136);
tmp9635 = _mm512_shuffle_f32x4(tmp9749, tmp9757, 221);
tmp9628 = _mm512_shuffle_f32x4(tmp9751, tmp9759, 136);
tmp9636 = _mm512_shuffle_f32x4(tmp9751, tmp9759, 221);
tmp9629 = _mm512_shuffle_f32x4(tmp9746, tmp9754, 136);
__m512 tmp9677 = _mm512_shuffle_f32x4(tmp9746, tmp9754, 221);
tmp9630 = _mm512_shuffle_f32x4(tmp9748, tmp9756, 136);
__m512 tmp9678 = _mm512_shuffle_f32x4(tmp9748, tmp9756, 221);
tmp9631 = _mm512_shuffle_f32x4(tmp9750, tmp9758, 136);
__m512 tmp9679 = _mm512_shuffle_f32x4(tmp9750, tmp9758, 221);
tmp9632 = _mm512_shuffle_f32x4(tmp9752, tmp9760, 136);
__m512 tmp9680 = _mm512_shuffle_f32x4(tmp9752, tmp9760, 221);
__m512 tmp9685 = _mm512_add_ps(tmp9626, tmp9627);
__m512 tmp9705 = _mm512_add_ps(tmp9634, tmp9635);
__m512 tmp9684 = _mm512_add_ps(tmp9628, tmp9629);
__m512 tmp9704 = _mm512_add_ps(tmp9636, tmp9677);
__m512 tmp9690 = _mm512_sub_ps(tmp9628, tmp9629);
__m512 tmp9710 = _mm512_sub_ps(tmp9636, tmp9677);
__m512 tmp9689 = _mm512_sub_ps(tmp9626, tmp9627);
__m512 tmp9709 = _mm512_sub_ps(tmp9634, tmp9635);
__m512 tmp9686 = _mm512_add_ps(tmp9630, tmp9631);
__m512 tmp9706 = _mm512_add_ps(tmp9678, tmp9679);
__m512 tmp9691 = _mm512_sub_ps(tmp9630, tmp9631);
__m512 tmp9711 = _mm512_sub_ps(tmp9678, tmp9679);
__m512 tmp9688 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(2e+00f), tmp9689);
__m512 tmp9708 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(2e+00f), tmp9709);
__m512 tmp9695 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(8e+00f), tmp9689);
__m512 tmp9715 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(8e+00f), tmp9709);
__m512 tmp9683 = _mm512_add_ps(tmp9684, tmp9685);
__m512 tmp9703 = _mm512_add_ps(tmp9704, tmp9705);
__m512 tmp9687 = _mm512_fmadd_ps(tmp9691, _mm512_set1_ps(1.6e+01f), tmp9688);
__m512 tmp9707 = _mm512_fmadd_ps(tmp9711, _mm512_set1_ps(1.6e+01f), tmp9708);
__m512 tmp9694 = _mm512_fmadd_ps(tmp9691, _mm512_set1_ps(4e+00f), tmp9695);
__m512 tmp9714 = _mm512_fmadd_ps(tmp9711, _mm512_set1_ps(4e+00f), tmp9715);
__m512 tmp9700 = _mm512_add_ps(tmp9691, tmp9689);
__m512 tmp9720 = _mm512_add_ps(tmp9711, tmp9709);
__m512 tmp9693 = _mm512_fmadd_ps(tmp9684, _mm512_set1_ps(4e+00f), tmp9685);
__m512 tmp9713 = _mm512_fmadd_ps(tmp9704, _mm512_set1_ps(4e+00f), tmp9705);
__m512 tmp9697 = _mm512_fmadd_ps(tmp9684, _mm512_set1_ps(1.6e+01f), tmp9685);
__m512 tmp9717 = _mm512_fmadd_ps(tmp9704, _mm512_set1_ps(1.6e+01f), tmp9705);
__m512 tmp9682 = _mm512_add_ps(tmp9683, tmp9625);
__m512 tmp9702 = _mm512_add_ps(tmp9703, tmp9633);
__m512 tmp9699 = _mm512_add_ps(tmp9700, tmp9632);
__m512 tmp9719 = _mm512_add_ps(tmp9720, tmp9680);
__m512 tmp9681 = _mm512_fmadd_ps(tmp9686, _mm512_set1_ps(3.2e+01f), tmp9682);
__m512 tmp9701 = _mm512_fmadd_ps(tmp9706, _mm512_set1_ps(3.2e+01f), tmp9702);
__m512 tmp9692 = _mm512_fmadd_ps(tmp9686, _mm512_set1_ps(8e+00f), tmp9693);
__m512 tmp9712 = _mm512_fmadd_ps(tmp9706, _mm512_set1_ps(8e+00f), tmp9713);
__m512 tmp9698 = _mm512_fmadd_ps(tmp9690, _mm512_set1_ps(3.2e+01f), tmp9699);
__m512 tmp9718 = _mm512_fmadd_ps(tmp9710, _mm512_set1_ps(3.2e+01f), tmp9719);
__m512 tmp9696 = _mm512_fmadd_ps(tmp9686, _mm512_set1_ps(2e+00f), tmp9697);
__m512 tmp9716 = _mm512_fmadd_ps(tmp9706, _mm512_set1_ps(2e+00f), tmp9717);
__m512 out1323 = tmp9681;
__m512 out1329 = tmp9701;
__m512 out1324 = tmp9687;
__m512 out1330 = tmp9707;
__m512 out1325 = tmp9692;
__m512 out1331 = tmp9712;
__m512 out1326 = tmp9694;
__m512 out1332 = tmp9714;
__m512 out1327 = tmp9696;
__m512 out1333 = tmp9716;
__m512 out1328 = tmp9698;
__m512 out1334 = tmp9718;
out1323 = _mm512_max_ps(_mm512_setzero_ps(), out1323);
out1329 = _mm512_max_ps(_mm512_setzero_ps(), out1329);
out1324 = _mm512_max_ps(_mm512_setzero_ps(), out1324);
out1330 = _mm512_max_ps(_mm512_setzero_ps(), out1330);
out1325 = _mm512_max_ps(_mm512_setzero_ps(), out1325);
out1331 = _mm512_max_ps(_mm512_setzero_ps(), out1331);
out1326 = _mm512_max_ps(_mm512_setzero_ps(), out1326);
out1332 = _mm512_max_ps(_mm512_setzero_ps(), out1332);
out1327 = _mm512_max_ps(_mm512_setzero_ps(), out1327);
out1333 = _mm512_max_ps(_mm512_setzero_ps(), out1333);
out1328 = _mm512_max_ps(_mm512_setzero_ps(), out1328);
out1334 = _mm512_max_ps(_mm512_setzero_ps(), out1334);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1323);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1329);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1324);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1330);
_mm512_mask_storeu_ps(datPtr13+448+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1325);
_mm512_mask_storeu_ps(datPtr13+496+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1331);
_mm512_mask_storeu_ps(datPtr13+672+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1326);
_mm512_mask_storeu_ps(datPtr13+720+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1332);
_mm512_mask_storeu_ps(datPtr13+896+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1327);
_mm512_mask_storeu_ps(datPtr13+944+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1333);
_mm512_mask_storeu_ps(datPtr13+1120+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1328);
_mm512_mask_storeu_ps(datPtr13+1168+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1334);
__m512 sf705 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf706 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1421 = _mm512_shuffle_f32x4(sf705, sf706, 68);
__m512 in1422 = _mm512_shuffle_f32x4(sf705, sf706, 238);
__m512 sf707 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf708 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1429 = _mm512_shuffle_f32x4(sf707, sf708, 68);
__m512 in1430 = _mm512_shuffle_f32x4(sf707, sf708, 238);
__m512 sf709 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf710 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1423 = _mm512_shuffle_f32x4(sf709, sf710, 68);
__m512 in1424 = _mm512_shuffle_f32x4(sf709, sf710, 238);
__m512 sf711 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf712 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1431 = _mm512_shuffle_f32x4(sf711, sf712, 68);
__m512 in1432 = _mm512_shuffle_f32x4(sf711, sf712, 238);
__m512 sf713 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf714 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1425 = _mm512_shuffle_f32x4(sf713, sf714, 68);
__m512 in1426 = _mm512_shuffle_f32x4(sf713, sf714, 238);
__m512 sf715 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf716 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1433 = _mm512_shuffle_f32x4(sf715, sf716, 68);
__m512 in1434 = _mm512_shuffle_f32x4(sf715, sf716, 238);
__m512 sf717 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf718 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1427 = _mm512_shuffle_f32x4(sf717, sf718, 68);
__m512 in1428 = _mm512_shuffle_f32x4(sf717, sf718, 238);
__m512 sf719 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf720 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1435 = _mm512_shuffle_f32x4(sf719, sf720, 68);
__m512 in1436 = _mm512_shuffle_f32x4(sf719, sf720, 238);
__m512 tmp9777 = _mm512_add_ps(in1422, in1423);
__m512 tmp9797 = _mm512_add_ps(in1430, in1431);
__m512 tmp9776 = _mm512_add_ps(in1424, in1425);
__m512 tmp9796 = _mm512_add_ps(in1432, in1433);
__m512 tmp9782 = _mm512_sub_ps(in1424, in1425);
__m512 tmp9802 = _mm512_sub_ps(in1432, in1433);
__m512 tmp9781 = _mm512_sub_ps(in1422, in1423);
__m512 tmp9801 = _mm512_sub_ps(in1430, in1431);
__m512 tmp9778 = _mm512_add_ps(in1426, in1427);
__m512 tmp9798 = _mm512_add_ps(in1434, in1435);
__m512 tmp9783 = _mm512_sub_ps(in1426, in1427);
__m512 tmp9803 = _mm512_sub_ps(in1434, in1435);
__m512 tmp9780 = _mm512_fmadd_ps(tmp9782, _mm512_set1_ps(2e+00f), tmp9781);
__m512 tmp9800 = _mm512_fmadd_ps(tmp9802, _mm512_set1_ps(2e+00f), tmp9801);
__m512 tmp9787 = _mm512_fmadd_ps(tmp9782, _mm512_set1_ps(8e+00f), tmp9781);
__m512 tmp9807 = _mm512_fmadd_ps(tmp9802, _mm512_set1_ps(8e+00f), tmp9801);
__m512 tmp9775 = _mm512_add_ps(tmp9776, tmp9777);
__m512 tmp9795 = _mm512_add_ps(tmp9796, tmp9797);
__m512 tmp9779 = _mm512_fmadd_ps(tmp9783, _mm512_set1_ps(1.6e+01f), tmp9780);
__m512 tmp9799 = _mm512_fmadd_ps(tmp9803, _mm512_set1_ps(1.6e+01f), tmp9800);
__m512 tmp9786 = _mm512_fmadd_ps(tmp9783, _mm512_set1_ps(4e+00f), tmp9787);
__m512 tmp9806 = _mm512_fmadd_ps(tmp9803, _mm512_set1_ps(4e+00f), tmp9807);
__m512 tmp9792 = _mm512_add_ps(tmp9783, tmp9781);
__m512 tmp9812 = _mm512_add_ps(tmp9803, tmp9801);
__m512 tmp9785 = _mm512_fmadd_ps(tmp9776, _mm512_set1_ps(4e+00f), tmp9777);
__m512 tmp9805 = _mm512_fmadd_ps(tmp9796, _mm512_set1_ps(4e+00f), tmp9797);
__m512 tmp9789 = _mm512_fmadd_ps(tmp9776, _mm512_set1_ps(1.6e+01f), tmp9777);
__m512 tmp9809 = _mm512_fmadd_ps(tmp9796, _mm512_set1_ps(1.6e+01f), tmp9797);
__m512 tmp9774 = _mm512_add_ps(tmp9775, in1421);
__m512 tmp9794 = _mm512_add_ps(tmp9795, in1429);
__m512 tmp9791 = _mm512_add_ps(tmp9792, in1428);
__m512 tmp9811 = _mm512_add_ps(tmp9812, in1436);
__m512 tmp9773 = _mm512_fmadd_ps(tmp9778, _mm512_set1_ps(3.2e+01f), tmp9774);
__m512 tmp9793 = _mm512_fmadd_ps(tmp9798, _mm512_set1_ps(3.2e+01f), tmp9794);
__m512 tmp9784 = _mm512_fmadd_ps(tmp9778, _mm512_set1_ps(8e+00f), tmp9785);
__m512 tmp9804 = _mm512_fmadd_ps(tmp9798, _mm512_set1_ps(8e+00f), tmp9805);
__m512 tmp9790 = _mm512_fmadd_ps(tmp9782, _mm512_set1_ps(3.2e+01f), tmp9791);
__m512 tmp9810 = _mm512_fmadd_ps(tmp9802, _mm512_set1_ps(3.2e+01f), tmp9811);
__m512 tmp9788 = _mm512_fmadd_ps(tmp9778, _mm512_set1_ps(2e+00f), tmp9789);
__m512 tmp9808 = _mm512_fmadd_ps(tmp9798, _mm512_set1_ps(2e+00f), tmp9809);
__m512 tmp9761 = tmp9773;
__m512 tmp9767 = tmp9793;
__m512 tmp9762 = tmp9779;
__m512 tmp9768 = tmp9799;
__m512 tmp9763 = tmp9784;
__m512 tmp9769 = tmp9804;
__m512 tmp9764 = tmp9786;
__m512 tmp9770 = tmp9806;
__m512 tmp9765 = tmp9788;
__m512 tmp9771 = tmp9808;
__m512 tmp9766 = tmp9790;
__m512 tmp9772 = tmp9810;
__m512 tmp9857 = _mm512_unpacklo_ps(tmp9761, tmp9762);
__m512 tmp9858 = _mm512_unpackhi_ps(tmp9761, tmp9762);
__m512 tmp9859 = _mm512_unpacklo_ps(tmp9763, tmp9764);
__m512 tmp9860 = _mm512_unpackhi_ps(tmp9763, tmp9764);
__m512 tmp9861 = _mm512_unpacklo_ps(tmp9765, tmp9766);
__m512 tmp9862 = _mm512_unpackhi_ps(tmp9765, tmp9766);
__m512 tmp9863 = _mm512_unpacklo_ps(tmp9767, tmp9768);
__m512 tmp9864 = _mm512_unpackhi_ps(tmp9767, tmp9768);
__m512 tmp9865 = _mm512_unpacklo_ps(tmp9769, tmp9770);
__m512 tmp9866 = _mm512_unpackhi_ps(tmp9769, tmp9770);
__m512 tmp9867 = _mm512_unpacklo_ps(tmp9771, tmp9772);
__m512 tmp9868 = _mm512_unpackhi_ps(tmp9771, tmp9772);
__m512 tmp9869 = _mm512_shuffle_ps(tmp9857, tmp9859, 68);
__m512 tmp9870 = _mm512_shuffle_ps(tmp9857, tmp9859, 238);
__m512 tmp9871 = _mm512_shuffle_ps(tmp9858, tmp9860, 68);
__m512 tmp9872 = _mm512_shuffle_ps(tmp9858, tmp9860, 238);
__m512 tmp9873 = _mm512_shuffle_ps(tmp9861, tmp9863, 68);
__m512 tmp9874 = _mm512_shuffle_ps(tmp9861, tmp9863, 238);
__m512 tmp9875 = _mm512_shuffle_ps(tmp9862, tmp9864, 68);
__m512 tmp9876 = _mm512_shuffle_ps(tmp9862, tmp9864, 238);
__m512 tmp9877 = _mm512_shuffle_ps(tmp9865, tmp9867, 68);
__m512 tmp9878 = _mm512_shuffle_ps(tmp9865, tmp9867, 238);
__m512 tmp9879 = _mm512_shuffle_ps(tmp9866, tmp9868, 68);
__m512 tmp9880 = _mm512_shuffle_ps(tmp9866, tmp9868, 238);
__m512 tmp9881 = _mm512_shuffle_f32x4(tmp9869, tmp9873, 136);
__m512 tmp9882 = _mm512_shuffle_f32x4(tmp9869, tmp9873, 221);
__m512 tmp9883 = _mm512_shuffle_f32x4(tmp9870, tmp9874, 136);
__m512 tmp9884 = _mm512_shuffle_f32x4(tmp9870, tmp9874, 221);
__m512 tmp9885 = _mm512_shuffle_f32x4(tmp9871, tmp9875, 136);
__m512 tmp9886 = _mm512_shuffle_f32x4(tmp9871, tmp9875, 221);
__m512 tmp9887 = _mm512_shuffle_f32x4(tmp9872, tmp9876, 136);
__m512 tmp9888 = _mm512_shuffle_f32x4(tmp9872, tmp9876, 221);
__m512 tmp9889 = _mm512_shuffle_f32x4(tmp9877, tmp9877, 136);
__m512 tmp9890 = _mm512_shuffle_f32x4(tmp9877, tmp9877, 221);
__m512 tmp9891 = _mm512_shuffle_f32x4(tmp9878, tmp9878, 136);
__m512 tmp9892 = _mm512_shuffle_f32x4(tmp9878, tmp9878, 221);
__m512 tmp9893 = _mm512_shuffle_f32x4(tmp9879, tmp9879, 136);
__m512 tmp9894 = _mm512_shuffle_f32x4(tmp9879, tmp9879, 221);
__m512 tmp9895 = _mm512_shuffle_f32x4(tmp9880, tmp9880, 136);
__m512 tmp9896 = _mm512_shuffle_f32x4(tmp9880, tmp9880, 221);
tmp9761 = _mm512_shuffle_f32x4(tmp9881, tmp9889, 136);
tmp9769 = _mm512_shuffle_f32x4(tmp9881, tmp9889, 221);
tmp9762 = _mm512_shuffle_f32x4(tmp9883, tmp9891, 136);
tmp9770 = _mm512_shuffle_f32x4(tmp9883, tmp9891, 221);
tmp9763 = _mm512_shuffle_f32x4(tmp9885, tmp9893, 136);
tmp9771 = _mm512_shuffle_f32x4(tmp9885, tmp9893, 221);
tmp9764 = _mm512_shuffle_f32x4(tmp9887, tmp9895, 136);
tmp9772 = _mm512_shuffle_f32x4(tmp9887, tmp9895, 221);
tmp9765 = _mm512_shuffle_f32x4(tmp9882, tmp9890, 136);
__m512 tmp9813 = _mm512_shuffle_f32x4(tmp9882, tmp9890, 221);
tmp9766 = _mm512_shuffle_f32x4(tmp9884, tmp9892, 136);
__m512 tmp9814 = _mm512_shuffle_f32x4(tmp9884, tmp9892, 221);
tmp9767 = _mm512_shuffle_f32x4(tmp9886, tmp9894, 136);
__m512 tmp9815 = _mm512_shuffle_f32x4(tmp9886, tmp9894, 221);
tmp9768 = _mm512_shuffle_f32x4(tmp9888, tmp9896, 136);
__m512 tmp9816 = _mm512_shuffle_f32x4(tmp9888, tmp9896, 221);
__m512 tmp9821 = _mm512_add_ps(tmp9762, tmp9763);
__m512 tmp9841 = _mm512_add_ps(tmp9770, tmp9771);
__m512 tmp9820 = _mm512_add_ps(tmp9764, tmp9765);
__m512 tmp9840 = _mm512_add_ps(tmp9772, tmp9813);
__m512 tmp9826 = _mm512_sub_ps(tmp9764, tmp9765);
__m512 tmp9846 = _mm512_sub_ps(tmp9772, tmp9813);
__m512 tmp9825 = _mm512_sub_ps(tmp9762, tmp9763);
__m512 tmp9845 = _mm512_sub_ps(tmp9770, tmp9771);
__m512 tmp9822 = _mm512_add_ps(tmp9766, tmp9767);
__m512 tmp9842 = _mm512_add_ps(tmp9814, tmp9815);
__m512 tmp9827 = _mm512_sub_ps(tmp9766, tmp9767);
__m512 tmp9847 = _mm512_sub_ps(tmp9814, tmp9815);
__m512 tmp9824 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(2e+00f), tmp9825);
__m512 tmp9844 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(2e+00f), tmp9845);
__m512 tmp9831 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(8e+00f), tmp9825);
__m512 tmp9851 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(8e+00f), tmp9845);
__m512 tmp9819 = _mm512_add_ps(tmp9820, tmp9821);
__m512 tmp9839 = _mm512_add_ps(tmp9840, tmp9841);
__m512 tmp9823 = _mm512_fmadd_ps(tmp9827, _mm512_set1_ps(1.6e+01f), tmp9824);
__m512 tmp9843 = _mm512_fmadd_ps(tmp9847, _mm512_set1_ps(1.6e+01f), tmp9844);
__m512 tmp9830 = _mm512_fmadd_ps(tmp9827, _mm512_set1_ps(4e+00f), tmp9831);
__m512 tmp9850 = _mm512_fmadd_ps(tmp9847, _mm512_set1_ps(4e+00f), tmp9851);
__m512 tmp9836 = _mm512_add_ps(tmp9827, tmp9825);
__m512 tmp9856 = _mm512_add_ps(tmp9847, tmp9845);
__m512 tmp9829 = _mm512_fmadd_ps(tmp9820, _mm512_set1_ps(4e+00f), tmp9821);
__m512 tmp9849 = _mm512_fmadd_ps(tmp9840, _mm512_set1_ps(4e+00f), tmp9841);
__m512 tmp9833 = _mm512_fmadd_ps(tmp9820, _mm512_set1_ps(1.6e+01f), tmp9821);
__m512 tmp9853 = _mm512_fmadd_ps(tmp9840, _mm512_set1_ps(1.6e+01f), tmp9841);
__m512 tmp9818 = _mm512_add_ps(tmp9819, tmp9761);
__m512 tmp9838 = _mm512_add_ps(tmp9839, tmp9769);
__m512 tmp9835 = _mm512_add_ps(tmp9836, tmp9768);
__m512 tmp9855 = _mm512_add_ps(tmp9856, tmp9816);
__m512 tmp9817 = _mm512_fmadd_ps(tmp9822, _mm512_set1_ps(3.2e+01f), tmp9818);
__m512 tmp9837 = _mm512_fmadd_ps(tmp9842, _mm512_set1_ps(3.2e+01f), tmp9838);
__m512 tmp9828 = _mm512_fmadd_ps(tmp9822, _mm512_set1_ps(8e+00f), tmp9829);
__m512 tmp9848 = _mm512_fmadd_ps(tmp9842, _mm512_set1_ps(8e+00f), tmp9849);
__m512 tmp9834 = _mm512_fmadd_ps(tmp9826, _mm512_set1_ps(3.2e+01f), tmp9835);
__m512 tmp9854 = _mm512_fmadd_ps(tmp9846, _mm512_set1_ps(3.2e+01f), tmp9855);
__m512 tmp9832 = _mm512_fmadd_ps(tmp9822, _mm512_set1_ps(2e+00f), tmp9833);
__m512 tmp9852 = _mm512_fmadd_ps(tmp9842, _mm512_set1_ps(2e+00f), tmp9853);
__m512 out1335 = tmp9817;
__m512 out1341 = tmp9837;
__m512 out1336 = tmp9823;
__m512 out1342 = tmp9843;
__m512 out1337 = tmp9828;
__m512 out1343 = tmp9848;
__m512 out1338 = tmp9830;
__m512 out1344 = tmp9850;
__m512 out1339 = tmp9832;
__m512 out1345 = tmp9852;
__m512 out1340 = tmp9834;
__m512 out1346 = tmp9854;
out1335 = _mm512_max_ps(_mm512_setzero_ps(), out1335);
out1341 = _mm512_max_ps(_mm512_setzero_ps(), out1341);
out1336 = _mm512_max_ps(_mm512_setzero_ps(), out1336);
out1342 = _mm512_max_ps(_mm512_setzero_ps(), out1342);
out1337 = _mm512_max_ps(_mm512_setzero_ps(), out1337);
out1343 = _mm512_max_ps(_mm512_setzero_ps(), out1343);
out1338 = _mm512_max_ps(_mm512_setzero_ps(), out1338);
out1344 = _mm512_max_ps(_mm512_setzero_ps(), out1344);
out1339 = _mm512_max_ps(_mm512_setzero_ps(), out1339);
out1345 = _mm512_max_ps(_mm512_setzero_ps(), out1345);
out1340 = _mm512_max_ps(_mm512_setzero_ps(), out1340);
out1346 = _mm512_max_ps(_mm512_setzero_ps(), out1346);
_mm512_mask_storeu_ps(datPtr13+1200+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1335);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1341);
_mm512_mask_storeu_ps(datPtr13+1424+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1336);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1342);
_mm512_mask_storeu_ps(datPtr13+1648+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1337);
_mm512_mask_storeu_ps(datPtr13+13056+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1343);
_mm512_mask_storeu_ps(datPtr13+1872+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1338);
_mm512_mask_storeu_ps(datPtr13+13280+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1344);
_mm512_mask_storeu_ps(datPtr13+2096+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1339);
_mm512_mask_storeu_ps(datPtr13+13504+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1345);
_mm512_mask_storeu_ps(datPtr13+2320+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1340);
_mm512_mask_storeu_ps(datPtr13+13728+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1346);
__m512 sf721 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf722 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1437 = _mm512_shuffle_f32x4(sf721, sf722, 68);
__m512 in1438 = _mm512_shuffle_f32x4(sf721, sf722, 238);
__m512 sf723 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf724 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1445 = _mm512_shuffle_f32x4(sf723, sf724, 68);
__m512 in1446 = _mm512_shuffle_f32x4(sf723, sf724, 238);
__m512 sf725 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf726 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1439 = _mm512_shuffle_f32x4(sf725, sf726, 68);
__m512 in1440 = _mm512_shuffle_f32x4(sf725, sf726, 238);
__m512 sf727 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf728 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1447 = _mm512_shuffle_f32x4(sf727, sf728, 68);
__m512 in1448 = _mm512_shuffle_f32x4(sf727, sf728, 238);
__m512 sf729 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf730 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1441 = _mm512_shuffle_f32x4(sf729, sf730, 68);
__m512 in1442 = _mm512_shuffle_f32x4(sf729, sf730, 238);
__m512 sf731 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf732 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1449 = _mm512_shuffle_f32x4(sf731, sf732, 68);
__m512 in1450 = _mm512_shuffle_f32x4(sf731, sf732, 238);
__m512 sf733 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf734 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1443 = _mm512_shuffle_f32x4(sf733, sf734, 68);
__m512 in1444 = _mm512_shuffle_f32x4(sf733, sf734, 238);
__m512 sf735 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k95+768*l34);
__m512 sf736 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k95+768*l34);
__m512 in1451 = _mm512_shuffle_f32x4(sf735, sf736, 68);
__m512 in1452 = _mm512_shuffle_f32x4(sf735, sf736, 238);
__m512 tmp9913 = _mm512_add_ps(in1438, in1439);
__m512 tmp9933 = _mm512_add_ps(in1446, in1447);
__m512 tmp9912 = _mm512_add_ps(in1440, in1441);
__m512 tmp9932 = _mm512_add_ps(in1448, in1449);
__m512 tmp9918 = _mm512_sub_ps(in1440, in1441);
__m512 tmp9938 = _mm512_sub_ps(in1448, in1449);
__m512 tmp9917 = _mm512_sub_ps(in1438, in1439);
__m512 tmp9937 = _mm512_sub_ps(in1446, in1447);
__m512 tmp9914 = _mm512_add_ps(in1442, in1443);
__m512 tmp9934 = _mm512_add_ps(in1450, in1451);
__m512 tmp9919 = _mm512_sub_ps(in1442, in1443);
__m512 tmp9939 = _mm512_sub_ps(in1450, in1451);
__m512 tmp9916 = _mm512_fmadd_ps(tmp9918, _mm512_set1_ps(2e+00f), tmp9917);
__m512 tmp9936 = _mm512_fmadd_ps(tmp9938, _mm512_set1_ps(2e+00f), tmp9937);
__m512 tmp9923 = _mm512_fmadd_ps(tmp9918, _mm512_set1_ps(8e+00f), tmp9917);
__m512 tmp9943 = _mm512_fmadd_ps(tmp9938, _mm512_set1_ps(8e+00f), tmp9937);
__m512 tmp9911 = _mm512_add_ps(tmp9912, tmp9913);
__m512 tmp9931 = _mm512_add_ps(tmp9932, tmp9933);
__m512 tmp9915 = _mm512_fmadd_ps(tmp9919, _mm512_set1_ps(1.6e+01f), tmp9916);
__m512 tmp9935 = _mm512_fmadd_ps(tmp9939, _mm512_set1_ps(1.6e+01f), tmp9936);
__m512 tmp9922 = _mm512_fmadd_ps(tmp9919, _mm512_set1_ps(4e+00f), tmp9923);
__m512 tmp9942 = _mm512_fmadd_ps(tmp9939, _mm512_set1_ps(4e+00f), tmp9943);
__m512 tmp9928 = _mm512_add_ps(tmp9919, tmp9917);
__m512 tmp9948 = _mm512_add_ps(tmp9939, tmp9937);
__m512 tmp9921 = _mm512_fmadd_ps(tmp9912, _mm512_set1_ps(4e+00f), tmp9913);
__m512 tmp9941 = _mm512_fmadd_ps(tmp9932, _mm512_set1_ps(4e+00f), tmp9933);
__m512 tmp9925 = _mm512_fmadd_ps(tmp9912, _mm512_set1_ps(1.6e+01f), tmp9913);
__m512 tmp9945 = _mm512_fmadd_ps(tmp9932, _mm512_set1_ps(1.6e+01f), tmp9933);
__m512 tmp9910 = _mm512_add_ps(tmp9911, in1437);
__m512 tmp9930 = _mm512_add_ps(tmp9931, in1445);
__m512 tmp9927 = _mm512_add_ps(tmp9928, in1444);
__m512 tmp9947 = _mm512_add_ps(tmp9948, in1452);
__m512 tmp9909 = _mm512_fmadd_ps(tmp9914, _mm512_set1_ps(3.2e+01f), tmp9910);
__m512 tmp9929 = _mm512_fmadd_ps(tmp9934, _mm512_set1_ps(3.2e+01f), tmp9930);
__m512 tmp9920 = _mm512_fmadd_ps(tmp9914, _mm512_set1_ps(8e+00f), tmp9921);
__m512 tmp9940 = _mm512_fmadd_ps(tmp9934, _mm512_set1_ps(8e+00f), tmp9941);
__m512 tmp9926 = _mm512_fmadd_ps(tmp9918, _mm512_set1_ps(3.2e+01f), tmp9927);
__m512 tmp9946 = _mm512_fmadd_ps(tmp9938, _mm512_set1_ps(3.2e+01f), tmp9947);
__m512 tmp9924 = _mm512_fmadd_ps(tmp9914, _mm512_set1_ps(2e+00f), tmp9925);
__m512 tmp9944 = _mm512_fmadd_ps(tmp9934, _mm512_set1_ps(2e+00f), tmp9945);
__m512 tmp9897 = tmp9909;
__m512 tmp9903 = tmp9929;
__m512 tmp9898 = tmp9915;
__m512 tmp9904 = tmp9935;
__m512 tmp9899 = tmp9920;
__m512 tmp9905 = tmp9940;
__m512 tmp9900 = tmp9922;
__m512 tmp9906 = tmp9942;
__m512 tmp9901 = tmp9924;
__m512 tmp9907 = tmp9944;
__m512 tmp9902 = tmp9926;
__m512 tmp9908 = tmp9946;
__m512 tmp9993 = _mm512_unpacklo_ps(tmp9897, tmp9898);
__m512 tmp9994 = _mm512_unpackhi_ps(tmp9897, tmp9898);
__m512 tmp9995 = _mm512_unpacklo_ps(tmp9899, tmp9900);
__m512 tmp9996 = _mm512_unpackhi_ps(tmp9899, tmp9900);
__m512 tmp9997 = _mm512_unpacklo_ps(tmp9901, tmp9902);
__m512 tmp9998 = _mm512_unpackhi_ps(tmp9901, tmp9902);
__m512 tmp9999 = _mm512_unpacklo_ps(tmp9903, tmp9904);
__m512 tmp10000 = _mm512_unpackhi_ps(tmp9903, tmp9904);
__m512 tmp10001 = _mm512_unpacklo_ps(tmp9905, tmp9906);
__m512 tmp10002 = _mm512_unpackhi_ps(tmp9905, tmp9906);
__m512 tmp10003 = _mm512_unpacklo_ps(tmp9907, tmp9908);
__m512 tmp10004 = _mm512_unpackhi_ps(tmp9907, tmp9908);
__m512 tmp10005 = _mm512_shuffle_ps(tmp9993, tmp9995, 68);
__m512 tmp10006 = _mm512_shuffle_ps(tmp9993, tmp9995, 238);
__m512 tmp10007 = _mm512_shuffle_ps(tmp9994, tmp9996, 68);
__m512 tmp10008 = _mm512_shuffle_ps(tmp9994, tmp9996, 238);
__m512 tmp10009 = _mm512_shuffle_ps(tmp9997, tmp9999, 68);
__m512 tmp10010 = _mm512_shuffle_ps(tmp9997, tmp9999, 238);
__m512 tmp10011 = _mm512_shuffle_ps(tmp9998, tmp10000, 68);
__m512 tmp10012 = _mm512_shuffle_ps(tmp9998, tmp10000, 238);
__m512 tmp10013 = _mm512_shuffle_ps(tmp10001, tmp10003, 68);
__m512 tmp10014 = _mm512_shuffle_ps(tmp10001, tmp10003, 238);
__m512 tmp10015 = _mm512_shuffle_ps(tmp10002, tmp10004, 68);
__m512 tmp10016 = _mm512_shuffle_ps(tmp10002, tmp10004, 238);
__m512 tmp10017 = _mm512_shuffle_f32x4(tmp10005, tmp10009, 136);
__m512 tmp10018 = _mm512_shuffle_f32x4(tmp10005, tmp10009, 221);
__m512 tmp10019 = _mm512_shuffle_f32x4(tmp10006, tmp10010, 136);
__m512 tmp10020 = _mm512_shuffle_f32x4(tmp10006, tmp10010, 221);
__m512 tmp10021 = _mm512_shuffle_f32x4(tmp10007, tmp10011, 136);
__m512 tmp10022 = _mm512_shuffle_f32x4(tmp10007, tmp10011, 221);
__m512 tmp10023 = _mm512_shuffle_f32x4(tmp10008, tmp10012, 136);
__m512 tmp10024 = _mm512_shuffle_f32x4(tmp10008, tmp10012, 221);
__m512 tmp10025 = _mm512_shuffle_f32x4(tmp10013, tmp10013, 136);
__m512 tmp10026 = _mm512_shuffle_f32x4(tmp10013, tmp10013, 221);
__m512 tmp10027 = _mm512_shuffle_f32x4(tmp10014, tmp10014, 136);
__m512 tmp10028 = _mm512_shuffle_f32x4(tmp10014, tmp10014, 221);
__m512 tmp10029 = _mm512_shuffle_f32x4(tmp10015, tmp10015, 136);
__m512 tmp10030 = _mm512_shuffle_f32x4(tmp10015, tmp10015, 221);
__m512 tmp10031 = _mm512_shuffle_f32x4(tmp10016, tmp10016, 136);
__m512 tmp10032 = _mm512_shuffle_f32x4(tmp10016, tmp10016, 221);
tmp9897 = _mm512_shuffle_f32x4(tmp10017, tmp10025, 136);
tmp9905 = _mm512_shuffle_f32x4(tmp10017, tmp10025, 221);
tmp9898 = _mm512_shuffle_f32x4(tmp10019, tmp10027, 136);
tmp9906 = _mm512_shuffle_f32x4(tmp10019, tmp10027, 221);
tmp9899 = _mm512_shuffle_f32x4(tmp10021, tmp10029, 136);
tmp9907 = _mm512_shuffle_f32x4(tmp10021, tmp10029, 221);
tmp9900 = _mm512_shuffle_f32x4(tmp10023, tmp10031, 136);
tmp9908 = _mm512_shuffle_f32x4(tmp10023, tmp10031, 221);
tmp9901 = _mm512_shuffle_f32x4(tmp10018, tmp10026, 136);
__m512 tmp9949 = _mm512_shuffle_f32x4(tmp10018, tmp10026, 221);
tmp9902 = _mm512_shuffle_f32x4(tmp10020, tmp10028, 136);
__m512 tmp9950 = _mm512_shuffle_f32x4(tmp10020, tmp10028, 221);
tmp9903 = _mm512_shuffle_f32x4(tmp10022, tmp10030, 136);
__m512 tmp9951 = _mm512_shuffle_f32x4(tmp10022, tmp10030, 221);
tmp9904 = _mm512_shuffle_f32x4(tmp10024, tmp10032, 136);
__m512 tmp9952 = _mm512_shuffle_f32x4(tmp10024, tmp10032, 221);
__m512 tmp9957 = _mm512_add_ps(tmp9898, tmp9899);
__m512 tmp9977 = _mm512_add_ps(tmp9906, tmp9907);
__m512 tmp9956 = _mm512_add_ps(tmp9900, tmp9901);
__m512 tmp9976 = _mm512_add_ps(tmp9908, tmp9949);
__m512 tmp9962 = _mm512_sub_ps(tmp9900, tmp9901);
__m512 tmp9982 = _mm512_sub_ps(tmp9908, tmp9949);
__m512 tmp9961 = _mm512_sub_ps(tmp9898, tmp9899);
__m512 tmp9981 = _mm512_sub_ps(tmp9906, tmp9907);
__m512 tmp9958 = _mm512_add_ps(tmp9902, tmp9903);
__m512 tmp9978 = _mm512_add_ps(tmp9950, tmp9951);
__m512 tmp9963 = _mm512_sub_ps(tmp9902, tmp9903);
__m512 tmp9983 = _mm512_sub_ps(tmp9950, tmp9951);
__m512 tmp9960 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(2e+00f), tmp9961);
__m512 tmp9980 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(2e+00f), tmp9981);
__m512 tmp9967 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(8e+00f), tmp9961);
__m512 tmp9987 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(8e+00f), tmp9981);
__m512 tmp9955 = _mm512_add_ps(tmp9956, tmp9957);
__m512 tmp9975 = _mm512_add_ps(tmp9976, tmp9977);
__m512 tmp9959 = _mm512_fmadd_ps(tmp9963, _mm512_set1_ps(1.6e+01f), tmp9960);
__m512 tmp9979 = _mm512_fmadd_ps(tmp9983, _mm512_set1_ps(1.6e+01f), tmp9980);
__m512 tmp9966 = _mm512_fmadd_ps(tmp9963, _mm512_set1_ps(4e+00f), tmp9967);
__m512 tmp9986 = _mm512_fmadd_ps(tmp9983, _mm512_set1_ps(4e+00f), tmp9987);
__m512 tmp9972 = _mm512_add_ps(tmp9963, tmp9961);
__m512 tmp9992 = _mm512_add_ps(tmp9983, tmp9981);
__m512 tmp9965 = _mm512_fmadd_ps(tmp9956, _mm512_set1_ps(4e+00f), tmp9957);
__m512 tmp9985 = _mm512_fmadd_ps(tmp9976, _mm512_set1_ps(4e+00f), tmp9977);
__m512 tmp9969 = _mm512_fmadd_ps(tmp9956, _mm512_set1_ps(1.6e+01f), tmp9957);
__m512 tmp9989 = _mm512_fmadd_ps(tmp9976, _mm512_set1_ps(1.6e+01f), tmp9977);
__m512 tmp9954 = _mm512_add_ps(tmp9955, tmp9897);
__m512 tmp9974 = _mm512_add_ps(tmp9975, tmp9905);
__m512 tmp9971 = _mm512_add_ps(tmp9972, tmp9904);
__m512 tmp9991 = _mm512_add_ps(tmp9992, tmp9952);
__m512 tmp9953 = _mm512_fmadd_ps(tmp9958, _mm512_set1_ps(3.2e+01f), tmp9954);
__m512 tmp9973 = _mm512_fmadd_ps(tmp9978, _mm512_set1_ps(3.2e+01f), tmp9974);
__m512 tmp9964 = _mm512_fmadd_ps(tmp9958, _mm512_set1_ps(8e+00f), tmp9965);
__m512 tmp9984 = _mm512_fmadd_ps(tmp9978, _mm512_set1_ps(8e+00f), tmp9985);
__m512 tmp9970 = _mm512_fmadd_ps(tmp9962, _mm512_set1_ps(3.2e+01f), tmp9971);
__m512 tmp9990 = _mm512_fmadd_ps(tmp9982, _mm512_set1_ps(3.2e+01f), tmp9991);
__m512 tmp9968 = _mm512_fmadd_ps(tmp9958, _mm512_set1_ps(2e+00f), tmp9969);
__m512 tmp9988 = _mm512_fmadd_ps(tmp9978, _mm512_set1_ps(2e+00f), tmp9989);
__m512 out1347 = tmp9953;
__m512 out1353 = tmp9973;
__m512 out1348 = tmp9959;
__m512 out1354 = tmp9979;
__m512 out1349 = tmp9964;
__m512 out1355 = tmp9984;
__m512 out1350 = tmp9966;
__m512 out1356 = tmp9986;
__m512 out1351 = tmp9968;
__m512 out1357 = tmp9988;
__m512 out1352 = tmp9970;
__m512 out1358 = tmp9990;
out1347 = _mm512_max_ps(_mm512_setzero_ps(), out1347);
out1353 = _mm512_max_ps(_mm512_setzero_ps(), out1353);
out1348 = _mm512_max_ps(_mm512_setzero_ps(), out1348);
out1354 = _mm512_max_ps(_mm512_setzero_ps(), out1354);
out1349 = _mm512_max_ps(_mm512_setzero_ps(), out1349);
out1355 = _mm512_max_ps(_mm512_setzero_ps(), out1355);
out1350 = _mm512_max_ps(_mm512_setzero_ps(), out1350);
out1356 = _mm512_max_ps(_mm512_setzero_ps(), out1356);
out1351 = _mm512_max_ps(_mm512_setzero_ps(), out1351);
out1357 = _mm512_max_ps(_mm512_setzero_ps(), out1357);
out1352 = _mm512_max_ps(_mm512_setzero_ps(), out1352);
out1358 = _mm512_max_ps(_mm512_setzero_ps(), out1358);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1347);
_mm512_mask_storeu_ps(datPtr13+13808+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1353);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1348);
_mm512_mask_storeu_ps(datPtr13+14032+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1354);
_mm512_mask_storeu_ps(datPtr13+13104+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1349);
_mm512_mask_storeu_ps(datPtr13+14256+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1355);
_mm512_mask_storeu_ps(datPtr13+13328+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1350);
_mm512_mask_storeu_ps(datPtr13+14480+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1356);
_mm512_mask_storeu_ps(datPtr13+13552+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1351);
_mm512_mask_storeu_ps(datPtr13+14704+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1357);
_mm512_mask_storeu_ps(datPtr13+13776+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 255, out1352);
_mm512_mask_storeu_ps(datPtr13+14928+50432*i29+224*toH35+4*toW35+50432*k95+25216*l34, 4095, out1358);
}
}
++j23;
}
j23 = 15;
}
ptrdiff_t rel18 = j23-15;
ptrdiff_t base18 = 54;
if (rel18 < 1) {
ptrdiff_t toH36 = base18+0;
ptrdiff_t toW36 = 0;
ptrdiff_t k96 = 1*w46;
for (; k96 != 1; ++k96) {
ptrdiff_t l35 = 0;
for (; l35 != 2; ++l35) {
__m512 sf737 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf738 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1453 = _mm512_shuffle_f32x4(sf737, sf738, 68);
__m512 in1454 = _mm512_shuffle_f32x4(sf737, sf738, 238);
__m512 sf739 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf740 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1461 = _mm512_shuffle_f32x4(sf739, sf740, 68);
__m512 in1462 = _mm512_shuffle_f32x4(sf739, sf740, 238);
__m512 sf741 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf742 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1455 = _mm512_shuffle_f32x4(sf741, sf742, 68);
__m512 in1456 = _mm512_shuffle_f32x4(sf741, sf742, 238);
__m512 sf743 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf744 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1463 = _mm512_shuffle_f32x4(sf743, sf744, 68);
__m512 in1464 = _mm512_shuffle_f32x4(sf743, sf744, 238);
__m512 sf745 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf746 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1457 = _mm512_shuffle_f32x4(sf745, sf746, 68);
__m512 in1458 = _mm512_shuffle_f32x4(sf745, sf746, 238);
__m512 sf747 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf748 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1465 = _mm512_shuffle_f32x4(sf747, sf748, 68);
__m512 in1466 = _mm512_shuffle_f32x4(sf747, sf748, 238);
__m512 sf749 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf750 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1459 = _mm512_shuffle_f32x4(sf749, sf750, 68);
__m512 in1460 = _mm512_shuffle_f32x4(sf749, sf750, 238);
__m512 sf751 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf752 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1467 = _mm512_shuffle_f32x4(sf751, sf752, 68);
__m512 in1468 = _mm512_shuffle_f32x4(sf751, sf752, 238);
__m512 tmp10049 = _mm512_add_ps(in1454, in1455);
__m512 tmp10069 = _mm512_add_ps(in1462, in1463);
__m512 tmp10048 = _mm512_add_ps(in1456, in1457);
__m512 tmp10068 = _mm512_add_ps(in1464, in1465);
__m512 tmp10054 = _mm512_sub_ps(in1456, in1457);
__m512 tmp10074 = _mm512_sub_ps(in1464, in1465);
__m512 tmp10053 = _mm512_sub_ps(in1454, in1455);
__m512 tmp10073 = _mm512_sub_ps(in1462, in1463);
__m512 tmp10050 = _mm512_add_ps(in1458, in1459);
__m512 tmp10070 = _mm512_add_ps(in1466, in1467);
__m512 tmp10055 = _mm512_sub_ps(in1458, in1459);
__m512 tmp10075 = _mm512_sub_ps(in1466, in1467);
__m512 tmp10052 = _mm512_fmadd_ps(tmp10054, _mm512_set1_ps(2e+00f), tmp10053);
__m512 tmp10072 = _mm512_fmadd_ps(tmp10074, _mm512_set1_ps(2e+00f), tmp10073);
__m512 tmp10059 = _mm512_fmadd_ps(tmp10054, _mm512_set1_ps(8e+00f), tmp10053);
__m512 tmp10079 = _mm512_fmadd_ps(tmp10074, _mm512_set1_ps(8e+00f), tmp10073);
__m512 tmp10047 = _mm512_add_ps(tmp10048, tmp10049);
__m512 tmp10067 = _mm512_add_ps(tmp10068, tmp10069);
__m512 tmp10051 = _mm512_fmadd_ps(tmp10055, _mm512_set1_ps(1.6e+01f), tmp10052);
__m512 tmp10071 = _mm512_fmadd_ps(tmp10075, _mm512_set1_ps(1.6e+01f), tmp10072);
__m512 tmp10058 = _mm512_fmadd_ps(tmp10055, _mm512_set1_ps(4e+00f), tmp10059);
__m512 tmp10078 = _mm512_fmadd_ps(tmp10075, _mm512_set1_ps(4e+00f), tmp10079);
__m512 tmp10064 = _mm512_add_ps(tmp10055, tmp10053);
__m512 tmp10084 = _mm512_add_ps(tmp10075, tmp10073);
__m512 tmp10057 = _mm512_fmadd_ps(tmp10048, _mm512_set1_ps(4e+00f), tmp10049);
__m512 tmp10077 = _mm512_fmadd_ps(tmp10068, _mm512_set1_ps(4e+00f), tmp10069);
__m512 tmp10061 = _mm512_fmadd_ps(tmp10048, _mm512_set1_ps(1.6e+01f), tmp10049);
__m512 tmp10081 = _mm512_fmadd_ps(tmp10068, _mm512_set1_ps(1.6e+01f), tmp10069);
__m512 tmp10046 = _mm512_add_ps(tmp10047, in1453);
__m512 tmp10066 = _mm512_add_ps(tmp10067, in1461);
__m512 tmp10063 = _mm512_add_ps(tmp10064, in1460);
__m512 tmp10083 = _mm512_add_ps(tmp10084, in1468);
__m512 tmp10045 = _mm512_fmadd_ps(tmp10050, _mm512_set1_ps(3.2e+01f), tmp10046);
__m512 tmp10065 = _mm512_fmadd_ps(tmp10070, _mm512_set1_ps(3.2e+01f), tmp10066);
__m512 tmp10056 = _mm512_fmadd_ps(tmp10050, _mm512_set1_ps(8e+00f), tmp10057);
__m512 tmp10076 = _mm512_fmadd_ps(tmp10070, _mm512_set1_ps(8e+00f), tmp10077);
__m512 tmp10062 = _mm512_fmadd_ps(tmp10054, _mm512_set1_ps(3.2e+01f), tmp10063);
__m512 tmp10082 = _mm512_fmadd_ps(tmp10074, _mm512_set1_ps(3.2e+01f), tmp10083);
__m512 tmp10060 = _mm512_fmadd_ps(tmp10050, _mm512_set1_ps(2e+00f), tmp10061);
__m512 tmp10080 = _mm512_fmadd_ps(tmp10070, _mm512_set1_ps(2e+00f), tmp10081);
__m512 tmp10033 = tmp10045;
__m512 tmp10039 = tmp10065;
__m512 tmp10034 = tmp10051;
__m512 tmp10040 = tmp10071;
__m512 tmp10035 = tmp10056;
__m512 tmp10041 = tmp10076;
__m512 tmp10036 = tmp10058;
__m512 tmp10042 = tmp10078;
__m512 tmp10037 = tmp10060;
__m512 tmp10043 = tmp10080;
__m512 tmp10038 = tmp10062;
__m512 tmp10044 = tmp10082;
__m512 tmp10111 = _mm512_unpacklo_ps(tmp10033, tmp10034);
__m512 tmp10112 = _mm512_unpackhi_ps(tmp10033, tmp10034);
__m512 tmp10113 = _mm512_unpacklo_ps(tmp10035, tmp10036);
__m512 tmp10114 = _mm512_unpackhi_ps(tmp10035, tmp10036);
__m512 tmp10115 = _mm512_unpacklo_ps(tmp10037, tmp10038);
__m512 tmp10116 = _mm512_unpackhi_ps(tmp10037, tmp10038);
__m512 tmp10117 = _mm512_unpacklo_ps(tmp10039, tmp10040);
__m512 tmp10118 = _mm512_unpackhi_ps(tmp10039, tmp10040);
__m512 tmp10119 = _mm512_unpacklo_ps(tmp10041, tmp10042);
__m512 tmp10120 = _mm512_unpackhi_ps(tmp10041, tmp10042);
__m512 tmp10121 = _mm512_unpacklo_ps(tmp10043, tmp10044);
__m512 tmp10122 = _mm512_unpackhi_ps(tmp10043, tmp10044);
__m512 tmp10123 = _mm512_shuffle_ps(tmp10111, tmp10113, 68);
__m512 tmp10124 = _mm512_shuffle_ps(tmp10111, tmp10113, 238);
__m512 tmp10125 = _mm512_shuffle_ps(tmp10112, tmp10114, 68);
__m512 tmp10126 = _mm512_shuffle_ps(tmp10112, tmp10114, 238);
__m512 tmp10127 = _mm512_shuffle_ps(tmp10115, tmp10117, 68);
__m512 tmp10128 = _mm512_shuffle_ps(tmp10115, tmp10117, 238);
__m512 tmp10129 = _mm512_shuffle_ps(tmp10116, tmp10118, 68);
__m512 tmp10130 = _mm512_shuffle_ps(tmp10116, tmp10118, 238);
__m512 tmp10131 = _mm512_shuffle_ps(tmp10119, tmp10121, 68);
__m512 tmp10132 = _mm512_shuffle_ps(tmp10119, tmp10121, 238);
__m512 tmp10133 = _mm512_shuffle_ps(tmp10120, tmp10122, 68);
__m512 tmp10134 = _mm512_shuffle_ps(tmp10120, tmp10122, 238);
__m512 tmp10135 = _mm512_shuffle_f32x4(tmp10123, tmp10127, 136);
__m512 tmp10136 = _mm512_shuffle_f32x4(tmp10123, tmp10127, 221);
__m512 tmp10137 = _mm512_shuffle_f32x4(tmp10124, tmp10128, 136);
__m512 tmp10138 = _mm512_shuffle_f32x4(tmp10124, tmp10128, 221);
__m512 tmp10139 = _mm512_shuffle_f32x4(tmp10125, tmp10129, 136);
__m512 tmp10140 = _mm512_shuffle_f32x4(tmp10125, tmp10129, 221);
__m512 tmp10141 = _mm512_shuffle_f32x4(tmp10126, tmp10130, 136);
__m512 tmp10142 = _mm512_shuffle_f32x4(tmp10126, tmp10130, 221);
__m512 tmp10143 = _mm512_shuffle_f32x4(tmp10131, tmp10131, 136);
__m512 tmp10144 = _mm512_shuffle_f32x4(tmp10131, tmp10131, 221);
__m512 tmp10145 = _mm512_shuffle_f32x4(tmp10132, tmp10132, 136);
__m512 tmp10146 = _mm512_shuffle_f32x4(tmp10132, tmp10132, 221);
__m512 tmp10147 = _mm512_shuffle_f32x4(tmp10133, tmp10133, 136);
__m512 tmp10148 = _mm512_shuffle_f32x4(tmp10133, tmp10133, 221);
__m512 tmp10149 = _mm512_shuffle_f32x4(tmp10134, tmp10134, 136);
__m512 tmp10150 = _mm512_shuffle_f32x4(tmp10134, tmp10134, 221);
tmp10033 = _mm512_shuffle_f32x4(tmp10135, tmp10143, 136);
tmp10041 = _mm512_shuffle_f32x4(tmp10135, tmp10143, 221);
tmp10034 = _mm512_shuffle_f32x4(tmp10137, tmp10145, 136);
tmp10042 = _mm512_shuffle_f32x4(tmp10137, tmp10145, 221);
tmp10035 = _mm512_shuffle_f32x4(tmp10139, tmp10147, 136);
tmp10043 = _mm512_shuffle_f32x4(tmp10139, tmp10147, 221);
tmp10036 = _mm512_shuffle_f32x4(tmp10141, tmp10149, 136);
tmp10044 = _mm512_shuffle_f32x4(tmp10141, tmp10149, 221);
tmp10037 = _mm512_shuffle_f32x4(tmp10136, tmp10144, 136);
__m512 tmp10085 = _mm512_shuffle_f32x4(tmp10136, tmp10144, 221);
tmp10038 = _mm512_shuffle_f32x4(tmp10138, tmp10146, 136);
__m512 tmp10086 = _mm512_shuffle_f32x4(tmp10138, tmp10146, 221);
tmp10039 = _mm512_shuffle_f32x4(tmp10140, tmp10148, 136);
__m512 tmp10087 = _mm512_shuffle_f32x4(tmp10140, tmp10148, 221);
tmp10040 = _mm512_shuffle_f32x4(tmp10142, tmp10150, 136);
__m512 tmp10088 = _mm512_shuffle_f32x4(tmp10142, tmp10150, 221);
(void)tmp10040;
(void)tmp10088;
__m512 tmp10093 = _mm512_add_ps(tmp10034, tmp10035);
__m512 tmp10104 = _mm512_add_ps(tmp10042, tmp10043);
__m512 tmp10092 = _mm512_add_ps(tmp10036, tmp10037);
__m512 tmp10103 = _mm512_add_ps(tmp10044, tmp10085);
__m512 tmp10098 = _mm512_sub_ps(tmp10036, tmp10037);
__m512 tmp10109 = _mm512_sub_ps(tmp10044, tmp10085);
__m512 tmp10097 = _mm512_sub_ps(tmp10034, tmp10035);
__m512 tmp10108 = _mm512_sub_ps(tmp10042, tmp10043);
__m512 tmp10094 = _mm512_add_ps(tmp10038, tmp10039);
__m512 tmp10105 = _mm512_add_ps(tmp10086, tmp10087);
__m512 tmp10099 = _mm512_sub_ps(tmp10038, tmp10039);
__m512 tmp10110 = _mm512_sub_ps(tmp10086, tmp10087);
__m512 tmp10096 = _mm512_fmadd_ps(tmp10098, _mm512_set1_ps(2e+00f), tmp10097);
__m512 tmp10107 = _mm512_fmadd_ps(tmp10109, _mm512_set1_ps(2e+00f), tmp10108);
__m512 tmp10091 = _mm512_add_ps(tmp10092, tmp10093);
__m512 tmp10102 = _mm512_add_ps(tmp10103, tmp10104);
__m512 tmp10095 = _mm512_fmadd_ps(tmp10099, _mm512_set1_ps(1.6e+01f), tmp10096);
__m512 tmp10106 = _mm512_fmadd_ps(tmp10110, _mm512_set1_ps(1.6e+01f), tmp10107);
__m512 tmp10090 = _mm512_add_ps(tmp10091, tmp10033);
__m512 tmp10101 = _mm512_add_ps(tmp10102, tmp10041);
__m512 tmp10089 = _mm512_fmadd_ps(tmp10094, _mm512_set1_ps(3.2e+01f), tmp10090);
__m512 tmp10100 = _mm512_fmadd_ps(tmp10105, _mm512_set1_ps(3.2e+01f), tmp10101);
__m512 out1359 = tmp10089;
__m512 out1361 = tmp10100;
__m512 out1360 = tmp10095;
__m512 out1362 = tmp10106;
out1359 = _mm512_max_ps(_mm512_setzero_ps(), out1359);
out1361 = _mm512_max_ps(_mm512_setzero_ps(), out1361);
out1360 = _mm512_max_ps(_mm512_setzero_ps(), out1360);
out1362 = _mm512_max_ps(_mm512_setzero_ps(), out1362);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1359);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1361);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1360);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1362);
__m512 sf753 = _mm512_loadu_ps(sfPtr7+256+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf754 = _mm512_loadu_ps(sfPtr7+384+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1469 = _mm512_shuffle_f32x4(sf753, sf754, 68);
__m512 in1470 = _mm512_shuffle_f32x4(sf753, sf754, 238);
__m512 sf755 = _mm512_loadu_ps(sfPtr7+320+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf756 = _mm512_loadu_ps(sfPtr7+448+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1477 = _mm512_shuffle_f32x4(sf755, sf756, 68);
__m512 in1478 = _mm512_shuffle_f32x4(sf755, sf756, 238);
__m512 sf757 = _mm512_loadu_ps(sfPtr7+25856+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf758 = _mm512_loadu_ps(sfPtr7+25984+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1471 = _mm512_shuffle_f32x4(sf757, sf758, 68);
__m512 in1472 = _mm512_shuffle_f32x4(sf757, sf758, 238);
__m512 sf759 = _mm512_loadu_ps(sfPtr7+25920+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf760 = _mm512_loadu_ps(sfPtr7+26048+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1479 = _mm512_shuffle_f32x4(sf759, sf760, 68);
__m512 in1480 = _mm512_shuffle_f32x4(sf759, sf760, 238);
__m512 sf761 = _mm512_loadu_ps(sfPtr7+51456+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf762 = _mm512_loadu_ps(sfPtr7+51584+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1473 = _mm512_shuffle_f32x4(sf761, sf762, 68);
__m512 in1474 = _mm512_shuffle_f32x4(sf761, sf762, 238);
__m512 sf763 = _mm512_loadu_ps(sfPtr7+51520+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf764 = _mm512_loadu_ps(sfPtr7+51648+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1481 = _mm512_shuffle_f32x4(sf763, sf764, 68);
__m512 in1482 = _mm512_shuffle_f32x4(sf763, sf764, 238);
__m512 sf765 = _mm512_loadu_ps(sfPtr7+77056+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf766 = _mm512_loadu_ps(sfPtr7+77184+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1475 = _mm512_shuffle_f32x4(sf765, sf766, 68);
__m512 in1476 = _mm512_shuffle_f32x4(sf765, sf766, 238);
__m512 sf767 = _mm512_loadu_ps(sfPtr7+77120+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf768 = _mm512_loadu_ps(sfPtr7+77248+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1483 = _mm512_shuffle_f32x4(sf767, sf768, 68);
__m512 in1484 = _mm512_shuffle_f32x4(sf767, sf768, 238);
__m512 tmp10167 = _mm512_add_ps(in1470, in1471);
__m512 tmp10187 = _mm512_add_ps(in1478, in1479);
__m512 tmp10166 = _mm512_add_ps(in1472, in1473);
__m512 tmp10186 = _mm512_add_ps(in1480, in1481);
__m512 tmp10172 = _mm512_sub_ps(in1472, in1473);
__m512 tmp10192 = _mm512_sub_ps(in1480, in1481);
__m512 tmp10171 = _mm512_sub_ps(in1470, in1471);
__m512 tmp10191 = _mm512_sub_ps(in1478, in1479);
__m512 tmp10168 = _mm512_add_ps(in1474, in1475);
__m512 tmp10188 = _mm512_add_ps(in1482, in1483);
__m512 tmp10173 = _mm512_sub_ps(in1474, in1475);
__m512 tmp10193 = _mm512_sub_ps(in1482, in1483);
__m512 tmp10170 = _mm512_fmadd_ps(tmp10172, _mm512_set1_ps(2e+00f), tmp10171);
__m512 tmp10190 = _mm512_fmadd_ps(tmp10192, _mm512_set1_ps(2e+00f), tmp10191);
__m512 tmp10177 = _mm512_fmadd_ps(tmp10172, _mm512_set1_ps(8e+00f), tmp10171);
__m512 tmp10197 = _mm512_fmadd_ps(tmp10192, _mm512_set1_ps(8e+00f), tmp10191);
__m512 tmp10165 = _mm512_add_ps(tmp10166, tmp10167);
__m512 tmp10185 = _mm512_add_ps(tmp10186, tmp10187);
__m512 tmp10169 = _mm512_fmadd_ps(tmp10173, _mm512_set1_ps(1.6e+01f), tmp10170);
__m512 tmp10189 = _mm512_fmadd_ps(tmp10193, _mm512_set1_ps(1.6e+01f), tmp10190);
__m512 tmp10176 = _mm512_fmadd_ps(tmp10173, _mm512_set1_ps(4e+00f), tmp10177);
__m512 tmp10196 = _mm512_fmadd_ps(tmp10193, _mm512_set1_ps(4e+00f), tmp10197);
__m512 tmp10182 = _mm512_add_ps(tmp10173, tmp10171);
__m512 tmp10202 = _mm512_add_ps(tmp10193, tmp10191);
__m512 tmp10175 = _mm512_fmadd_ps(tmp10166, _mm512_set1_ps(4e+00f), tmp10167);
__m512 tmp10195 = _mm512_fmadd_ps(tmp10186, _mm512_set1_ps(4e+00f), tmp10187);
__m512 tmp10179 = _mm512_fmadd_ps(tmp10166, _mm512_set1_ps(1.6e+01f), tmp10167);
__m512 tmp10199 = _mm512_fmadd_ps(tmp10186, _mm512_set1_ps(1.6e+01f), tmp10187);
__m512 tmp10164 = _mm512_add_ps(tmp10165, in1469);
__m512 tmp10184 = _mm512_add_ps(tmp10185, in1477);
__m512 tmp10181 = _mm512_add_ps(tmp10182, in1476);
__m512 tmp10201 = _mm512_add_ps(tmp10202, in1484);
__m512 tmp10163 = _mm512_fmadd_ps(tmp10168, _mm512_set1_ps(3.2e+01f), tmp10164);
__m512 tmp10183 = _mm512_fmadd_ps(tmp10188, _mm512_set1_ps(3.2e+01f), tmp10184);
__m512 tmp10174 = _mm512_fmadd_ps(tmp10168, _mm512_set1_ps(8e+00f), tmp10175);
__m512 tmp10194 = _mm512_fmadd_ps(tmp10188, _mm512_set1_ps(8e+00f), tmp10195);
__m512 tmp10180 = _mm512_fmadd_ps(tmp10172, _mm512_set1_ps(3.2e+01f), tmp10181);
__m512 tmp10200 = _mm512_fmadd_ps(tmp10192, _mm512_set1_ps(3.2e+01f), tmp10201);
__m512 tmp10178 = _mm512_fmadd_ps(tmp10168, _mm512_set1_ps(2e+00f), tmp10179);
__m512 tmp10198 = _mm512_fmadd_ps(tmp10188, _mm512_set1_ps(2e+00f), tmp10199);
__m512 tmp10151 = tmp10163;
__m512 tmp10157 = tmp10183;
__m512 tmp10152 = tmp10169;
__m512 tmp10158 = tmp10189;
__m512 tmp10153 = tmp10174;
__m512 tmp10159 = tmp10194;
__m512 tmp10154 = tmp10176;
__m512 tmp10160 = tmp10196;
__m512 tmp10155 = tmp10178;
__m512 tmp10161 = tmp10198;
__m512 tmp10156 = tmp10180;
__m512 tmp10162 = tmp10200;
__m512 tmp10229 = _mm512_unpacklo_ps(tmp10151, tmp10152);
__m512 tmp10230 = _mm512_unpackhi_ps(tmp10151, tmp10152);
__m512 tmp10231 = _mm512_unpacklo_ps(tmp10153, tmp10154);
__m512 tmp10232 = _mm512_unpackhi_ps(tmp10153, tmp10154);
__m512 tmp10233 = _mm512_unpacklo_ps(tmp10155, tmp10156);
__m512 tmp10234 = _mm512_unpackhi_ps(tmp10155, tmp10156);
__m512 tmp10235 = _mm512_unpacklo_ps(tmp10157, tmp10158);
__m512 tmp10236 = _mm512_unpackhi_ps(tmp10157, tmp10158);
__m512 tmp10237 = _mm512_unpacklo_ps(tmp10159, tmp10160);
__m512 tmp10238 = _mm512_unpackhi_ps(tmp10159, tmp10160);
__m512 tmp10239 = _mm512_unpacklo_ps(tmp10161, tmp10162);
__m512 tmp10240 = _mm512_unpackhi_ps(tmp10161, tmp10162);
__m512 tmp10241 = _mm512_shuffle_ps(tmp10229, tmp10231, 68);
__m512 tmp10242 = _mm512_shuffle_ps(tmp10229, tmp10231, 238);
__m512 tmp10243 = _mm512_shuffle_ps(tmp10230, tmp10232, 68);
__m512 tmp10244 = _mm512_shuffle_ps(tmp10230, tmp10232, 238);
__m512 tmp10245 = _mm512_shuffle_ps(tmp10233, tmp10235, 68);
__m512 tmp10246 = _mm512_shuffle_ps(tmp10233, tmp10235, 238);
__m512 tmp10247 = _mm512_shuffle_ps(tmp10234, tmp10236, 68);
__m512 tmp10248 = _mm512_shuffle_ps(tmp10234, tmp10236, 238);
__m512 tmp10249 = _mm512_shuffle_ps(tmp10237, tmp10239, 68);
__m512 tmp10250 = _mm512_shuffle_ps(tmp10237, tmp10239, 238);
__m512 tmp10251 = _mm512_shuffle_ps(tmp10238, tmp10240, 68);
__m512 tmp10252 = _mm512_shuffle_ps(tmp10238, tmp10240, 238);
__m512 tmp10253 = _mm512_shuffle_f32x4(tmp10241, tmp10245, 136);
__m512 tmp10254 = _mm512_shuffle_f32x4(tmp10241, tmp10245, 221);
__m512 tmp10255 = _mm512_shuffle_f32x4(tmp10242, tmp10246, 136);
__m512 tmp10256 = _mm512_shuffle_f32x4(tmp10242, tmp10246, 221);
__m512 tmp10257 = _mm512_shuffle_f32x4(tmp10243, tmp10247, 136);
__m512 tmp10258 = _mm512_shuffle_f32x4(tmp10243, tmp10247, 221);
__m512 tmp10259 = _mm512_shuffle_f32x4(tmp10244, tmp10248, 136);
__m512 tmp10260 = _mm512_shuffle_f32x4(tmp10244, tmp10248, 221);
__m512 tmp10261 = _mm512_shuffle_f32x4(tmp10249, tmp10249, 136);
__m512 tmp10262 = _mm512_shuffle_f32x4(tmp10249, tmp10249, 221);
__m512 tmp10263 = _mm512_shuffle_f32x4(tmp10250, tmp10250, 136);
__m512 tmp10264 = _mm512_shuffle_f32x4(tmp10250, tmp10250, 221);
__m512 tmp10265 = _mm512_shuffle_f32x4(tmp10251, tmp10251, 136);
__m512 tmp10266 = _mm512_shuffle_f32x4(tmp10251, tmp10251, 221);
__m512 tmp10267 = _mm512_shuffle_f32x4(tmp10252, tmp10252, 136);
__m512 tmp10268 = _mm512_shuffle_f32x4(tmp10252, tmp10252, 221);
tmp10151 = _mm512_shuffle_f32x4(tmp10253, tmp10261, 136);
tmp10159 = _mm512_shuffle_f32x4(tmp10253, tmp10261, 221);
tmp10152 = _mm512_shuffle_f32x4(tmp10255, tmp10263, 136);
tmp10160 = _mm512_shuffle_f32x4(tmp10255, tmp10263, 221);
tmp10153 = _mm512_shuffle_f32x4(tmp10257, tmp10265, 136);
tmp10161 = _mm512_shuffle_f32x4(tmp10257, tmp10265, 221);
tmp10154 = _mm512_shuffle_f32x4(tmp10259, tmp10267, 136);
tmp10162 = _mm512_shuffle_f32x4(tmp10259, tmp10267, 221);
tmp10155 = _mm512_shuffle_f32x4(tmp10254, tmp10262, 136);
__m512 tmp10203 = _mm512_shuffle_f32x4(tmp10254, tmp10262, 221);
tmp10156 = _mm512_shuffle_f32x4(tmp10256, tmp10264, 136);
__m512 tmp10204 = _mm512_shuffle_f32x4(tmp10256, tmp10264, 221);
tmp10157 = _mm512_shuffle_f32x4(tmp10258, tmp10266, 136);
__m512 tmp10205 = _mm512_shuffle_f32x4(tmp10258, tmp10266, 221);
tmp10158 = _mm512_shuffle_f32x4(tmp10260, tmp10268, 136);
__m512 tmp10206 = _mm512_shuffle_f32x4(tmp10260, tmp10268, 221);
(void)tmp10158;
(void)tmp10206;
__m512 tmp10211 = _mm512_add_ps(tmp10152, tmp10153);
__m512 tmp10222 = _mm512_add_ps(tmp10160, tmp10161);
__m512 tmp10210 = _mm512_add_ps(tmp10154, tmp10155);
__m512 tmp10221 = _mm512_add_ps(tmp10162, tmp10203);
__m512 tmp10216 = _mm512_sub_ps(tmp10154, tmp10155);
__m512 tmp10227 = _mm512_sub_ps(tmp10162, tmp10203);
__m512 tmp10215 = _mm512_sub_ps(tmp10152, tmp10153);
__m512 tmp10226 = _mm512_sub_ps(tmp10160, tmp10161);
__m512 tmp10212 = _mm512_add_ps(tmp10156, tmp10157);
__m512 tmp10223 = _mm512_add_ps(tmp10204, tmp10205);
__m512 tmp10217 = _mm512_sub_ps(tmp10156, tmp10157);
__m512 tmp10228 = _mm512_sub_ps(tmp10204, tmp10205);
__m512 tmp10214 = _mm512_fmadd_ps(tmp10216, _mm512_set1_ps(2e+00f), tmp10215);
__m512 tmp10225 = _mm512_fmadd_ps(tmp10227, _mm512_set1_ps(2e+00f), tmp10226);
__m512 tmp10209 = _mm512_add_ps(tmp10210, tmp10211);
__m512 tmp10220 = _mm512_add_ps(tmp10221, tmp10222);
__m512 tmp10213 = _mm512_fmadd_ps(tmp10217, _mm512_set1_ps(1.6e+01f), tmp10214);
__m512 tmp10224 = _mm512_fmadd_ps(tmp10228, _mm512_set1_ps(1.6e+01f), tmp10225);
__m512 tmp10208 = _mm512_add_ps(tmp10209, tmp10151);
__m512 tmp10219 = _mm512_add_ps(tmp10220, tmp10159);
__m512 tmp10207 = _mm512_fmadd_ps(tmp10212, _mm512_set1_ps(3.2e+01f), tmp10208);
__m512 tmp10218 = _mm512_fmadd_ps(tmp10223, _mm512_set1_ps(3.2e+01f), tmp10219);
__m512 out1363 = tmp10207;
__m512 out1365 = tmp10218;
__m512 out1364 = tmp10213;
__m512 out1366 = tmp10224;
out1363 = _mm512_max_ps(_mm512_setzero_ps(), out1363);
out1365 = _mm512_max_ps(_mm512_setzero_ps(), out1365);
out1364 = _mm512_max_ps(_mm512_setzero_ps(), out1364);
out1366 = _mm512_max_ps(_mm512_setzero_ps(), out1366);
_mm512_mask_storeu_ps(datPtr13+96+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1363);
_mm512_mask_storeu_ps(datPtr13+12608+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1365);
_mm512_mask_storeu_ps(datPtr13+320+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1364);
_mm512_mask_storeu_ps(datPtr13+12832+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1366);
__m512 sf769 = _mm512_loadu_ps(sfPtr7+512+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf770 = _mm512_loadu_ps(sfPtr7+640+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1485 = _mm512_shuffle_f32x4(sf769, sf770, 68);
__m512 in1486 = _mm512_shuffle_f32x4(sf769, sf770, 238);
__m512 sf771 = _mm512_loadu_ps(sfPtr7+576+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf772 = _mm512_loadu_ps(sfPtr7+704+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1493 = _mm512_shuffle_f32x4(sf771, sf772, 68);
__m512 in1494 = _mm512_shuffle_f32x4(sf771, sf772, 238);
__m512 sf773 = _mm512_loadu_ps(sfPtr7+26112+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf774 = _mm512_loadu_ps(sfPtr7+26240+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1487 = _mm512_shuffle_f32x4(sf773, sf774, 68);
__m512 in1488 = _mm512_shuffle_f32x4(sf773, sf774, 238);
__m512 sf775 = _mm512_loadu_ps(sfPtr7+26176+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf776 = _mm512_loadu_ps(sfPtr7+26304+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1495 = _mm512_shuffle_f32x4(sf775, sf776, 68);
__m512 in1496 = _mm512_shuffle_f32x4(sf775, sf776, 238);
__m512 sf777 = _mm512_loadu_ps(sfPtr7+51712+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf778 = _mm512_loadu_ps(sfPtr7+51840+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1489 = _mm512_shuffle_f32x4(sf777, sf778, 68);
__m512 in1490 = _mm512_shuffle_f32x4(sf777, sf778, 238);
__m512 sf779 = _mm512_loadu_ps(sfPtr7+51776+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf780 = _mm512_loadu_ps(sfPtr7+51904+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1497 = _mm512_shuffle_f32x4(sf779, sf780, 68);
__m512 in1498 = _mm512_shuffle_f32x4(sf779, sf780, 238);
__m512 sf781 = _mm512_loadu_ps(sfPtr7+77312+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf782 = _mm512_loadu_ps(sfPtr7+77440+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1491 = _mm512_shuffle_f32x4(sf781, sf782, 68);
__m512 in1492 = _mm512_shuffle_f32x4(sf781, sf782, 238);
__m512 sf783 = _mm512_loadu_ps(sfPtr7+77376+102400*i29+1536*j23+1536*k96+768*l35);
__m512 sf784 = _mm512_loadu_ps(sfPtr7+77504+102400*i29+1536*j23+1536*k96+768*l35);
__m512 in1499 = _mm512_shuffle_f32x4(sf783, sf784, 68);
__m512 in1500 = _mm512_shuffle_f32x4(sf783, sf784, 238);
__m512 tmp10285 = _mm512_add_ps(in1486, in1487);
__m512 tmp10305 = _mm512_add_ps(in1494, in1495);
__m512 tmp10284 = _mm512_add_ps(in1488, in1489);
__m512 tmp10304 = _mm512_add_ps(in1496, in1497);
__m512 tmp10290 = _mm512_sub_ps(in1488, in1489);
__m512 tmp10310 = _mm512_sub_ps(in1496, in1497);
__m512 tmp10289 = _mm512_sub_ps(in1486, in1487);
__m512 tmp10309 = _mm512_sub_ps(in1494, in1495);
__m512 tmp10286 = _mm512_add_ps(in1490, in1491);
__m512 tmp10306 = _mm512_add_ps(in1498, in1499);
__m512 tmp10291 = _mm512_sub_ps(in1490, in1491);
__m512 tmp10311 = _mm512_sub_ps(in1498, in1499);
__m512 tmp10288 = _mm512_fmadd_ps(tmp10290, _mm512_set1_ps(2e+00f), tmp10289);
__m512 tmp10308 = _mm512_fmadd_ps(tmp10310, _mm512_set1_ps(2e+00f), tmp10309);
__m512 tmp10295 = _mm512_fmadd_ps(tmp10290, _mm512_set1_ps(8e+00f), tmp10289);
__m512 tmp10315 = _mm512_fmadd_ps(tmp10310, _mm512_set1_ps(8e+00f), tmp10309);
__m512 tmp10283 = _mm512_add_ps(tmp10284, tmp10285);
__m512 tmp10303 = _mm512_add_ps(tmp10304, tmp10305);
__m512 tmp10287 = _mm512_fmadd_ps(tmp10291, _mm512_set1_ps(1.6e+01f), tmp10288);
__m512 tmp10307 = _mm512_fmadd_ps(tmp10311, _mm512_set1_ps(1.6e+01f), tmp10308);
__m512 tmp10294 = _mm512_fmadd_ps(tmp10291, _mm512_set1_ps(4e+00f), tmp10295);
__m512 tmp10314 = _mm512_fmadd_ps(tmp10311, _mm512_set1_ps(4e+00f), tmp10315);
__m512 tmp10300 = _mm512_add_ps(tmp10291, tmp10289);
__m512 tmp10320 = _mm512_add_ps(tmp10311, tmp10309);
__m512 tmp10293 = _mm512_fmadd_ps(tmp10284, _mm512_set1_ps(4e+00f), tmp10285);
__m512 tmp10313 = _mm512_fmadd_ps(tmp10304, _mm512_set1_ps(4e+00f), tmp10305);
__m512 tmp10297 = _mm512_fmadd_ps(tmp10284, _mm512_set1_ps(1.6e+01f), tmp10285);
__m512 tmp10317 = _mm512_fmadd_ps(tmp10304, _mm512_set1_ps(1.6e+01f), tmp10305);
__m512 tmp10282 = _mm512_add_ps(tmp10283, in1485);
__m512 tmp10302 = _mm512_add_ps(tmp10303, in1493);
__m512 tmp10299 = _mm512_add_ps(tmp10300, in1492);
__m512 tmp10319 = _mm512_add_ps(tmp10320, in1500);
__m512 tmp10281 = _mm512_fmadd_ps(tmp10286, _mm512_set1_ps(3.2e+01f), tmp10282);
__m512 tmp10301 = _mm512_fmadd_ps(tmp10306, _mm512_set1_ps(3.2e+01f), tmp10302);
__m512 tmp10292 = _mm512_fmadd_ps(tmp10286, _mm512_set1_ps(8e+00f), tmp10293);
__m512 tmp10312 = _mm512_fmadd_ps(tmp10306, _mm512_set1_ps(8e+00f), tmp10313);
__m512 tmp10298 = _mm512_fmadd_ps(tmp10290, _mm512_set1_ps(3.2e+01f), tmp10299);
__m512 tmp10318 = _mm512_fmadd_ps(tmp10310, _mm512_set1_ps(3.2e+01f), tmp10319);
__m512 tmp10296 = _mm512_fmadd_ps(tmp10286, _mm512_set1_ps(2e+00f), tmp10297);
__m512 tmp10316 = _mm512_fmadd_ps(tmp10306, _mm512_set1_ps(2e+00f), tmp10317);
__m512 tmp10269 = tmp10281;
__m512 tmp10275 = tmp10301;
__m512 tmp10270 = tmp10287;
__m512 tmp10276 = tmp10307;
__m512 tmp10271 = tmp10292;
__m512 tmp10277 = tmp10312;
__m512 tmp10272 = tmp10294;
__m512 tmp10278 = tmp10314;
__m512 tmp10273 = tmp10296;
__m512 tmp10279 = tmp10316;
__m512 tmp10274 = tmp10298;
__m512 tmp10280 = tmp10318;
__m512 tmp10347 = _mm512_unpacklo_ps(tmp10269, tmp10270);
__m512 tmp10348 = _mm512_unpackhi_ps(tmp10269, tmp10270);
__m512 tmp10349 = _mm512_unpacklo_ps(tmp10271, tmp10272);
__m512 tmp10350 = _mm512_unpackhi_ps(tmp10271, tmp10272);
__m512 tmp10351 = _mm512_unpacklo_ps(tmp10273, tmp10274);
__m512 tmp10352 = _mm512_unpackhi_ps(tmp10273, tmp10274);
__m512 tmp10353 = _mm512_unpacklo_ps(tmp10275, tmp10276);
__m512 tmp10354 = _mm512_unpackhi_ps(tmp10275, tmp10276);
__m512 tmp10355 = _mm512_unpacklo_ps(tmp10277, tmp10278);
__m512 tmp10356 = _mm512_unpackhi_ps(tmp10277, tmp10278);
__m512 tmp10357 = _mm512_unpacklo_ps(tmp10279, tmp10280);
__m512 tmp10358 = _mm512_unpackhi_ps(tmp10279, tmp10280);
__m512 tmp10359 = _mm512_shuffle_ps(tmp10347, tmp10349, 68);
__m512 tmp10360 = _mm512_shuffle_ps(tmp10347, tmp10349, 238);
__m512 tmp10361 = _mm512_shuffle_ps(tmp10348, tmp10350, 68);
__m512 tmp10362 = _mm512_shuffle_ps(tmp10348, tmp10350, 238);
__m512 tmp10363 = _mm512_shuffle_ps(tmp10351, tmp10353, 68);
__m512 tmp10364 = _mm512_shuffle_ps(tmp10351, tmp10353, 238);
__m512 tmp10365 = _mm512_shuffle_ps(tmp10352, tmp10354, 68);
__m512 tmp10366 = _mm512_shuffle_ps(tmp10352, tmp10354, 238);
__m512 tmp10367 = _mm512_shuffle_ps(tmp10355, tmp10357, 68);
__m512 tmp10368 = _mm512_shuffle_ps(tmp10355, tmp10357, 238);
__m512 tmp10369 = _mm512_shuffle_ps(tmp10356, tmp10358, 68);
__m512 tmp10370 = _mm512_shuffle_ps(tmp10356, tmp10358, 238);
__m512 tmp10371 = _mm512_shuffle_f32x4(tmp10359, tmp10363, 136);
__m512 tmp10372 = _mm512_shuffle_f32x4(tmp10359, tmp10363, 221);
__m512 tmp10373 = _mm512_shuffle_f32x4(tmp10360, tmp10364, 136);
__m512 tmp10374 = _mm512_shuffle_f32x4(tmp10360, tmp10364, 221);
__m512 tmp10375 = _mm512_shuffle_f32x4(tmp10361, tmp10365, 136);
__m512 tmp10376 = _mm512_shuffle_f32x4(tmp10361, tmp10365, 221);
__m512 tmp10377 = _mm512_shuffle_f32x4(tmp10362, tmp10366, 136);
__m512 tmp10378 = _mm512_shuffle_f32x4(tmp10362, tmp10366, 221);
__m512 tmp10379 = _mm512_shuffle_f32x4(tmp10367, tmp10367, 136);
__m512 tmp10380 = _mm512_shuffle_f32x4(tmp10367, tmp10367, 221);
__m512 tmp10381 = _mm512_shuffle_f32x4(tmp10368, tmp10368, 136);
__m512 tmp10382 = _mm512_shuffle_f32x4(tmp10368, tmp10368, 221);
__m512 tmp10383 = _mm512_shuffle_f32x4(tmp10369, tmp10369, 136);
__m512 tmp10384 = _mm512_shuffle_f32x4(tmp10369, tmp10369, 221);
__m512 tmp10385 = _mm512_shuffle_f32x4(tmp10370, tmp10370, 136);
__m512 tmp10386 = _mm512_shuffle_f32x4(tmp10370, tmp10370, 221);
tmp10269 = _mm512_shuffle_f32x4(tmp10371, tmp10379, 136);
tmp10277 = _mm512_shuffle_f32x4(tmp10371, tmp10379, 221);
tmp10270 = _mm512_shuffle_f32x4(tmp10373, tmp10381, 136);
tmp10278 = _mm512_shuffle_f32x4(tmp10373, tmp10381, 221);
tmp10271 = _mm512_shuffle_f32x4(tmp10375, tmp10383, 136);
tmp10279 = _mm512_shuffle_f32x4(tmp10375, tmp10383, 221);
tmp10272 = _mm512_shuffle_f32x4(tmp10377, tmp10385, 136);
tmp10280 = _mm512_shuffle_f32x4(tmp10377, tmp10385, 221);
tmp10273 = _mm512_shuffle_f32x4(tmp10372, tmp10380, 136);
__m512 tmp10321 = _mm512_shuffle_f32x4(tmp10372, tmp10380, 221);
tmp10274 = _mm512_shuffle_f32x4(tmp10374, tmp10382, 136);
__m512 tmp10322 = _mm512_shuffle_f32x4(tmp10374, tmp10382, 221);
tmp10275 = _mm512_shuffle_f32x4(tmp10376, tmp10384, 136);
__m512 tmp10323 = _mm512_shuffle_f32x4(tmp10376, tmp10384, 221);
tmp10276 = _mm512_shuffle_f32x4(tmp10378, tmp10386, 136);
__m512 tmp10324 = _mm512_shuffle_f32x4(tmp10378, tmp10386, 221);
(void)tmp10276;
(void)tmp10324;
__m512 tmp10329 = _mm512_add_ps(tmp10270, tmp10271);
__m512 tmp10340 = _mm512_add_ps(tmp10278, tmp10279);
__m512 tmp10328 = _mm512_add_ps(tmp10272, tmp10273);
__m512 tmp10339 = _mm512_add_ps(tmp10280, tmp10321);
__m512 tmp10334 = _mm512_sub_ps(tmp10272, tmp10273);
__m512 tmp10345 = _mm512_sub_ps(tmp10280, tmp10321);
__m512 tmp10333 = _mm512_sub_ps(tmp10270, tmp10271);
__m512 tmp10344 = _mm512_sub_ps(tmp10278, tmp10279);
__m512 tmp10330 = _mm512_add_ps(tmp10274, tmp10275);
__m512 tmp10341 = _mm512_add_ps(tmp10322, tmp10323);
__m512 tmp10335 = _mm512_sub_ps(tmp10274, tmp10275);
__m512 tmp10346 = _mm512_sub_ps(tmp10322, tmp10323);
__m512 tmp10332 = _mm512_fmadd_ps(tmp10334, _mm512_set1_ps(2e+00f), tmp10333);
__m512 tmp10343 = _mm512_fmadd_ps(tmp10345, _mm512_set1_ps(2e+00f), tmp10344);
__m512 tmp10327 = _mm512_add_ps(tmp10328, tmp10329);
__m512 tmp10338 = _mm512_add_ps(tmp10339, tmp10340);
__m512 tmp10331 = _mm512_fmadd_ps(tmp10335, _mm512_set1_ps(1.6e+01f), tmp10332);
__m512 tmp10342 = _mm512_fmadd_ps(tmp10346, _mm512_set1_ps(1.6e+01f), tmp10343);
__m512 tmp10326 = _mm512_add_ps(tmp10327, tmp10269);
__m512 tmp10337 = _mm512_add_ps(tmp10338, tmp10277);
__m512 tmp10325 = _mm512_fmadd_ps(tmp10330, _mm512_set1_ps(3.2e+01f), tmp10326);
__m512 tmp10336 = _mm512_fmadd_ps(tmp10341, _mm512_set1_ps(3.2e+01f), tmp10337);
__m512 out1367 = tmp10325;
__m512 out1369 = tmp10336;
__m512 out1368 = tmp10331;
__m512 out1370 = tmp10342;
out1367 = _mm512_max_ps(_mm512_setzero_ps(), out1367);
out1369 = _mm512_max_ps(_mm512_setzero_ps(), out1369);
out1368 = _mm512_max_ps(_mm512_setzero_ps(), out1368);
out1370 = _mm512_max_ps(_mm512_setzero_ps(), out1370);
_mm512_mask_storeu_ps(datPtr13+12656+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1367);
_mm512_mask_storeu_ps(datPtr13+12704+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1369);
_mm512_mask_storeu_ps(datPtr13+12880+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1368);
_mm512_mask_storeu_ps(datPtr13+12928+50432*i29+224*toH36+4*toW36+50432*k96+25216*l35, 4095, out1370);
}
}
++j23;
rel18 = 1;
}
ptrdiff_t toH37 = base18+0;
ptrdiff_t toW37 = 36;
ptrdiff_t k97 = 1*w46;
for (; k97 != 1; ++k97) {
ptrdiff_t l36 = 0;
for (; l36 != 4; ++l36) {
__m512 sf785 = _mm512_loadu_ps(sfPtr7+0+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf786 = _mm512_loadu_ps(sfPtr7+128+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1501 = _mm512_shuffle_f32x4(sf785, sf786, 68);
__m512 in1502 = _mm512_shuffle_f32x4(sf785, sf786, 238);
__m512 sf787 = _mm512_loadu_ps(sfPtr7+64+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf788 = _mm512_loadu_ps(sfPtr7+192+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1509 = _mm512_shuffle_f32x4(sf787, sf788, 68);
__m512 in1510 = _mm512_shuffle_f32x4(sf787, sf788, 238);
__m512 sf789 = _mm512_loadu_ps(sfPtr7+25600+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf790 = _mm512_loadu_ps(sfPtr7+25728+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1503 = _mm512_shuffle_f32x4(sf789, sf790, 68);
__m512 in1504 = _mm512_shuffle_f32x4(sf789, sf790, 238);
__m512 sf791 = _mm512_loadu_ps(sfPtr7+25664+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf792 = _mm512_loadu_ps(sfPtr7+25792+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1511 = _mm512_shuffle_f32x4(sf791, sf792, 68);
__m512 in1512 = _mm512_shuffle_f32x4(sf791, sf792, 238);
__m512 sf793 = _mm512_loadu_ps(sfPtr7+51200+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf794 = _mm512_loadu_ps(sfPtr7+51328+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1505 = _mm512_shuffle_f32x4(sf793, sf794, 68);
__m512 in1506 = _mm512_shuffle_f32x4(sf793, sf794, 238);
__m512 sf795 = _mm512_loadu_ps(sfPtr7+51264+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf796 = _mm512_loadu_ps(sfPtr7+51392+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1513 = _mm512_shuffle_f32x4(sf795, sf796, 68);
__m512 in1514 = _mm512_shuffle_f32x4(sf795, sf796, 238);
__m512 sf797 = _mm512_loadu_ps(sfPtr7+76800+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf798 = _mm512_loadu_ps(sfPtr7+76928+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1507 = _mm512_shuffle_f32x4(sf797, sf798, 68);
__m512 in1508 = _mm512_shuffle_f32x4(sf797, sf798, 238);
__m512 sf799 = _mm512_loadu_ps(sfPtr7+76864+102400*i29+1536*j23+1024*k97+256*l36);
__m512 sf800 = _mm512_loadu_ps(sfPtr7+76992+102400*i29+1536*j23+1024*k97+256*l36);
__m512 in1515 = _mm512_shuffle_f32x4(sf799, sf800, 68);
__m512 in1516 = _mm512_shuffle_f32x4(sf799, sf800, 238);
__m512 tmp10403 = _mm512_add_ps(in1502, in1503);
__m512 tmp10423 = _mm512_add_ps(in1510, in1511);
__m512 tmp10402 = _mm512_add_ps(in1504, in1505);
__m512 tmp10422 = _mm512_add_ps(in1512, in1513);
__m512 tmp10408 = _mm512_sub_ps(in1504, in1505);
__m512 tmp10428 = _mm512_sub_ps(in1512, in1513);
__m512 tmp10407 = _mm512_sub_ps(in1502, in1503);
__m512 tmp10427 = _mm512_sub_ps(in1510, in1511);
__m512 tmp10404 = _mm512_add_ps(in1506, in1507);
__m512 tmp10424 = _mm512_add_ps(in1514, in1515);
__m512 tmp10409 = _mm512_sub_ps(in1506, in1507);
__m512 tmp10429 = _mm512_sub_ps(in1514, in1515);
__m512 tmp10406 = _mm512_fmadd_ps(tmp10408, _mm512_set1_ps(2e+00f), tmp10407);
__m512 tmp10426 = _mm512_fmadd_ps(tmp10428, _mm512_set1_ps(2e+00f), tmp10427);
__m512 tmp10413 = _mm512_fmadd_ps(tmp10408, _mm512_set1_ps(8e+00f), tmp10407);
__m512 tmp10433 = _mm512_fmadd_ps(tmp10428, _mm512_set1_ps(8e+00f), tmp10427);
__m512 tmp10401 = _mm512_add_ps(tmp10402, tmp10403);
__m512 tmp10421 = _mm512_add_ps(tmp10422, tmp10423);
__m512 tmp10405 = _mm512_fmadd_ps(tmp10409, _mm512_set1_ps(1.6e+01f), tmp10406);
__m512 tmp10425 = _mm512_fmadd_ps(tmp10429, _mm512_set1_ps(1.6e+01f), tmp10426);
__m512 tmp10412 = _mm512_fmadd_ps(tmp10409, _mm512_set1_ps(4e+00f), tmp10413);
__m512 tmp10432 = _mm512_fmadd_ps(tmp10429, _mm512_set1_ps(4e+00f), tmp10433);
__m512 tmp10418 = _mm512_add_ps(tmp10409, tmp10407);
__m512 tmp10438 = _mm512_add_ps(tmp10429, tmp10427);
__m512 tmp10411 = _mm512_fmadd_ps(tmp10402, _mm512_set1_ps(4e+00f), tmp10403);
__m512 tmp10431 = _mm512_fmadd_ps(tmp10422, _mm512_set1_ps(4e+00f), tmp10423);
__m512 tmp10415 = _mm512_fmadd_ps(tmp10402, _mm512_set1_ps(1.6e+01f), tmp10403);
__m512 tmp10435 = _mm512_fmadd_ps(tmp10422, _mm512_set1_ps(1.6e+01f), tmp10423);
__m512 tmp10400 = _mm512_add_ps(tmp10401, in1501);
__m512 tmp10420 = _mm512_add_ps(tmp10421, in1509);
__m512 tmp10417 = _mm512_add_ps(tmp10418, in1508);
__m512 tmp10437 = _mm512_add_ps(tmp10438, in1516);
__m512 tmp10399 = _mm512_fmadd_ps(tmp10404, _mm512_set1_ps(3.2e+01f), tmp10400);
__m512 tmp10419 = _mm512_fmadd_ps(tmp10424, _mm512_set1_ps(3.2e+01f), tmp10420);
__m512 tmp10410 = _mm512_fmadd_ps(tmp10404, _mm512_set1_ps(8e+00f), tmp10411);
__m512 tmp10430 = _mm512_fmadd_ps(tmp10424, _mm512_set1_ps(8e+00f), tmp10431);
__m512 tmp10416 = _mm512_fmadd_ps(tmp10408, _mm512_set1_ps(3.2e+01f), tmp10417);
__m512 tmp10436 = _mm512_fmadd_ps(tmp10428, _mm512_set1_ps(3.2e+01f), tmp10437);
__m512 tmp10414 = _mm512_fmadd_ps(tmp10404, _mm512_set1_ps(2e+00f), tmp10415);
__m512 tmp10434 = _mm512_fmadd_ps(tmp10424, _mm512_set1_ps(2e+00f), tmp10435);
__m512 tmp10387 = tmp10399;
__m512 tmp10393 = tmp10419;
__m512 tmp10388 = tmp10405;
__m512 tmp10394 = tmp10425;
__m512 tmp10389 = tmp10410;
__m512 tmp10395 = tmp10430;
__m512 tmp10390 = tmp10412;
__m512 tmp10396 = tmp10432;
__m512 tmp10391 = tmp10414;
__m512 tmp10397 = tmp10434;
__m512 tmp10392 = tmp10416;
__m512 tmp10398 = tmp10436;
__m512 tmp10465 = _mm512_unpacklo_ps(tmp10387, tmp10388);
__m512 tmp10466 = _mm512_unpackhi_ps(tmp10387, tmp10388);
__m512 tmp10467 = _mm512_unpacklo_ps(tmp10389, tmp10390);
__m512 tmp10468 = _mm512_unpackhi_ps(tmp10389, tmp10390);
__m512 tmp10469 = _mm512_unpacklo_ps(tmp10391, tmp10392);
__m512 tmp10470 = _mm512_unpackhi_ps(tmp10391, tmp10392);
__m512 tmp10471 = _mm512_unpacklo_ps(tmp10393, tmp10394);
__m512 tmp10472 = _mm512_unpackhi_ps(tmp10393, tmp10394);
__m512 tmp10473 = _mm512_unpacklo_ps(tmp10395, tmp10396);
__m512 tmp10474 = _mm512_unpackhi_ps(tmp10395, tmp10396);
__m512 tmp10475 = _mm512_unpacklo_ps(tmp10397, tmp10398);
__m512 tmp10476 = _mm512_unpackhi_ps(tmp10397, tmp10398);
__m512 tmp10477 = _mm512_shuffle_ps(tmp10465, tmp10467, 68);
__m512 tmp10478 = _mm512_shuffle_ps(tmp10465, tmp10467, 238);
__m512 tmp10479 = _mm512_shuffle_ps(tmp10466, tmp10468, 68);
__m512 tmp10480 = _mm512_shuffle_ps(tmp10466, tmp10468, 238);
__m512 tmp10481 = _mm512_shuffle_ps(tmp10469, tmp10471, 68);
__m512 tmp10482 = _mm512_shuffle_ps(tmp10469, tmp10471, 238);
__m512 tmp10483 = _mm512_shuffle_ps(tmp10470, tmp10472, 68);
__m512 tmp10484 = _mm512_shuffle_ps(tmp10470, tmp10472, 238);
__m512 tmp10485 = _mm512_shuffle_ps(tmp10473, tmp10475, 68);
__m512 tmp10486 = _mm512_shuffle_ps(tmp10473, tmp10475, 238);
__m512 tmp10487 = _mm512_shuffle_ps(tmp10474, tmp10476, 68);
__m512 tmp10488 = _mm512_shuffle_ps(tmp10474, tmp10476, 238);
__m512 tmp10489 = _mm512_shuffle_f32x4(tmp10477, tmp10481, 136);
__m512 tmp10490 = _mm512_shuffle_f32x4(tmp10477, tmp10481, 221);
__m512 tmp10491 = _mm512_shuffle_f32x4(tmp10478, tmp10482, 136);
__m512 tmp10492 = _mm512_shuffle_f32x4(tmp10478, tmp10482, 221);
__m512 tmp10493 = _mm512_shuffle_f32x4(tmp10479, tmp10483, 136);
__m512 tmp10494 = _mm512_shuffle_f32x4(tmp10479, tmp10483, 221);
__m512 tmp10495 = _mm512_shuffle_f32x4(tmp10480, tmp10484, 136);
__m512 tmp10496 = _mm512_shuffle_f32x4(tmp10480, tmp10484, 221);
__m512 tmp10497 = _mm512_shuffle_f32x4(tmp10485, tmp10485, 136);
__m512 tmp10498 = _mm512_shuffle_f32x4(tmp10485, tmp10485, 221);
__m512 tmp10499 = _mm512_shuffle_f32x4(tmp10486, tmp10486, 136);
__m512 tmp10500 = _mm512_shuffle_f32x4(tmp10486, tmp10486, 221);
__m512 tmp10501 = _mm512_shuffle_f32x4(tmp10487, tmp10487, 136);
__m512 tmp10502 = _mm512_shuffle_f32x4(tmp10487, tmp10487, 221);
__m512 tmp10503 = _mm512_shuffle_f32x4(tmp10488, tmp10488, 136);
__m512 tmp10504 = _mm512_shuffle_f32x4(tmp10488, tmp10488, 221);
tmp10387 = _mm512_shuffle_f32x4(tmp10489, tmp10497, 136);
tmp10395 = _mm512_shuffle_f32x4(tmp10489, tmp10497, 221);
tmp10388 = _mm512_shuffle_f32x4(tmp10491, tmp10499, 136);
tmp10396 = _mm512_shuffle_f32x4(tmp10491, tmp10499, 221);
tmp10389 = _mm512_shuffle_f32x4(tmp10493, tmp10501, 136);
tmp10397 = _mm512_shuffle_f32x4(tmp10493, tmp10501, 221);
tmp10390 = _mm512_shuffle_f32x4(tmp10495, tmp10503, 136);
tmp10398 = _mm512_shuffle_f32x4(tmp10495, tmp10503, 221);
tmp10391 = _mm512_shuffle_f32x4(tmp10490, tmp10498, 136);
__m512 tmp10439 = _mm512_shuffle_f32x4(tmp10490, tmp10498, 221);
tmp10392 = _mm512_shuffle_f32x4(tmp10492, tmp10500, 136);
__m512 tmp10440 = _mm512_shuffle_f32x4(tmp10492, tmp10500, 221);
tmp10393 = _mm512_shuffle_f32x4(tmp10494, tmp10502, 136);
__m512 tmp10441 = _mm512_shuffle_f32x4(tmp10494, tmp10502, 221);
tmp10394 = _mm512_shuffle_f32x4(tmp10496, tmp10504, 136);
__m512 tmp10442 = _mm512_shuffle_f32x4(tmp10496, tmp10504, 221);
(void)tmp10394;
(void)tmp10442;
__m512 tmp10447 = _mm512_add_ps(tmp10388, tmp10389);
__m512 tmp10458 = _mm512_add_ps(tmp10396, tmp10397);
__m512 tmp10446 = _mm512_add_ps(tmp10390, tmp10391);
__m512 tmp10457 = _mm512_add_ps(tmp10398, tmp10439);
__m512 tmp10452 = _mm512_sub_ps(tmp10390, tmp10391);
__m512 tmp10463 = _mm512_sub_ps(tmp10398, tmp10439);
__m512 tmp10451 = _mm512_sub_ps(tmp10388, tmp10389);
__m512 tmp10462 = _mm512_sub_ps(tmp10396, tmp10397);
__m512 tmp10448 = _mm512_add_ps(tmp10392, tmp10393);
__m512 tmp10459 = _mm512_add_ps(tmp10440, tmp10441);
__m512 tmp10453 = _mm512_sub_ps(tmp10392, tmp10393);
__m512 tmp10464 = _mm512_sub_ps(tmp10440, tmp10441);
__m512 tmp10450 = _mm512_fmadd_ps(tmp10452, _mm512_set1_ps(2e+00f), tmp10451);
__m512 tmp10461 = _mm512_fmadd_ps(tmp10463, _mm512_set1_ps(2e+00f), tmp10462);
__m512 tmp10445 = _mm512_add_ps(tmp10446, tmp10447);
__m512 tmp10456 = _mm512_add_ps(tmp10457, tmp10458);
__m512 tmp10449 = _mm512_fmadd_ps(tmp10453, _mm512_set1_ps(1.6e+01f), tmp10450);
__m512 tmp10460 = _mm512_fmadd_ps(tmp10464, _mm512_set1_ps(1.6e+01f), tmp10461);
__m512 tmp10444 = _mm512_add_ps(tmp10445, tmp10387);
__m512 tmp10455 = _mm512_add_ps(tmp10456, tmp10395);
__m512 tmp10443 = _mm512_fmadd_ps(tmp10448, _mm512_set1_ps(3.2e+01f), tmp10444);
__m512 tmp10454 = _mm512_fmadd_ps(tmp10459, _mm512_set1_ps(3.2e+01f), tmp10455);
__m512 out1371 = tmp10443;
__m512 out1373 = tmp10454;
__m512 out1372 = tmp10449;
__m512 out1374 = tmp10460;
out1371 = _mm512_max_ps(_mm512_setzero_ps(), out1371);
out1373 = _mm512_max_ps(_mm512_setzero_ps(), out1373);
out1372 = _mm512_max_ps(_mm512_setzero_ps(), out1372);
out1374 = _mm512_max_ps(_mm512_setzero_ps(), out1374);
_mm512_mask_storeu_ps(datPtr13+0+50432*i29+224*toH37+4*toW37+50432*k97+12608*l36, 4095, out1371);
_mm512_mask_storeu_ps(datPtr13+48+50432*i29+224*toH37+4*toW37+50432*k97+12608*l36, 255, out1373);
_mm512_mask_storeu_ps(datPtr13+224+50432*i29+224*toH37+4*toW37+50432*k97+12608*l36, 4095, out1372);
_mm512_mask_storeu_ps(datPtr13+272+50432*i29+224*toH37+4*toW37+50432*k97+12608*l36, 255, out1374);
}
}
++j23;
}
}

static void ResNeXt50ThreeConsumeSums2(ResNeXt50ThreaderTeam1* team35, char** tensors43) {
ResNeXt50ThreaderTask1 task47;
task47.callee1 = ResNeXt50ThreeConsumeSums2Callee1;
task47.any1 = tensors43;
task47.nd1 = 3;
task47.hull1[0] = 1;
task47.hull1[1] = 1;
task47.hull1[2] = 16;
ResNeXt50ThreaderDo1(team35, &task47);
}

static void ResNeXt50ThreeArrangeFilts3Callee1(ResNeXt50ThreaderTask1* task80, int64_t* pt45) {
char** tensors78 = task80->any1;
ptrdiff_t b71 = 0;
ptrdiff_t g25 = pt45[1];
ptrdiff_t e23 = 0;
char*restrict bfPtr11 = tensors78[3]+1024*e23;
char*restrict wfPtr11 = tensors78[3]+1024+12976128*e23;
char*restrict wtPtr13 = tensors78[0]+14256*e23;
char*restrict biasPtr13 = tensors78[1];
char*restrict bnPtr14 = tensors78[2];
ptrdiff_t i48 = 8*g25;
ptrdiff_t ii34 = i48+7;
for (; i48 <= ii34; ++i48) {
ptrdiff_t j41 = 2*b71;
if (j41 < 2) {
for (; j41 != 2; ++j41) {
ptrdiff_t k129 = 0+1*j41;
ptrdiff_t cut17 = 0;
__m512 postMul39 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(0+8*i48+4*j41))[0]);
__m512 postMul40 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(1+8*i48+4*j41))[0]);
__m512 postMul41 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(2+8*i48+4*j41))[0]);
__m512 postMul42 = _mm512_set1_ps(((float*)bnPtr14+(ptrdiff_t)2*(3+8*i48+4*j41))[0]);
ptrdiff_t s45 = 0;
for (; s45 != 8; ++s45) {
__m512 wt459 = _mm512_maskz_loadu_ps(511, wtPtr13+0+2304*i48+1152*j41+36*s45);
__m512 wt460 = _mm512_maskz_loadu_ps(511, wtPtr13+288+2304*i48+1152*j41+36*s45);
__m512 wt461 = _mm512_maskz_loadu_ps(511, wtPtr13+576+2304*i48+1152*j41+36*s45);
__m512 wt462 = _mm512_maskz_loadu_ps(511, wtPtr13+864+2304*i48+1152*j41+36*s45);
wt459 = _mm512_mul_ps(wt459, postMul39);
wt460 = _mm512_mul_ps(wt460, postMul40);
wt461 = _mm512_mul_ps(wt461, postMul41);
wt462 = _mm512_mul_ps(wt462, postMul42);
__m512i pm173 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm174 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp11081 = _mm512_permutex2var_ps(wt459, pm173, wt461);
__m512 tmp11082 = _mm512_permutex2var_ps(wt460, pm173, wt462);
__m512 tmp11083 = _mm512_permutex2var_ps(wt459, pm174, wt461);
__m512 tmp11084 = _mm512_permutex2var_ps(wt460, pm174, wt462);
__m512 in1517 = _mm512_permutex2var_ps(tmp11081, pm173, tmp11082);
__m512 in1518 = _mm512_permutex2var_ps(tmp11081, pm174, tmp11082);
__m512 in1519 = _mm512_permutex2var_ps(tmp11083, pm173, tmp11084);
__m512 tmp11085 = _mm512_fmadd_ps(in1517, _mm512_set1_ps(4e+00f), in1519);
__m512 tmp11086 = _mm512_add_ps(in1517, in1519);
__m512 tmp11087 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(4e+00f), in1517);
__m512 tmp11088 = _mm512_add_ps(in1518, tmp11086);
__m512 tmp11089 = _mm512_fmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp11087);
tmp11087 = _mm512_fnmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp11087);
__m512 tmp11090 = _mm512_fnmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp11085);
tmp11085 = _mm512_fmadd_ps(in1518, _mm512_set1_ps(2e+00f), tmp11085);
tmp11086 = _mm512_sub_ps(tmp11086, in1518);
__m512 tmp11107 = _mm512_unpacklo_ps(in1517, tmp11088);
__m512 tmp11108 = _mm512_unpackhi_ps(in1517, tmp11088);
__m512 tmp11109 = _mm512_unpacklo_ps(tmp11086, tmp11089);
__m512 tmp11110 = _mm512_unpackhi_ps(tmp11086, tmp11089);
__m512 tmp11111 = _mm512_unpacklo_ps(tmp11087, tmp11085);
__m512 tmp11112 = _mm512_unpackhi_ps(tmp11087, tmp11085);
__m512 tmp11113 = _mm512_unpacklo_ps(tmp11090, in1519);
__m512 tmp11114 = _mm512_unpackhi_ps(tmp11090, in1519);
__m512 tmp11115 = _mm512_shuffle_ps(tmp11107, tmp11109, 68);
__m512 tmp11116 = _mm512_shuffle_ps(tmp11107, tmp11109, 238);
__m512 tmp11117 = _mm512_shuffle_ps(tmp11108, tmp11110, 68);
__m512 tmp11118 = _mm512_shuffle_ps(tmp11108, tmp11110, 238);
__m512 tmp11119 = _mm512_shuffle_ps(tmp11111, tmp11113, 68);
__m512 tmp11120 = _mm512_shuffle_ps(tmp11111, tmp11113, 238);
__m512 tmp11121 = _mm512_shuffle_ps(tmp11112, tmp11114, 68);
__m512 tmp11122 = _mm512_shuffle_ps(tmp11112, tmp11114, 238);
__m512 tmp11123 = _mm512_shuffle_f32x4(tmp11115, tmp11119, 136);
__m512 tmp11124 = _mm512_shuffle_f32x4(tmp11115, tmp11119, 221);
__m512 tmp11125 = _mm512_shuffle_f32x4(tmp11116, tmp11120, 136);
__m512 tmp11126 = _mm512_shuffle_f32x4(tmp11116, tmp11120, 221);
__m512 tmp11127 = _mm512_shuffle_f32x4(tmp11117, tmp11121, 136);
__m512 tmp11128 = _mm512_shuffle_f32x4(tmp11117, tmp11121, 221);
__m512 tmp11129 = _mm512_shuffle_f32x4(tmp11118, tmp11122, 136);
__m512 tmp11130 = _mm512_shuffle_f32x4(tmp11118, tmp11122, 221);
in1517 = _mm512_shuffle_f32x4(tmp11123, tmp11123, 136);
__m512 tmp11091 = _mm512_shuffle_f32x4(tmp11123, tmp11123, 221);
tmp11088 = _mm512_shuffle_f32x4(tmp11125, tmp11125, 136);
__m512 tmp11092 = _mm512_shuffle_f32x4(tmp11125, tmp11125, 221);
tmp11086 = _mm512_shuffle_f32x4(tmp11127, tmp11127, 136);
__m512 tmp11093 = _mm512_shuffle_f32x4(tmp11127, tmp11127, 221);
tmp11089 = _mm512_shuffle_f32x4(tmp11129, tmp11129, 136);
__m512 tmp11094 = _mm512_shuffle_f32x4(tmp11129, tmp11129, 221);
tmp11087 = _mm512_shuffle_f32x4(tmp11124, tmp11124, 136);
tmp11085 = _mm512_shuffle_f32x4(tmp11126, tmp11126, 136);
tmp11090 = _mm512_shuffle_f32x4(tmp11128, tmp11128, 136);
in1519 = _mm512_shuffle_f32x4(tmp11130, tmp11130, 136);
in1517 = _mm512_shuffle_f32x4(in1517, tmp11089, 68);
tmp11088 = _mm512_shuffle_f32x4(tmp11088, tmp11087, 68);
tmp11086 = _mm512_shuffle_f32x4(tmp11086, tmp11085, 68);
tmp11090 = _mm512_shuffle_f32x4(tmp11090, tmp11092, 68);
in1519 = _mm512_shuffle_f32x4(in1519, tmp11093, 68);
tmp11091 = _mm512_shuffle_f32x4(tmp11091, tmp11094, 68);
__m512 tmp11095 = _mm512_fmadd_ps(in1517, _mm512_set1_ps(4e+00f), tmp11086);
__m512 tmp11101 = _mm512_fmadd_ps(tmp11090, _mm512_set1_ps(4e+00f), tmp11091);
__m512 tmp11096 = _mm512_add_ps(in1517, tmp11086);
__m512 tmp11102 = _mm512_add_ps(tmp11090, tmp11091);
__m512 tmp11097 = _mm512_fmadd_ps(tmp11086, _mm512_set1_ps(4e+00f), in1517);
__m512 tmp11103 = _mm512_fmadd_ps(tmp11091, _mm512_set1_ps(4e+00f), tmp11090);
__m512 tmp11098 = _mm512_add_ps(tmp11088, tmp11096);
__m512 tmp11104 = _mm512_add_ps(in1519, tmp11102);
__m512 tmp11099 = _mm512_fmadd_ps(tmp11088, _mm512_set1_ps(2e+00f), tmp11097);
__m512 tmp11105 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp11103);
tmp11097 = _mm512_fnmadd_ps(tmp11088, _mm512_set1_ps(2e+00f), tmp11097);
tmp11103 = _mm512_fnmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp11103);
__m512 tmp11100 = _mm512_fnmadd_ps(tmp11088, _mm512_set1_ps(2e+00f), tmp11095);
__m512 tmp11106 = _mm512_fnmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp11101);
tmp11095 = _mm512_fmadd_ps(tmp11088, _mm512_set1_ps(2e+00f), tmp11095);
tmp11101 = _mm512_fmadd_ps(in1519, _mm512_set1_ps(2e+00f), tmp11101);
tmp11096 = _mm512_sub_ps(tmp11096, tmp11088);
tmp11102 = _mm512_sub_ps(tmp11102, in1519);
in1517 = _mm512_mul_ps(in1517, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp11098 = _mm512_mul_ps(tmp11098, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp11096 = _mm512_mul_ps(tmp11096, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp11099 = _mm512_mul_ps(tmp11099, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp11097 = _mm512_mul_ps(tmp11097, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp11095 = _mm512_mul_ps(tmp11095, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp11100 = _mm512_mul_ps(tmp11100, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp11086 = _mm512_mul_ps(tmp11086, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp11090 = _mm512_mul_ps(tmp11090, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp11104 = _mm512_mul_ps(tmp11104, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp11102 = _mm512_mul_ps(tmp11102, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp11105 = _mm512_mul_ps(tmp11105, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp11103 = _mm512_mul_ps(tmp11103, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp11101 = _mm512_mul_ps(tmp11101, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp11106 = _mm512_mul_ps(tmp11106, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp11091 = _mm512_mul_ps(tmp11091, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1375 = _mm512_shuffle_f32x4(in1517, tmp11098, 68);
__m512 out1379 = _mm512_shuffle_f32x4(in1517, tmp11098, 238);
__m512 out1376 = _mm512_shuffle_f32x4(tmp11096, tmp11099, 68);
__m512 out1380 = _mm512_shuffle_f32x4(tmp11096, tmp11099, 238);
__m512 out1377 = _mm512_shuffle_f32x4(tmp11097, tmp11095, 68);
__m512 out1381 = _mm512_shuffle_f32x4(tmp11097, tmp11095, 238);
__m512 out1378 = _mm512_shuffle_f32x4(tmp11100, tmp11086, 68);
__m512 out1382 = _mm512_shuffle_f32x4(tmp11100, tmp11086, 238);
__m512 out1383 = _mm512_shuffle_f32x4(tmp11090, tmp11104, 68);
__m512 out1387 = _mm512_shuffle_f32x4(tmp11090, tmp11104, 238);
__m512 out1384 = _mm512_shuffle_f32x4(tmp11102, tmp11105, 68);
__m512 out1388 = _mm512_shuffle_f32x4(tmp11102, tmp11105, 238);
__m512 out1385 = _mm512_shuffle_f32x4(tmp11103, tmp11101, 68);
__m512 out1389 = _mm512_shuffle_f32x4(tmp11103, tmp11101, 238);
__m512 out1386 = _mm512_shuffle_f32x4(tmp11106, tmp11091, 68);
__m512 out1390 = _mm512_shuffle_f32x4(tmp11106, tmp11091, 238);
ptrdiff_t off9 = 32*cut17;
ptrdiff_t off10 = (size_t)(cut17+1)/4*1024+(size_t)(cut17+1)%4*32;
ptrdiff_t off11 = (size_t)(cut17+2)/4*1024+(size_t)(cut17+2)%4*32;
ptrdiff_t off12 = (size_t)(cut17+3)/4*1024+(size_t)(cut17+3)%4*32;
__m512i wf113 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1375, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf114 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1379, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf115 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1383, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf116 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1387, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf117 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1376, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf118 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1380, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf119 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1384, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf120 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1388, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf121 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1377, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf122 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1381, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf123 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1385, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf124 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1389, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf125 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1378, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf126 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1382, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf127 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1386, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf128 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1390, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr11+0+8192*i48+1024*k129+off9+128*s45, 255, wf113);
_mm512_mask_storeu_epi32(wfPtr11+0+8192*i48+1024*k129+off10+128*s45, 255, wf114);
_mm512_mask_storeu_epi32(wfPtr11+0+8192*i48+1024*k129+off11+128*s45, 255, wf115);
_mm512_mask_storeu_epi32(wfPtr11+0+8192*i48+1024*k129+off12+128*s45, 255, wf116);
_mm512_mask_storeu_epi32(wfPtr11+2048+8192*i48+1024*k129+off9+128*s45, 255, wf117);
_mm512_mask_storeu_epi32(wfPtr11+2048+8192*i48+1024*k129+off10+128*s45, 255, wf118);
_mm512_mask_storeu_epi32(wfPtr11+2048+8192*i48+1024*k129+off11+128*s45, 255, wf119);
_mm512_mask_storeu_epi32(wfPtr11+2048+8192*i48+1024*k129+off12+128*s45, 255, wf120);
_mm512_mask_storeu_epi32(wfPtr11+4096+8192*i48+1024*k129+off9+128*s45, 255, wf121);
_mm512_mask_storeu_epi32(wfPtr11+4096+8192*i48+1024*k129+off10+128*s45, 255, wf122);
_mm512_mask_storeu_epi32(wfPtr11+4096+8192*i48+1024*k129+off11+128*s45, 255, wf123);
_mm512_mask_storeu_epi32(wfPtr11+4096+8192*i48+1024*k129+off12+128*s45, 255, wf124);
_mm512_mask_storeu_epi32(wfPtr11+6144+8192*i48+1024*k129+off9+128*s45, 255, wf125);
_mm512_mask_storeu_epi32(wfPtr11+6144+8192*i48+1024*k129+off10+128*s45, 255, wf126);
_mm512_mask_storeu_epi32(wfPtr11+6144+8192*i48+1024*k129+off11+128*s45, 255, wf127);
_mm512_mask_storeu_epi32(wfPtr11+6144+8192*i48+1024*k129+off12+128*s45, 255, wf128);
}
__m512 bias5 = _mm512_setzero_ps();
if (!e23) {
bias5 = _mm512_maskz_loadu_ps(15, biasPtr13-0+32*i48+16*j41);
__m512i pmMul27 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd27 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas10 = _mm512_maskz_loadu_ps(255, bnPtr14+(ptrdiff_t)8*(0+8*i48+4*j41));
__m512 postMul43 = _mm512_permutexvar_ps(pmMul27, mas10);
__m512 postAdd27 = _mm512_permutexvar_ps(pmAdd27, mas10);
bias5 = _mm512_fmadd_ps(bias5, postMul43, postAdd27);
}
_mm512_mask_storeu_ps(bfPtr11-0+32*i48+16*j41, 15, bias5);
}
}
}
}

static void ResNeXt50ThreeArrangeFilts3(ResNeXt50ThreaderTeam1* team52, char** tensors77) {
ResNeXt50ThreaderTask1 task81;
task81.callee1 = ResNeXt50ThreeArrangeFilts3Callee1;
task81.any1 = tensors77;
task81.nd1 = 3;
task81.hull1[0] = 1;
task81.hull1[1] = 4;
task81.hull1[2] = 1;
ResNeXt50ThreaderDo1(team52, &task81);
}

static void ResNeXt50ThreeArrangeDats3Callee1(ResNeXt50ThreaderTask1* task82, int64_t* pt46) {
char** tensors80 = task82->any1;
ptrdiff_t s46 = 0;
ptrdiff_t c39 = 0;
ptrdiff_t g26 = pt46[2];
ptrdiff_t e24 = 0;
char*restrict datPtr25 = tensors80[0]-116+1241856*e24;
char*restrict dfPtr11 = tensors80[1]+81100800*e24;
ptrdiff_t i49 = 3*g26;
ptrdiff_t ii35 = i49+(g26 < 9 ? 2 : 4);
for (; i49 <= ii35; ++i49) {
ptrdiff_t j42 = 5*c39;
ptrdiff_t rel21 = j42-0;
ptrdiff_t base21 = 0;
if (rel21 < 2) {
if (rel21 < 1) {
ptrdiff_t h43 = base21+0;
ptrdiff_t w56 = 0;
ptrdiff_t k130 = 0;
for (; k130 != 4; ++k130) {
__m512 dat2033 = _mm512_maskz_loadu_ps(8191, datPtr25+116+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2034 = _mm512_maskz_loadu_ps(16383, datPtr25+160+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512i pm175 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1520 = _mm512_permutexvar_ps(pm175, dat2033);
__m512i pm176 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1527 = _mm512_permutexvar_ps(pm176, dat2034);
__m512 dat2035 = _mm512_maskz_loadu_ps(8191, datPtr25+228+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2036 = _mm512_maskz_loadu_ps(16383, datPtr25+272+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1521 = _mm512_permutexvar_ps(pm175, dat2035);
__m512 in1528 = _mm512_permutexvar_ps(pm176, dat2036);
__m512 dat2037 = _mm512_maskz_loadu_ps(8191, datPtr25+340+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2038 = _mm512_maskz_loadu_ps(16383, datPtr25+384+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1522 = _mm512_permutexvar_ps(pm175, dat2037);
__m512 in1529 = _mm512_permutexvar_ps(pm176, dat2038);
__m512 dat2039 = _mm512_maskz_loadu_ps(8191, datPtr25+452+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2040 = _mm512_maskz_loadu_ps(16383, datPtr25+496+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1523 = _mm512_permutexvar_ps(pm175, dat2039);
__m512 in1530 = _mm512_permutexvar_ps(pm176, dat2040);
__m512 dat2041 = _mm512_maskz_loadu_ps(8191, datPtr25+564+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2042 = _mm512_maskz_loadu_ps(16383, datPtr25+608+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1524 = _mm512_permutexvar_ps(pm175, dat2041);
__m512 in1531 = _mm512_permutexvar_ps(pm176, dat2042);
__m512 dat2043 = _mm512_maskz_loadu_ps(8191, datPtr25+676+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2044 = _mm512_maskz_loadu_ps(16383, datPtr25+720+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1525 = _mm512_permutexvar_ps(pm175, dat2043);
__m512 in1532 = _mm512_permutexvar_ps(pm176, dat2044);
__m512 dat2045 = _mm512_maskz_loadu_ps(8191, datPtr25+788+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2046 = _mm512_maskz_loadu_ps(16383, datPtr25+832+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1526 = _mm512_permutexvar_ps(pm175, dat2045);
__m512 in1533 = _mm512_permutexvar_ps(pm176, dat2046);
__m512 tmp11131 = _mm512_add_ps(in1520, in1524);
__m512 tmp11136 = _mm512_add_ps(in1527, in1531);
__m512 tmp11132 = _mm512_sub_ps(in1523, in1521);
__m512 tmp11137 = _mm512_sub_ps(in1530, in1528);
__m512 tmp11133 = _mm512_add_ps(in1521, in1525);
__m512 tmp11138 = _mm512_add_ps(in1528, in1532);
__m512 tmp11134 = _mm512_sub_ps(_mm512_setzero_ps(), in1525);
__m512 tmp11139 = _mm512_sub_ps(_mm512_setzero_ps(), in1532);
tmp11131 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-4.25e+00f), tmp11131);
tmp11136 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-4.25e+00f), tmp11136);
tmp11133 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-4.25e+00f), tmp11133);
tmp11138 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-4.25e+00f), tmp11138);
tmp11134 = _mm512_fmadd_ps(tmp11132, _mm512_set1_ps(5.25e+00f), tmp11134);
tmp11139 = _mm512_fmadd_ps(tmp11137, _mm512_set1_ps(5.25e+00f), tmp11139);
tmp11132 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(2.5e-01f), in1525);
tmp11137 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(2.5e-01f), in1532);
in1521 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(4e+00f), in1525);
in1528 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(4e+00f), in1532);
__m512 tmp11135 = _mm512_sub_ps(tmp11133, tmp11131);
__m512 tmp11140 = _mm512_sub_ps(tmp11138, tmp11136);
tmp11133 = _mm512_add_ps(tmp11131, tmp11133);
tmp11138 = _mm512_add_ps(tmp11136, tmp11138);
tmp11131 = _mm512_fmadd_ps(in1520, _mm512_set1_ps(2.5e-01f), in1524);
tmp11136 = _mm512_fmadd_ps(in1527, _mm512_set1_ps(2.5e-01f), in1531);
tmp11132 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-1.25e+00f), tmp11132);
tmp11137 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-1.25e+00f), tmp11137);
in1523 = _mm512_fmadd_ps(in1523, _mm512_set1_ps(-5e+00f), in1521);
in1530 = _mm512_fmadd_ps(in1530, _mm512_set1_ps(-5e+00f), in1528);
tmp11131 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-1.25e+00f), tmp11131);
tmp11136 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-1.25e+00f), tmp11136);
in1525 = _mm512_fmadd_ps(tmp11131, _mm512_set1_ps(2e+00f), tmp11132);
in1532 = _mm512_fmadd_ps(tmp11136, _mm512_set1_ps(2e+00f), tmp11137);
tmp11132 = _mm512_fnmadd_ps(tmp11131, _mm512_set1_ps(2e+00f), tmp11132);
tmp11137 = _mm512_fnmadd_ps(tmp11136, _mm512_set1_ps(2e+00f), tmp11137);
tmp11131 = _mm512_fmadd_ps(in1524, _mm512_set1_ps(2.5e-01f), in1520);
tmp11136 = _mm512_fmadd_ps(in1531, _mm512_set1_ps(2.5e-01f), in1527);
in1520 = _mm512_sub_ps(in1526, in1520);
in1527 = _mm512_sub_ps(in1533, in1527);
tmp11131 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(-1.25e+00f), tmp11131);
tmp11136 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(-1.25e+00f), tmp11136);
in1522 = _mm512_sub_ps(in1522, in1524);
in1529 = _mm512_sub_ps(in1529, in1531);
in1522 = _mm512_fmadd_ps(in1522, _mm512_set1_ps(5.25e+00f), in1520);
in1529 = _mm512_fmadd_ps(in1529, _mm512_set1_ps(5.25e+00f), in1527);
in1521 = _mm512_fmadd_ps(tmp11131, _mm512_set1_ps(2e+00f), in1523);
in1528 = _mm512_fmadd_ps(tmp11136, _mm512_set1_ps(2e+00f), in1530);
in1523 = _mm512_fnmadd_ps(tmp11131, _mm512_set1_ps(2e+00f), in1523);
in1530 = _mm512_fnmadd_ps(tmp11136, _mm512_set1_ps(2e+00f), in1530);
__m512 tmp11149 = _mm512_unpacklo_ps(tmp11134, tmp11133);
__m512 tmp11150 = _mm512_unpackhi_ps(tmp11134, tmp11133);
__m512 tmp11151 = _mm512_unpacklo_ps(tmp11135, in1525);
__m512 tmp11152 = _mm512_unpackhi_ps(tmp11135, in1525);
__m512 tmp11153 = _mm512_unpacklo_ps(tmp11132, in1521);
__m512 tmp11154 = _mm512_unpackhi_ps(tmp11132, in1521);
__m512 tmp11155 = _mm512_unpacklo_ps(in1523, in1522);
__m512 tmp11156 = _mm512_unpackhi_ps(in1523, in1522);
__m512 tmp11157 = _mm512_unpacklo_ps(tmp11139, tmp11138);
__m512 tmp11158 = _mm512_unpackhi_ps(tmp11139, tmp11138);
__m512 tmp11159 = _mm512_unpacklo_ps(tmp11140, in1532);
__m512 tmp11160 = _mm512_unpackhi_ps(tmp11140, in1532);
__m512 tmp11161 = _mm512_unpacklo_ps(tmp11137, in1528);
__m512 tmp11162 = _mm512_unpackhi_ps(tmp11137, in1528);
__m512 tmp11163 = _mm512_unpacklo_ps(in1530, in1529);
__m512 tmp11164 = _mm512_unpackhi_ps(in1530, in1529);
__m512 tmp11165 = _mm512_shuffle_ps(tmp11149, tmp11151, 68);
__m512 tmp11166 = _mm512_shuffle_ps(tmp11149, tmp11151, 238);
__m512 tmp11167 = _mm512_shuffle_ps(tmp11150, tmp11152, 68);
__m512 tmp11168 = _mm512_shuffle_ps(tmp11150, tmp11152, 238);
__m512 tmp11169 = _mm512_shuffle_ps(tmp11153, tmp11155, 68);
__m512 tmp11170 = _mm512_shuffle_ps(tmp11153, tmp11155, 238);
__m512 tmp11171 = _mm512_shuffle_ps(tmp11154, tmp11156, 68);
__m512 tmp11172 = _mm512_shuffle_ps(tmp11154, tmp11156, 238);
__m512 tmp11173 = _mm512_shuffle_ps(tmp11157, tmp11159, 68);
__m512 tmp11174 = _mm512_shuffle_ps(tmp11157, tmp11159, 238);
__m512 tmp11175 = _mm512_shuffle_ps(tmp11158, tmp11160, 68);
__m512 tmp11176 = _mm512_shuffle_ps(tmp11158, tmp11160, 238);
__m512 tmp11177 = _mm512_shuffle_ps(tmp11161, tmp11163, 68);
__m512 tmp11178 = _mm512_shuffle_ps(tmp11161, tmp11163, 238);
__m512 tmp11179 = _mm512_shuffle_ps(tmp11162, tmp11164, 68);
__m512 tmp11180 = _mm512_shuffle_ps(tmp11162, tmp11164, 238);
__m512 tmp11181 = _mm512_shuffle_f32x4(tmp11165, tmp11169, 136);
__m512 tmp11182 = _mm512_shuffle_f32x4(tmp11165, tmp11169, 221);
__m512 tmp11183 = _mm512_shuffle_f32x4(tmp11166, tmp11170, 136);
__m512 tmp11184 = _mm512_shuffle_f32x4(tmp11166, tmp11170, 221);
__m512 tmp11185 = _mm512_shuffle_f32x4(tmp11167, tmp11171, 136);
__m512 tmp11186 = _mm512_shuffle_f32x4(tmp11167, tmp11171, 221);
__m512 tmp11187 = _mm512_shuffle_f32x4(tmp11168, tmp11172, 136);
__m512 tmp11188 = _mm512_shuffle_f32x4(tmp11168, tmp11172, 221);
__m512 tmp11189 = _mm512_shuffle_f32x4(tmp11173, tmp11177, 136);
__m512 tmp11190 = _mm512_shuffle_f32x4(tmp11173, tmp11177, 221);
__m512 tmp11191 = _mm512_shuffle_f32x4(tmp11174, tmp11178, 136);
__m512 tmp11192 = _mm512_shuffle_f32x4(tmp11174, tmp11178, 221);
__m512 tmp11193 = _mm512_shuffle_f32x4(tmp11175, tmp11179, 136);
__m512 tmp11194 = _mm512_shuffle_f32x4(tmp11175, tmp11179, 221);
__m512 tmp11195 = _mm512_shuffle_f32x4(tmp11176, tmp11180, 136);
__m512 tmp11196 = _mm512_shuffle_f32x4(tmp11176, tmp11180, 221);
tmp11134 = _mm512_shuffle_f32x4(tmp11181, tmp11189, 136);
tmp11139 = _mm512_shuffle_f32x4(tmp11181, tmp11189, 221);
tmp11133 = _mm512_shuffle_f32x4(tmp11183, tmp11191, 136);
tmp11138 = _mm512_shuffle_f32x4(tmp11183, tmp11191, 221);
tmp11135 = _mm512_shuffle_f32x4(tmp11185, tmp11193, 136);
tmp11140 = _mm512_shuffle_f32x4(tmp11185, tmp11193, 221);
in1525 = _mm512_shuffle_f32x4(tmp11187, tmp11195, 136);
in1532 = _mm512_shuffle_f32x4(tmp11187, tmp11195, 221);
tmp11132 = _mm512_shuffle_f32x4(tmp11182, tmp11190, 136);
tmp11137 = _mm512_shuffle_f32x4(tmp11182, tmp11190, 221);
in1521 = _mm512_shuffle_f32x4(tmp11184, tmp11192, 136);
in1528 = _mm512_shuffle_f32x4(tmp11184, tmp11192, 221);
in1523 = _mm512_shuffle_f32x4(tmp11186, tmp11194, 136);
in1530 = _mm512_shuffle_f32x4(tmp11186, tmp11194, 221);
in1522 = _mm512_shuffle_f32x4(tmp11188, tmp11196, 136);
in1529 = _mm512_shuffle_f32x4(tmp11188, tmp11196, 221);
__m512 tmp11141 = _mm512_add_ps(tmp11133, in1521);
__m512 tmp11145 = _mm512_add_ps(tmp11138, in1528);
__m512 tmp11142 = _mm512_sub_ps(tmp11132, tmp11135);
__m512 tmp11146 = _mm512_sub_ps(tmp11137, tmp11140);
__m512 tmp11143 = _mm512_add_ps(tmp11135, in1523);
__m512 tmp11147 = _mm512_add_ps(tmp11140, in1530);
tmp11134 = _mm512_sub_ps(tmp11134, in1523);
tmp11139 = _mm512_sub_ps(tmp11139, in1530);
tmp11141 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-4.25e+00f), tmp11141);
tmp11145 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-4.25e+00f), tmp11145);
tmp11143 = _mm512_fmadd_ps(tmp11132, _mm512_set1_ps(-4.25e+00f), tmp11143);
tmp11147 = _mm512_fmadd_ps(tmp11137, _mm512_set1_ps(-4.25e+00f), tmp11147);
tmp11134 = _mm512_fmadd_ps(tmp11142, _mm512_set1_ps(5.25e+00f), tmp11134);
tmp11139 = _mm512_fmadd_ps(tmp11146, _mm512_set1_ps(5.25e+00f), tmp11139);
tmp11142 = _mm512_fmadd_ps(tmp11135, _mm512_set1_ps(2.5e-01f), in1523);
tmp11146 = _mm512_fmadd_ps(tmp11140, _mm512_set1_ps(2.5e-01f), in1530);
tmp11135 = _mm512_fmadd_ps(tmp11135, _mm512_set1_ps(4e+00f), in1523);
tmp11140 = _mm512_fmadd_ps(tmp11140, _mm512_set1_ps(4e+00f), in1530);
__m512 tmp11144 = _mm512_sub_ps(tmp11143, tmp11141);
__m512 tmp11148 = _mm512_sub_ps(tmp11147, tmp11145);
tmp11143 = _mm512_add_ps(tmp11141, tmp11143);
tmp11147 = _mm512_add_ps(tmp11145, tmp11147);
tmp11141 = _mm512_fmadd_ps(tmp11133, _mm512_set1_ps(2.5e-01f), in1521);
tmp11145 = _mm512_fmadd_ps(tmp11138, _mm512_set1_ps(2.5e-01f), in1528);
tmp11142 = _mm512_fmadd_ps(tmp11132, _mm512_set1_ps(-1.25e+00f), tmp11142);
tmp11146 = _mm512_fmadd_ps(tmp11137, _mm512_set1_ps(-1.25e+00f), tmp11146);
tmp11132 = _mm512_fmadd_ps(tmp11132, _mm512_set1_ps(-5e+00f), tmp11135);
tmp11137 = _mm512_fmadd_ps(tmp11137, _mm512_set1_ps(-5e+00f), tmp11140);
tmp11141 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-1.25e+00f), tmp11141);
tmp11145 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-1.25e+00f), tmp11145);
in1523 = _mm512_fmadd_ps(tmp11141, _mm512_set1_ps(2e+00f), tmp11142);
in1530 = _mm512_fmadd_ps(tmp11145, _mm512_set1_ps(2e+00f), tmp11146);
tmp11142 = _mm512_fnmadd_ps(tmp11141, _mm512_set1_ps(2e+00f), tmp11142);
tmp11146 = _mm512_fnmadd_ps(tmp11145, _mm512_set1_ps(2e+00f), tmp11146);
tmp11141 = _mm512_fmadd_ps(in1521, _mm512_set1_ps(2.5e-01f), tmp11133);
tmp11145 = _mm512_fmadd_ps(in1528, _mm512_set1_ps(2.5e-01f), tmp11138);
tmp11133 = _mm512_sub_ps(in1522, tmp11133);
tmp11138 = _mm512_sub_ps(in1529, tmp11138);
tmp11141 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(-1.25e+00f), tmp11141);
tmp11145 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(-1.25e+00f), tmp11145);
in1525 = _mm512_sub_ps(in1525, in1521);
in1532 = _mm512_sub_ps(in1532, in1528);
in1525 = _mm512_fmadd_ps(in1525, _mm512_set1_ps(5.25e+00f), tmp11133);
in1532 = _mm512_fmadd_ps(in1532, _mm512_set1_ps(5.25e+00f), tmp11138);
tmp11135 = _mm512_fmadd_ps(tmp11141, _mm512_set1_ps(2e+00f), tmp11132);
tmp11140 = _mm512_fmadd_ps(tmp11145, _mm512_set1_ps(2e+00f), tmp11137);
tmp11132 = _mm512_fnmadd_ps(tmp11141, _mm512_set1_ps(2e+00f), tmp11132);
tmp11137 = _mm512_fnmadd_ps(tmp11145, _mm512_set1_ps(2e+00f), tmp11137);
__m512 out1391 = _mm512_shuffle_f32x4(tmp11134, tmp11143, 68);
__m512 out1399 = _mm512_shuffle_f32x4(tmp11134, tmp11143, 238);
__m512 out1392 = _mm512_shuffle_f32x4(tmp11144, in1523, 68);
__m512 out1400 = _mm512_shuffle_f32x4(tmp11144, in1523, 238);
__m512 out1393 = _mm512_shuffle_f32x4(tmp11142, tmp11135, 68);
__m512 out1401 = _mm512_shuffle_f32x4(tmp11142, tmp11135, 238);
__m512 out1394 = _mm512_shuffle_f32x4(tmp11132, in1525, 68);
__m512 out1402 = _mm512_shuffle_f32x4(tmp11132, in1525, 238);
__m512 out1395 = _mm512_shuffle_f32x4(tmp11139, tmp11147, 68);
__m512 out1403 = _mm512_shuffle_f32x4(tmp11139, tmp11147, 238);
__m512 out1396 = _mm512_shuffle_f32x4(tmp11148, in1530, 68);
__m512 out1404 = _mm512_shuffle_f32x4(tmp11148, in1530, 238);
__m512 out1397 = _mm512_shuffle_f32x4(tmp11146, tmp11140, 68);
__m512 out1405 = _mm512_shuffle_f32x4(tmp11146, tmp11140, 238);
__m512 out1398 = _mm512_shuffle_f32x4(tmp11137, in1532, 68);
__m512 out1406 = _mm512_shuffle_f32x4(tmp11137, in1532, 238);
_mm512_storeu_ps(dfPtr11+0+51200*i49+3072*j42+3072*s46+768*k130, out1391);
_mm512_storeu_ps(dfPtr11+128+51200*i49+3072*j42+3072*s46+768*k130, out1399);
_mm512_storeu_ps(dfPtr11+64+51200*i49+3072*j42+3072*s46+768*k130, out1395);
_mm512_storeu_ps(dfPtr11+192+51200*i49+3072*j42+3072*s46+768*k130, out1403);
_mm512_storeu_ps(dfPtr11+12800+51200*i49+3072*j42+3072*s46+768*k130, out1392);
_mm512_storeu_ps(dfPtr11+12928+51200*i49+3072*j42+3072*s46+768*k130, out1400);
_mm512_storeu_ps(dfPtr11+12864+51200*i49+3072*j42+3072*s46+768*k130, out1396);
_mm512_storeu_ps(dfPtr11+12992+51200*i49+3072*j42+3072*s46+768*k130, out1404);
_mm512_storeu_ps(dfPtr11+25600+51200*i49+3072*j42+3072*s46+768*k130, out1393);
_mm512_storeu_ps(dfPtr11+25728+51200*i49+3072*j42+3072*s46+768*k130, out1401);
_mm512_storeu_ps(dfPtr11+25664+51200*i49+3072*j42+3072*s46+768*k130, out1397);
_mm512_storeu_ps(dfPtr11+25792+51200*i49+3072*j42+3072*s46+768*k130, out1405);
_mm512_storeu_ps(dfPtr11+38400+51200*i49+3072*j42+3072*s46+768*k130, out1394);
_mm512_storeu_ps(dfPtr11+38528+51200*i49+3072*j42+3072*s46+768*k130, out1402);
_mm512_storeu_ps(dfPtr11+38464+51200*i49+3072*j42+3072*s46+768*k130, out1398);
_mm512_storeu_ps(dfPtr11+38592+51200*i49+3072*j42+3072*s46+768*k130, out1406);
__m512 dat2047 = _mm512_maskz_loadu_ps(127, datPtr25+676+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512i pm177 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1534 = _mm512_permutexvar_ps(pm177, dat2047);
__m512 dat2048 = _mm512_maskz_loadu_ps(31, datPtr25+208+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2049 = _mm512_maskz_loadu_ps(127, datPtr25+788+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2050 = _mm512_maskz_loadu_ps(8191, datPtr25+3252+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512i pm178 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1535 = _mm512_permutex2var_ps(dat2048, pm178, dat2049);
__m512i pm179 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1542 = _mm512_permutexvar_ps(pm179, dat2050);
__m512 dat2051 = _mm512_maskz_loadu_ps(31, datPtr25+320+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2052 = _mm512_maskz_loadu_ps(127, datPtr25+900+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2053 = _mm512_maskz_loadu_ps(8191, datPtr25+3364+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1536 = _mm512_permutex2var_ps(dat2051, pm178, dat2052);
__m512 in1543 = _mm512_permutexvar_ps(pm179, dat2053);
__m512 dat2054 = _mm512_maskz_loadu_ps(31, datPtr25+432+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2055 = _mm512_maskz_loadu_ps(127, datPtr25+1012+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2056 = _mm512_maskz_loadu_ps(8191, datPtr25+3476+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1537 = _mm512_permutex2var_ps(dat2054, pm178, dat2055);
__m512 in1544 = _mm512_permutexvar_ps(pm179, dat2056);
__m512 dat2057 = _mm512_maskz_loadu_ps(31, datPtr25+544+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2058 = _mm512_maskz_loadu_ps(127, datPtr25+1124+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2059 = _mm512_maskz_loadu_ps(8191, datPtr25+3588+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1538 = _mm512_permutex2var_ps(dat2057, pm178, dat2058);
__m512 in1545 = _mm512_permutexvar_ps(pm179, dat2059);
__m512 dat2060 = _mm512_maskz_loadu_ps(31, datPtr25+656+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2061 = _mm512_maskz_loadu_ps(127, datPtr25+1236+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2062 = _mm512_maskz_loadu_ps(8191, datPtr25+3700+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1539 = _mm512_permutex2var_ps(dat2060, pm178, dat2061);
__m512 in1546 = _mm512_permutexvar_ps(pm179, dat2062);
__m512 dat2063 = _mm512_maskz_loadu_ps(31, datPtr25+768+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2064 = _mm512_maskz_loadu_ps(127, datPtr25+1348+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2065 = _mm512_maskz_loadu_ps(8191, datPtr25+3812+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1540 = _mm512_permutex2var_ps(dat2063, pm178, dat2064);
__m512 in1547 = _mm512_permutexvar_ps(pm179, dat2065);
__m512 dat2066 = _mm512_maskz_loadu_ps(31, datPtr25+880+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2067 = _mm512_maskz_loadu_ps(127, datPtr25+1460+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2068 = _mm512_maskz_loadu_ps(8191, datPtr25+3924+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1541 = _mm512_permutex2var_ps(dat2066, pm178, dat2067);
__m512 in1548 = _mm512_permutexvar_ps(pm179, dat2068);
__m512 tmp11197 = _mm512_add_ps(in1535, in1539);
__m512 tmp11201 = _mm512_add_ps(in1542, in1546);
__m512 tmp11198 = _mm512_sub_ps(in1538, in1536);
__m512 tmp11202 = _mm512_sub_ps(in1545, in1543);
__m512 tmp11199 = _mm512_add_ps(in1536, in1540);
__m512 tmp11203 = _mm512_add_ps(in1543, in1547);
in1534 = _mm512_sub_ps(in1534, in1540);
__m512 tmp11204 = _mm512_sub_ps(_mm512_setzero_ps(), in1547);
tmp11197 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-4.25e+00f), tmp11197);
tmp11201 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-4.25e+00f), tmp11201);
tmp11199 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-4.25e+00f), tmp11199);
tmp11203 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-4.25e+00f), tmp11203);
in1534 = _mm512_fmadd_ps(tmp11198, _mm512_set1_ps(5.25e+00f), in1534);
tmp11204 = _mm512_fmadd_ps(tmp11202, _mm512_set1_ps(5.25e+00f), tmp11204);
tmp11198 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(2.5e-01f), in1540);
tmp11202 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(2.5e-01f), in1547);
in1536 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(4e+00f), in1540);
in1543 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(4e+00f), in1547);
__m512 tmp11200 = _mm512_sub_ps(tmp11199, tmp11197);
__m512 tmp11205 = _mm512_sub_ps(tmp11203, tmp11201);
tmp11199 = _mm512_add_ps(tmp11197, tmp11199);
tmp11203 = _mm512_add_ps(tmp11201, tmp11203);
tmp11197 = _mm512_fmadd_ps(in1535, _mm512_set1_ps(2.5e-01f), in1539);
tmp11201 = _mm512_fmadd_ps(in1542, _mm512_set1_ps(2.5e-01f), in1546);
tmp11198 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-1.25e+00f), tmp11198);
tmp11202 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-1.25e+00f), tmp11202);
in1538 = _mm512_fmadd_ps(in1538, _mm512_set1_ps(-5e+00f), in1536);
in1545 = _mm512_fmadd_ps(in1545, _mm512_set1_ps(-5e+00f), in1543);
tmp11197 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-1.25e+00f), tmp11197);
tmp11201 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-1.25e+00f), tmp11201);
in1540 = _mm512_fmadd_ps(tmp11197, _mm512_set1_ps(2e+00f), tmp11198);
in1547 = _mm512_fmadd_ps(tmp11201, _mm512_set1_ps(2e+00f), tmp11202);
tmp11198 = _mm512_fnmadd_ps(tmp11197, _mm512_set1_ps(2e+00f), tmp11198);
tmp11202 = _mm512_fnmadd_ps(tmp11201, _mm512_set1_ps(2e+00f), tmp11202);
tmp11197 = _mm512_fmadd_ps(in1539, _mm512_set1_ps(2.5e-01f), in1535);
tmp11201 = _mm512_fmadd_ps(in1546, _mm512_set1_ps(2.5e-01f), in1542);
in1535 = _mm512_sub_ps(in1541, in1535);
in1542 = _mm512_sub_ps(in1548, in1542);
tmp11197 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(-1.25e+00f), tmp11197);
tmp11201 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(-1.25e+00f), tmp11201);
in1537 = _mm512_sub_ps(in1537, in1539);
in1544 = _mm512_sub_ps(in1544, in1546);
in1537 = _mm512_fmadd_ps(in1537, _mm512_set1_ps(5.25e+00f), in1535);
in1544 = _mm512_fmadd_ps(in1544, _mm512_set1_ps(5.25e+00f), in1542);
in1536 = _mm512_fmadd_ps(tmp11197, _mm512_set1_ps(2e+00f), in1538);
in1543 = _mm512_fmadd_ps(tmp11201, _mm512_set1_ps(2e+00f), in1545);
in1538 = _mm512_fnmadd_ps(tmp11197, _mm512_set1_ps(2e+00f), in1538);
in1545 = _mm512_fnmadd_ps(tmp11201, _mm512_set1_ps(2e+00f), in1545);
__m512 tmp11214 = _mm512_unpacklo_ps(in1534, tmp11199);
__m512 tmp11215 = _mm512_unpackhi_ps(in1534, tmp11199);
__m512 tmp11216 = _mm512_unpacklo_ps(tmp11200, in1540);
__m512 tmp11217 = _mm512_unpackhi_ps(tmp11200, in1540);
__m512 tmp11218 = _mm512_unpacklo_ps(tmp11198, in1536);
__m512 tmp11219 = _mm512_unpackhi_ps(tmp11198, in1536);
__m512 tmp11220 = _mm512_unpacklo_ps(in1538, in1537);
__m512 tmp11221 = _mm512_unpackhi_ps(in1538, in1537);
__m512 tmp11222 = _mm512_unpacklo_ps(tmp11204, tmp11203);
__m512 tmp11223 = _mm512_unpackhi_ps(tmp11204, tmp11203);
__m512 tmp11224 = _mm512_unpacklo_ps(tmp11205, in1547);
__m512 tmp11225 = _mm512_unpackhi_ps(tmp11205, in1547);
__m512 tmp11226 = _mm512_unpacklo_ps(tmp11202, in1543);
__m512 tmp11227 = _mm512_unpackhi_ps(tmp11202, in1543);
__m512 tmp11228 = _mm512_unpacklo_ps(in1545, in1544);
__m512 tmp11229 = _mm512_unpackhi_ps(in1545, in1544);
__m512 tmp11230 = _mm512_shuffle_ps(tmp11214, tmp11216, 68);
__m512 tmp11231 = _mm512_shuffle_ps(tmp11214, tmp11216, 238);
__m512 tmp11232 = _mm512_shuffle_ps(tmp11215, tmp11217, 68);
__m512 tmp11233 = _mm512_shuffle_ps(tmp11215, tmp11217, 238);
__m512 tmp11234 = _mm512_shuffle_ps(tmp11218, tmp11220, 68);
__m512 tmp11235 = _mm512_shuffle_ps(tmp11218, tmp11220, 238);
__m512 tmp11236 = _mm512_shuffle_ps(tmp11219, tmp11221, 68);
__m512 tmp11237 = _mm512_shuffle_ps(tmp11219, tmp11221, 238);
__m512 tmp11238 = _mm512_shuffle_ps(tmp11222, tmp11224, 68);
__m512 tmp11239 = _mm512_shuffle_ps(tmp11222, tmp11224, 238);
__m512 tmp11240 = _mm512_shuffle_ps(tmp11223, tmp11225, 68);
__m512 tmp11241 = _mm512_shuffle_ps(tmp11223, tmp11225, 238);
__m512 tmp11242 = _mm512_shuffle_ps(tmp11226, tmp11228, 68);
__m512 tmp11243 = _mm512_shuffle_ps(tmp11226, tmp11228, 238);
__m512 tmp11244 = _mm512_shuffle_ps(tmp11227, tmp11229, 68);
__m512 tmp11245 = _mm512_shuffle_ps(tmp11227, tmp11229, 238);
__m512 tmp11246 = _mm512_shuffle_f32x4(tmp11230, tmp11234, 136);
__m512 tmp11247 = _mm512_shuffle_f32x4(tmp11230, tmp11234, 221);
__m512 tmp11248 = _mm512_shuffle_f32x4(tmp11231, tmp11235, 136);
__m512 tmp11249 = _mm512_shuffle_f32x4(tmp11231, tmp11235, 221);
__m512 tmp11250 = _mm512_shuffle_f32x4(tmp11232, tmp11236, 136);
__m512 tmp11251 = _mm512_shuffle_f32x4(tmp11232, tmp11236, 221);
__m512 tmp11252 = _mm512_shuffle_f32x4(tmp11233, tmp11237, 136);
__m512 tmp11253 = _mm512_shuffle_f32x4(tmp11233, tmp11237, 221);
__m512 tmp11254 = _mm512_shuffle_f32x4(tmp11238, tmp11242, 136);
__m512 tmp11255 = _mm512_shuffle_f32x4(tmp11238, tmp11242, 221);
__m512 tmp11256 = _mm512_shuffle_f32x4(tmp11239, tmp11243, 136);
__m512 tmp11257 = _mm512_shuffle_f32x4(tmp11239, tmp11243, 221);
__m512 tmp11258 = _mm512_shuffle_f32x4(tmp11240, tmp11244, 136);
__m512 tmp11259 = _mm512_shuffle_f32x4(tmp11240, tmp11244, 221);
__m512 tmp11260 = _mm512_shuffle_f32x4(tmp11241, tmp11245, 136);
__m512 tmp11261 = _mm512_shuffle_f32x4(tmp11241, tmp11245, 221);
in1534 = _mm512_shuffle_f32x4(tmp11246, tmp11254, 136);
tmp11204 = _mm512_shuffle_f32x4(tmp11246, tmp11254, 221);
tmp11199 = _mm512_shuffle_f32x4(tmp11248, tmp11256, 136);
tmp11203 = _mm512_shuffle_f32x4(tmp11248, tmp11256, 221);
tmp11200 = _mm512_shuffle_f32x4(tmp11250, tmp11258, 136);
tmp11205 = _mm512_shuffle_f32x4(tmp11250, tmp11258, 221);
in1540 = _mm512_shuffle_f32x4(tmp11252, tmp11260, 136);
in1547 = _mm512_shuffle_f32x4(tmp11252, tmp11260, 221);
tmp11198 = _mm512_shuffle_f32x4(tmp11247, tmp11255, 136);
tmp11202 = _mm512_shuffle_f32x4(tmp11247, tmp11255, 221);
in1536 = _mm512_shuffle_f32x4(tmp11249, tmp11257, 136);
in1543 = _mm512_shuffle_f32x4(tmp11249, tmp11257, 221);
in1538 = _mm512_shuffle_f32x4(tmp11251, tmp11259, 136);
in1545 = _mm512_shuffle_f32x4(tmp11251, tmp11259, 221);
in1537 = _mm512_shuffle_f32x4(tmp11253, tmp11261, 136);
in1544 = _mm512_shuffle_f32x4(tmp11253, tmp11261, 221);
__m512 tmp11206 = _mm512_add_ps(tmp11199, in1536);
__m512 tmp11210 = _mm512_add_ps(tmp11203, in1543);
__m512 tmp11207 = _mm512_sub_ps(tmp11198, tmp11200);
__m512 tmp11211 = _mm512_sub_ps(tmp11202, tmp11205);
__m512 tmp11208 = _mm512_add_ps(tmp11200, in1538);
__m512 tmp11212 = _mm512_add_ps(tmp11205, in1545);
in1534 = _mm512_sub_ps(in1534, in1538);
tmp11204 = _mm512_sub_ps(tmp11204, in1545);
tmp11206 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-4.25e+00f), tmp11206);
tmp11210 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-4.25e+00f), tmp11210);
tmp11208 = _mm512_fmadd_ps(tmp11198, _mm512_set1_ps(-4.25e+00f), tmp11208);
tmp11212 = _mm512_fmadd_ps(tmp11202, _mm512_set1_ps(-4.25e+00f), tmp11212);
in1534 = _mm512_fmadd_ps(tmp11207, _mm512_set1_ps(5.25e+00f), in1534);
tmp11204 = _mm512_fmadd_ps(tmp11211, _mm512_set1_ps(5.25e+00f), tmp11204);
tmp11207 = _mm512_fmadd_ps(tmp11200, _mm512_set1_ps(2.5e-01f), in1538);
tmp11211 = _mm512_fmadd_ps(tmp11205, _mm512_set1_ps(2.5e-01f), in1545);
tmp11200 = _mm512_fmadd_ps(tmp11200, _mm512_set1_ps(4e+00f), in1538);
tmp11205 = _mm512_fmadd_ps(tmp11205, _mm512_set1_ps(4e+00f), in1545);
__m512 tmp11209 = _mm512_sub_ps(tmp11208, tmp11206);
__m512 tmp11213 = _mm512_sub_ps(tmp11212, tmp11210);
tmp11208 = _mm512_add_ps(tmp11206, tmp11208);
tmp11212 = _mm512_add_ps(tmp11210, tmp11212);
tmp11206 = _mm512_fmadd_ps(tmp11199, _mm512_set1_ps(2.5e-01f), in1536);
tmp11210 = _mm512_fmadd_ps(tmp11203, _mm512_set1_ps(2.5e-01f), in1543);
tmp11207 = _mm512_fmadd_ps(tmp11198, _mm512_set1_ps(-1.25e+00f), tmp11207);
tmp11211 = _mm512_fmadd_ps(tmp11202, _mm512_set1_ps(-1.25e+00f), tmp11211);
tmp11198 = _mm512_fmadd_ps(tmp11198, _mm512_set1_ps(-5e+00f), tmp11200);
tmp11202 = _mm512_fmadd_ps(tmp11202, _mm512_set1_ps(-5e+00f), tmp11205);
tmp11206 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-1.25e+00f), tmp11206);
tmp11210 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-1.25e+00f), tmp11210);
in1538 = _mm512_fmadd_ps(tmp11206, _mm512_set1_ps(2e+00f), tmp11207);
in1545 = _mm512_fmadd_ps(tmp11210, _mm512_set1_ps(2e+00f), tmp11211);
tmp11207 = _mm512_fnmadd_ps(tmp11206, _mm512_set1_ps(2e+00f), tmp11207);
tmp11211 = _mm512_fnmadd_ps(tmp11210, _mm512_set1_ps(2e+00f), tmp11211);
tmp11206 = _mm512_fmadd_ps(in1536, _mm512_set1_ps(2.5e-01f), tmp11199);
tmp11210 = _mm512_fmadd_ps(in1543, _mm512_set1_ps(2.5e-01f), tmp11203);
tmp11199 = _mm512_sub_ps(in1537, tmp11199);
tmp11203 = _mm512_sub_ps(in1544, tmp11203);
tmp11206 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(-1.25e+00f), tmp11206);
tmp11210 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(-1.25e+00f), tmp11210);
in1540 = _mm512_sub_ps(in1540, in1536);
in1547 = _mm512_sub_ps(in1547, in1543);
in1540 = _mm512_fmadd_ps(in1540, _mm512_set1_ps(5.25e+00f), tmp11199);
in1547 = _mm512_fmadd_ps(in1547, _mm512_set1_ps(5.25e+00f), tmp11203);
tmp11200 = _mm512_fmadd_ps(tmp11206, _mm512_set1_ps(2e+00f), tmp11198);
tmp11205 = _mm512_fmadd_ps(tmp11210, _mm512_set1_ps(2e+00f), tmp11202);
tmp11198 = _mm512_fnmadd_ps(tmp11206, _mm512_set1_ps(2e+00f), tmp11198);
tmp11202 = _mm512_fnmadd_ps(tmp11210, _mm512_set1_ps(2e+00f), tmp11202);
__m512 out1407 = _mm512_shuffle_f32x4(in1534, tmp11208, 68);
__m512 out1415 = _mm512_shuffle_f32x4(in1534, tmp11208, 238);
__m512 out1408 = _mm512_shuffle_f32x4(tmp11209, in1538, 68);
__m512 out1416 = _mm512_shuffle_f32x4(tmp11209, in1538, 238);
__m512 out1409 = _mm512_shuffle_f32x4(tmp11207, tmp11200, 68);
__m512 out1417 = _mm512_shuffle_f32x4(tmp11207, tmp11200, 238);
__m512 out1410 = _mm512_shuffle_f32x4(tmp11198, in1540, 68);
__m512 out1418 = _mm512_shuffle_f32x4(tmp11198, in1540, 238);
__m512 out1411 = _mm512_shuffle_f32x4(tmp11204, tmp11212, 68);
__m512 out1419 = _mm512_shuffle_f32x4(tmp11204, tmp11212, 238);
__m512 out1412 = _mm512_shuffle_f32x4(tmp11213, in1545, 68);
__m512 out1420 = _mm512_shuffle_f32x4(tmp11213, in1545, 238);
__m512 out1413 = _mm512_shuffle_f32x4(tmp11211, tmp11205, 68);
__m512 out1421 = _mm512_shuffle_f32x4(tmp11211, tmp11205, 238);
__m512 out1414 = _mm512_shuffle_f32x4(tmp11202, in1547, 68);
__m512 out1422 = _mm512_shuffle_f32x4(tmp11202, in1547, 238);
_mm512_storeu_ps(dfPtr11+256+51200*i49+3072*j42+3072*s46+768*k130, out1407);
_mm512_storeu_ps(dfPtr11+384+51200*i49+3072*j42+3072*s46+768*k130, out1415);
_mm512_storeu_ps(dfPtr11+320+51200*i49+3072*j42+3072*s46+768*k130, out1411);
_mm512_storeu_ps(dfPtr11+448+51200*i49+3072*j42+3072*s46+768*k130, out1419);
_mm512_storeu_ps(dfPtr11+13056+51200*i49+3072*j42+3072*s46+768*k130, out1408);
_mm512_storeu_ps(dfPtr11+13184+51200*i49+3072*j42+3072*s46+768*k130, out1416);
_mm512_storeu_ps(dfPtr11+13120+51200*i49+3072*j42+3072*s46+768*k130, out1412);
_mm512_storeu_ps(dfPtr11+13248+51200*i49+3072*j42+3072*s46+768*k130, out1420);
_mm512_storeu_ps(dfPtr11+25856+51200*i49+3072*j42+3072*s46+768*k130, out1409);
_mm512_storeu_ps(dfPtr11+25984+51200*i49+3072*j42+3072*s46+768*k130, out1417);
_mm512_storeu_ps(dfPtr11+25920+51200*i49+3072*j42+3072*s46+768*k130, out1413);
_mm512_storeu_ps(dfPtr11+26048+51200*i49+3072*j42+3072*s46+768*k130, out1421);
_mm512_storeu_ps(dfPtr11+38656+51200*i49+3072*j42+3072*s46+768*k130, out1410);
_mm512_storeu_ps(dfPtr11+38784+51200*i49+3072*j42+3072*s46+768*k130, out1418);
_mm512_storeu_ps(dfPtr11+38720+51200*i49+3072*j42+3072*s46+768*k130, out1414);
_mm512_storeu_ps(dfPtr11+38848+51200*i49+3072*j42+3072*s46+768*k130, out1422);
__m512 dat2069 = _mm512_maskz_loadu_ps(127, datPtr25+3812+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512i pm180 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1556 = _mm512_permutexvar_ps(pm180, dat2069);
__m512 dat2070 = _mm512_maskz_loadu_ps(16383, datPtr25+3296+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2071 = _mm512_maskz_loadu_ps(31, datPtr25+3344+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2072 = _mm512_maskz_loadu_ps(127, datPtr25+3924+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512i pm181 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1549 = _mm512_permutexvar_ps(pm181, dat2070);
__m512i pm182 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1557 = _mm512_permutex2var_ps(dat2071, pm182, dat2072);
__m512 dat2073 = _mm512_maskz_loadu_ps(16383, datPtr25+3408+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2074 = _mm512_maskz_loadu_ps(31, datPtr25+3456+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2075 = _mm512_maskz_loadu_ps(127, datPtr25+4036+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1550 = _mm512_permutexvar_ps(pm181, dat2073);
__m512 in1558 = _mm512_permutex2var_ps(dat2074, pm182, dat2075);
__m512 dat2076 = _mm512_maskz_loadu_ps(16383, datPtr25+3520+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2077 = _mm512_maskz_loadu_ps(31, datPtr25+3568+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2078 = _mm512_maskz_loadu_ps(127, datPtr25+4148+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1551 = _mm512_permutexvar_ps(pm181, dat2076);
__m512 in1559 = _mm512_permutex2var_ps(dat2077, pm182, dat2078);
__m512 dat2079 = _mm512_maskz_loadu_ps(16383, datPtr25+3632+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2080 = _mm512_maskz_loadu_ps(31, datPtr25+3680+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2081 = _mm512_maskz_loadu_ps(127, datPtr25+4260+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1552 = _mm512_permutexvar_ps(pm181, dat2079);
__m512 in1560 = _mm512_permutex2var_ps(dat2080, pm182, dat2081);
__m512 dat2082 = _mm512_maskz_loadu_ps(16383, datPtr25+3744+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2083 = _mm512_maskz_loadu_ps(31, datPtr25+3792+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2084 = _mm512_maskz_loadu_ps(127, datPtr25+4372+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1553 = _mm512_permutexvar_ps(pm181, dat2082);
__m512 in1561 = _mm512_permutex2var_ps(dat2083, pm182, dat2084);
__m512 dat2085 = _mm512_maskz_loadu_ps(16383, datPtr25+3856+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2086 = _mm512_maskz_loadu_ps(31, datPtr25+3904+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2087 = _mm512_maskz_loadu_ps(127, datPtr25+4484+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1554 = _mm512_permutexvar_ps(pm181, dat2085);
__m512 in1562 = _mm512_permutex2var_ps(dat2086, pm182, dat2087);
__m512 dat2088 = _mm512_maskz_loadu_ps(16383, datPtr25+3968+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2089 = _mm512_maskz_loadu_ps(31, datPtr25+4016+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 dat2090 = _mm512_maskz_loadu_ps(127, datPtr25+4596+25088*i49+112*h43+4*w56+25088*s46+6272*k130);
__m512 in1555 = _mm512_permutexvar_ps(pm181, dat2088);
__m512 in1563 = _mm512_permutex2var_ps(dat2089, pm182, dat2090);
__m512 tmp11262 = _mm512_add_ps(in1549, in1553);
__m512 tmp11267 = _mm512_add_ps(in1557, in1561);
__m512 tmp11263 = _mm512_sub_ps(in1552, in1550);
__m512 tmp11268 = _mm512_sub_ps(in1560, in1558);
__m512 tmp11264 = _mm512_add_ps(in1550, in1554);
__m512 tmp11269 = _mm512_add_ps(in1558, in1562);
__m512 tmp11265 = _mm512_sub_ps(_mm512_setzero_ps(), in1554);
in1556 = _mm512_sub_ps(in1556, in1562);
tmp11262 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-4.25e+00f), tmp11262);
tmp11267 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-4.25e+00f), tmp11267);
tmp11264 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-4.25e+00f), tmp11264);
tmp11269 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-4.25e+00f), tmp11269);
tmp11265 = _mm512_fmadd_ps(tmp11263, _mm512_set1_ps(5.25e+00f), tmp11265);
in1556 = _mm512_fmadd_ps(tmp11268, _mm512_set1_ps(5.25e+00f), in1556);
tmp11263 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(2.5e-01f), in1554);
tmp11268 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(2.5e-01f), in1562);
in1550 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(4e+00f), in1554);
in1558 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(4e+00f), in1562);
__m512 tmp11266 = _mm512_sub_ps(tmp11264, tmp11262);
__m512 tmp11270 = _mm512_sub_ps(tmp11269, tmp11267);
tmp11264 = _mm512_add_ps(tmp11262, tmp11264);
tmp11269 = _mm512_add_ps(tmp11267, tmp11269);
tmp11262 = _mm512_fmadd_ps(in1549, _mm512_set1_ps(2.5e-01f), in1553);
tmp11267 = _mm512_fmadd_ps(in1557, _mm512_set1_ps(2.5e-01f), in1561);
tmp11263 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-1.25e+00f), tmp11263);
tmp11268 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-1.25e+00f), tmp11268);
in1552 = _mm512_fmadd_ps(in1552, _mm512_set1_ps(-5e+00f), in1550);
in1560 = _mm512_fmadd_ps(in1560, _mm512_set1_ps(-5e+00f), in1558);
tmp11262 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-1.25e+00f), tmp11262);
tmp11267 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-1.25e+00f), tmp11267);
in1554 = _mm512_fmadd_ps(tmp11262, _mm512_set1_ps(2e+00f), tmp11263);
in1562 = _mm512_fmadd_ps(tmp11267, _mm512_set1_ps(2e+00f), tmp11268);
tmp11263 = _mm512_fnmadd_ps(tmp11262, _mm512_set1_ps(2e+00f), tmp11263);
tmp11268 = _mm512_fnmadd_ps(tmp11267, _mm512_set1_ps(2e+00f), tmp11268);
tmp11262 = _mm512_fmadd_ps(in1553, _mm512_set1_ps(2.5e-01f), in1549);
tmp11267 = _mm512_fmadd_ps(in1561, _mm512_set1_ps(2.5e-01f), in1557);
in1549 = _mm512_sub_ps(in1555, in1549);
in1557 = _mm512_sub_ps(in1563, in1557);
tmp11262 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(-1.25e+00f), tmp11262);
tmp11267 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(-1.25e+00f), tmp11267);
in1551 = _mm512_sub_ps(in1551, in1553);
in1559 = _mm512_sub_ps(in1559, in1561);
in1551 = _mm512_fmadd_ps(in1551, _mm512_set1_ps(5.25e+00f), in1549);
in1559 = _mm512_fmadd_ps(in1559, _mm512_set1_ps(5.25e+00f), in1557);
in1550 = _mm512_fmadd_ps(tmp11262, _mm512_set1_ps(2e+00f), in1552);
in1558 = _mm512_fmadd_ps(tmp11267, _mm512_set1_ps(2e+00f), in1560);
in1552 = _mm512_fnmadd_ps(tmp11262, _mm512_set1_ps(2e+00f), in1552);
in1560 = _mm512_fnmadd_ps(tmp11267, _mm512_set1_ps(2e+00f), in1560);
__m512 tmp11279 = _mm512_unpacklo_ps(tmp11265, tmp11264);
__m512 tmp11280 = _mm512_unpackhi_ps(tmp11265, tmp11264);
__m512 tmp11281 = _mm512_unpacklo_ps(tmp11266, in1554);
__m512 tmp11282 = _mm512_unpackhi_ps(tmp11266, in1554);
__m512 tmp11283 = _mm512_unpacklo_ps(tmp11263, in1550);
__m512 tmp11284 = _mm512_unpackhi_ps(tmp11263, in1550);
__m512 tmp11285 = _mm512_unpacklo_ps(in1552, in1551);
__m512 tmp11286 = _mm512_unpackhi_ps(in1552, in1551);
__m512 tmp11287 = _mm512_unpacklo_ps(in1556, tmp11269);
__m512 tmp11288 = _mm512_unpackhi_ps(in1556, tmp11269);
__m512 tmp11289 = _mm512_unpacklo_ps(tmp11270, in1562);
__m512 tmp11290 = _mm512_unpackhi_ps(tmp11270, in1562);
__m512 tmp11291 = _mm512_unpacklo_ps(tmp11268, in1558);
__m512 tmp11292 = _mm512_unpackhi_ps(tmp11268, in1558);
__m512 tmp11293 = _mm512_unpacklo_ps(in1560, in1559);
__m512 tmp11294 = _mm512_unpackhi_ps(in1560, in1559);
__m512 tmp11295 = _mm512_shuffle_ps(tmp11279, tmp11281, 68);
__m512 tmp11296 = _mm512_shuffle_ps(tmp11279, tmp11281, 238);
__m512 tmp11297 = _mm512_shuffle_ps(tmp11280, tmp11282, 68);
__m512 tmp11298 = _mm512_shuffle_ps(tmp11280, tmp11282, 238);
__m512 tmp11299 = _mm512_shuffle_ps(tmp11283, tmp11285, 68);
__m512 tmp11300 = _mm512_shuffle_ps(tmp11283, tmp11285, 238);
__m512 tmp11301 = _mm512_shuffle_ps(tmp11284, tmp11286, 68);
__m512 tmp11302 = _mm512_shuffle_ps(tmp11284, tmp11286, 238);
__m512 tmp11303 = _mm512_shuffle_ps(tmp11287, tmp11289, 68);
__m512 tmp11304 = _mm512_shuffle_ps(tmp11287, tmp11289, 238);
__m512 tmp11305 = _mm512_shuffle_ps(tmp11288, tmp11290, 68);
__m512 tmp11306 = _mm512_shuffle_ps(tmp11288, tmp11290, 238);
__m512 tmp11307 = _mm512_shuffle_ps(tmp11291, tmp11293, 68);
__m512 tmp11308 = _mm512_shuffle_ps(tmp11291, tmp11293, 238);
__m512 tmp11309 = _mm512_shuffle_ps(tmp11292, tmp11294, 68);
__m512 tmp11310 = _mm512_shuffle_ps(tmp11292, tmp11294, 238);
__m512 tmp11311 = _mm512_shuffle_f32x4(tmp11295, tmp11299, 136);
__m512 tmp11312 = _mm512_shuffle_f32x4(tmp11295, tmp11299, 221);
__m512 tmp11313 = _mm512_shuffle_f32x4(tmp11296, tmp11300, 136);
__m512 tmp11314 = _mm512_shuffle_f32x4(tmp11296, tmp11300, 221);
__m512 tmp11315 = _mm512_shuffle_f32x4(tmp11297, tmp11301, 136);
__m512 tmp11316 = _mm512_shuffle_f32x4(tmp11297, tmp11301, 221);
__m512 tmp11317 = _mm512_shuffle_f32x4(tmp11298, tmp11302, 136);
__m512 tmp11318 = _mm512_shuffle_f32x4(tmp11298, tmp11302, 221);
__m512 tmp11319 = _mm512_shuffle_f32x4(tmp11303, tmp11307, 136);
__m512 tmp11320 = _mm512_shuffle_f32x4(tmp11303, tmp11307, 221);
__m512 tmp11321 = _mm512_shuffle_f32x4(tmp11304, tmp11308, 136);
__m512 tmp11322 = _mm512_shuffle_f32x4(tmp11304, tmp11308, 221);
__m512 tmp11323 = _mm512_shuffle_f32x4(tmp11305, tmp11309, 136);
__m512 tmp11324 = _mm512_shuffle_f32x4(tmp11305, tmp11309, 221);
__m512 tmp11325 = _mm512_shuffle_f32x4(tmp11306, tmp11310, 136);
__m512 tmp11326 = _mm512_shuffle_f32x4(tmp11306, tmp11310, 221);
tmp11265 = _mm512_shuffle_f32x4(tmp11311, tmp11319, 136);
in1556 = _mm512_shuffle_f32x4(tmp11311, tmp11319, 221);
tmp11264 = _mm512_shuffle_f32x4(tmp11313, tmp11321, 136);
tmp11269 = _mm512_shuffle_f32x4(tmp11313, tmp11321, 221);
tmp11266 = _mm512_shuffle_f32x4(tmp11315, tmp11323, 136);
tmp11270 = _mm512_shuffle_f32x4(tmp11315, tmp11323, 221);
in1554 = _mm512_shuffle_f32x4(tmp11317, tmp11325, 136);
in1562 = _mm512_shuffle_f32x4(tmp11317, tmp11325, 221);
tmp11263 = _mm512_shuffle_f32x4(tmp11312, tmp11320, 136);
tmp11268 = _mm512_shuffle_f32x4(tmp11312, tmp11320, 221);
in1550 = _mm512_shuffle_f32x4(tmp11314, tmp11322, 136);
in1558 = _mm512_shuffle_f32x4(tmp11314, tmp11322, 221);
in1552 = _mm512_shuffle_f32x4(tmp11316, tmp11324, 136);
in1560 = _mm512_shuffle_f32x4(tmp11316, tmp11324, 221);
in1551 = _mm512_shuffle_f32x4(tmp11318, tmp11326, 136);
in1559 = _mm512_shuffle_f32x4(tmp11318, tmp11326, 221);
__m512 tmp11271 = _mm512_add_ps(tmp11264, in1550);
__m512 tmp11275 = _mm512_add_ps(tmp11269, in1558);
__m512 tmp11272 = _mm512_sub_ps(tmp11263, tmp11266);
__m512 tmp11276 = _mm512_sub_ps(tmp11268, tmp11270);
__m512 tmp11273 = _mm512_add_ps(tmp11266, in1552);
__m512 tmp11277 = _mm512_add_ps(tmp11270, in1560);
tmp11265 = _mm512_sub_ps(tmp11265, in1552);
in1556 = _mm512_sub_ps(in1556, in1560);
tmp11271 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-4.25e+00f), tmp11271);
tmp11275 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-4.25e+00f), tmp11275);
tmp11273 = _mm512_fmadd_ps(tmp11263, _mm512_set1_ps(-4.25e+00f), tmp11273);
tmp11277 = _mm512_fmadd_ps(tmp11268, _mm512_set1_ps(-4.25e+00f), tmp11277);
tmp11265 = _mm512_fmadd_ps(tmp11272, _mm512_set1_ps(5.25e+00f), tmp11265);
in1556 = _mm512_fmadd_ps(tmp11276, _mm512_set1_ps(5.25e+00f), in1556);
tmp11272 = _mm512_fmadd_ps(tmp11266, _mm512_set1_ps(2.5e-01f), in1552);
tmp11276 = _mm512_fmadd_ps(tmp11270, _mm512_set1_ps(2.5e-01f), in1560);
tmp11266 = _mm512_fmadd_ps(tmp11266, _mm512_set1_ps(4e+00f), in1552);
tmp11270 = _mm512_fmadd_ps(tmp11270, _mm512_set1_ps(4e+00f), in1560);
__m512 tmp11274 = _mm512_sub_ps(tmp11273, tmp11271);
__m512 tmp11278 = _mm512_sub_ps(tmp11277, tmp11275);
tmp11273 = _mm512_add_ps(tmp11271, tmp11273);
tmp11277 = _mm512_add_ps(tmp11275, tmp11277);
tmp11271 = _mm512_fmadd_ps(tmp11264, _mm512_set1_ps(2.5e-01f), in1550);
tmp11275 = _mm512_fmadd_ps(tmp11269, _mm512_set1_ps(2.5e-01f), in1558);
tmp11272 = _mm512_fmadd_ps(tmp11263, _mm512_set1_ps(-1.25e+00f), tmp11272);
tmp11276 = _mm512_fmadd_ps(tmp11268, _mm512_set1_ps(-1.25e+00f), tmp11276);
tmp11263 = _mm512_fmadd_ps(tmp11263, _mm512_set1_ps(-5e+00f), tmp11266);
tmp11268 = _mm512_fmadd_ps(tmp11268, _mm512_set1_ps(-5e+00f), tmp11270);
tmp11271 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-1.25e+00f), tmp11271);
tmp11275 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-1.25e+00f), tmp11275);
in1552 = _mm512_fmadd_ps(tmp11271, _mm512_set1_ps(2e+00f), tmp11272);
in1560 = _mm512_fmadd_ps(tmp11275, _mm512_set1_ps(2e+00f), tmp11276);
tmp11272 = _mm512_fnmadd_ps(tmp11271, _mm512_set1_ps(2e+00f), tmp11272);
tmp11276 = _mm512_fnmadd_ps(tmp11275, _mm512_set1_ps(2e+00f), tmp11276);
tmp11271 = _mm512_fmadd_ps(in1550, _mm512_set1_ps(2.5e-01f), tmp11264);
tmp11275 = _mm512_fmadd_ps(in1558, _mm512_set1_ps(2.5e-01f), tmp11269);
tmp11264 = _mm512_sub_ps(in1551, tmp11264);
tmp11269 = _mm512_sub_ps(in1559, tmp11269);
tmp11271 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(-1.25e+00f), tmp11271);
tmp11275 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(-1.25e+00f), tmp11275);
in1554 = _mm512_sub_ps(in1554, in1550);
in1562 = _mm512_sub_ps(in1562, in1558);
in1554 = _mm512_fmadd_ps(in1554, _mm512_set1_ps(5.25e+00f), tmp11264);
in1562 = _mm512_fmadd_ps(in1562, _mm512_set1_ps(5.25e+00f), tmp11269);
tmp11266 = _mm512_fmadd_ps(tmp11271, _mm512_set1_ps(2e+00f), tmp11263);
tmp11270 = _mm512_fmadd_ps(tmp11275, _mm512_set1_ps(2e+00f), tmp11268);
tmp11263 = _mm512_fnmadd_ps(tmp11271, _mm512_set1_ps(2e+00f), tmp11263);
tmp11268 = _mm512_fnmadd_ps(tmp11275, _mm512_set1_ps(2e+00f), tmp11268);
__m512 out1423 = _mm512_shuffle_f32x4(tmp11265, tmp11273, 68);
__m512 out1431 = _mm512_shuffle_f32x4(tmp11265, tmp11273, 238);
__m512 out1424 = _mm512_shuffle_f32x4(tmp11274, in1552, 68);
__m512 out1432 = _mm512_shuffle_f32x4(tmp11274, in1552, 238);
__m512 out1425 = _mm512_shuffle_f32x4(tmp11272, tmp11266, 68);
__m512 out1433 = _mm512_shuffle_f32x4(tmp11272, tmp11266, 238);
__m512 out1426 = _mm512_shuffle_f32x4(tmp11263, in1554, 68);
__m512 out1434 = _mm512_shuffle_f32x4(tmp11263, in1554, 238);
__m512 out1427 = _mm512_shuffle_f32x4(in1556, tmp11277, 68);
__m512 out1435 = _mm512_shuffle_f32x4(in1556, tmp11277, 238);
__m512 out1428 = _mm512_shuffle_f32x4(tmp11278, in1560, 68);
__m512 out1436 = _mm512_shuffle_f32x4(tmp11278, in1560, 238);
__m512 out1429 = _mm512_shuffle_f32x4(tmp11276, tmp11270, 68);
__m512 out1437 = _mm512_shuffle_f32x4(tmp11276, tmp11270, 238);
__m512 out1430 = _mm512_shuffle_f32x4(tmp11268, in1562, 68);
__m512 out1438 = _mm512_shuffle_f32x4(tmp11268, in1562, 238);
_mm512_storeu_ps(dfPtr11+512+51200*i49+3072*j42+3072*s46+768*k130, out1423);
_mm512_storeu_ps(dfPtr11+640+51200*i49+3072*j42+3072*s46+768*k130, out1431);
_mm512_storeu_ps(dfPtr11+576+51200*i49+3072*j42+3072*s46+768*k130, out1427);
_mm512_storeu_ps(dfPtr11+704+51200*i49+3072*j42+3072*s46+768*k130, out1435);
_mm512_storeu_ps(dfPtr11+13312+51200*i49+3072*j42+3072*s46+768*k130, out1424);
_mm512_storeu_ps(dfPtr11+13440+51200*i49+3072*j42+3072*s46+768*k130, out1432);
_mm512_storeu_ps(dfPtr11+13376+51200*i49+3072*j42+3072*s46+768*k130, out1428);
_mm512_storeu_ps(dfPtr11+13504+51200*i49+3072*j42+3072*s46+768*k130, out1436);
_mm512_storeu_ps(dfPtr11+26112+51200*i49+3072*j42+3072*s46+768*k130, out1425);
_mm512_storeu_ps(dfPtr11+26240+51200*i49+3072*j42+3072*s46+768*k130, out1433);
_mm512_storeu_ps(dfPtr11+26176+51200*i49+3072*j42+3072*s46+768*k130, out1429);
_mm512_storeu_ps(dfPtr11+26304+51200*i49+3072*j42+3072*s46+768*k130, out1437);
_mm512_storeu_ps(dfPtr11+38912+51200*i49+3072*j42+3072*s46+768*k130, out1426);
_mm512_storeu_ps(dfPtr11+39040+51200*i49+3072*j42+3072*s46+768*k130, out1434);
_mm512_storeu_ps(dfPtr11+38976+51200*i49+3072*j42+3072*s46+768*k130, out1430);
_mm512_storeu_ps(dfPtr11+39104+51200*i49+3072*j42+3072*s46+768*k130, out1438);
}
++j42;
rel21 = 1;
}
ptrdiff_t h44 = base21+6;
ptrdiff_t w57 = 6;
ptrdiff_t k131 = 0;
for (; k131 != 4; ++k131) {
__m512 dat2091 = _mm512_maskz_loadu_ps(16383, datPtr25+0+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2092 = _mm512_maskz_loadu_ps(2047, datPtr25+48+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512i pm183 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1564 = _mm512_permutexvar_ps(pm183, dat2091);
__m512i pm184 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1572 = _mm512_permutexvar_ps(pm184, dat2092);
__m512 dat2093 = _mm512_maskz_loadu_ps(16383, datPtr25+112+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2094 = _mm512_maskz_loadu_ps(2047, datPtr25+160+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1565 = _mm512_permutexvar_ps(pm183, dat2093);
__m512 in1573 = _mm512_permutexvar_ps(pm184, dat2094);
__m512 dat2095 = _mm512_maskz_loadu_ps(16383, datPtr25+224+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2096 = _mm512_maskz_loadu_ps(2047, datPtr25+272+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1566 = _mm512_permutexvar_ps(pm183, dat2095);
__m512 in1574 = _mm512_permutexvar_ps(pm184, dat2096);
__m512 dat2097 = _mm512_maskz_loadu_ps(16383, datPtr25+336+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2098 = _mm512_maskz_loadu_ps(2047, datPtr25+384+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1567 = _mm512_permutexvar_ps(pm183, dat2097);
__m512 in1575 = _mm512_permutexvar_ps(pm184, dat2098);
__m512 dat2099 = _mm512_maskz_loadu_ps(16383, datPtr25+448+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2100 = _mm512_maskz_loadu_ps(2047, datPtr25+496+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1568 = _mm512_permutexvar_ps(pm183, dat2099);
__m512 in1576 = _mm512_permutexvar_ps(pm184, dat2100);
__m512 dat2101 = _mm512_maskz_loadu_ps(16383, datPtr25+560+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2102 = _mm512_maskz_loadu_ps(2047, datPtr25+608+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1569 = _mm512_permutexvar_ps(pm183, dat2101);
__m512 in1577 = _mm512_permutexvar_ps(pm184, dat2102);
__m512 dat2103 = _mm512_maskz_loadu_ps(16383, datPtr25+672+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2104 = _mm512_maskz_loadu_ps(2047, datPtr25+720+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1570 = _mm512_permutexvar_ps(pm183, dat2103);
__m512 in1578 = _mm512_permutexvar_ps(pm184, dat2104);
__m512 dat2105 = _mm512_maskz_loadu_ps(16383, datPtr25+784+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2106 = _mm512_maskz_loadu_ps(2047, datPtr25+832+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1571 = _mm512_permutexvar_ps(pm183, dat2105);
__m512 in1579 = _mm512_permutexvar_ps(pm184, dat2106);
__m512 tmp11327 = _mm512_add_ps(in1565, in1569);
__m512 tmp11331 = _mm512_add_ps(in1573, in1577);
__m512 tmp11328 = _mm512_sub_ps(in1568, in1566);
__m512 tmp11332 = _mm512_sub_ps(in1576, in1574);
__m512 tmp11329 = _mm512_add_ps(in1566, in1570);
__m512 tmp11333 = _mm512_add_ps(in1574, in1578);
in1564 = _mm512_sub_ps(in1564, in1570);
in1572 = _mm512_sub_ps(in1572, in1578);
tmp11327 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-4.25e+00f), tmp11327);
tmp11331 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-4.25e+00f), tmp11331);
tmp11329 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-4.25e+00f), tmp11329);
tmp11333 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-4.25e+00f), tmp11333);
in1564 = _mm512_fmadd_ps(tmp11328, _mm512_set1_ps(5.25e+00f), in1564);
in1572 = _mm512_fmadd_ps(tmp11332, _mm512_set1_ps(5.25e+00f), in1572);
tmp11328 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(2.5e-01f), in1570);
tmp11332 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(2.5e-01f), in1578);
in1566 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(4e+00f), in1570);
in1574 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(4e+00f), in1578);
__m512 tmp11330 = _mm512_sub_ps(tmp11329, tmp11327);
__m512 tmp11334 = _mm512_sub_ps(tmp11333, tmp11331);
tmp11329 = _mm512_add_ps(tmp11327, tmp11329);
tmp11333 = _mm512_add_ps(tmp11331, tmp11333);
tmp11327 = _mm512_fmadd_ps(in1565, _mm512_set1_ps(2.5e-01f), in1569);
tmp11331 = _mm512_fmadd_ps(in1573, _mm512_set1_ps(2.5e-01f), in1577);
tmp11328 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-1.25e+00f), tmp11328);
tmp11332 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-1.25e+00f), tmp11332);
in1568 = _mm512_fmadd_ps(in1568, _mm512_set1_ps(-5e+00f), in1566);
in1576 = _mm512_fmadd_ps(in1576, _mm512_set1_ps(-5e+00f), in1574);
tmp11327 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-1.25e+00f), tmp11327);
tmp11331 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-1.25e+00f), tmp11331);
in1570 = _mm512_fmadd_ps(tmp11327, _mm512_set1_ps(2e+00f), tmp11328);
in1578 = _mm512_fmadd_ps(tmp11331, _mm512_set1_ps(2e+00f), tmp11332);
tmp11328 = _mm512_fnmadd_ps(tmp11327, _mm512_set1_ps(2e+00f), tmp11328);
tmp11332 = _mm512_fnmadd_ps(tmp11331, _mm512_set1_ps(2e+00f), tmp11332);
tmp11327 = _mm512_fmadd_ps(in1569, _mm512_set1_ps(2.5e-01f), in1565);
tmp11331 = _mm512_fmadd_ps(in1577, _mm512_set1_ps(2.5e-01f), in1573);
in1565 = _mm512_sub_ps(in1571, in1565);
in1573 = _mm512_sub_ps(in1579, in1573);
tmp11327 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(-1.25e+00f), tmp11327);
tmp11331 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(-1.25e+00f), tmp11331);
in1567 = _mm512_sub_ps(in1567, in1569);
in1575 = _mm512_sub_ps(in1575, in1577);
in1567 = _mm512_fmadd_ps(in1567, _mm512_set1_ps(5.25e+00f), in1565);
in1575 = _mm512_fmadd_ps(in1575, _mm512_set1_ps(5.25e+00f), in1573);
in1566 = _mm512_fmadd_ps(tmp11327, _mm512_set1_ps(2e+00f), in1568);
in1574 = _mm512_fmadd_ps(tmp11331, _mm512_set1_ps(2e+00f), in1576);
in1568 = _mm512_fnmadd_ps(tmp11327, _mm512_set1_ps(2e+00f), in1568);
in1576 = _mm512_fnmadd_ps(tmp11331, _mm512_set1_ps(2e+00f), in1576);
__m512 tmp11343 = _mm512_unpacklo_ps(in1564, tmp11329);
__m512 tmp11344 = _mm512_unpackhi_ps(in1564, tmp11329);
__m512 tmp11345 = _mm512_unpacklo_ps(tmp11330, in1570);
__m512 tmp11346 = _mm512_unpackhi_ps(tmp11330, in1570);
__m512 tmp11347 = _mm512_unpacklo_ps(tmp11328, in1566);
__m512 tmp11348 = _mm512_unpackhi_ps(tmp11328, in1566);
__m512 tmp11349 = _mm512_unpacklo_ps(in1568, in1567);
__m512 tmp11350 = _mm512_unpackhi_ps(in1568, in1567);
__m512 tmp11351 = _mm512_unpacklo_ps(in1572, tmp11333);
__m512 tmp11352 = _mm512_unpackhi_ps(in1572, tmp11333);
__m512 tmp11353 = _mm512_unpacklo_ps(tmp11334, in1578);
__m512 tmp11354 = _mm512_unpackhi_ps(tmp11334, in1578);
__m512 tmp11355 = _mm512_unpacklo_ps(tmp11332, in1574);
__m512 tmp11356 = _mm512_unpackhi_ps(tmp11332, in1574);
__m512 tmp11357 = _mm512_unpacklo_ps(in1576, in1575);
__m512 tmp11358 = _mm512_unpackhi_ps(in1576, in1575);
__m512 tmp11359 = _mm512_shuffle_ps(tmp11343, tmp11345, 68);
__m512 tmp11360 = _mm512_shuffle_ps(tmp11343, tmp11345, 238);
__m512 tmp11361 = _mm512_shuffle_ps(tmp11344, tmp11346, 68);
__m512 tmp11362 = _mm512_shuffle_ps(tmp11344, tmp11346, 238);
__m512 tmp11363 = _mm512_shuffle_ps(tmp11347, tmp11349, 68);
__m512 tmp11364 = _mm512_shuffle_ps(tmp11347, tmp11349, 238);
__m512 tmp11365 = _mm512_shuffle_ps(tmp11348, tmp11350, 68);
__m512 tmp11366 = _mm512_shuffle_ps(tmp11348, tmp11350, 238);
__m512 tmp11367 = _mm512_shuffle_ps(tmp11351, tmp11353, 68);
__m512 tmp11368 = _mm512_shuffle_ps(tmp11351, tmp11353, 238);
__m512 tmp11369 = _mm512_shuffle_ps(tmp11352, tmp11354, 68);
__m512 tmp11370 = _mm512_shuffle_ps(tmp11352, tmp11354, 238);
__m512 tmp11371 = _mm512_shuffle_ps(tmp11355, tmp11357, 68);
__m512 tmp11372 = _mm512_shuffle_ps(tmp11355, tmp11357, 238);
__m512 tmp11373 = _mm512_shuffle_ps(tmp11356, tmp11358, 68);
__m512 tmp11374 = _mm512_shuffle_ps(tmp11356, tmp11358, 238);
__m512 tmp11375 = _mm512_shuffle_f32x4(tmp11359, tmp11363, 136);
__m512 tmp11376 = _mm512_shuffle_f32x4(tmp11359, tmp11363, 221);
__m512 tmp11377 = _mm512_shuffle_f32x4(tmp11360, tmp11364, 136);
__m512 tmp11378 = _mm512_shuffle_f32x4(tmp11360, tmp11364, 221);
__m512 tmp11379 = _mm512_shuffle_f32x4(tmp11361, tmp11365, 136);
__m512 tmp11380 = _mm512_shuffle_f32x4(tmp11361, tmp11365, 221);
__m512 tmp11381 = _mm512_shuffle_f32x4(tmp11362, tmp11366, 136);
__m512 tmp11382 = _mm512_shuffle_f32x4(tmp11362, tmp11366, 221);
__m512 tmp11383 = _mm512_shuffle_f32x4(tmp11367, tmp11371, 136);
__m512 tmp11384 = _mm512_shuffle_f32x4(tmp11367, tmp11371, 221);
__m512 tmp11385 = _mm512_shuffle_f32x4(tmp11368, tmp11372, 136);
__m512 tmp11386 = _mm512_shuffle_f32x4(tmp11368, tmp11372, 221);
__m512 tmp11387 = _mm512_shuffle_f32x4(tmp11369, tmp11373, 136);
__m512 tmp11388 = _mm512_shuffle_f32x4(tmp11369, tmp11373, 221);
__m512 tmp11389 = _mm512_shuffle_f32x4(tmp11370, tmp11374, 136);
__m512 tmp11390 = _mm512_shuffle_f32x4(tmp11370, tmp11374, 221);
in1564 = _mm512_shuffle_f32x4(tmp11375, tmp11383, 136);
in1572 = _mm512_shuffle_f32x4(tmp11375, tmp11383, 221);
tmp11329 = _mm512_shuffle_f32x4(tmp11377, tmp11385, 136);
tmp11333 = _mm512_shuffle_f32x4(tmp11377, tmp11385, 221);
tmp11330 = _mm512_shuffle_f32x4(tmp11379, tmp11387, 136);
tmp11334 = _mm512_shuffle_f32x4(tmp11379, tmp11387, 221);
in1570 = _mm512_shuffle_f32x4(tmp11381, tmp11389, 136);
in1578 = _mm512_shuffle_f32x4(tmp11381, tmp11389, 221);
tmp11328 = _mm512_shuffle_f32x4(tmp11376, tmp11384, 136);
tmp11332 = _mm512_shuffle_f32x4(tmp11376, tmp11384, 221);
in1566 = _mm512_shuffle_f32x4(tmp11378, tmp11386, 136);
in1574 = _mm512_shuffle_f32x4(tmp11378, tmp11386, 221);
in1568 = _mm512_shuffle_f32x4(tmp11380, tmp11388, 136);
in1576 = _mm512_shuffle_f32x4(tmp11380, tmp11388, 221);
in1567 = _mm512_shuffle_f32x4(tmp11382, tmp11390, 136);
in1575 = _mm512_shuffle_f32x4(tmp11382, tmp11390, 221);
__m512 tmp11335 = _mm512_add_ps(tmp11329, in1566);
__m512 tmp11339 = _mm512_add_ps(tmp11333, in1574);
__m512 tmp11336 = _mm512_sub_ps(tmp11328, tmp11330);
__m512 tmp11340 = _mm512_sub_ps(tmp11332, tmp11334);
__m512 tmp11337 = _mm512_add_ps(tmp11330, in1568);
__m512 tmp11341 = _mm512_add_ps(tmp11334, in1576);
in1564 = _mm512_sub_ps(in1564, in1568);
in1572 = _mm512_sub_ps(in1572, in1576);
tmp11335 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-4.25e+00f), tmp11335);
tmp11339 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-4.25e+00f), tmp11339);
tmp11337 = _mm512_fmadd_ps(tmp11328, _mm512_set1_ps(-4.25e+00f), tmp11337);
tmp11341 = _mm512_fmadd_ps(tmp11332, _mm512_set1_ps(-4.25e+00f), tmp11341);
in1564 = _mm512_fmadd_ps(tmp11336, _mm512_set1_ps(5.25e+00f), in1564);
in1572 = _mm512_fmadd_ps(tmp11340, _mm512_set1_ps(5.25e+00f), in1572);
tmp11336 = _mm512_fmadd_ps(tmp11330, _mm512_set1_ps(2.5e-01f), in1568);
tmp11340 = _mm512_fmadd_ps(tmp11334, _mm512_set1_ps(2.5e-01f), in1576);
tmp11330 = _mm512_fmadd_ps(tmp11330, _mm512_set1_ps(4e+00f), in1568);
tmp11334 = _mm512_fmadd_ps(tmp11334, _mm512_set1_ps(4e+00f), in1576);
__m512 tmp11338 = _mm512_sub_ps(tmp11337, tmp11335);
__m512 tmp11342 = _mm512_sub_ps(tmp11341, tmp11339);
tmp11337 = _mm512_add_ps(tmp11335, tmp11337);
tmp11341 = _mm512_add_ps(tmp11339, tmp11341);
tmp11335 = _mm512_fmadd_ps(tmp11329, _mm512_set1_ps(2.5e-01f), in1566);
tmp11339 = _mm512_fmadd_ps(tmp11333, _mm512_set1_ps(2.5e-01f), in1574);
tmp11336 = _mm512_fmadd_ps(tmp11328, _mm512_set1_ps(-1.25e+00f), tmp11336);
tmp11340 = _mm512_fmadd_ps(tmp11332, _mm512_set1_ps(-1.25e+00f), tmp11340);
tmp11328 = _mm512_fmadd_ps(tmp11328, _mm512_set1_ps(-5e+00f), tmp11330);
tmp11332 = _mm512_fmadd_ps(tmp11332, _mm512_set1_ps(-5e+00f), tmp11334);
tmp11335 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-1.25e+00f), tmp11335);
tmp11339 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-1.25e+00f), tmp11339);
in1568 = _mm512_fmadd_ps(tmp11335, _mm512_set1_ps(2e+00f), tmp11336);
in1576 = _mm512_fmadd_ps(tmp11339, _mm512_set1_ps(2e+00f), tmp11340);
tmp11336 = _mm512_fnmadd_ps(tmp11335, _mm512_set1_ps(2e+00f), tmp11336);
tmp11340 = _mm512_fnmadd_ps(tmp11339, _mm512_set1_ps(2e+00f), tmp11340);
tmp11335 = _mm512_fmadd_ps(in1566, _mm512_set1_ps(2.5e-01f), tmp11329);
tmp11339 = _mm512_fmadd_ps(in1574, _mm512_set1_ps(2.5e-01f), tmp11333);
tmp11329 = _mm512_sub_ps(in1567, tmp11329);
tmp11333 = _mm512_sub_ps(in1575, tmp11333);
tmp11335 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(-1.25e+00f), tmp11335);
tmp11339 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(-1.25e+00f), tmp11339);
in1570 = _mm512_sub_ps(in1570, in1566);
in1578 = _mm512_sub_ps(in1578, in1574);
in1570 = _mm512_fmadd_ps(in1570, _mm512_set1_ps(5.25e+00f), tmp11329);
in1578 = _mm512_fmadd_ps(in1578, _mm512_set1_ps(5.25e+00f), tmp11333);
tmp11330 = _mm512_fmadd_ps(tmp11335, _mm512_set1_ps(2e+00f), tmp11328);
tmp11334 = _mm512_fmadd_ps(tmp11339, _mm512_set1_ps(2e+00f), tmp11332);
tmp11328 = _mm512_fnmadd_ps(tmp11335, _mm512_set1_ps(2e+00f), tmp11328);
tmp11332 = _mm512_fnmadd_ps(tmp11339, _mm512_set1_ps(2e+00f), tmp11332);
__m512 out1439 = _mm512_shuffle_f32x4(in1564, tmp11337, 68);
__m512 out1447 = _mm512_shuffle_f32x4(in1564, tmp11337, 238);
__m512 out1440 = _mm512_shuffle_f32x4(tmp11338, in1568, 68);
__m512 out1448 = _mm512_shuffle_f32x4(tmp11338, in1568, 238);
__m512 out1441 = _mm512_shuffle_f32x4(tmp11336, tmp11330, 68);
__m512 out1449 = _mm512_shuffle_f32x4(tmp11336, tmp11330, 238);
__m512 out1442 = _mm512_shuffle_f32x4(tmp11328, in1570, 68);
__m512 out1450 = _mm512_shuffle_f32x4(tmp11328, in1570, 238);
__m512 out1443 = _mm512_shuffle_f32x4(in1572, tmp11341, 68);
__m512 out1451 = _mm512_shuffle_f32x4(in1572, tmp11341, 238);
__m512 out1444 = _mm512_shuffle_f32x4(tmp11342, in1576, 68);
__m512 out1452 = _mm512_shuffle_f32x4(tmp11342, in1576, 238);
__m512 out1445 = _mm512_shuffle_f32x4(tmp11340, tmp11334, 68);
__m512 out1453 = _mm512_shuffle_f32x4(tmp11340, tmp11334, 238);
__m512 out1446 = _mm512_shuffle_f32x4(tmp11332, in1578, 68);
__m512 out1454 = _mm512_shuffle_f32x4(tmp11332, in1578, 238);
_mm512_storeu_ps(dfPtr11+0+51200*i49+3072*j42+3072*s46+768*k131, out1439);
_mm512_storeu_ps(dfPtr11+128+51200*i49+3072*j42+3072*s46+768*k131, out1447);
_mm512_storeu_ps(dfPtr11+64+51200*i49+3072*j42+3072*s46+768*k131, out1443);
_mm512_storeu_ps(dfPtr11+192+51200*i49+3072*j42+3072*s46+768*k131, out1451);
_mm512_storeu_ps(dfPtr11+12800+51200*i49+3072*j42+3072*s46+768*k131, out1440);
_mm512_storeu_ps(dfPtr11+12928+51200*i49+3072*j42+3072*s46+768*k131, out1448);
_mm512_storeu_ps(dfPtr11+12864+51200*i49+3072*j42+3072*s46+768*k131, out1444);
_mm512_storeu_ps(dfPtr11+12992+51200*i49+3072*j42+3072*s46+768*k131, out1452);
_mm512_storeu_ps(dfPtr11+25600+51200*i49+3072*j42+3072*s46+768*k131, out1441);
_mm512_storeu_ps(dfPtr11+25728+51200*i49+3072*j42+3072*s46+768*k131, out1449);
_mm512_storeu_ps(dfPtr11+25664+51200*i49+3072*j42+3072*s46+768*k131, out1445);
_mm512_storeu_ps(dfPtr11+25792+51200*i49+3072*j42+3072*s46+768*k131, out1453);
_mm512_storeu_ps(dfPtr11+38400+51200*i49+3072*j42+3072*s46+768*k131, out1442);
_mm512_storeu_ps(dfPtr11+38528+51200*i49+3072*j42+3072*s46+768*k131, out1450);
_mm512_storeu_ps(dfPtr11+38464+51200*i49+3072*j42+3072*s46+768*k131, out1446);
_mm512_storeu_ps(dfPtr11+38592+51200*i49+3072*j42+3072*s46+768*k131, out1454);
__m512 dat2107 = _mm512_maskz_loadu_ps(8191, datPtr25+652+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2108 = _mm512_maskz_loadu_ps(16383, datPtr25+3136+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512i pm185 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1580 = _mm512_permutexvar_ps(pm185, dat2107);
__m512i pm186 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1588 = _mm512_permutexvar_ps(pm186, dat2108);
__m512 dat2109 = _mm512_maskz_loadu_ps(8191, datPtr25+764+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2110 = _mm512_maskz_loadu_ps(16383, datPtr25+3248+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1581 = _mm512_permutexvar_ps(pm185, dat2109);
__m512 in1589 = _mm512_permutexvar_ps(pm186, dat2110);
__m512 dat2111 = _mm512_maskz_loadu_ps(8191, datPtr25+876+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2112 = _mm512_maskz_loadu_ps(16383, datPtr25+3360+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1582 = _mm512_permutexvar_ps(pm185, dat2111);
__m512 in1590 = _mm512_permutexvar_ps(pm186, dat2112);
__m512 dat2113 = _mm512_maskz_loadu_ps(8191, datPtr25+988+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2114 = _mm512_maskz_loadu_ps(16383, datPtr25+3472+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1583 = _mm512_permutexvar_ps(pm185, dat2113);
__m512 in1591 = _mm512_permutexvar_ps(pm186, dat2114);
__m512 dat2115 = _mm512_maskz_loadu_ps(8191, datPtr25+1100+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2116 = _mm512_maskz_loadu_ps(16383, datPtr25+3584+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1584 = _mm512_permutexvar_ps(pm185, dat2115);
__m512 in1592 = _mm512_permutexvar_ps(pm186, dat2116);
__m512 dat2117 = _mm512_maskz_loadu_ps(8191, datPtr25+1212+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2118 = _mm512_maskz_loadu_ps(16383, datPtr25+3696+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1585 = _mm512_permutexvar_ps(pm185, dat2117);
__m512 in1593 = _mm512_permutexvar_ps(pm186, dat2118);
__m512 dat2119 = _mm512_maskz_loadu_ps(8191, datPtr25+1324+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2120 = _mm512_maskz_loadu_ps(16383, datPtr25+3808+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1586 = _mm512_permutexvar_ps(pm185, dat2119);
__m512 in1594 = _mm512_permutexvar_ps(pm186, dat2120);
__m512 dat2121 = _mm512_maskz_loadu_ps(8191, datPtr25+1436+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2122 = _mm512_maskz_loadu_ps(16383, datPtr25+3920+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1587 = _mm512_permutexvar_ps(pm185, dat2121);
__m512 in1595 = _mm512_permutexvar_ps(pm186, dat2122);
__m512 tmp11391 = _mm512_add_ps(in1581, in1585);
__m512 tmp11395 = _mm512_add_ps(in1589, in1593);
__m512 tmp11392 = _mm512_sub_ps(in1584, in1582);
__m512 tmp11396 = _mm512_sub_ps(in1592, in1590);
__m512 tmp11393 = _mm512_add_ps(in1582, in1586);
__m512 tmp11397 = _mm512_add_ps(in1590, in1594);
in1580 = _mm512_sub_ps(in1580, in1586);
in1588 = _mm512_sub_ps(in1588, in1594);
tmp11391 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-4.25e+00f), tmp11391);
tmp11395 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-4.25e+00f), tmp11395);
tmp11393 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-4.25e+00f), tmp11393);
tmp11397 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-4.25e+00f), tmp11397);
in1580 = _mm512_fmadd_ps(tmp11392, _mm512_set1_ps(5.25e+00f), in1580);
in1588 = _mm512_fmadd_ps(tmp11396, _mm512_set1_ps(5.25e+00f), in1588);
tmp11392 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(2.5e-01f), in1586);
tmp11396 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(2.5e-01f), in1594);
in1582 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(4e+00f), in1586);
in1590 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(4e+00f), in1594);
__m512 tmp11394 = _mm512_sub_ps(tmp11393, tmp11391);
__m512 tmp11398 = _mm512_sub_ps(tmp11397, tmp11395);
tmp11393 = _mm512_add_ps(tmp11391, tmp11393);
tmp11397 = _mm512_add_ps(tmp11395, tmp11397);
tmp11391 = _mm512_fmadd_ps(in1581, _mm512_set1_ps(2.5e-01f), in1585);
tmp11395 = _mm512_fmadd_ps(in1589, _mm512_set1_ps(2.5e-01f), in1593);
tmp11392 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-1.25e+00f), tmp11392);
tmp11396 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-1.25e+00f), tmp11396);
in1584 = _mm512_fmadd_ps(in1584, _mm512_set1_ps(-5e+00f), in1582);
in1592 = _mm512_fmadd_ps(in1592, _mm512_set1_ps(-5e+00f), in1590);
tmp11391 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-1.25e+00f), tmp11391);
tmp11395 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-1.25e+00f), tmp11395);
in1586 = _mm512_fmadd_ps(tmp11391, _mm512_set1_ps(2e+00f), tmp11392);
in1594 = _mm512_fmadd_ps(tmp11395, _mm512_set1_ps(2e+00f), tmp11396);
tmp11392 = _mm512_fnmadd_ps(tmp11391, _mm512_set1_ps(2e+00f), tmp11392);
tmp11396 = _mm512_fnmadd_ps(tmp11395, _mm512_set1_ps(2e+00f), tmp11396);
tmp11391 = _mm512_fmadd_ps(in1585, _mm512_set1_ps(2.5e-01f), in1581);
tmp11395 = _mm512_fmadd_ps(in1593, _mm512_set1_ps(2.5e-01f), in1589);
in1581 = _mm512_sub_ps(in1587, in1581);
in1589 = _mm512_sub_ps(in1595, in1589);
tmp11391 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(-1.25e+00f), tmp11391);
tmp11395 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(-1.25e+00f), tmp11395);
in1583 = _mm512_sub_ps(in1583, in1585);
in1591 = _mm512_sub_ps(in1591, in1593);
in1583 = _mm512_fmadd_ps(in1583, _mm512_set1_ps(5.25e+00f), in1581);
in1591 = _mm512_fmadd_ps(in1591, _mm512_set1_ps(5.25e+00f), in1589);
in1582 = _mm512_fmadd_ps(tmp11391, _mm512_set1_ps(2e+00f), in1584);
in1590 = _mm512_fmadd_ps(tmp11395, _mm512_set1_ps(2e+00f), in1592);
in1584 = _mm512_fnmadd_ps(tmp11391, _mm512_set1_ps(2e+00f), in1584);
in1592 = _mm512_fnmadd_ps(tmp11395, _mm512_set1_ps(2e+00f), in1592);
__m512 tmp11407 = _mm512_unpacklo_ps(in1580, tmp11393);
__m512 tmp11408 = _mm512_unpackhi_ps(in1580, tmp11393);
__m512 tmp11409 = _mm512_unpacklo_ps(tmp11394, in1586);
__m512 tmp11410 = _mm512_unpackhi_ps(tmp11394, in1586);
__m512 tmp11411 = _mm512_unpacklo_ps(tmp11392, in1582);
__m512 tmp11412 = _mm512_unpackhi_ps(tmp11392, in1582);
__m512 tmp11413 = _mm512_unpacklo_ps(in1584, in1583);
__m512 tmp11414 = _mm512_unpackhi_ps(in1584, in1583);
__m512 tmp11415 = _mm512_unpacklo_ps(in1588, tmp11397);
__m512 tmp11416 = _mm512_unpackhi_ps(in1588, tmp11397);
__m512 tmp11417 = _mm512_unpacklo_ps(tmp11398, in1594);
__m512 tmp11418 = _mm512_unpackhi_ps(tmp11398, in1594);
__m512 tmp11419 = _mm512_unpacklo_ps(tmp11396, in1590);
__m512 tmp11420 = _mm512_unpackhi_ps(tmp11396, in1590);
__m512 tmp11421 = _mm512_unpacklo_ps(in1592, in1591);
__m512 tmp11422 = _mm512_unpackhi_ps(in1592, in1591);
__m512 tmp11423 = _mm512_shuffle_ps(tmp11407, tmp11409, 68);
__m512 tmp11424 = _mm512_shuffle_ps(tmp11407, tmp11409, 238);
__m512 tmp11425 = _mm512_shuffle_ps(tmp11408, tmp11410, 68);
__m512 tmp11426 = _mm512_shuffle_ps(tmp11408, tmp11410, 238);
__m512 tmp11427 = _mm512_shuffle_ps(tmp11411, tmp11413, 68);
__m512 tmp11428 = _mm512_shuffle_ps(tmp11411, tmp11413, 238);
__m512 tmp11429 = _mm512_shuffle_ps(tmp11412, tmp11414, 68);
__m512 tmp11430 = _mm512_shuffle_ps(tmp11412, tmp11414, 238);
__m512 tmp11431 = _mm512_shuffle_ps(tmp11415, tmp11417, 68);
__m512 tmp11432 = _mm512_shuffle_ps(tmp11415, tmp11417, 238);
__m512 tmp11433 = _mm512_shuffle_ps(tmp11416, tmp11418, 68);
__m512 tmp11434 = _mm512_shuffle_ps(tmp11416, tmp11418, 238);
__m512 tmp11435 = _mm512_shuffle_ps(tmp11419, tmp11421, 68);
__m512 tmp11436 = _mm512_shuffle_ps(tmp11419, tmp11421, 238);
__m512 tmp11437 = _mm512_shuffle_ps(tmp11420, tmp11422, 68);
__m512 tmp11438 = _mm512_shuffle_ps(tmp11420, tmp11422, 238);
__m512 tmp11439 = _mm512_shuffle_f32x4(tmp11423, tmp11427, 136);
__m512 tmp11440 = _mm512_shuffle_f32x4(tmp11423, tmp11427, 221);
__m512 tmp11441 = _mm512_shuffle_f32x4(tmp11424, tmp11428, 136);
__m512 tmp11442 = _mm512_shuffle_f32x4(tmp11424, tmp11428, 221);
__m512 tmp11443 = _mm512_shuffle_f32x4(tmp11425, tmp11429, 136);
__m512 tmp11444 = _mm512_shuffle_f32x4(tmp11425, tmp11429, 221);
__m512 tmp11445 = _mm512_shuffle_f32x4(tmp11426, tmp11430, 136);
__m512 tmp11446 = _mm512_shuffle_f32x4(tmp11426, tmp11430, 221);
__m512 tmp11447 = _mm512_shuffle_f32x4(tmp11431, tmp11435, 136);
__m512 tmp11448 = _mm512_shuffle_f32x4(tmp11431, tmp11435, 221);
__m512 tmp11449 = _mm512_shuffle_f32x4(tmp11432, tmp11436, 136);
__m512 tmp11450 = _mm512_shuffle_f32x4(tmp11432, tmp11436, 221);
__m512 tmp11451 = _mm512_shuffle_f32x4(tmp11433, tmp11437, 136);
__m512 tmp11452 = _mm512_shuffle_f32x4(tmp11433, tmp11437, 221);
__m512 tmp11453 = _mm512_shuffle_f32x4(tmp11434, tmp11438, 136);
__m512 tmp11454 = _mm512_shuffle_f32x4(tmp11434, tmp11438, 221);
in1580 = _mm512_shuffle_f32x4(tmp11439, tmp11447, 136);
in1588 = _mm512_shuffle_f32x4(tmp11439, tmp11447, 221);
tmp11393 = _mm512_shuffle_f32x4(tmp11441, tmp11449, 136);
tmp11397 = _mm512_shuffle_f32x4(tmp11441, tmp11449, 221);
tmp11394 = _mm512_shuffle_f32x4(tmp11443, tmp11451, 136);
tmp11398 = _mm512_shuffle_f32x4(tmp11443, tmp11451, 221);
in1586 = _mm512_shuffle_f32x4(tmp11445, tmp11453, 136);
in1594 = _mm512_shuffle_f32x4(tmp11445, tmp11453, 221);
tmp11392 = _mm512_shuffle_f32x4(tmp11440, tmp11448, 136);
tmp11396 = _mm512_shuffle_f32x4(tmp11440, tmp11448, 221);
in1582 = _mm512_shuffle_f32x4(tmp11442, tmp11450, 136);
in1590 = _mm512_shuffle_f32x4(tmp11442, tmp11450, 221);
in1584 = _mm512_shuffle_f32x4(tmp11444, tmp11452, 136);
in1592 = _mm512_shuffle_f32x4(tmp11444, tmp11452, 221);
in1583 = _mm512_shuffle_f32x4(tmp11446, tmp11454, 136);
in1591 = _mm512_shuffle_f32x4(tmp11446, tmp11454, 221);
__m512 tmp11399 = _mm512_add_ps(tmp11393, in1582);
__m512 tmp11403 = _mm512_add_ps(tmp11397, in1590);
__m512 tmp11400 = _mm512_sub_ps(tmp11392, tmp11394);
__m512 tmp11404 = _mm512_sub_ps(tmp11396, tmp11398);
__m512 tmp11401 = _mm512_add_ps(tmp11394, in1584);
__m512 tmp11405 = _mm512_add_ps(tmp11398, in1592);
in1580 = _mm512_sub_ps(in1580, in1584);
in1588 = _mm512_sub_ps(in1588, in1592);
tmp11399 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-4.25e+00f), tmp11399);
tmp11403 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-4.25e+00f), tmp11403);
tmp11401 = _mm512_fmadd_ps(tmp11392, _mm512_set1_ps(-4.25e+00f), tmp11401);
tmp11405 = _mm512_fmadd_ps(tmp11396, _mm512_set1_ps(-4.25e+00f), tmp11405);
in1580 = _mm512_fmadd_ps(tmp11400, _mm512_set1_ps(5.25e+00f), in1580);
in1588 = _mm512_fmadd_ps(tmp11404, _mm512_set1_ps(5.25e+00f), in1588);
tmp11400 = _mm512_fmadd_ps(tmp11394, _mm512_set1_ps(2.5e-01f), in1584);
tmp11404 = _mm512_fmadd_ps(tmp11398, _mm512_set1_ps(2.5e-01f), in1592);
tmp11394 = _mm512_fmadd_ps(tmp11394, _mm512_set1_ps(4e+00f), in1584);
tmp11398 = _mm512_fmadd_ps(tmp11398, _mm512_set1_ps(4e+00f), in1592);
__m512 tmp11402 = _mm512_sub_ps(tmp11401, tmp11399);
__m512 tmp11406 = _mm512_sub_ps(tmp11405, tmp11403);
tmp11401 = _mm512_add_ps(tmp11399, tmp11401);
tmp11405 = _mm512_add_ps(tmp11403, tmp11405);
tmp11399 = _mm512_fmadd_ps(tmp11393, _mm512_set1_ps(2.5e-01f), in1582);
tmp11403 = _mm512_fmadd_ps(tmp11397, _mm512_set1_ps(2.5e-01f), in1590);
tmp11400 = _mm512_fmadd_ps(tmp11392, _mm512_set1_ps(-1.25e+00f), tmp11400);
tmp11404 = _mm512_fmadd_ps(tmp11396, _mm512_set1_ps(-1.25e+00f), tmp11404);
tmp11392 = _mm512_fmadd_ps(tmp11392, _mm512_set1_ps(-5e+00f), tmp11394);
tmp11396 = _mm512_fmadd_ps(tmp11396, _mm512_set1_ps(-5e+00f), tmp11398);
tmp11399 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-1.25e+00f), tmp11399);
tmp11403 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-1.25e+00f), tmp11403);
in1584 = _mm512_fmadd_ps(tmp11399, _mm512_set1_ps(2e+00f), tmp11400);
in1592 = _mm512_fmadd_ps(tmp11403, _mm512_set1_ps(2e+00f), tmp11404);
tmp11400 = _mm512_fnmadd_ps(tmp11399, _mm512_set1_ps(2e+00f), tmp11400);
tmp11404 = _mm512_fnmadd_ps(tmp11403, _mm512_set1_ps(2e+00f), tmp11404);
tmp11399 = _mm512_fmadd_ps(in1582, _mm512_set1_ps(2.5e-01f), tmp11393);
tmp11403 = _mm512_fmadd_ps(in1590, _mm512_set1_ps(2.5e-01f), tmp11397);
tmp11393 = _mm512_sub_ps(in1583, tmp11393);
tmp11397 = _mm512_sub_ps(in1591, tmp11397);
tmp11399 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(-1.25e+00f), tmp11399);
tmp11403 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(-1.25e+00f), tmp11403);
in1586 = _mm512_sub_ps(in1586, in1582);
in1594 = _mm512_sub_ps(in1594, in1590);
in1586 = _mm512_fmadd_ps(in1586, _mm512_set1_ps(5.25e+00f), tmp11393);
in1594 = _mm512_fmadd_ps(in1594, _mm512_set1_ps(5.25e+00f), tmp11397);
tmp11394 = _mm512_fmadd_ps(tmp11399, _mm512_set1_ps(2e+00f), tmp11392);
tmp11398 = _mm512_fmadd_ps(tmp11403, _mm512_set1_ps(2e+00f), tmp11396);
tmp11392 = _mm512_fnmadd_ps(tmp11399, _mm512_set1_ps(2e+00f), tmp11392);
tmp11396 = _mm512_fnmadd_ps(tmp11403, _mm512_set1_ps(2e+00f), tmp11396);
__m512 out1455 = _mm512_shuffle_f32x4(in1580, tmp11401, 68);
__m512 out1463 = _mm512_shuffle_f32x4(in1580, tmp11401, 238);
__m512 out1456 = _mm512_shuffle_f32x4(tmp11402, in1584, 68);
__m512 out1464 = _mm512_shuffle_f32x4(tmp11402, in1584, 238);
__m512 out1457 = _mm512_shuffle_f32x4(tmp11400, tmp11394, 68);
__m512 out1465 = _mm512_shuffle_f32x4(tmp11400, tmp11394, 238);
__m512 out1458 = _mm512_shuffle_f32x4(tmp11392, in1586, 68);
__m512 out1466 = _mm512_shuffle_f32x4(tmp11392, in1586, 238);
__m512 out1459 = _mm512_shuffle_f32x4(in1588, tmp11405, 68);
__m512 out1467 = _mm512_shuffle_f32x4(in1588, tmp11405, 238);
__m512 out1460 = _mm512_shuffle_f32x4(tmp11406, in1592, 68);
__m512 out1468 = _mm512_shuffle_f32x4(tmp11406, in1592, 238);
__m512 out1461 = _mm512_shuffle_f32x4(tmp11404, tmp11398, 68);
__m512 out1469 = _mm512_shuffle_f32x4(tmp11404, tmp11398, 238);
__m512 out1462 = _mm512_shuffle_f32x4(tmp11396, in1594, 68);
__m512 out1470 = _mm512_shuffle_f32x4(tmp11396, in1594, 238);
_mm512_storeu_ps(dfPtr11+256+51200*i49+3072*j42+3072*s46+768*k131, out1455);
_mm512_storeu_ps(dfPtr11+384+51200*i49+3072*j42+3072*s46+768*k131, out1463);
_mm512_storeu_ps(dfPtr11+320+51200*i49+3072*j42+3072*s46+768*k131, out1459);
_mm512_storeu_ps(dfPtr11+448+51200*i49+3072*j42+3072*s46+768*k131, out1467);
_mm512_storeu_ps(dfPtr11+13056+51200*i49+3072*j42+3072*s46+768*k131, out1456);
_mm512_storeu_ps(dfPtr11+13184+51200*i49+3072*j42+3072*s46+768*k131, out1464);
_mm512_storeu_ps(dfPtr11+13120+51200*i49+3072*j42+3072*s46+768*k131, out1460);
_mm512_storeu_ps(dfPtr11+13248+51200*i49+3072*j42+3072*s46+768*k131, out1468);
_mm512_storeu_ps(dfPtr11+25856+51200*i49+3072*j42+3072*s46+768*k131, out1457);
_mm512_storeu_ps(dfPtr11+25984+51200*i49+3072*j42+3072*s46+768*k131, out1465);
_mm512_storeu_ps(dfPtr11+25920+51200*i49+3072*j42+3072*s46+768*k131, out1461);
_mm512_storeu_ps(dfPtr11+26048+51200*i49+3072*j42+3072*s46+768*k131, out1469);
_mm512_storeu_ps(dfPtr11+38656+51200*i49+3072*j42+3072*s46+768*k131, out1458);
_mm512_storeu_ps(dfPtr11+38784+51200*i49+3072*j42+3072*s46+768*k131, out1466);
_mm512_storeu_ps(dfPtr11+38720+51200*i49+3072*j42+3072*s46+768*k131, out1462);
_mm512_storeu_ps(dfPtr11+38848+51200*i49+3072*j42+3072*s46+768*k131, out1470);
__m512 dat2123 = _mm512_maskz_loadu_ps(2047, datPtr25+3184+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2124 = _mm512_maskz_loadu_ps(8191, datPtr25+3788+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512i pm187 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1596 = _mm512_permutexvar_ps(pm187, dat2123);
__m512i pm188 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1604 = _mm512_permutexvar_ps(pm188, dat2124);
__m512 dat2125 = _mm512_maskz_loadu_ps(2047, datPtr25+3296+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2126 = _mm512_maskz_loadu_ps(8191, datPtr25+3900+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1597 = _mm512_permutexvar_ps(pm187, dat2125);
__m512 in1605 = _mm512_permutexvar_ps(pm188, dat2126);
__m512 dat2127 = _mm512_maskz_loadu_ps(2047, datPtr25+3408+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2128 = _mm512_maskz_loadu_ps(8191, datPtr25+4012+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1598 = _mm512_permutexvar_ps(pm187, dat2127);
__m512 in1606 = _mm512_permutexvar_ps(pm188, dat2128);
__m512 dat2129 = _mm512_maskz_loadu_ps(2047, datPtr25+3520+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2130 = _mm512_maskz_loadu_ps(8191, datPtr25+4124+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1599 = _mm512_permutexvar_ps(pm187, dat2129);
__m512 in1607 = _mm512_permutexvar_ps(pm188, dat2130);
__m512 dat2131 = _mm512_maskz_loadu_ps(2047, datPtr25+3632+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2132 = _mm512_maskz_loadu_ps(8191, datPtr25+4236+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1600 = _mm512_permutexvar_ps(pm187, dat2131);
__m512 in1608 = _mm512_permutexvar_ps(pm188, dat2132);
__m512 dat2133 = _mm512_maskz_loadu_ps(2047, datPtr25+3744+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2134 = _mm512_maskz_loadu_ps(8191, datPtr25+4348+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1601 = _mm512_permutexvar_ps(pm187, dat2133);
__m512 in1609 = _mm512_permutexvar_ps(pm188, dat2134);
__m512 dat2135 = _mm512_maskz_loadu_ps(2047, datPtr25+3856+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2136 = _mm512_maskz_loadu_ps(8191, datPtr25+4460+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1602 = _mm512_permutexvar_ps(pm187, dat2135);
__m512 in1610 = _mm512_permutexvar_ps(pm188, dat2136);
__m512 dat2137 = _mm512_maskz_loadu_ps(2047, datPtr25+3968+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 dat2138 = _mm512_maskz_loadu_ps(8191, datPtr25+4572+25088*i49+112*h44+4*w57+25088*s46+6272*k131);
__m512 in1603 = _mm512_permutexvar_ps(pm187, dat2137);
__m512 in1611 = _mm512_permutexvar_ps(pm188, dat2138);
__m512 tmp11455 = _mm512_add_ps(in1597, in1601);
__m512 tmp11459 = _mm512_add_ps(in1605, in1609);
__m512 tmp11456 = _mm512_sub_ps(in1600, in1598);
__m512 tmp11460 = _mm512_sub_ps(in1608, in1606);
__m512 tmp11457 = _mm512_add_ps(in1598, in1602);
__m512 tmp11461 = _mm512_add_ps(in1606, in1610);
in1596 = _mm512_sub_ps(in1596, in1602);
in1604 = _mm512_sub_ps(in1604, in1610);
tmp11455 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-4.25e+00f), tmp11455);
tmp11459 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-4.25e+00f), tmp11459);
tmp11457 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-4.25e+00f), tmp11457);
tmp11461 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-4.25e+00f), tmp11461);
in1596 = _mm512_fmadd_ps(tmp11456, _mm512_set1_ps(5.25e+00f), in1596);
in1604 = _mm512_fmadd_ps(tmp11460, _mm512_set1_ps(5.25e+00f), in1604);
tmp11456 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(2.5e-01f), in1602);
tmp11460 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(2.5e-01f), in1610);
in1598 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(4e+00f), in1602);
in1606 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(4e+00f), in1610);
__m512 tmp11458 = _mm512_sub_ps(tmp11457, tmp11455);
__m512 tmp11462 = _mm512_sub_ps(tmp11461, tmp11459);
tmp11457 = _mm512_add_ps(tmp11455, tmp11457);
tmp11461 = _mm512_add_ps(tmp11459, tmp11461);
tmp11455 = _mm512_fmadd_ps(in1597, _mm512_set1_ps(2.5e-01f), in1601);
tmp11459 = _mm512_fmadd_ps(in1605, _mm512_set1_ps(2.5e-01f), in1609);
tmp11456 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-1.25e+00f), tmp11456);
tmp11460 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-1.25e+00f), tmp11460);
in1600 = _mm512_fmadd_ps(in1600, _mm512_set1_ps(-5e+00f), in1598);
in1608 = _mm512_fmadd_ps(in1608, _mm512_set1_ps(-5e+00f), in1606);
tmp11455 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-1.25e+00f), tmp11455);
tmp11459 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-1.25e+00f), tmp11459);
in1602 = _mm512_fmadd_ps(tmp11455, _mm512_set1_ps(2e+00f), tmp11456);
in1610 = _mm512_fmadd_ps(tmp11459, _mm512_set1_ps(2e+00f), tmp11460);
tmp11456 = _mm512_fnmadd_ps(tmp11455, _mm512_set1_ps(2e+00f), tmp11456);
tmp11460 = _mm512_fnmadd_ps(tmp11459, _mm512_set1_ps(2e+00f), tmp11460);
tmp11455 = _mm512_fmadd_ps(in1601, _mm512_set1_ps(2.5e-01f), in1597);
tmp11459 = _mm512_fmadd_ps(in1609, _mm512_set1_ps(2.5e-01f), in1605);
in1597 = _mm512_sub_ps(in1603, in1597);
in1605 = _mm512_sub_ps(in1611, in1605);
tmp11455 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(-1.25e+00f), tmp11455);
tmp11459 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(-1.25e+00f), tmp11459);
in1599 = _mm512_sub_ps(in1599, in1601);
in1607 = _mm512_sub_ps(in1607, in1609);
in1599 = _mm512_fmadd_ps(in1599, _mm512_set1_ps(5.25e+00f), in1597);
in1607 = _mm512_fmadd_ps(in1607, _mm512_set1_ps(5.25e+00f), in1605);
in1598 = _mm512_fmadd_ps(tmp11455, _mm512_set1_ps(2e+00f), in1600);
in1606 = _mm512_fmadd_ps(tmp11459, _mm512_set1_ps(2e+00f), in1608);
in1600 = _mm512_fnmadd_ps(tmp11455, _mm512_set1_ps(2e+00f), in1600);
in1608 = _mm512_fnmadd_ps(tmp11459, _mm512_set1_ps(2e+00f), in1608);
__m512 tmp11471 = _mm512_unpacklo_ps(in1596, tmp11457);
__m512 tmp11472 = _mm512_unpackhi_ps(in1596, tmp11457);
__m512 tmp11473 = _mm512_unpacklo_ps(tmp11458, in1602);
__m512 tmp11474 = _mm512_unpackhi_ps(tmp11458, in1602);
__m512 tmp11475 = _mm512_unpacklo_ps(tmp11456, in1598);
__m512 tmp11476 = _mm512_unpackhi_ps(tmp11456, in1598);
__m512 tmp11477 = _mm512_unpacklo_ps(in1600, in1599);
__m512 tmp11478 = _mm512_unpackhi_ps(in1600, in1599);
__m512 tmp11479 = _mm512_unpacklo_ps(in1604, tmp11461);
__m512 tmp11480 = _mm512_unpackhi_ps(in1604, tmp11461);
__m512 tmp11481 = _mm512_unpacklo_ps(tmp11462, in1610);
__m512 tmp11482 = _mm512_unpackhi_ps(tmp11462, in1610);
__m512 tmp11483 = _mm512_unpacklo_ps(tmp11460, in1606);
__m512 tmp11484 = _mm512_unpackhi_ps(tmp11460, in1606);
__m512 tmp11485 = _mm512_unpacklo_ps(in1608, in1607);
__m512 tmp11486 = _mm512_unpackhi_ps(in1608, in1607);
__m512 tmp11487 = _mm512_shuffle_ps(tmp11471, tmp11473, 68);
__m512 tmp11488 = _mm512_shuffle_ps(tmp11471, tmp11473, 238);
__m512 tmp11489 = _mm512_shuffle_ps(tmp11472, tmp11474, 68);
__m512 tmp11490 = _mm512_shuffle_ps(tmp11472, tmp11474, 238);
__m512 tmp11491 = _mm512_shuffle_ps(tmp11475, tmp11477, 68);
__m512 tmp11492 = _mm512_shuffle_ps(tmp11475, tmp11477, 238);
__m512 tmp11493 = _mm512_shuffle_ps(tmp11476, tmp11478, 68);
__m512 tmp11494 = _mm512_shuffle_ps(tmp11476, tmp11478, 238);
__m512 tmp11495 = _mm512_shuffle_ps(tmp11479, tmp11481, 68);
__m512 tmp11496 = _mm512_shuffle_ps(tmp11479, tmp11481, 238);
__m512 tmp11497 = _mm512_shuffle_ps(tmp11480, tmp11482, 68);
__m512 tmp11498 = _mm512_shuffle_ps(tmp11480, tmp11482, 238);
__m512 tmp11499 = _mm512_shuffle_ps(tmp11483, tmp11485, 68);
__m512 tmp11500 = _mm512_shuffle_ps(tmp11483, tmp11485, 238);
__m512 tmp11501 = _mm512_shuffle_ps(tmp11484, tmp11486, 68);
__m512 tmp11502 = _mm512_shuffle_ps(tmp11484, tmp11486, 238);
__m512 tmp11503 = _mm512_shuffle_f32x4(tmp11487, tmp11491, 136);
__m512 tmp11504 = _mm512_shuffle_f32x4(tmp11487, tmp11491, 221);
__m512 tmp11505 = _mm512_shuffle_f32x4(tmp11488, tmp11492, 136);
__m512 tmp11506 = _mm512_shuffle_f32x4(tmp11488, tmp11492, 221);
__m512 tmp11507 = _mm512_shuffle_f32x4(tmp11489, tmp11493, 136);
__m512 tmp11508 = _mm512_shuffle_f32x4(tmp11489, tmp11493, 221);
__m512 tmp11509 = _mm512_shuffle_f32x4(tmp11490, tmp11494, 136);
__m512 tmp11510 = _mm512_shuffle_f32x4(tmp11490, tmp11494, 221);
__m512 tmp11511 = _mm512_shuffle_f32x4(tmp11495, tmp11499, 136);
__m512 tmp11512 = _mm512_shuffle_f32x4(tmp11495, tmp11499, 221);
__m512 tmp11513 = _mm512_shuffle_f32x4(tmp11496, tmp11500, 136);
__m512 tmp11514 = _mm512_shuffle_f32x4(tmp11496, tmp11500, 221);
__m512 tmp11515 = _mm512_shuffle_f32x4(tmp11497, tmp11501, 136);
__m512 tmp11516 = _mm512_shuffle_f32x4(tmp11497, tmp11501, 221);
__m512 tmp11517 = _mm512_shuffle_f32x4(tmp11498, tmp11502, 136);
__m512 tmp11518 = _mm512_shuffle_f32x4(tmp11498, tmp11502, 221);
in1596 = _mm512_shuffle_f32x4(tmp11503, tmp11511, 136);
in1604 = _mm512_shuffle_f32x4(tmp11503, tmp11511, 221);
tmp11457 = _mm512_shuffle_f32x4(tmp11505, tmp11513, 136);
tmp11461 = _mm512_shuffle_f32x4(tmp11505, tmp11513, 221);
tmp11458 = _mm512_shuffle_f32x4(tmp11507, tmp11515, 136);
tmp11462 = _mm512_shuffle_f32x4(tmp11507, tmp11515, 221);
in1602 = _mm512_shuffle_f32x4(tmp11509, tmp11517, 136);
in1610 = _mm512_shuffle_f32x4(tmp11509, tmp11517, 221);
tmp11456 = _mm512_shuffle_f32x4(tmp11504, tmp11512, 136);
tmp11460 = _mm512_shuffle_f32x4(tmp11504, tmp11512, 221);
in1598 = _mm512_shuffle_f32x4(tmp11506, tmp11514, 136);
in1606 = _mm512_shuffle_f32x4(tmp11506, tmp11514, 221);
in1600 = _mm512_shuffle_f32x4(tmp11508, tmp11516, 136);
in1608 = _mm512_shuffle_f32x4(tmp11508, tmp11516, 221);
in1599 = _mm512_shuffle_f32x4(tmp11510, tmp11518, 136);
in1607 = _mm512_shuffle_f32x4(tmp11510, tmp11518, 221);
__m512 tmp11463 = _mm512_add_ps(tmp11457, in1598);
__m512 tmp11467 = _mm512_add_ps(tmp11461, in1606);
__m512 tmp11464 = _mm512_sub_ps(tmp11456, tmp11458);
__m512 tmp11468 = _mm512_sub_ps(tmp11460, tmp11462);
__m512 tmp11465 = _mm512_add_ps(tmp11458, in1600);
__m512 tmp11469 = _mm512_add_ps(tmp11462, in1608);
in1596 = _mm512_sub_ps(in1596, in1600);
in1604 = _mm512_sub_ps(in1604, in1608);
tmp11463 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-4.25e+00f), tmp11463);
tmp11467 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-4.25e+00f), tmp11467);
tmp11465 = _mm512_fmadd_ps(tmp11456, _mm512_set1_ps(-4.25e+00f), tmp11465);
tmp11469 = _mm512_fmadd_ps(tmp11460, _mm512_set1_ps(-4.25e+00f), tmp11469);
in1596 = _mm512_fmadd_ps(tmp11464, _mm512_set1_ps(5.25e+00f), in1596);
in1604 = _mm512_fmadd_ps(tmp11468, _mm512_set1_ps(5.25e+00f), in1604);
tmp11464 = _mm512_fmadd_ps(tmp11458, _mm512_set1_ps(2.5e-01f), in1600);
tmp11468 = _mm512_fmadd_ps(tmp11462, _mm512_set1_ps(2.5e-01f), in1608);
tmp11458 = _mm512_fmadd_ps(tmp11458, _mm512_set1_ps(4e+00f), in1600);
tmp11462 = _mm512_fmadd_ps(tmp11462, _mm512_set1_ps(4e+00f), in1608);
__m512 tmp11466 = _mm512_sub_ps(tmp11465, tmp11463);
__m512 tmp11470 = _mm512_sub_ps(tmp11469, tmp11467);
tmp11465 = _mm512_add_ps(tmp11463, tmp11465);
tmp11469 = _mm512_add_ps(tmp11467, tmp11469);
tmp11463 = _mm512_fmadd_ps(tmp11457, _mm512_set1_ps(2.5e-01f), in1598);
tmp11467 = _mm512_fmadd_ps(tmp11461, _mm512_set1_ps(2.5e-01f), in1606);
tmp11464 = _mm512_fmadd_ps(tmp11456, _mm512_set1_ps(-1.25e+00f), tmp11464);
tmp11468 = _mm512_fmadd_ps(tmp11460, _mm512_set1_ps(-1.25e+00f), tmp11468);
tmp11456 = _mm512_fmadd_ps(tmp11456, _mm512_set1_ps(-5e+00f), tmp11458);
tmp11460 = _mm512_fmadd_ps(tmp11460, _mm512_set1_ps(-5e+00f), tmp11462);
tmp11463 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-1.25e+00f), tmp11463);
tmp11467 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-1.25e+00f), tmp11467);
in1600 = _mm512_fmadd_ps(tmp11463, _mm512_set1_ps(2e+00f), tmp11464);
in1608 = _mm512_fmadd_ps(tmp11467, _mm512_set1_ps(2e+00f), tmp11468);
tmp11464 = _mm512_fnmadd_ps(tmp11463, _mm512_set1_ps(2e+00f), tmp11464);
tmp11468 = _mm512_fnmadd_ps(tmp11467, _mm512_set1_ps(2e+00f), tmp11468);
tmp11463 = _mm512_fmadd_ps(in1598, _mm512_set1_ps(2.5e-01f), tmp11457);
tmp11467 = _mm512_fmadd_ps(in1606, _mm512_set1_ps(2.5e-01f), tmp11461);
tmp11457 = _mm512_sub_ps(in1599, tmp11457);
tmp11461 = _mm512_sub_ps(in1607, tmp11461);
tmp11463 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(-1.25e+00f), tmp11463);
tmp11467 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(-1.25e+00f), tmp11467);
in1602 = _mm512_sub_ps(in1602, in1598);
in1610 = _mm512_sub_ps(in1610, in1606);
in1602 = _mm512_fmadd_ps(in1602, _mm512_set1_ps(5.25e+00f), tmp11457);
in1610 = _mm512_fmadd_ps(in1610, _mm512_set1_ps(5.25e+00f), tmp11461);
tmp11458 = _mm512_fmadd_ps(tmp11463, _mm512_set1_ps(2e+00f), tmp11456);
tmp11462 = _mm512_fmadd_ps(tmp11467, _mm512_set1_ps(2e+00f), tmp11460);
tmp11456 = _mm512_fnmadd_ps(tmp11463, _mm512_set1_ps(2e+00f), tmp11456);
tmp11460 = _mm512_fnmadd_ps(tmp11467, _mm512_set1_ps(2e+00f), tmp11460);
__m512 out1471 = _mm512_shuffle_f32x4(in1596, tmp11465, 68);
__m512 out1479 = _mm512_shuffle_f32x4(in1596, tmp11465, 238);
__m512 out1472 = _mm512_shuffle_f32x4(tmp11466, in1600, 68);
__m512 out1480 = _mm512_shuffle_f32x4(tmp11466, in1600, 238);
__m512 out1473 = _mm512_shuffle_f32x4(tmp11464, tmp11458, 68);
__m512 out1481 = _mm512_shuffle_f32x4(tmp11464, tmp11458, 238);
__m512 out1474 = _mm512_shuffle_f32x4(tmp11456, in1602, 68);
__m512 out1482 = _mm512_shuffle_f32x4(tmp11456, in1602, 238);
__m512 out1475 = _mm512_shuffle_f32x4(in1604, tmp11469, 68);
__m512 out1483 = _mm512_shuffle_f32x4(in1604, tmp11469, 238);
__m512 out1476 = _mm512_shuffle_f32x4(tmp11470, in1608, 68);
__m512 out1484 = _mm512_shuffle_f32x4(tmp11470, in1608, 238);
__m512 out1477 = _mm512_shuffle_f32x4(tmp11468, tmp11462, 68);
__m512 out1485 = _mm512_shuffle_f32x4(tmp11468, tmp11462, 238);
__m512 out1478 = _mm512_shuffle_f32x4(tmp11460, in1610, 68);
__m512 out1486 = _mm512_shuffle_f32x4(tmp11460, in1610, 238);
_mm512_storeu_ps(dfPtr11+512+51200*i49+3072*j42+3072*s46+768*k131, out1471);
_mm512_storeu_ps(dfPtr11+640+51200*i49+3072*j42+3072*s46+768*k131, out1479);
_mm512_storeu_ps(dfPtr11+576+51200*i49+3072*j42+3072*s46+768*k131, out1475);
_mm512_storeu_ps(dfPtr11+704+51200*i49+3072*j42+3072*s46+768*k131, out1483);
_mm512_storeu_ps(dfPtr11+13312+51200*i49+3072*j42+3072*s46+768*k131, out1472);
_mm512_storeu_ps(dfPtr11+13440+51200*i49+3072*j42+3072*s46+768*k131, out1480);
_mm512_storeu_ps(dfPtr11+13376+51200*i49+3072*j42+3072*s46+768*k131, out1476);
_mm512_storeu_ps(dfPtr11+13504+51200*i49+3072*j42+3072*s46+768*k131, out1484);
_mm512_storeu_ps(dfPtr11+26112+51200*i49+3072*j42+3072*s46+768*k131, out1473);
_mm512_storeu_ps(dfPtr11+26240+51200*i49+3072*j42+3072*s46+768*k131, out1481);
_mm512_storeu_ps(dfPtr11+26176+51200*i49+3072*j42+3072*s46+768*k131, out1477);
_mm512_storeu_ps(dfPtr11+26304+51200*i49+3072*j42+3072*s46+768*k131, out1485);
_mm512_storeu_ps(dfPtr11+38912+51200*i49+3072*j42+3072*s46+768*k131, out1474);
_mm512_storeu_ps(dfPtr11+39040+51200*i49+3072*j42+3072*s46+768*k131, out1482);
_mm512_storeu_ps(dfPtr11+38976+51200*i49+3072*j42+3072*s46+768*k131, out1478);
_mm512_storeu_ps(dfPtr11+39104+51200*i49+3072*j42+3072*s46+768*k131, out1486);
}
++j42;
rel21 = 2;
}
if (rel21 < 3) {
ptrdiff_t h45 = base21+12;
ptrdiff_t w58 = 12;
ptrdiff_t k132 = 0;
for (; k132 != 4; ++k132) {
__m512 dat2139 = _mm512_maskz_loadu_ps(16383, datPtr25+0+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2140 = _mm512_maskz_loadu_ps(31, datPtr25+48+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2141 = _mm512_maskz_loadu_ps(127, datPtr25+628+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512i pm189 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1612 = _mm512_permutexvar_ps(pm189, dat2139);
__m512i pm190 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1620 = _mm512_permutex2var_ps(dat2140, pm190, dat2141);
__m512 dat2142 = _mm512_maskz_loadu_ps(16383, datPtr25+112+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2143 = _mm512_maskz_loadu_ps(31, datPtr25+160+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2144 = _mm512_maskz_loadu_ps(127, datPtr25+740+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1613 = _mm512_permutexvar_ps(pm189, dat2142);
__m512 in1621 = _mm512_permutex2var_ps(dat2143, pm190, dat2144);
__m512 dat2145 = _mm512_maskz_loadu_ps(16383, datPtr25+224+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2146 = _mm512_maskz_loadu_ps(31, datPtr25+272+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2147 = _mm512_maskz_loadu_ps(127, datPtr25+852+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1614 = _mm512_permutexvar_ps(pm189, dat2145);
__m512 in1622 = _mm512_permutex2var_ps(dat2146, pm190, dat2147);
__m512 dat2148 = _mm512_maskz_loadu_ps(16383, datPtr25+336+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2149 = _mm512_maskz_loadu_ps(31, datPtr25+384+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2150 = _mm512_maskz_loadu_ps(127, datPtr25+964+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1615 = _mm512_permutexvar_ps(pm189, dat2148);
__m512 in1623 = _mm512_permutex2var_ps(dat2149, pm190, dat2150);
__m512 dat2151 = _mm512_maskz_loadu_ps(16383, datPtr25+448+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2152 = _mm512_maskz_loadu_ps(31, datPtr25+496+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2153 = _mm512_maskz_loadu_ps(127, datPtr25+1076+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1616 = _mm512_permutexvar_ps(pm189, dat2151);
__m512 in1624 = _mm512_permutex2var_ps(dat2152, pm190, dat2153);
__m512 dat2154 = _mm512_maskz_loadu_ps(16383, datPtr25+560+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2155 = _mm512_maskz_loadu_ps(31, datPtr25+608+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2156 = _mm512_maskz_loadu_ps(127, datPtr25+1188+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1617 = _mm512_permutexvar_ps(pm189, dat2154);
__m512 in1625 = _mm512_permutex2var_ps(dat2155, pm190, dat2156);
__m512 dat2157 = _mm512_maskz_loadu_ps(16383, datPtr25+672+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2158 = _mm512_maskz_loadu_ps(31, datPtr25+720+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2159 = _mm512_maskz_loadu_ps(127, datPtr25+1300+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1618 = _mm512_permutexvar_ps(pm189, dat2157);
__m512 in1626 = _mm512_permutex2var_ps(dat2158, pm190, dat2159);
__m512 dat2160 = _mm512_maskz_loadu_ps(16383, datPtr25+784+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2161 = _mm512_maskz_loadu_ps(31, datPtr25+832+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2162 = _mm512_maskz_loadu_ps(127, datPtr25+1412+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1619 = _mm512_permutexvar_ps(pm189, dat2160);
__m512 in1627 = _mm512_permutex2var_ps(dat2161, pm190, dat2162);
__m512 tmp11519 = _mm512_add_ps(in1613, in1617);
__m512 tmp11523 = _mm512_add_ps(in1621, in1625);
__m512 tmp11520 = _mm512_sub_ps(in1616, in1614);
__m512 tmp11524 = _mm512_sub_ps(in1624, in1622);
__m512 tmp11521 = _mm512_add_ps(in1614, in1618);
__m512 tmp11525 = _mm512_add_ps(in1622, in1626);
in1612 = _mm512_sub_ps(in1612, in1618);
in1620 = _mm512_sub_ps(in1620, in1626);
tmp11519 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-4.25e+00f), tmp11519);
tmp11523 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-4.25e+00f), tmp11523);
tmp11521 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-4.25e+00f), tmp11521);
tmp11525 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-4.25e+00f), tmp11525);
in1612 = _mm512_fmadd_ps(tmp11520, _mm512_set1_ps(5.25e+00f), in1612);
in1620 = _mm512_fmadd_ps(tmp11524, _mm512_set1_ps(5.25e+00f), in1620);
tmp11520 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(2.5e-01f), in1618);
tmp11524 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(2.5e-01f), in1626);
in1614 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(4e+00f), in1618);
in1622 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(4e+00f), in1626);
__m512 tmp11522 = _mm512_sub_ps(tmp11521, tmp11519);
__m512 tmp11526 = _mm512_sub_ps(tmp11525, tmp11523);
tmp11521 = _mm512_add_ps(tmp11519, tmp11521);
tmp11525 = _mm512_add_ps(tmp11523, tmp11525);
tmp11519 = _mm512_fmadd_ps(in1613, _mm512_set1_ps(2.5e-01f), in1617);
tmp11523 = _mm512_fmadd_ps(in1621, _mm512_set1_ps(2.5e-01f), in1625);
tmp11520 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-1.25e+00f), tmp11520);
tmp11524 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-1.25e+00f), tmp11524);
in1616 = _mm512_fmadd_ps(in1616, _mm512_set1_ps(-5e+00f), in1614);
in1624 = _mm512_fmadd_ps(in1624, _mm512_set1_ps(-5e+00f), in1622);
tmp11519 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-1.25e+00f), tmp11519);
tmp11523 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-1.25e+00f), tmp11523);
in1618 = _mm512_fmadd_ps(tmp11519, _mm512_set1_ps(2e+00f), tmp11520);
in1626 = _mm512_fmadd_ps(tmp11523, _mm512_set1_ps(2e+00f), tmp11524);
tmp11520 = _mm512_fnmadd_ps(tmp11519, _mm512_set1_ps(2e+00f), tmp11520);
tmp11524 = _mm512_fnmadd_ps(tmp11523, _mm512_set1_ps(2e+00f), tmp11524);
tmp11519 = _mm512_fmadd_ps(in1617, _mm512_set1_ps(2.5e-01f), in1613);
tmp11523 = _mm512_fmadd_ps(in1625, _mm512_set1_ps(2.5e-01f), in1621);
in1613 = _mm512_sub_ps(in1619, in1613);
in1621 = _mm512_sub_ps(in1627, in1621);
tmp11519 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(-1.25e+00f), tmp11519);
tmp11523 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(-1.25e+00f), tmp11523);
in1615 = _mm512_sub_ps(in1615, in1617);
in1623 = _mm512_sub_ps(in1623, in1625);
in1615 = _mm512_fmadd_ps(in1615, _mm512_set1_ps(5.25e+00f), in1613);
in1623 = _mm512_fmadd_ps(in1623, _mm512_set1_ps(5.25e+00f), in1621);
in1614 = _mm512_fmadd_ps(tmp11519, _mm512_set1_ps(2e+00f), in1616);
in1622 = _mm512_fmadd_ps(tmp11523, _mm512_set1_ps(2e+00f), in1624);
in1616 = _mm512_fnmadd_ps(tmp11519, _mm512_set1_ps(2e+00f), in1616);
in1624 = _mm512_fnmadd_ps(tmp11523, _mm512_set1_ps(2e+00f), in1624);
__m512 tmp11535 = _mm512_unpacklo_ps(in1612, tmp11521);
__m512 tmp11536 = _mm512_unpackhi_ps(in1612, tmp11521);
__m512 tmp11537 = _mm512_unpacklo_ps(tmp11522, in1618);
__m512 tmp11538 = _mm512_unpackhi_ps(tmp11522, in1618);
__m512 tmp11539 = _mm512_unpacklo_ps(tmp11520, in1614);
__m512 tmp11540 = _mm512_unpackhi_ps(tmp11520, in1614);
__m512 tmp11541 = _mm512_unpacklo_ps(in1616, in1615);
__m512 tmp11542 = _mm512_unpackhi_ps(in1616, in1615);
__m512 tmp11543 = _mm512_unpacklo_ps(in1620, tmp11525);
__m512 tmp11544 = _mm512_unpackhi_ps(in1620, tmp11525);
__m512 tmp11545 = _mm512_unpacklo_ps(tmp11526, in1626);
__m512 tmp11546 = _mm512_unpackhi_ps(tmp11526, in1626);
__m512 tmp11547 = _mm512_unpacklo_ps(tmp11524, in1622);
__m512 tmp11548 = _mm512_unpackhi_ps(tmp11524, in1622);
__m512 tmp11549 = _mm512_unpacklo_ps(in1624, in1623);
__m512 tmp11550 = _mm512_unpackhi_ps(in1624, in1623);
__m512 tmp11551 = _mm512_shuffle_ps(tmp11535, tmp11537, 68);
__m512 tmp11552 = _mm512_shuffle_ps(tmp11535, tmp11537, 238);
__m512 tmp11553 = _mm512_shuffle_ps(tmp11536, tmp11538, 68);
__m512 tmp11554 = _mm512_shuffle_ps(tmp11536, tmp11538, 238);
__m512 tmp11555 = _mm512_shuffle_ps(tmp11539, tmp11541, 68);
__m512 tmp11556 = _mm512_shuffle_ps(tmp11539, tmp11541, 238);
__m512 tmp11557 = _mm512_shuffle_ps(tmp11540, tmp11542, 68);
__m512 tmp11558 = _mm512_shuffle_ps(tmp11540, tmp11542, 238);
__m512 tmp11559 = _mm512_shuffle_ps(tmp11543, tmp11545, 68);
__m512 tmp11560 = _mm512_shuffle_ps(tmp11543, tmp11545, 238);
__m512 tmp11561 = _mm512_shuffle_ps(tmp11544, tmp11546, 68);
__m512 tmp11562 = _mm512_shuffle_ps(tmp11544, tmp11546, 238);
__m512 tmp11563 = _mm512_shuffle_ps(tmp11547, tmp11549, 68);
__m512 tmp11564 = _mm512_shuffle_ps(tmp11547, tmp11549, 238);
__m512 tmp11565 = _mm512_shuffle_ps(tmp11548, tmp11550, 68);
__m512 tmp11566 = _mm512_shuffle_ps(tmp11548, tmp11550, 238);
__m512 tmp11567 = _mm512_shuffle_f32x4(tmp11551, tmp11555, 136);
__m512 tmp11568 = _mm512_shuffle_f32x4(tmp11551, tmp11555, 221);
__m512 tmp11569 = _mm512_shuffle_f32x4(tmp11552, tmp11556, 136);
__m512 tmp11570 = _mm512_shuffle_f32x4(tmp11552, tmp11556, 221);
__m512 tmp11571 = _mm512_shuffle_f32x4(tmp11553, tmp11557, 136);
__m512 tmp11572 = _mm512_shuffle_f32x4(tmp11553, tmp11557, 221);
__m512 tmp11573 = _mm512_shuffle_f32x4(tmp11554, tmp11558, 136);
__m512 tmp11574 = _mm512_shuffle_f32x4(tmp11554, tmp11558, 221);
__m512 tmp11575 = _mm512_shuffle_f32x4(tmp11559, tmp11563, 136);
__m512 tmp11576 = _mm512_shuffle_f32x4(tmp11559, tmp11563, 221);
__m512 tmp11577 = _mm512_shuffle_f32x4(tmp11560, tmp11564, 136);
__m512 tmp11578 = _mm512_shuffle_f32x4(tmp11560, tmp11564, 221);
__m512 tmp11579 = _mm512_shuffle_f32x4(tmp11561, tmp11565, 136);
__m512 tmp11580 = _mm512_shuffle_f32x4(tmp11561, tmp11565, 221);
__m512 tmp11581 = _mm512_shuffle_f32x4(tmp11562, tmp11566, 136);
__m512 tmp11582 = _mm512_shuffle_f32x4(tmp11562, tmp11566, 221);
in1612 = _mm512_shuffle_f32x4(tmp11567, tmp11575, 136);
in1620 = _mm512_shuffle_f32x4(tmp11567, tmp11575, 221);
tmp11521 = _mm512_shuffle_f32x4(tmp11569, tmp11577, 136);
tmp11525 = _mm512_shuffle_f32x4(tmp11569, tmp11577, 221);
tmp11522 = _mm512_shuffle_f32x4(tmp11571, tmp11579, 136);
tmp11526 = _mm512_shuffle_f32x4(tmp11571, tmp11579, 221);
in1618 = _mm512_shuffle_f32x4(tmp11573, tmp11581, 136);
in1626 = _mm512_shuffle_f32x4(tmp11573, tmp11581, 221);
tmp11520 = _mm512_shuffle_f32x4(tmp11568, tmp11576, 136);
tmp11524 = _mm512_shuffle_f32x4(tmp11568, tmp11576, 221);
in1614 = _mm512_shuffle_f32x4(tmp11570, tmp11578, 136);
in1622 = _mm512_shuffle_f32x4(tmp11570, tmp11578, 221);
in1616 = _mm512_shuffle_f32x4(tmp11572, tmp11580, 136);
in1624 = _mm512_shuffle_f32x4(tmp11572, tmp11580, 221);
in1615 = _mm512_shuffle_f32x4(tmp11574, tmp11582, 136);
in1623 = _mm512_shuffle_f32x4(tmp11574, tmp11582, 221);
__m512 tmp11527 = _mm512_add_ps(tmp11521, in1614);
__m512 tmp11531 = _mm512_add_ps(tmp11525, in1622);
__m512 tmp11528 = _mm512_sub_ps(tmp11520, tmp11522);
__m512 tmp11532 = _mm512_sub_ps(tmp11524, tmp11526);
__m512 tmp11529 = _mm512_add_ps(tmp11522, in1616);
__m512 tmp11533 = _mm512_add_ps(tmp11526, in1624);
in1612 = _mm512_sub_ps(in1612, in1616);
in1620 = _mm512_sub_ps(in1620, in1624);
tmp11527 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-4.25e+00f), tmp11527);
tmp11531 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-4.25e+00f), tmp11531);
tmp11529 = _mm512_fmadd_ps(tmp11520, _mm512_set1_ps(-4.25e+00f), tmp11529);
tmp11533 = _mm512_fmadd_ps(tmp11524, _mm512_set1_ps(-4.25e+00f), tmp11533);
in1612 = _mm512_fmadd_ps(tmp11528, _mm512_set1_ps(5.25e+00f), in1612);
in1620 = _mm512_fmadd_ps(tmp11532, _mm512_set1_ps(5.25e+00f), in1620);
tmp11528 = _mm512_fmadd_ps(tmp11522, _mm512_set1_ps(2.5e-01f), in1616);
tmp11532 = _mm512_fmadd_ps(tmp11526, _mm512_set1_ps(2.5e-01f), in1624);
tmp11522 = _mm512_fmadd_ps(tmp11522, _mm512_set1_ps(4e+00f), in1616);
tmp11526 = _mm512_fmadd_ps(tmp11526, _mm512_set1_ps(4e+00f), in1624);
__m512 tmp11530 = _mm512_sub_ps(tmp11529, tmp11527);
__m512 tmp11534 = _mm512_sub_ps(tmp11533, tmp11531);
tmp11529 = _mm512_add_ps(tmp11527, tmp11529);
tmp11533 = _mm512_add_ps(tmp11531, tmp11533);
tmp11527 = _mm512_fmadd_ps(tmp11521, _mm512_set1_ps(2.5e-01f), in1614);
tmp11531 = _mm512_fmadd_ps(tmp11525, _mm512_set1_ps(2.5e-01f), in1622);
tmp11528 = _mm512_fmadd_ps(tmp11520, _mm512_set1_ps(-1.25e+00f), tmp11528);
tmp11532 = _mm512_fmadd_ps(tmp11524, _mm512_set1_ps(-1.25e+00f), tmp11532);
tmp11520 = _mm512_fmadd_ps(tmp11520, _mm512_set1_ps(-5e+00f), tmp11522);
tmp11524 = _mm512_fmadd_ps(tmp11524, _mm512_set1_ps(-5e+00f), tmp11526);
tmp11527 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-1.25e+00f), tmp11527);
tmp11531 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-1.25e+00f), tmp11531);
in1616 = _mm512_fmadd_ps(tmp11527, _mm512_set1_ps(2e+00f), tmp11528);
in1624 = _mm512_fmadd_ps(tmp11531, _mm512_set1_ps(2e+00f), tmp11532);
tmp11528 = _mm512_fnmadd_ps(tmp11527, _mm512_set1_ps(2e+00f), tmp11528);
tmp11532 = _mm512_fnmadd_ps(tmp11531, _mm512_set1_ps(2e+00f), tmp11532);
tmp11527 = _mm512_fmadd_ps(in1614, _mm512_set1_ps(2.5e-01f), tmp11521);
tmp11531 = _mm512_fmadd_ps(in1622, _mm512_set1_ps(2.5e-01f), tmp11525);
tmp11521 = _mm512_sub_ps(in1615, tmp11521);
tmp11525 = _mm512_sub_ps(in1623, tmp11525);
tmp11527 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(-1.25e+00f), tmp11527);
tmp11531 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(-1.25e+00f), tmp11531);
in1618 = _mm512_sub_ps(in1618, in1614);
in1626 = _mm512_sub_ps(in1626, in1622);
in1618 = _mm512_fmadd_ps(in1618, _mm512_set1_ps(5.25e+00f), tmp11521);
in1626 = _mm512_fmadd_ps(in1626, _mm512_set1_ps(5.25e+00f), tmp11525);
tmp11522 = _mm512_fmadd_ps(tmp11527, _mm512_set1_ps(2e+00f), tmp11520);
tmp11526 = _mm512_fmadd_ps(tmp11531, _mm512_set1_ps(2e+00f), tmp11524);
tmp11520 = _mm512_fnmadd_ps(tmp11527, _mm512_set1_ps(2e+00f), tmp11520);
tmp11524 = _mm512_fnmadd_ps(tmp11531, _mm512_set1_ps(2e+00f), tmp11524);
__m512 out1487 = _mm512_shuffle_f32x4(in1612, tmp11529, 68);
__m512 out1495 = _mm512_shuffle_f32x4(in1612, tmp11529, 238);
__m512 out1488 = _mm512_shuffle_f32x4(tmp11530, in1616, 68);
__m512 out1496 = _mm512_shuffle_f32x4(tmp11530, in1616, 238);
__m512 out1489 = _mm512_shuffle_f32x4(tmp11528, tmp11522, 68);
__m512 out1497 = _mm512_shuffle_f32x4(tmp11528, tmp11522, 238);
__m512 out1490 = _mm512_shuffle_f32x4(tmp11520, in1618, 68);
__m512 out1498 = _mm512_shuffle_f32x4(tmp11520, in1618, 238);
__m512 out1491 = _mm512_shuffle_f32x4(in1620, tmp11533, 68);
__m512 out1499 = _mm512_shuffle_f32x4(in1620, tmp11533, 238);
__m512 out1492 = _mm512_shuffle_f32x4(tmp11534, in1624, 68);
__m512 out1500 = _mm512_shuffle_f32x4(tmp11534, in1624, 238);
__m512 out1493 = _mm512_shuffle_f32x4(tmp11532, tmp11526, 68);
__m512 out1501 = _mm512_shuffle_f32x4(tmp11532, tmp11526, 238);
__m512 out1494 = _mm512_shuffle_f32x4(tmp11524, in1626, 68);
__m512 out1502 = _mm512_shuffle_f32x4(tmp11524, in1626, 238);
_mm512_storeu_ps(dfPtr11+0+51200*i49+3072*j42+3072*s46+768*k132, out1487);
_mm512_storeu_ps(dfPtr11+128+51200*i49+3072*j42+3072*s46+768*k132, out1495);
_mm512_storeu_ps(dfPtr11+64+51200*i49+3072*j42+3072*s46+768*k132, out1491);
_mm512_storeu_ps(dfPtr11+192+51200*i49+3072*j42+3072*s46+768*k132, out1499);
_mm512_storeu_ps(dfPtr11+12800+51200*i49+3072*j42+3072*s46+768*k132, out1488);
_mm512_storeu_ps(dfPtr11+12928+51200*i49+3072*j42+3072*s46+768*k132, out1496);
_mm512_storeu_ps(dfPtr11+12864+51200*i49+3072*j42+3072*s46+768*k132, out1492);
_mm512_storeu_ps(dfPtr11+12992+51200*i49+3072*j42+3072*s46+768*k132, out1500);
_mm512_storeu_ps(dfPtr11+25600+51200*i49+3072*j42+3072*s46+768*k132, out1489);
_mm512_storeu_ps(dfPtr11+25728+51200*i49+3072*j42+3072*s46+768*k132, out1497);
_mm512_storeu_ps(dfPtr11+25664+51200*i49+3072*j42+3072*s46+768*k132, out1493);
_mm512_storeu_ps(dfPtr11+25792+51200*i49+3072*j42+3072*s46+768*k132, out1501);
_mm512_storeu_ps(dfPtr11+38400+51200*i49+3072*j42+3072*s46+768*k132, out1490);
_mm512_storeu_ps(dfPtr11+38528+51200*i49+3072*j42+3072*s46+768*k132, out1498);
_mm512_storeu_ps(dfPtr11+38464+51200*i49+3072*j42+3072*s46+768*k132, out1494);
_mm512_storeu_ps(dfPtr11+38592+51200*i49+3072*j42+3072*s46+768*k132, out1502);
__m512 dat2163 = _mm512_maskz_loadu_ps(16383, datPtr25+648+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2164 = _mm512_maskz_loadu_ps(16383, datPtr25+3136+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512i pm191 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1628 = _mm512_permutexvar_ps(pm191, dat2163);
__m512 in1636 = _mm512_permutexvar_ps(pm191, dat2164);
__m512 dat2165 = _mm512_maskz_loadu_ps(16383, datPtr25+760+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2166 = _mm512_maskz_loadu_ps(16383, datPtr25+3248+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1629 = _mm512_permutexvar_ps(pm191, dat2165);
__m512 in1637 = _mm512_permutexvar_ps(pm191, dat2166);
__m512 dat2167 = _mm512_maskz_loadu_ps(16383, datPtr25+872+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2168 = _mm512_maskz_loadu_ps(16383, datPtr25+3360+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1630 = _mm512_permutexvar_ps(pm191, dat2167);
__m512 in1638 = _mm512_permutexvar_ps(pm191, dat2168);
__m512 dat2169 = _mm512_maskz_loadu_ps(16383, datPtr25+984+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2170 = _mm512_maskz_loadu_ps(16383, datPtr25+3472+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1631 = _mm512_permutexvar_ps(pm191, dat2169);
__m512 in1639 = _mm512_permutexvar_ps(pm191, dat2170);
__m512 dat2171 = _mm512_maskz_loadu_ps(16383, datPtr25+1096+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2172 = _mm512_maskz_loadu_ps(16383, datPtr25+3584+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1632 = _mm512_permutexvar_ps(pm191, dat2171);
__m512 in1640 = _mm512_permutexvar_ps(pm191, dat2172);
__m512 dat2173 = _mm512_maskz_loadu_ps(16383, datPtr25+1208+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2174 = _mm512_maskz_loadu_ps(16383, datPtr25+3696+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1633 = _mm512_permutexvar_ps(pm191, dat2173);
__m512 in1641 = _mm512_permutexvar_ps(pm191, dat2174);
__m512 dat2175 = _mm512_maskz_loadu_ps(16383, datPtr25+1320+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2176 = _mm512_maskz_loadu_ps(16383, datPtr25+3808+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1634 = _mm512_permutexvar_ps(pm191, dat2175);
__m512 in1642 = _mm512_permutexvar_ps(pm191, dat2176);
__m512 dat2177 = _mm512_maskz_loadu_ps(16383, datPtr25+1432+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2178 = _mm512_maskz_loadu_ps(16383, datPtr25+3920+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1635 = _mm512_permutexvar_ps(pm191, dat2177);
__m512 in1643 = _mm512_permutexvar_ps(pm191, dat2178);
__m512 tmp11583 = _mm512_add_ps(in1629, in1633);
__m512 tmp11587 = _mm512_add_ps(in1637, in1641);
__m512 tmp11584 = _mm512_sub_ps(in1632, in1630);
__m512 tmp11588 = _mm512_sub_ps(in1640, in1638);
__m512 tmp11585 = _mm512_add_ps(in1630, in1634);
__m512 tmp11589 = _mm512_add_ps(in1638, in1642);
in1628 = _mm512_sub_ps(in1628, in1634);
in1636 = _mm512_sub_ps(in1636, in1642);
tmp11583 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-4.25e+00f), tmp11583);
tmp11587 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-4.25e+00f), tmp11587);
tmp11585 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-4.25e+00f), tmp11585);
tmp11589 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-4.25e+00f), tmp11589);
in1628 = _mm512_fmadd_ps(tmp11584, _mm512_set1_ps(5.25e+00f), in1628);
in1636 = _mm512_fmadd_ps(tmp11588, _mm512_set1_ps(5.25e+00f), in1636);
tmp11584 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(2.5e-01f), in1634);
tmp11588 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(2.5e-01f), in1642);
in1630 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(4e+00f), in1634);
in1638 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(4e+00f), in1642);
__m512 tmp11586 = _mm512_sub_ps(tmp11585, tmp11583);
__m512 tmp11590 = _mm512_sub_ps(tmp11589, tmp11587);
tmp11585 = _mm512_add_ps(tmp11583, tmp11585);
tmp11589 = _mm512_add_ps(tmp11587, tmp11589);
tmp11583 = _mm512_fmadd_ps(in1629, _mm512_set1_ps(2.5e-01f), in1633);
tmp11587 = _mm512_fmadd_ps(in1637, _mm512_set1_ps(2.5e-01f), in1641);
tmp11584 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-1.25e+00f), tmp11584);
tmp11588 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-1.25e+00f), tmp11588);
in1632 = _mm512_fmadd_ps(in1632, _mm512_set1_ps(-5e+00f), in1630);
in1640 = _mm512_fmadd_ps(in1640, _mm512_set1_ps(-5e+00f), in1638);
tmp11583 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-1.25e+00f), tmp11583);
tmp11587 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-1.25e+00f), tmp11587);
in1634 = _mm512_fmadd_ps(tmp11583, _mm512_set1_ps(2e+00f), tmp11584);
in1642 = _mm512_fmadd_ps(tmp11587, _mm512_set1_ps(2e+00f), tmp11588);
tmp11584 = _mm512_fnmadd_ps(tmp11583, _mm512_set1_ps(2e+00f), tmp11584);
tmp11588 = _mm512_fnmadd_ps(tmp11587, _mm512_set1_ps(2e+00f), tmp11588);
tmp11583 = _mm512_fmadd_ps(in1633, _mm512_set1_ps(2.5e-01f), in1629);
tmp11587 = _mm512_fmadd_ps(in1641, _mm512_set1_ps(2.5e-01f), in1637);
in1629 = _mm512_sub_ps(in1635, in1629);
in1637 = _mm512_sub_ps(in1643, in1637);
tmp11583 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(-1.25e+00f), tmp11583);
tmp11587 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(-1.25e+00f), tmp11587);
in1631 = _mm512_sub_ps(in1631, in1633);
in1639 = _mm512_sub_ps(in1639, in1641);
in1631 = _mm512_fmadd_ps(in1631, _mm512_set1_ps(5.25e+00f), in1629);
in1639 = _mm512_fmadd_ps(in1639, _mm512_set1_ps(5.25e+00f), in1637);
in1630 = _mm512_fmadd_ps(tmp11583, _mm512_set1_ps(2e+00f), in1632);
in1638 = _mm512_fmadd_ps(tmp11587, _mm512_set1_ps(2e+00f), in1640);
in1632 = _mm512_fnmadd_ps(tmp11583, _mm512_set1_ps(2e+00f), in1632);
in1640 = _mm512_fnmadd_ps(tmp11587, _mm512_set1_ps(2e+00f), in1640);
__m512 tmp11599 = _mm512_unpacklo_ps(in1628, tmp11585);
__m512 tmp11600 = _mm512_unpackhi_ps(in1628, tmp11585);
__m512 tmp11601 = _mm512_unpacklo_ps(tmp11586, in1634);
__m512 tmp11602 = _mm512_unpackhi_ps(tmp11586, in1634);
__m512 tmp11603 = _mm512_unpacklo_ps(tmp11584, in1630);
__m512 tmp11604 = _mm512_unpackhi_ps(tmp11584, in1630);
__m512 tmp11605 = _mm512_unpacklo_ps(in1632, in1631);
__m512 tmp11606 = _mm512_unpackhi_ps(in1632, in1631);
__m512 tmp11607 = _mm512_unpacklo_ps(in1636, tmp11589);
__m512 tmp11608 = _mm512_unpackhi_ps(in1636, tmp11589);
__m512 tmp11609 = _mm512_unpacklo_ps(tmp11590, in1642);
__m512 tmp11610 = _mm512_unpackhi_ps(tmp11590, in1642);
__m512 tmp11611 = _mm512_unpacklo_ps(tmp11588, in1638);
__m512 tmp11612 = _mm512_unpackhi_ps(tmp11588, in1638);
__m512 tmp11613 = _mm512_unpacklo_ps(in1640, in1639);
__m512 tmp11614 = _mm512_unpackhi_ps(in1640, in1639);
__m512 tmp11615 = _mm512_shuffle_ps(tmp11599, tmp11601, 68);
__m512 tmp11616 = _mm512_shuffle_ps(tmp11599, tmp11601, 238);
__m512 tmp11617 = _mm512_shuffle_ps(tmp11600, tmp11602, 68);
__m512 tmp11618 = _mm512_shuffle_ps(tmp11600, tmp11602, 238);
__m512 tmp11619 = _mm512_shuffle_ps(tmp11603, tmp11605, 68);
__m512 tmp11620 = _mm512_shuffle_ps(tmp11603, tmp11605, 238);
__m512 tmp11621 = _mm512_shuffle_ps(tmp11604, tmp11606, 68);
__m512 tmp11622 = _mm512_shuffle_ps(tmp11604, tmp11606, 238);
__m512 tmp11623 = _mm512_shuffle_ps(tmp11607, tmp11609, 68);
__m512 tmp11624 = _mm512_shuffle_ps(tmp11607, tmp11609, 238);
__m512 tmp11625 = _mm512_shuffle_ps(tmp11608, tmp11610, 68);
__m512 tmp11626 = _mm512_shuffle_ps(tmp11608, tmp11610, 238);
__m512 tmp11627 = _mm512_shuffle_ps(tmp11611, tmp11613, 68);
__m512 tmp11628 = _mm512_shuffle_ps(tmp11611, tmp11613, 238);
__m512 tmp11629 = _mm512_shuffle_ps(tmp11612, tmp11614, 68);
__m512 tmp11630 = _mm512_shuffle_ps(tmp11612, tmp11614, 238);
__m512 tmp11631 = _mm512_shuffle_f32x4(tmp11615, tmp11619, 136);
__m512 tmp11632 = _mm512_shuffle_f32x4(tmp11615, tmp11619, 221);
__m512 tmp11633 = _mm512_shuffle_f32x4(tmp11616, tmp11620, 136);
__m512 tmp11634 = _mm512_shuffle_f32x4(tmp11616, tmp11620, 221);
__m512 tmp11635 = _mm512_shuffle_f32x4(tmp11617, tmp11621, 136);
__m512 tmp11636 = _mm512_shuffle_f32x4(tmp11617, tmp11621, 221);
__m512 tmp11637 = _mm512_shuffle_f32x4(tmp11618, tmp11622, 136);
__m512 tmp11638 = _mm512_shuffle_f32x4(tmp11618, tmp11622, 221);
__m512 tmp11639 = _mm512_shuffle_f32x4(tmp11623, tmp11627, 136);
__m512 tmp11640 = _mm512_shuffle_f32x4(tmp11623, tmp11627, 221);
__m512 tmp11641 = _mm512_shuffle_f32x4(tmp11624, tmp11628, 136);
__m512 tmp11642 = _mm512_shuffle_f32x4(tmp11624, tmp11628, 221);
__m512 tmp11643 = _mm512_shuffle_f32x4(tmp11625, tmp11629, 136);
__m512 tmp11644 = _mm512_shuffle_f32x4(tmp11625, tmp11629, 221);
__m512 tmp11645 = _mm512_shuffle_f32x4(tmp11626, tmp11630, 136);
__m512 tmp11646 = _mm512_shuffle_f32x4(tmp11626, tmp11630, 221);
in1628 = _mm512_shuffle_f32x4(tmp11631, tmp11639, 136);
in1636 = _mm512_shuffle_f32x4(tmp11631, tmp11639, 221);
tmp11585 = _mm512_shuffle_f32x4(tmp11633, tmp11641, 136);
tmp11589 = _mm512_shuffle_f32x4(tmp11633, tmp11641, 221);
tmp11586 = _mm512_shuffle_f32x4(tmp11635, tmp11643, 136);
tmp11590 = _mm512_shuffle_f32x4(tmp11635, tmp11643, 221);
in1634 = _mm512_shuffle_f32x4(tmp11637, tmp11645, 136);
in1642 = _mm512_shuffle_f32x4(tmp11637, tmp11645, 221);
tmp11584 = _mm512_shuffle_f32x4(tmp11632, tmp11640, 136);
tmp11588 = _mm512_shuffle_f32x4(tmp11632, tmp11640, 221);
in1630 = _mm512_shuffle_f32x4(tmp11634, tmp11642, 136);
in1638 = _mm512_shuffle_f32x4(tmp11634, tmp11642, 221);
in1632 = _mm512_shuffle_f32x4(tmp11636, tmp11644, 136);
in1640 = _mm512_shuffle_f32x4(tmp11636, tmp11644, 221);
in1631 = _mm512_shuffle_f32x4(tmp11638, tmp11646, 136);
in1639 = _mm512_shuffle_f32x4(tmp11638, tmp11646, 221);
__m512 tmp11591 = _mm512_add_ps(tmp11585, in1630);
__m512 tmp11595 = _mm512_add_ps(tmp11589, in1638);
__m512 tmp11592 = _mm512_sub_ps(tmp11584, tmp11586);
__m512 tmp11596 = _mm512_sub_ps(tmp11588, tmp11590);
__m512 tmp11593 = _mm512_add_ps(tmp11586, in1632);
__m512 tmp11597 = _mm512_add_ps(tmp11590, in1640);
in1628 = _mm512_sub_ps(in1628, in1632);
in1636 = _mm512_sub_ps(in1636, in1640);
tmp11591 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-4.25e+00f), tmp11591);
tmp11595 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-4.25e+00f), tmp11595);
tmp11593 = _mm512_fmadd_ps(tmp11584, _mm512_set1_ps(-4.25e+00f), tmp11593);
tmp11597 = _mm512_fmadd_ps(tmp11588, _mm512_set1_ps(-4.25e+00f), tmp11597);
in1628 = _mm512_fmadd_ps(tmp11592, _mm512_set1_ps(5.25e+00f), in1628);
in1636 = _mm512_fmadd_ps(tmp11596, _mm512_set1_ps(5.25e+00f), in1636);
tmp11592 = _mm512_fmadd_ps(tmp11586, _mm512_set1_ps(2.5e-01f), in1632);
tmp11596 = _mm512_fmadd_ps(tmp11590, _mm512_set1_ps(2.5e-01f), in1640);
tmp11586 = _mm512_fmadd_ps(tmp11586, _mm512_set1_ps(4e+00f), in1632);
tmp11590 = _mm512_fmadd_ps(tmp11590, _mm512_set1_ps(4e+00f), in1640);
__m512 tmp11594 = _mm512_sub_ps(tmp11593, tmp11591);
__m512 tmp11598 = _mm512_sub_ps(tmp11597, tmp11595);
tmp11593 = _mm512_add_ps(tmp11591, tmp11593);
tmp11597 = _mm512_add_ps(tmp11595, tmp11597);
tmp11591 = _mm512_fmadd_ps(tmp11585, _mm512_set1_ps(2.5e-01f), in1630);
tmp11595 = _mm512_fmadd_ps(tmp11589, _mm512_set1_ps(2.5e-01f), in1638);
tmp11592 = _mm512_fmadd_ps(tmp11584, _mm512_set1_ps(-1.25e+00f), tmp11592);
tmp11596 = _mm512_fmadd_ps(tmp11588, _mm512_set1_ps(-1.25e+00f), tmp11596);
tmp11584 = _mm512_fmadd_ps(tmp11584, _mm512_set1_ps(-5e+00f), tmp11586);
tmp11588 = _mm512_fmadd_ps(tmp11588, _mm512_set1_ps(-5e+00f), tmp11590);
tmp11591 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-1.25e+00f), tmp11591);
tmp11595 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-1.25e+00f), tmp11595);
in1632 = _mm512_fmadd_ps(tmp11591, _mm512_set1_ps(2e+00f), tmp11592);
in1640 = _mm512_fmadd_ps(tmp11595, _mm512_set1_ps(2e+00f), tmp11596);
tmp11592 = _mm512_fnmadd_ps(tmp11591, _mm512_set1_ps(2e+00f), tmp11592);
tmp11596 = _mm512_fnmadd_ps(tmp11595, _mm512_set1_ps(2e+00f), tmp11596);
tmp11591 = _mm512_fmadd_ps(in1630, _mm512_set1_ps(2.5e-01f), tmp11585);
tmp11595 = _mm512_fmadd_ps(in1638, _mm512_set1_ps(2.5e-01f), tmp11589);
tmp11585 = _mm512_sub_ps(in1631, tmp11585);
tmp11589 = _mm512_sub_ps(in1639, tmp11589);
tmp11591 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(-1.25e+00f), tmp11591);
tmp11595 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(-1.25e+00f), tmp11595);
in1634 = _mm512_sub_ps(in1634, in1630);
in1642 = _mm512_sub_ps(in1642, in1638);
in1634 = _mm512_fmadd_ps(in1634, _mm512_set1_ps(5.25e+00f), tmp11585);
in1642 = _mm512_fmadd_ps(in1642, _mm512_set1_ps(5.25e+00f), tmp11589);
tmp11586 = _mm512_fmadd_ps(tmp11591, _mm512_set1_ps(2e+00f), tmp11584);
tmp11590 = _mm512_fmadd_ps(tmp11595, _mm512_set1_ps(2e+00f), tmp11588);
tmp11584 = _mm512_fnmadd_ps(tmp11591, _mm512_set1_ps(2e+00f), tmp11584);
tmp11588 = _mm512_fnmadd_ps(tmp11595, _mm512_set1_ps(2e+00f), tmp11588);
__m512 out1503 = _mm512_shuffle_f32x4(in1628, tmp11593, 68);
__m512 out1511 = _mm512_shuffle_f32x4(in1628, tmp11593, 238);
__m512 out1504 = _mm512_shuffle_f32x4(tmp11594, in1632, 68);
__m512 out1512 = _mm512_shuffle_f32x4(tmp11594, in1632, 238);
__m512 out1505 = _mm512_shuffle_f32x4(tmp11592, tmp11586, 68);
__m512 out1513 = _mm512_shuffle_f32x4(tmp11592, tmp11586, 238);
__m512 out1506 = _mm512_shuffle_f32x4(tmp11584, in1634, 68);
__m512 out1514 = _mm512_shuffle_f32x4(tmp11584, in1634, 238);
__m512 out1507 = _mm512_shuffle_f32x4(in1636, tmp11597, 68);
__m512 out1515 = _mm512_shuffle_f32x4(in1636, tmp11597, 238);
__m512 out1508 = _mm512_shuffle_f32x4(tmp11598, in1640, 68);
__m512 out1516 = _mm512_shuffle_f32x4(tmp11598, in1640, 238);
__m512 out1509 = _mm512_shuffle_f32x4(tmp11596, tmp11590, 68);
__m512 out1517 = _mm512_shuffle_f32x4(tmp11596, tmp11590, 238);
__m512 out1510 = _mm512_shuffle_f32x4(tmp11588, in1642, 68);
__m512 out1518 = _mm512_shuffle_f32x4(tmp11588, in1642, 238);
_mm512_storeu_ps(dfPtr11+256+51200*i49+3072*j42+3072*s46+768*k132, out1503);
_mm512_storeu_ps(dfPtr11+384+51200*i49+3072*j42+3072*s46+768*k132, out1511);
_mm512_storeu_ps(dfPtr11+320+51200*i49+3072*j42+3072*s46+768*k132, out1507);
_mm512_storeu_ps(dfPtr11+448+51200*i49+3072*j42+3072*s46+768*k132, out1515);
_mm512_storeu_ps(dfPtr11+13056+51200*i49+3072*j42+3072*s46+768*k132, out1504);
_mm512_storeu_ps(dfPtr11+13184+51200*i49+3072*j42+3072*s46+768*k132, out1512);
_mm512_storeu_ps(dfPtr11+13120+51200*i49+3072*j42+3072*s46+768*k132, out1508);
_mm512_storeu_ps(dfPtr11+13248+51200*i49+3072*j42+3072*s46+768*k132, out1516);
_mm512_storeu_ps(dfPtr11+25856+51200*i49+3072*j42+3072*s46+768*k132, out1505);
_mm512_storeu_ps(dfPtr11+25984+51200*i49+3072*j42+3072*s46+768*k132, out1513);
_mm512_storeu_ps(dfPtr11+25920+51200*i49+3072*j42+3072*s46+768*k132, out1509);
_mm512_storeu_ps(dfPtr11+26048+51200*i49+3072*j42+3072*s46+768*k132, out1517);
_mm512_storeu_ps(dfPtr11+38656+51200*i49+3072*j42+3072*s46+768*k132, out1506);
_mm512_storeu_ps(dfPtr11+38784+51200*i49+3072*j42+3072*s46+768*k132, out1514);
_mm512_storeu_ps(dfPtr11+38720+51200*i49+3072*j42+3072*s46+768*k132, out1510);
_mm512_storeu_ps(dfPtr11+38848+51200*i49+3072*j42+3072*s46+768*k132, out1518);
__m512 dat2179 = _mm512_maskz_loadu_ps(31, datPtr25+3184+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2180 = _mm512_maskz_loadu_ps(8191, datPtr25+3764+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2181 = _mm512_maskz_loadu_ps(255, datPtr25+3808+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512i pm192 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1644 = _mm512_permutex2var_ps(dat2179, pm192, dat2180);
__m512i pm193 = _mm512_set_epi32(23, 22, 21, 20, 19, 18, 17, 16, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in1652 = _mm512_permutex2var_ps(dat2180, pm193, dat2181);
__m512 dat2182 = _mm512_maskz_loadu_ps(31, datPtr25+3296+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2183 = _mm512_maskz_loadu_ps(8191, datPtr25+3876+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2184 = _mm512_maskz_loadu_ps(255, datPtr25+3920+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1645 = _mm512_permutex2var_ps(dat2182, pm192, dat2183);
__m512 in1653 = _mm512_permutex2var_ps(dat2183, pm193, dat2184);
__m512 dat2185 = _mm512_maskz_loadu_ps(31, datPtr25+3408+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2186 = _mm512_maskz_loadu_ps(8191, datPtr25+3988+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2187 = _mm512_maskz_loadu_ps(255, datPtr25+4032+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1646 = _mm512_permutex2var_ps(dat2185, pm192, dat2186);
__m512 in1654 = _mm512_permutex2var_ps(dat2186, pm193, dat2187);
__m512 dat2188 = _mm512_maskz_loadu_ps(31, datPtr25+3520+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2189 = _mm512_maskz_loadu_ps(8191, datPtr25+4100+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2190 = _mm512_maskz_loadu_ps(255, datPtr25+4144+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1647 = _mm512_permutex2var_ps(dat2188, pm192, dat2189);
__m512 in1655 = _mm512_permutex2var_ps(dat2189, pm193, dat2190);
__m512 dat2191 = _mm512_maskz_loadu_ps(31, datPtr25+3632+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2192 = _mm512_maskz_loadu_ps(8191, datPtr25+4212+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2193 = _mm512_maskz_loadu_ps(255, datPtr25+4256+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1648 = _mm512_permutex2var_ps(dat2191, pm192, dat2192);
__m512 in1656 = _mm512_permutex2var_ps(dat2192, pm193, dat2193);
__m512 dat2194 = _mm512_maskz_loadu_ps(31, datPtr25+3744+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2195 = _mm512_maskz_loadu_ps(8191, datPtr25+4324+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2196 = _mm512_maskz_loadu_ps(255, datPtr25+4368+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1649 = _mm512_permutex2var_ps(dat2194, pm192, dat2195);
__m512 in1657 = _mm512_permutex2var_ps(dat2195, pm193, dat2196);
__m512 dat2197 = _mm512_maskz_loadu_ps(31, datPtr25+3856+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2198 = _mm512_maskz_loadu_ps(8191, datPtr25+4436+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2199 = _mm512_maskz_loadu_ps(255, datPtr25+4480+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1650 = _mm512_permutex2var_ps(dat2197, pm192, dat2198);
__m512 in1658 = _mm512_permutex2var_ps(dat2198, pm193, dat2199);
__m512 dat2200 = _mm512_maskz_loadu_ps(31, datPtr25+3968+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2201 = _mm512_maskz_loadu_ps(8191, datPtr25+4548+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 dat2202 = _mm512_maskz_loadu_ps(255, datPtr25+4592+25088*i49+112*h45+4*w58+25088*s46+6272*k132);
__m512 in1651 = _mm512_permutex2var_ps(dat2200, pm192, dat2201);
__m512 in1659 = _mm512_permutex2var_ps(dat2201, pm193, dat2202);
__m512 tmp11647 = _mm512_add_ps(in1645, in1649);
__m512 tmp11651 = _mm512_add_ps(in1653, in1657);
__m512 tmp11648 = _mm512_sub_ps(in1648, in1646);
__m512 tmp11652 = _mm512_sub_ps(in1656, in1654);
__m512 tmp11649 = _mm512_add_ps(in1646, in1650);
__m512 tmp11653 = _mm512_add_ps(in1654, in1658);
in1644 = _mm512_sub_ps(in1644, in1650);
in1652 = _mm512_sub_ps(in1652, in1658);
tmp11647 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-4.25e+00f), tmp11647);
tmp11651 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-4.25e+00f), tmp11651);
tmp11649 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-4.25e+00f), tmp11649);
tmp11653 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-4.25e+00f), tmp11653);
in1644 = _mm512_fmadd_ps(tmp11648, _mm512_set1_ps(5.25e+00f), in1644);
in1652 = _mm512_fmadd_ps(tmp11652, _mm512_set1_ps(5.25e+00f), in1652);
tmp11648 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(2.5e-01f), in1650);
tmp11652 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(2.5e-01f), in1658);
in1646 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(4e+00f), in1650);
in1654 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(4e+00f), in1658);
__m512 tmp11650 = _mm512_sub_ps(tmp11649, tmp11647);
__m512 tmp11654 = _mm512_sub_ps(tmp11653, tmp11651);
tmp11649 = _mm512_add_ps(tmp11647, tmp11649);
tmp11653 = _mm512_add_ps(tmp11651, tmp11653);
tmp11647 = _mm512_fmadd_ps(in1645, _mm512_set1_ps(2.5e-01f), in1649);
tmp11651 = _mm512_fmadd_ps(in1653, _mm512_set1_ps(2.5e-01f), in1657);
tmp11648 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-1.25e+00f), tmp11648);
tmp11652 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-1.25e+00f), tmp11652);
in1648 = _mm512_fmadd_ps(in1648, _mm512_set1_ps(-5e+00f), in1646);
in1656 = _mm512_fmadd_ps(in1656, _mm512_set1_ps(-5e+00f), in1654);
tmp11647 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-1.25e+00f), tmp11647);
tmp11651 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-1.25e+00f), tmp11651);
in1650 = _mm512_fmadd_ps(tmp11647, _mm512_set1_ps(2e+00f), tmp11648);
in1658 = _mm512_fmadd_ps(tmp11651, _mm512_set1_ps(2e+00f), tmp11652);
tmp11648 = _mm512_fnmadd_ps(tmp11647, _mm512_set1_ps(2e+00f), tmp11648);
tmp11652 = _mm512_fnmadd_ps(tmp11651, _mm512_set1_ps(2e+00f), tmp11652);
tmp11647 = _mm512_fmadd_ps(in1649, _mm512_set1_ps(2.5e-01f), in1645);
tmp11651 = _mm512_fmadd_ps(in1657, _mm512_set1_ps(2.5e-01f), in1653);
in1645 = _mm512_sub_ps(in1651, in1645);
in1653 = _mm512_sub_ps(in1659, in1653);
tmp11647 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(-1.25e+00f), tmp11647);
tmp11651 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(-1.25e+00f), tmp11651);
in1647 = _mm512_sub_ps(in1647, in1649);
in1655 = _mm512_sub_ps(in1655, in1657);
in1647 = _mm512_fmadd_ps(in1647, _mm512_set1_ps(5.25e+00f), in1645);
in1655 = _mm512_fmadd_ps(in1655, _mm512_set1_ps(5.25e+00f), in1653);
in1646 = _mm512_fmadd_ps(tmp11647, _mm512_set1_ps(2e+00f), in1648);
in1654 = _mm512_fmadd_ps(tmp11651, _mm512_set1_ps(2e+00f), in1656);
in1648 = _mm512_fnmadd_ps(tmp11647, _mm512_set1_ps(2e+00f), in1648);
in1656 = _mm512_fnmadd_ps(tmp11651, _mm512_set1_ps(2e+00f), in1656);
__m512 tmp11663 = _mm512_unpacklo_ps(in1644, tmp11649);
__m512 tmp11664 = _mm512_unpackhi_ps(in1644, tmp11649);
__m512 tmp11665 = _mm512_unpacklo_ps(tmp11650, in1650);
__m512 tmp11666 = _mm512_unpackhi_ps(tmp11650, in1650);
__m512 tmp11667 = _mm512_unpacklo_ps(tmp11648, in1646);
__m512 tmp11668 = _mm512_unpackhi_ps(tmp11648, in1646);
__m512 tmp11669 = _mm512_unpacklo_ps(in1648, in1647);
__m512 tmp11670 = _mm512_unpackhi_ps(in1648, in1647);
__m512 tmp11671 = _mm512_unpacklo_ps(in1652, tmp11653);
__m512 tmp11672 = _mm512_unpackhi_ps(in1652, tmp11653);
__m512 tmp11673 = _mm512_unpacklo_ps(tmp11654, in1658);
__m512 tmp11674 = _mm512_unpackhi_ps(tmp11654, in1658);
__m512 tmp11675 = _mm512_unpacklo_ps(tmp11652, in1654);
__m512 tmp11676 = _mm512_unpackhi_ps(tmp11652, in1654);
__m512 tmp11677 = _mm512_unpacklo_ps(in1656, in1655);
__m512 tmp11678 = _mm512_unpackhi_ps(in1656, in1655);
__m512 tmp11679 = _mm512_shuffle_ps(tmp11663, tmp11665, 68);
__m512 tmp11680 = _mm512_shuffle_ps(tmp11663, tmp11665, 238);
__m512 tmp11681 = _mm512_shuffle_ps(tmp11664, tmp11666, 68);
__m512 tmp11682 = _mm512_shuffle_ps(tmp11664, tmp11666, 238);
__m512 tmp11683 = _mm512_shuffle_ps(tmp11667, tmp11669, 68);
__m512 tmp11684 = _mm512_shuffle_ps(tmp11667, tmp11669, 238);
__m512 tmp11685 = _mm512_shuffle_ps(tmp11668, tmp11670, 68);
__m512 tmp11686 = _mm512_shuffle_ps(tmp11668, tmp11670, 238);
__m512 tmp11687 = _mm512_shuffle_ps(tmp11671, tmp11673, 68);
__m512 tmp11688 = _mm512_shuffle_ps(tmp11671, tmp11673, 238);
__m512 tmp11689 = _mm512_shuffle_ps(tmp11672, tmp11674, 68);
__m512 tmp11690 = _mm512_shuffle_ps(tmp11672, tmp11674, 238);
__m512 tmp11691 = _mm512_shuffle_ps(tmp11675, tmp11677, 68);
__m512 tmp11692 = _mm512_shuffle_ps(tmp11675, tmp11677, 238);
__m512 tmp11693 = _mm512_shuffle_ps(tmp11676, tmp11678, 68);
__m512 tmp11694 = _mm512_shuffle_ps(tmp11676, tmp11678, 238);
__m512 tmp11695 = _mm512_shuffle_f32x4(tmp11679, tmp11683, 136);
__m512 tmp11696 = _mm512_shuffle_f32x4(tmp11679, tmp11683, 221);
__m512 tmp11697 = _mm512_shuffle_f32x4(tmp11680, tmp11684, 136);
__m512 tmp11698 = _mm512_shuffle_f32x4(tmp11680, tmp11684, 221);
__m512 tmp11699 = _mm512_shuffle_f32x4(tmp11681, tmp11685, 136);
__m512 tmp11700 = _mm512_shuffle_f32x4(tmp11681, tmp11685, 221);
__m512 tmp11701 = _mm512_shuffle_f32x4(tmp11682, tmp11686, 136);
__m512 tmp11702 = _mm512_shuffle_f32x4(tmp11682, tmp11686, 221);
__m512 tmp11703 = _mm512_shuffle_f32x4(tmp11687, tmp11691, 136);
__m512 tmp11704 = _mm512_shuffle_f32x4(tmp11687, tmp11691, 221);
__m512 tmp11705 = _mm512_shuffle_f32x4(tmp11688, tmp11692, 136);
__m512 tmp11706 = _mm512_shuffle_f32x4(tmp11688, tmp11692, 221);
__m512 tmp11707 = _mm512_shuffle_f32x4(tmp11689, tmp11693, 136);
__m512 tmp11708 = _mm512_shuffle_f32x4(tmp11689, tmp11693, 221);
__m512 tmp11709 = _mm512_shuffle_f32x4(tmp11690, tmp11694, 136);
__m512 tmp11710 = _mm512_shuffle_f32x4(tmp11690, tmp11694, 221);
in1644 = _mm512_shuffle_f32x4(tmp11695, tmp11703, 136);
in1652 = _mm512_shuffle_f32x4(tmp11695, tmp11703, 221);
tmp11649 = _mm512_shuffle_f32x4(tmp11697, tmp11705, 136);
tmp11653 = _mm512_shuffle_f32x4(tmp11697, tmp11705, 221);
tmp11650 = _mm512_shuffle_f32x4(tmp11699, tmp11707, 136);
tmp11654 = _mm512_shuffle_f32x4(tmp11699, tmp11707, 221);
in1650 = _mm512_shuffle_f32x4(tmp11701, tmp11709, 136);
in1658 = _mm512_shuffle_f32x4(tmp11701, tmp11709, 221);
tmp11648 = _mm512_shuffle_f32x4(tmp11696, tmp11704, 136);
tmp11652 = _mm512_shuffle_f32x4(tmp11696, tmp11704, 221);
in1646 = _mm512_shuffle_f32x4(tmp11698, tmp11706, 136);
in1654 = _mm512_shuffle_f32x4(tmp11698, tmp11706, 221);
in1648 = _mm512_shuffle_f32x4(tmp11700, tmp11708, 136);
in1656 = _mm512_shuffle_f32x4(tmp11700, tmp11708, 221);
in1647 = _mm512_shuffle_f32x4(tmp11702, tmp11710, 136);
in1655 = _mm512_shuffle_f32x4(tmp11702, tmp11710, 221);
__m512 tmp11655 = _mm512_add_ps(tmp11649, in1646);
__m512 tmp11659 = _mm512_add_ps(tmp11653, in1654);
__m512 tmp11656 = _mm512_sub_ps(tmp11648, tmp11650);
__m512 tmp11660 = _mm512_sub_ps(tmp11652, tmp11654);
__m512 tmp11657 = _mm512_add_ps(tmp11650, in1648);
__m512 tmp11661 = _mm512_add_ps(tmp11654, in1656);
in1644 = _mm512_sub_ps(in1644, in1648);
in1652 = _mm512_sub_ps(in1652, in1656);
tmp11655 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-4.25e+00f), tmp11655);
tmp11659 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-4.25e+00f), tmp11659);
tmp11657 = _mm512_fmadd_ps(tmp11648, _mm512_set1_ps(-4.25e+00f), tmp11657);
tmp11661 = _mm512_fmadd_ps(tmp11652, _mm512_set1_ps(-4.25e+00f), tmp11661);
in1644 = _mm512_fmadd_ps(tmp11656, _mm512_set1_ps(5.25e+00f), in1644);
in1652 = _mm512_fmadd_ps(tmp11660, _mm512_set1_ps(5.25e+00f), in1652);
tmp11656 = _mm512_fmadd_ps(tmp11650, _mm512_set1_ps(2.5e-01f), in1648);
tmp11660 = _mm512_fmadd_ps(tmp11654, _mm512_set1_ps(2.5e-01f), in1656);
tmp11650 = _mm512_fmadd_ps(tmp11650, _mm512_set1_ps(4e+00f), in1648);
tmp11654 = _mm512_fmadd_ps(tmp11654, _mm512_set1_ps(4e+00f), in1656);
__m512 tmp11658 = _mm512_sub_ps(tmp11657, tmp11655);
__m512 tmp11662 = _mm512_sub_ps(tmp11661, tmp11659);
tmp11657 = _mm512_add_ps(tmp11655, tmp11657);
tmp11661 = _mm512_add_ps(tmp11659, tmp11661);
tmp11655 = _mm512_fmadd_ps(tmp11649, _mm512_set1_ps(2.5e-01f), in1646);
tmp11659 = _mm512_fmadd_ps(tmp11653, _mm512_set1_ps(2.5e-01f), in1654);
tmp11656 = _mm512_fmadd_ps(tmp11648, _mm512_set1_ps(-1.25e+00f), tmp11656);
tmp11660 = _mm512_fmadd_ps(tmp11652, _mm512_set1_ps(-1.25e+00f), tmp11660);
tmp11648 = _mm512_fmadd_ps(tmp11648, _mm512_set1_ps(-5e+00f), tmp11650);
tmp11652 = _mm512_fmadd_ps(tmp11652, _mm512_set1_ps(-5e+00f), tmp11654);
tmp11655 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-1.25e+00f), tmp11655);
tmp11659 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-1.25e+00f), tmp11659);
in1648 = _mm512_fmadd_ps(tmp11655, _mm512_set1_ps(2e+00f), tmp11656);
in1656 = _mm512_fmadd_ps(tmp11659, _mm512_set1_ps(2e+00f), tmp11660);
tmp11656 = _mm512_fnmadd_ps(tmp11655, _mm512_set1_ps(2e+00f), tmp11656);
tmp11660 = _mm512_fnmadd_ps(tmp11659, _mm512_set1_ps(2e+00f), tmp11660);
tmp11655 = _mm512_fmadd_ps(in1646, _mm512_set1_ps(2.5e-01f), tmp11649);
tmp11659 = _mm512_fmadd_ps(in1654, _mm512_set1_ps(2.5e-01f), tmp11653);
tmp11649 = _mm512_sub_ps(in1647, tmp11649);
tmp11653 = _mm512_sub_ps(in1655, tmp11653);
tmp11655 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(-1.25e+00f), tmp11655);
tmp11659 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(-1.25e+00f), tmp11659);
in1650 = _mm512_sub_ps(in1650, in1646);
in1658 = _mm512_sub_ps(in1658, in1654);
in1650 = _mm512_fmadd_ps(in1650, _mm512_set1_ps(5.25e+00f), tmp11649);
in1658 = _mm512_fmadd_ps(in1658, _mm512_set1_ps(5.25e+00f), tmp11653);
tmp11650 = _mm512_fmadd_ps(tmp11655, _mm512_set1_ps(2e+00f), tmp11648);
tmp11654 = _mm512_fmadd_ps(tmp11659, _mm512_set1_ps(2e+00f), tmp11652);
tmp11648 = _mm512_fnmadd_ps(tmp11655, _mm512_set1_ps(2e+00f), tmp11648);
tmp11652 = _mm512_fnmadd_ps(tmp11659, _mm512_set1_ps(2e+00f), tmp11652);
__m512 out1519 = _mm512_shuffle_f32x4(in1644, tmp11657, 68);
__m512 out1527 = _mm512_shuffle_f32x4(in1644, tmp11657, 238);
__m512 out1520 = _mm512_shuffle_f32x4(tmp11658, in1648, 68);
__m512 out1528 = _mm512_shuffle_f32x4(tmp11658, in1648, 238);
__m512 out1521 = _mm512_shuffle_f32x4(tmp11656, tmp11650, 68);
__m512 out1529 = _mm512_shuffle_f32x4(tmp11656, tmp11650, 238);
__m512 out1522 = _mm512_shuffle_f32x4(tmp11648, in1650, 68);
__m512 out1530 = _mm512_shuffle_f32x4(tmp11648, in1650, 238);
__m512 out1523 = _mm512_shuffle_f32x4(in1652, tmp11661, 68);
__m512 out1531 = _mm512_shuffle_f32x4(in1652, tmp11661, 238);
__m512 out1524 = _mm512_shuffle_f32x4(tmp11662, in1656, 68);
__m512 out1532 = _mm512_shuffle_f32x4(tmp11662, in1656, 238);
__m512 out1525 = _mm512_shuffle_f32x4(tmp11660, tmp11654, 68);
__m512 out1533 = _mm512_shuffle_f32x4(tmp11660, tmp11654, 238);
__m512 out1526 = _mm512_shuffle_f32x4(tmp11652, in1658, 68);
__m512 out1534 = _mm512_shuffle_f32x4(tmp11652, in1658, 238);
_mm512_storeu_ps(dfPtr11+512+51200*i49+3072*j42+3072*s46+768*k132, out1519);
_mm512_storeu_ps(dfPtr11+640+51200*i49+3072*j42+3072*s46+768*k132, out1527);
_mm512_storeu_ps(dfPtr11+576+51200*i49+3072*j42+3072*s46+768*k132, out1523);
_mm512_storeu_ps(dfPtr11+704+51200*i49+3072*j42+3072*s46+768*k132, out1531);
_mm512_storeu_ps(dfPtr11+13312+51200*i49+3072*j42+3072*s46+768*k132, out1520);
_mm512_storeu_ps(dfPtr11+13440+51200*i49+3072*j42+3072*s46+768*k132, out1528);
_mm512_storeu_ps(dfPtr11+13376+51200*i49+3072*j42+3072*s46+768*k132, out1524);
_mm512_storeu_ps(dfPtr11+13504+51200*i49+3072*j42+3072*s46+768*k132, out1532);
_mm512_storeu_ps(dfPtr11+26112+51200*i49+3072*j42+3072*s46+768*k132, out1521);
_mm512_storeu_ps(dfPtr11+26240+51200*i49+3072*j42+3072*s46+768*k132, out1529);
_mm512_storeu_ps(dfPtr11+26176+51200*i49+3072*j42+3072*s46+768*k132, out1525);
_mm512_storeu_ps(dfPtr11+26304+51200*i49+3072*j42+3072*s46+768*k132, out1533);
_mm512_storeu_ps(dfPtr11+38912+51200*i49+3072*j42+3072*s46+768*k132, out1522);
_mm512_storeu_ps(dfPtr11+39040+51200*i49+3072*j42+3072*s46+768*k132, out1530);
_mm512_storeu_ps(dfPtr11+38976+51200*i49+3072*j42+3072*s46+768*k132, out1526);
_mm512_storeu_ps(dfPtr11+39104+51200*i49+3072*j42+3072*s46+768*k132, out1534);
}
++j42;
rel21 = 3;
}
if (rel21 < 4) {
ptrdiff_t h46 = base21+18;
ptrdiff_t w59 = 18;
ptrdiff_t k133 = 0;
for (; k133 != 4; ++k133) {
__m512 dat2203 = _mm512_maskz_loadu_ps(2047, datPtr25+0+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2204 = _mm512_maskz_loadu_ps(8191, datPtr25+604+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512i pm194 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1660 = _mm512_permutexvar_ps(pm194, dat2203);
__m512i pm195 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1668 = _mm512_permutexvar_ps(pm195, dat2204);
__m512 dat2205 = _mm512_maskz_loadu_ps(2047, datPtr25+112+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2206 = _mm512_maskz_loadu_ps(8191, datPtr25+716+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1661 = _mm512_permutexvar_ps(pm194, dat2205);
__m512 in1669 = _mm512_permutexvar_ps(pm195, dat2206);
__m512 dat2207 = _mm512_maskz_loadu_ps(2047, datPtr25+224+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2208 = _mm512_maskz_loadu_ps(8191, datPtr25+828+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1662 = _mm512_permutexvar_ps(pm194, dat2207);
__m512 in1670 = _mm512_permutexvar_ps(pm195, dat2208);
__m512 dat2209 = _mm512_maskz_loadu_ps(2047, datPtr25+336+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2210 = _mm512_maskz_loadu_ps(8191, datPtr25+940+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1663 = _mm512_permutexvar_ps(pm194, dat2209);
__m512 in1671 = _mm512_permutexvar_ps(pm195, dat2210);
__m512 dat2211 = _mm512_maskz_loadu_ps(2047, datPtr25+448+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2212 = _mm512_maskz_loadu_ps(8191, datPtr25+1052+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1664 = _mm512_permutexvar_ps(pm194, dat2211);
__m512 in1672 = _mm512_permutexvar_ps(pm195, dat2212);
__m512 dat2213 = _mm512_maskz_loadu_ps(2047, datPtr25+560+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1665 = _mm512_permutexvar_ps(pm194, dat2213);
__m512 dat2214 = _mm512_maskz_loadu_ps(2047, datPtr25+672+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1666 = _mm512_permutexvar_ps(pm194, dat2214);
__m512 dat2215 = _mm512_maskz_loadu_ps(2047, datPtr25+784+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1667 = _mm512_permutexvar_ps(pm194, dat2215);
__m512 tmp11711 = _mm512_add_ps(in1661, in1665);
__m512 tmp11715 = in1669;
__m512 tmp11712 = _mm512_sub_ps(in1664, in1662);
__m512 tmp11716 = _mm512_sub_ps(in1672, in1670);
__m512 tmp11713 = _mm512_add_ps(in1662, in1666);
__m512 tmp11717 = in1670;
in1660 = _mm512_sub_ps(in1660, in1666);
in1668 = in1668;
tmp11711 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-4.25e+00f), tmp11711);
tmp11715 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-4.25e+00f), tmp11715);
tmp11713 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-4.25e+00f), tmp11713);
tmp11717 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-4.25e+00f), tmp11717);
in1660 = _mm512_fmadd_ps(tmp11712, _mm512_set1_ps(5.25e+00f), in1660);
in1668 = _mm512_fmadd_ps(tmp11716, _mm512_set1_ps(5.25e+00f), in1668);
tmp11712 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(2.5e-01f), in1666);
tmp11716 = _mm512_mul_ps(in1670, _mm512_set1_ps(2.5e-01f));
in1662 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(4e+00f), in1666);
in1670 = _mm512_mul_ps(in1670, _mm512_set1_ps(4e+00f));
__m512 tmp11714 = _mm512_sub_ps(tmp11713, tmp11711);
__m512 tmp11718 = _mm512_sub_ps(tmp11717, tmp11715);
tmp11713 = _mm512_add_ps(tmp11711, tmp11713);
tmp11717 = _mm512_add_ps(tmp11715, tmp11717);
tmp11711 = _mm512_fmadd_ps(in1661, _mm512_set1_ps(2.5e-01f), in1665);
tmp11715 = _mm512_mul_ps(in1669, _mm512_set1_ps(2.5e-01f));
tmp11712 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-1.25e+00f), tmp11712);
tmp11716 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-1.25e+00f), tmp11716);
in1664 = _mm512_fmadd_ps(in1664, _mm512_set1_ps(-5e+00f), in1662);
in1672 = _mm512_fmadd_ps(in1672, _mm512_set1_ps(-5e+00f), in1670);
tmp11711 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-1.25e+00f), tmp11711);
tmp11715 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-1.25e+00f), tmp11715);
in1666 = _mm512_fmadd_ps(tmp11711, _mm512_set1_ps(2e+00f), tmp11712);
__m512 tmp11719 = _mm512_fmadd_ps(tmp11715, _mm512_set1_ps(2e+00f), tmp11716);
tmp11712 = _mm512_fnmadd_ps(tmp11711, _mm512_set1_ps(2e+00f), tmp11712);
tmp11716 = _mm512_fnmadd_ps(tmp11715, _mm512_set1_ps(2e+00f), tmp11716);
tmp11711 = _mm512_fmadd_ps(in1665, _mm512_set1_ps(2.5e-01f), in1661);
tmp11715 = in1669;
in1661 = _mm512_sub_ps(in1667, in1661);
in1669 = _mm512_sub_ps(_mm512_setzero_ps(), in1669);
tmp11711 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(-1.25e+00f), tmp11711);
tmp11715 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(-1.25e+00f), tmp11715);
in1663 = _mm512_sub_ps(in1663, in1665);
in1671 = in1671;
in1663 = _mm512_fmadd_ps(in1663, _mm512_set1_ps(5.25e+00f), in1661);
in1671 = _mm512_fmadd_ps(in1671, _mm512_set1_ps(5.25e+00f), in1669);
in1662 = _mm512_fmadd_ps(tmp11711, _mm512_set1_ps(2e+00f), in1664);
in1670 = _mm512_fmadd_ps(tmp11715, _mm512_set1_ps(2e+00f), in1672);
in1664 = _mm512_fnmadd_ps(tmp11711, _mm512_set1_ps(2e+00f), in1664);
in1672 = _mm512_fnmadd_ps(tmp11715, _mm512_set1_ps(2e+00f), in1672);
__m512 tmp11728 = _mm512_unpacklo_ps(in1660, tmp11713);
__m512 tmp11729 = _mm512_unpackhi_ps(in1660, tmp11713);
__m512 tmp11730 = _mm512_unpacklo_ps(tmp11714, in1666);
__m512 tmp11731 = _mm512_unpackhi_ps(tmp11714, in1666);
__m512 tmp11732 = _mm512_unpacklo_ps(tmp11712, in1662);
__m512 tmp11733 = _mm512_unpackhi_ps(tmp11712, in1662);
__m512 tmp11734 = _mm512_unpacklo_ps(in1664, in1663);
__m512 tmp11735 = _mm512_unpackhi_ps(in1664, in1663);
__m512 tmp11736 = _mm512_unpacklo_ps(in1668, tmp11717);
__m512 tmp11737 = _mm512_unpackhi_ps(in1668, tmp11717);
__m512 tmp11738 = _mm512_unpacklo_ps(tmp11718, tmp11719);
__m512 tmp11739 = _mm512_unpackhi_ps(tmp11718, tmp11719);
__m512 tmp11740 = _mm512_unpacklo_ps(tmp11716, in1670);
__m512 tmp11741 = _mm512_unpackhi_ps(tmp11716, in1670);
__m512 tmp11742 = _mm512_unpacklo_ps(in1672, in1671);
__m512 tmp11743 = _mm512_unpackhi_ps(in1672, in1671);
__m512 tmp11744 = _mm512_shuffle_ps(tmp11728, tmp11730, 68);
__m512 tmp11745 = _mm512_shuffle_ps(tmp11728, tmp11730, 238);
__m512 tmp11746 = _mm512_shuffle_ps(tmp11729, tmp11731, 68);
__m512 tmp11747 = _mm512_shuffle_ps(tmp11729, tmp11731, 238);
__m512 tmp11748 = _mm512_shuffle_ps(tmp11732, tmp11734, 68);
__m512 tmp11749 = _mm512_shuffle_ps(tmp11732, tmp11734, 238);
__m512 tmp11750 = _mm512_shuffle_ps(tmp11733, tmp11735, 68);
__m512 tmp11751 = _mm512_shuffle_ps(tmp11733, tmp11735, 238);
__m512 tmp11752 = _mm512_shuffle_ps(tmp11736, tmp11738, 68);
__m512 tmp11753 = _mm512_shuffle_ps(tmp11736, tmp11738, 238);
__m512 tmp11754 = _mm512_shuffle_ps(tmp11737, tmp11739, 68);
__m512 tmp11755 = _mm512_shuffle_ps(tmp11737, tmp11739, 238);
__m512 tmp11756 = _mm512_shuffle_ps(tmp11740, tmp11742, 68);
__m512 tmp11757 = _mm512_shuffle_ps(tmp11740, tmp11742, 238);
__m512 tmp11758 = _mm512_shuffle_ps(tmp11741, tmp11743, 68);
__m512 tmp11759 = _mm512_shuffle_ps(tmp11741, tmp11743, 238);
__m512 tmp11760 = _mm512_shuffle_f32x4(tmp11744, tmp11748, 136);
__m512 tmp11761 = _mm512_shuffle_f32x4(tmp11744, tmp11748, 221);
__m512 tmp11762 = _mm512_shuffle_f32x4(tmp11745, tmp11749, 136);
__m512 tmp11763 = _mm512_shuffle_f32x4(tmp11745, tmp11749, 221);
__m512 tmp11764 = _mm512_shuffle_f32x4(tmp11746, tmp11750, 136);
__m512 tmp11765 = _mm512_shuffle_f32x4(tmp11746, tmp11750, 221);
__m512 tmp11766 = _mm512_shuffle_f32x4(tmp11747, tmp11751, 136);
__m512 tmp11767 = _mm512_shuffle_f32x4(tmp11747, tmp11751, 221);
__m512 tmp11768 = _mm512_shuffle_f32x4(tmp11752, tmp11756, 136);
__m512 tmp11769 = _mm512_shuffle_f32x4(tmp11752, tmp11756, 221);
__m512 tmp11770 = _mm512_shuffle_f32x4(tmp11753, tmp11757, 136);
__m512 tmp11771 = _mm512_shuffle_f32x4(tmp11753, tmp11757, 221);
__m512 tmp11772 = _mm512_shuffle_f32x4(tmp11754, tmp11758, 136);
__m512 tmp11773 = _mm512_shuffle_f32x4(tmp11754, tmp11758, 221);
__m512 tmp11774 = _mm512_shuffle_f32x4(tmp11755, tmp11759, 136);
__m512 tmp11775 = _mm512_shuffle_f32x4(tmp11755, tmp11759, 221);
in1660 = _mm512_shuffle_f32x4(tmp11760, tmp11768, 136);
in1668 = _mm512_shuffle_f32x4(tmp11760, tmp11768, 221);
tmp11713 = _mm512_shuffle_f32x4(tmp11762, tmp11770, 136);
tmp11717 = _mm512_shuffle_f32x4(tmp11762, tmp11770, 221);
tmp11714 = _mm512_shuffle_f32x4(tmp11764, tmp11772, 136);
tmp11718 = _mm512_shuffle_f32x4(tmp11764, tmp11772, 221);
in1666 = _mm512_shuffle_f32x4(tmp11766, tmp11774, 136);
tmp11719 = _mm512_shuffle_f32x4(tmp11766, tmp11774, 221);
tmp11712 = _mm512_shuffle_f32x4(tmp11761, tmp11769, 136);
tmp11716 = _mm512_shuffle_f32x4(tmp11761, tmp11769, 221);
in1662 = _mm512_shuffle_f32x4(tmp11763, tmp11771, 136);
in1670 = _mm512_shuffle_f32x4(tmp11763, tmp11771, 221);
in1664 = _mm512_shuffle_f32x4(tmp11765, tmp11773, 136);
in1672 = _mm512_shuffle_f32x4(tmp11765, tmp11773, 221);
in1663 = _mm512_shuffle_f32x4(tmp11767, tmp11775, 136);
in1671 = _mm512_shuffle_f32x4(tmp11767, tmp11775, 221);
__m512 tmp11720 = _mm512_add_ps(tmp11713, in1662);
__m512 tmp11724 = _mm512_add_ps(tmp11717, in1670);
__m512 tmp11721 = _mm512_sub_ps(tmp11712, tmp11714);
__m512 tmp11725 = _mm512_sub_ps(tmp11716, tmp11718);
__m512 tmp11722 = _mm512_add_ps(tmp11714, in1664);
__m512 tmp11726 = _mm512_add_ps(tmp11718, in1672);
in1660 = _mm512_sub_ps(in1660, in1664);
in1668 = _mm512_sub_ps(in1668, in1672);
tmp11720 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-4.25e+00f), tmp11720);
tmp11724 = _mm512_fmadd_ps(tmp11719, _mm512_set1_ps(-4.25e+00f), tmp11724);
tmp11722 = _mm512_fmadd_ps(tmp11712, _mm512_set1_ps(-4.25e+00f), tmp11722);
tmp11726 = _mm512_fmadd_ps(tmp11716, _mm512_set1_ps(-4.25e+00f), tmp11726);
in1660 = _mm512_fmadd_ps(tmp11721, _mm512_set1_ps(5.25e+00f), in1660);
in1668 = _mm512_fmadd_ps(tmp11725, _mm512_set1_ps(5.25e+00f), in1668);
tmp11721 = _mm512_fmadd_ps(tmp11714, _mm512_set1_ps(2.5e-01f), in1664);
tmp11725 = _mm512_fmadd_ps(tmp11718, _mm512_set1_ps(2.5e-01f), in1672);
tmp11714 = _mm512_fmadd_ps(tmp11714, _mm512_set1_ps(4e+00f), in1664);
tmp11718 = _mm512_fmadd_ps(tmp11718, _mm512_set1_ps(4e+00f), in1672);
__m512 tmp11723 = _mm512_sub_ps(tmp11722, tmp11720);
__m512 tmp11727 = _mm512_sub_ps(tmp11726, tmp11724);
tmp11722 = _mm512_add_ps(tmp11720, tmp11722);
tmp11726 = _mm512_add_ps(tmp11724, tmp11726);
tmp11720 = _mm512_fmadd_ps(tmp11713, _mm512_set1_ps(2.5e-01f), in1662);
tmp11724 = _mm512_fmadd_ps(tmp11717, _mm512_set1_ps(2.5e-01f), in1670);
tmp11721 = _mm512_fmadd_ps(tmp11712, _mm512_set1_ps(-1.25e+00f), tmp11721);
tmp11725 = _mm512_fmadd_ps(tmp11716, _mm512_set1_ps(-1.25e+00f), tmp11725);
tmp11712 = _mm512_fmadd_ps(tmp11712, _mm512_set1_ps(-5e+00f), tmp11714);
tmp11716 = _mm512_fmadd_ps(tmp11716, _mm512_set1_ps(-5e+00f), tmp11718);
tmp11720 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-1.25e+00f), tmp11720);
tmp11724 = _mm512_fmadd_ps(tmp11719, _mm512_set1_ps(-1.25e+00f), tmp11724);
in1664 = _mm512_fmadd_ps(tmp11720, _mm512_set1_ps(2e+00f), tmp11721);
in1672 = _mm512_fmadd_ps(tmp11724, _mm512_set1_ps(2e+00f), tmp11725);
tmp11721 = _mm512_fnmadd_ps(tmp11720, _mm512_set1_ps(2e+00f), tmp11721);
tmp11725 = _mm512_fnmadd_ps(tmp11724, _mm512_set1_ps(2e+00f), tmp11725);
tmp11720 = _mm512_fmadd_ps(in1662, _mm512_set1_ps(2.5e-01f), tmp11713);
tmp11724 = _mm512_fmadd_ps(in1670, _mm512_set1_ps(2.5e-01f), tmp11717);
tmp11713 = _mm512_sub_ps(in1663, tmp11713);
tmp11717 = _mm512_sub_ps(in1671, tmp11717);
tmp11720 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(-1.25e+00f), tmp11720);
tmp11724 = _mm512_fmadd_ps(tmp11719, _mm512_set1_ps(-1.25e+00f), tmp11724);
in1666 = _mm512_sub_ps(in1666, in1662);
tmp11719 = _mm512_sub_ps(tmp11719, in1670);
in1666 = _mm512_fmadd_ps(in1666, _mm512_set1_ps(5.25e+00f), tmp11713);
tmp11719 = _mm512_fmadd_ps(tmp11719, _mm512_set1_ps(5.25e+00f), tmp11717);
tmp11714 = _mm512_fmadd_ps(tmp11720, _mm512_set1_ps(2e+00f), tmp11712);
tmp11718 = _mm512_fmadd_ps(tmp11724, _mm512_set1_ps(2e+00f), tmp11716);
tmp11712 = _mm512_fnmadd_ps(tmp11720, _mm512_set1_ps(2e+00f), tmp11712);
tmp11716 = _mm512_fnmadd_ps(tmp11724, _mm512_set1_ps(2e+00f), tmp11716);
__m512 out1535 = _mm512_shuffle_f32x4(in1660, tmp11722, 68);
__m512 out1543 = _mm512_shuffle_f32x4(in1660, tmp11722, 238);
__m512 out1536 = _mm512_shuffle_f32x4(tmp11723, in1664, 68);
__m512 out1544 = _mm512_shuffle_f32x4(tmp11723, in1664, 238);
__m512 out1537 = _mm512_shuffle_f32x4(tmp11721, tmp11714, 68);
__m512 out1545 = _mm512_shuffle_f32x4(tmp11721, tmp11714, 238);
__m512 out1538 = _mm512_shuffle_f32x4(tmp11712, in1666, 68);
__m512 out1546 = _mm512_shuffle_f32x4(tmp11712, in1666, 238);
__m512 out1539 = _mm512_shuffle_f32x4(in1668, tmp11726, 68);
__m512 out1547 = _mm512_shuffle_f32x4(in1668, tmp11726, 238);
__m512 out1540 = _mm512_shuffle_f32x4(tmp11727, in1672, 68);
__m512 out1548 = _mm512_shuffle_f32x4(tmp11727, in1672, 238);
__m512 out1541 = _mm512_shuffle_f32x4(tmp11725, tmp11718, 68);
__m512 out1549 = _mm512_shuffle_f32x4(tmp11725, tmp11718, 238);
__m512 out1542 = _mm512_shuffle_f32x4(tmp11716, tmp11719, 68);
__m512 out1550 = _mm512_shuffle_f32x4(tmp11716, tmp11719, 238);
_mm512_storeu_ps(dfPtr11+0+51200*i49+3072*j42+3072*s46+768*k133, out1535);
_mm512_storeu_ps(dfPtr11+128+51200*i49+3072*j42+3072*s46+768*k133, out1543);
_mm512_storeu_ps(dfPtr11+64+51200*i49+3072*j42+3072*s46+768*k133, out1539);
_mm512_storeu_ps(dfPtr11+192+51200*i49+3072*j42+3072*s46+768*k133, out1547);
_mm512_storeu_ps(dfPtr11+12800+51200*i49+3072*j42+3072*s46+768*k133, out1536);
_mm512_storeu_ps(dfPtr11+12928+51200*i49+3072*j42+3072*s46+768*k133, out1544);
_mm512_storeu_ps(dfPtr11+12864+51200*i49+3072*j42+3072*s46+768*k133, out1540);
_mm512_storeu_ps(dfPtr11+12992+51200*i49+3072*j42+3072*s46+768*k133, out1548);
_mm512_storeu_ps(dfPtr11+25600+51200*i49+3072*j42+3072*s46+768*k133, out1537);
_mm512_storeu_ps(dfPtr11+25728+51200*i49+3072*j42+3072*s46+768*k133, out1545);
_mm512_storeu_ps(dfPtr11+25664+51200*i49+3072*j42+3072*s46+768*k133, out1541);
_mm512_storeu_ps(dfPtr11+25792+51200*i49+3072*j42+3072*s46+768*k133, out1549);
_mm512_storeu_ps(dfPtr11+38400+51200*i49+3072*j42+3072*s46+768*k133, out1538);
_mm512_storeu_ps(dfPtr11+38528+51200*i49+3072*j42+3072*s46+768*k133, out1546);
_mm512_storeu_ps(dfPtr11+38464+51200*i49+3072*j42+3072*s46+768*k133, out1542);
_mm512_storeu_ps(dfPtr11+38592+51200*i49+3072*j42+3072*s46+768*k133, out1550);
__m512 dat2216 = _mm512_maskz_loadu_ps(16383, datPtr25+648+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2217 = _mm512_maskz_loadu_ps(2047, datPtr25+3136+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512i pm196 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1673 = _mm512_permutexvar_ps(pm196, dat2216);
__m512i pm197 = _mm512_set_epi32(15, 15, 15, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1678 = _mm512_permutexvar_ps(pm197, dat2217);
__m512 dat2218 = _mm512_maskz_loadu_ps(16383, datPtr25+760+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2219 = _mm512_maskz_loadu_ps(2047, datPtr25+3248+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1674 = _mm512_permutexvar_ps(pm196, dat2218);
__m512 in1679 = _mm512_permutexvar_ps(pm197, dat2219);
__m512 dat2220 = _mm512_maskz_loadu_ps(16383, datPtr25+872+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2221 = _mm512_maskz_loadu_ps(2047, datPtr25+3360+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1675 = _mm512_permutexvar_ps(pm196, dat2220);
__m512 in1680 = _mm512_permutexvar_ps(pm197, dat2221);
__m512 dat2222 = _mm512_maskz_loadu_ps(16383, datPtr25+984+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2223 = _mm512_maskz_loadu_ps(2047, datPtr25+3472+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1676 = _mm512_permutexvar_ps(pm196, dat2222);
__m512 in1681 = _mm512_permutexvar_ps(pm197, dat2223);
__m512 dat2224 = _mm512_maskz_loadu_ps(16383, datPtr25+1096+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2225 = _mm512_maskz_loadu_ps(2047, datPtr25+3584+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1677 = _mm512_permutexvar_ps(pm196, dat2224);
__m512 in1682 = _mm512_permutexvar_ps(pm197, dat2225);
__m512 dat2226 = _mm512_maskz_loadu_ps(2047, datPtr25+3696+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1683 = _mm512_permutexvar_ps(pm197, dat2226);
__m512 dat2227 = _mm512_maskz_loadu_ps(2047, datPtr25+3808+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1684 = _mm512_permutexvar_ps(pm197, dat2227);
__m512 dat2228 = _mm512_maskz_loadu_ps(2047, datPtr25+3920+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1685 = _mm512_permutexvar_ps(pm197, dat2228);
__m512 tmp11776 = in1674;
__m512 tmp11781 = _mm512_add_ps(in1679, in1683);
__m512 tmp11777 = _mm512_sub_ps(in1677, in1675);
__m512 tmp11782 = _mm512_sub_ps(in1682, in1680);
__m512 tmp11778 = in1675;
__m512 tmp11783 = _mm512_add_ps(in1680, in1684);
in1673 = in1673;
in1678 = _mm512_sub_ps(in1678, in1684);
tmp11776 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-4.25e+00f), tmp11776);
tmp11781 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-4.25e+00f), tmp11781);
tmp11778 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-4.25e+00f), tmp11778);
tmp11783 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-4.25e+00f), tmp11783);
in1673 = _mm512_fmadd_ps(tmp11777, _mm512_set1_ps(5.25e+00f), in1673);
in1678 = _mm512_fmadd_ps(tmp11782, _mm512_set1_ps(5.25e+00f), in1678);
tmp11777 = _mm512_mul_ps(in1675, _mm512_set1_ps(2.5e-01f));
tmp11782 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(2.5e-01f), in1684);
in1675 = _mm512_mul_ps(in1675, _mm512_set1_ps(4e+00f));
in1680 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(4e+00f), in1684);
__m512 tmp11779 = _mm512_sub_ps(tmp11778, tmp11776);
__m512 tmp11784 = _mm512_sub_ps(tmp11783, tmp11781);
tmp11778 = _mm512_add_ps(tmp11776, tmp11778);
tmp11783 = _mm512_add_ps(tmp11781, tmp11783);
tmp11776 = _mm512_mul_ps(in1674, _mm512_set1_ps(2.5e-01f));
tmp11781 = _mm512_fmadd_ps(in1679, _mm512_set1_ps(2.5e-01f), in1683);
tmp11777 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-1.25e+00f), tmp11777);
tmp11782 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-1.25e+00f), tmp11782);
in1677 = _mm512_fmadd_ps(in1677, _mm512_set1_ps(-5e+00f), in1675);
in1682 = _mm512_fmadd_ps(in1682, _mm512_set1_ps(-5e+00f), in1680);
tmp11776 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-1.25e+00f), tmp11776);
tmp11781 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-1.25e+00f), tmp11781);
__m512 tmp11780 = _mm512_fmadd_ps(tmp11776, _mm512_set1_ps(2e+00f), tmp11777);
in1684 = _mm512_fmadd_ps(tmp11781, _mm512_set1_ps(2e+00f), tmp11782);
tmp11777 = _mm512_fnmadd_ps(tmp11776, _mm512_set1_ps(2e+00f), tmp11777);
tmp11782 = _mm512_fnmadd_ps(tmp11781, _mm512_set1_ps(2e+00f), tmp11782);
tmp11776 = in1674;
tmp11781 = _mm512_fmadd_ps(in1683, _mm512_set1_ps(2.5e-01f), in1679);
in1674 = _mm512_sub_ps(_mm512_setzero_ps(), in1674);
in1679 = _mm512_sub_ps(in1685, in1679);
tmp11776 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(-1.25e+00f), tmp11776);
tmp11781 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(-1.25e+00f), tmp11781);
in1676 = in1676;
in1681 = _mm512_sub_ps(in1681, in1683);
in1676 = _mm512_fmadd_ps(in1676, _mm512_set1_ps(5.25e+00f), in1674);
in1681 = _mm512_fmadd_ps(in1681, _mm512_set1_ps(5.25e+00f), in1679);
in1675 = _mm512_fmadd_ps(tmp11776, _mm512_set1_ps(2e+00f), in1677);
in1680 = _mm512_fmadd_ps(tmp11781, _mm512_set1_ps(2e+00f), in1682);
in1677 = _mm512_fnmadd_ps(tmp11776, _mm512_set1_ps(2e+00f), in1677);
in1682 = _mm512_fnmadd_ps(tmp11781, _mm512_set1_ps(2e+00f), in1682);
__m512 tmp11793 = _mm512_unpacklo_ps(in1673, tmp11778);
__m512 tmp11794 = _mm512_unpackhi_ps(in1673, tmp11778);
__m512 tmp11795 = _mm512_unpacklo_ps(tmp11779, tmp11780);
__m512 tmp11796 = _mm512_unpackhi_ps(tmp11779, tmp11780);
__m512 tmp11797 = _mm512_unpacklo_ps(tmp11777, in1675);
__m512 tmp11798 = _mm512_unpackhi_ps(tmp11777, in1675);
__m512 tmp11799 = _mm512_unpacklo_ps(in1677, in1676);
__m512 tmp11800 = _mm512_unpackhi_ps(in1677, in1676);
__m512 tmp11801 = _mm512_unpacklo_ps(in1678, tmp11783);
__m512 tmp11802 = _mm512_unpackhi_ps(in1678, tmp11783);
__m512 tmp11803 = _mm512_unpacklo_ps(tmp11784, in1684);
__m512 tmp11804 = _mm512_unpackhi_ps(tmp11784, in1684);
__m512 tmp11805 = _mm512_unpacklo_ps(tmp11782, in1680);
__m512 tmp11806 = _mm512_unpackhi_ps(tmp11782, in1680);
__m512 tmp11807 = _mm512_unpacklo_ps(in1682, in1681);
__m512 tmp11808 = _mm512_unpackhi_ps(in1682, in1681);
__m512 tmp11809 = _mm512_shuffle_ps(tmp11793, tmp11795, 68);
__m512 tmp11810 = _mm512_shuffle_ps(tmp11793, tmp11795, 238);
__m512 tmp11811 = _mm512_shuffle_ps(tmp11794, tmp11796, 68);
__m512 tmp11812 = _mm512_shuffle_ps(tmp11794, tmp11796, 238);
__m512 tmp11813 = _mm512_shuffle_ps(tmp11797, tmp11799, 68);
__m512 tmp11814 = _mm512_shuffle_ps(tmp11797, tmp11799, 238);
__m512 tmp11815 = _mm512_shuffle_ps(tmp11798, tmp11800, 68);
__m512 tmp11816 = _mm512_shuffle_ps(tmp11798, tmp11800, 238);
__m512 tmp11817 = _mm512_shuffle_ps(tmp11801, tmp11803, 68);
__m512 tmp11818 = _mm512_shuffle_ps(tmp11801, tmp11803, 238);
__m512 tmp11819 = _mm512_shuffle_ps(tmp11802, tmp11804, 68);
__m512 tmp11820 = _mm512_shuffle_ps(tmp11802, tmp11804, 238);
__m512 tmp11821 = _mm512_shuffle_ps(tmp11805, tmp11807, 68);
__m512 tmp11822 = _mm512_shuffle_ps(tmp11805, tmp11807, 238);
__m512 tmp11823 = _mm512_shuffle_ps(tmp11806, tmp11808, 68);
__m512 tmp11824 = _mm512_shuffle_ps(tmp11806, tmp11808, 238);
__m512 tmp11825 = _mm512_shuffle_f32x4(tmp11809, tmp11813, 136);
__m512 tmp11826 = _mm512_shuffle_f32x4(tmp11809, tmp11813, 221);
__m512 tmp11827 = _mm512_shuffle_f32x4(tmp11810, tmp11814, 136);
__m512 tmp11828 = _mm512_shuffle_f32x4(tmp11810, tmp11814, 221);
__m512 tmp11829 = _mm512_shuffle_f32x4(tmp11811, tmp11815, 136);
__m512 tmp11830 = _mm512_shuffle_f32x4(tmp11811, tmp11815, 221);
__m512 tmp11831 = _mm512_shuffle_f32x4(tmp11812, tmp11816, 136);
__m512 tmp11832 = _mm512_shuffle_f32x4(tmp11812, tmp11816, 221);
__m512 tmp11833 = _mm512_shuffle_f32x4(tmp11817, tmp11821, 136);
__m512 tmp11834 = _mm512_shuffle_f32x4(tmp11817, tmp11821, 221);
__m512 tmp11835 = _mm512_shuffle_f32x4(tmp11818, tmp11822, 136);
__m512 tmp11836 = _mm512_shuffle_f32x4(tmp11818, tmp11822, 221);
__m512 tmp11837 = _mm512_shuffle_f32x4(tmp11819, tmp11823, 136);
__m512 tmp11838 = _mm512_shuffle_f32x4(tmp11819, tmp11823, 221);
__m512 tmp11839 = _mm512_shuffle_f32x4(tmp11820, tmp11824, 136);
__m512 tmp11840 = _mm512_shuffle_f32x4(tmp11820, tmp11824, 221);
in1673 = _mm512_shuffle_f32x4(tmp11825, tmp11833, 136);
in1678 = _mm512_shuffle_f32x4(tmp11825, tmp11833, 221);
tmp11778 = _mm512_shuffle_f32x4(tmp11827, tmp11835, 136);
tmp11783 = _mm512_shuffle_f32x4(tmp11827, tmp11835, 221);
tmp11779 = _mm512_shuffle_f32x4(tmp11829, tmp11837, 136);
tmp11784 = _mm512_shuffle_f32x4(tmp11829, tmp11837, 221);
tmp11780 = _mm512_shuffle_f32x4(tmp11831, tmp11839, 136);
in1684 = _mm512_shuffle_f32x4(tmp11831, tmp11839, 221);
tmp11777 = _mm512_shuffle_f32x4(tmp11826, tmp11834, 136);
tmp11782 = _mm512_shuffle_f32x4(tmp11826, tmp11834, 221);
in1675 = _mm512_shuffle_f32x4(tmp11828, tmp11836, 136);
in1680 = _mm512_shuffle_f32x4(tmp11828, tmp11836, 221);
in1677 = _mm512_shuffle_f32x4(tmp11830, tmp11838, 136);
in1682 = _mm512_shuffle_f32x4(tmp11830, tmp11838, 221);
in1676 = _mm512_shuffle_f32x4(tmp11832, tmp11840, 136);
in1681 = _mm512_shuffle_f32x4(tmp11832, tmp11840, 221);
__m512 tmp11785 = _mm512_add_ps(tmp11778, in1675);
__m512 tmp11789 = _mm512_add_ps(tmp11783, in1680);
__m512 tmp11786 = _mm512_sub_ps(tmp11777, tmp11779);
__m512 tmp11790 = _mm512_sub_ps(tmp11782, tmp11784);
__m512 tmp11787 = _mm512_add_ps(tmp11779, in1677);
__m512 tmp11791 = _mm512_add_ps(tmp11784, in1682);
in1673 = _mm512_sub_ps(in1673, in1677);
in1678 = _mm512_sub_ps(in1678, in1682);
tmp11785 = _mm512_fmadd_ps(tmp11780, _mm512_set1_ps(-4.25e+00f), tmp11785);
tmp11789 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-4.25e+00f), tmp11789);
tmp11787 = _mm512_fmadd_ps(tmp11777, _mm512_set1_ps(-4.25e+00f), tmp11787);
tmp11791 = _mm512_fmadd_ps(tmp11782, _mm512_set1_ps(-4.25e+00f), tmp11791);
in1673 = _mm512_fmadd_ps(tmp11786, _mm512_set1_ps(5.25e+00f), in1673);
in1678 = _mm512_fmadd_ps(tmp11790, _mm512_set1_ps(5.25e+00f), in1678);
tmp11786 = _mm512_fmadd_ps(tmp11779, _mm512_set1_ps(2.5e-01f), in1677);
tmp11790 = _mm512_fmadd_ps(tmp11784, _mm512_set1_ps(2.5e-01f), in1682);
tmp11779 = _mm512_fmadd_ps(tmp11779, _mm512_set1_ps(4e+00f), in1677);
tmp11784 = _mm512_fmadd_ps(tmp11784, _mm512_set1_ps(4e+00f), in1682);
__m512 tmp11788 = _mm512_sub_ps(tmp11787, tmp11785);
__m512 tmp11792 = _mm512_sub_ps(tmp11791, tmp11789);
tmp11787 = _mm512_add_ps(tmp11785, tmp11787);
tmp11791 = _mm512_add_ps(tmp11789, tmp11791);
tmp11785 = _mm512_fmadd_ps(tmp11778, _mm512_set1_ps(2.5e-01f), in1675);
tmp11789 = _mm512_fmadd_ps(tmp11783, _mm512_set1_ps(2.5e-01f), in1680);
tmp11786 = _mm512_fmadd_ps(tmp11777, _mm512_set1_ps(-1.25e+00f), tmp11786);
tmp11790 = _mm512_fmadd_ps(tmp11782, _mm512_set1_ps(-1.25e+00f), tmp11790);
tmp11777 = _mm512_fmadd_ps(tmp11777, _mm512_set1_ps(-5e+00f), tmp11779);
tmp11782 = _mm512_fmadd_ps(tmp11782, _mm512_set1_ps(-5e+00f), tmp11784);
tmp11785 = _mm512_fmadd_ps(tmp11780, _mm512_set1_ps(-1.25e+00f), tmp11785);
tmp11789 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-1.25e+00f), tmp11789);
in1677 = _mm512_fmadd_ps(tmp11785, _mm512_set1_ps(2e+00f), tmp11786);
in1682 = _mm512_fmadd_ps(tmp11789, _mm512_set1_ps(2e+00f), tmp11790);
tmp11786 = _mm512_fnmadd_ps(tmp11785, _mm512_set1_ps(2e+00f), tmp11786);
tmp11790 = _mm512_fnmadd_ps(tmp11789, _mm512_set1_ps(2e+00f), tmp11790);
tmp11785 = _mm512_fmadd_ps(in1675, _mm512_set1_ps(2.5e-01f), tmp11778);
tmp11789 = _mm512_fmadd_ps(in1680, _mm512_set1_ps(2.5e-01f), tmp11783);
tmp11778 = _mm512_sub_ps(in1676, tmp11778);
tmp11783 = _mm512_sub_ps(in1681, tmp11783);
tmp11785 = _mm512_fmadd_ps(tmp11780, _mm512_set1_ps(-1.25e+00f), tmp11785);
tmp11789 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(-1.25e+00f), tmp11789);
tmp11780 = _mm512_sub_ps(tmp11780, in1675);
in1684 = _mm512_sub_ps(in1684, in1680);
tmp11780 = _mm512_fmadd_ps(tmp11780, _mm512_set1_ps(5.25e+00f), tmp11778);
in1684 = _mm512_fmadd_ps(in1684, _mm512_set1_ps(5.25e+00f), tmp11783);
tmp11779 = _mm512_fmadd_ps(tmp11785, _mm512_set1_ps(2e+00f), tmp11777);
tmp11784 = _mm512_fmadd_ps(tmp11789, _mm512_set1_ps(2e+00f), tmp11782);
tmp11777 = _mm512_fnmadd_ps(tmp11785, _mm512_set1_ps(2e+00f), tmp11777);
tmp11782 = _mm512_fnmadd_ps(tmp11789, _mm512_set1_ps(2e+00f), tmp11782);
__m512 out1551 = _mm512_shuffle_f32x4(in1673, tmp11787, 68);
__m512 out1559 = _mm512_shuffle_f32x4(in1673, tmp11787, 238);
__m512 out1552 = _mm512_shuffle_f32x4(tmp11788, in1677, 68);
__m512 out1560 = _mm512_shuffle_f32x4(tmp11788, in1677, 238);
__m512 out1553 = _mm512_shuffle_f32x4(tmp11786, tmp11779, 68);
__m512 out1561 = _mm512_shuffle_f32x4(tmp11786, tmp11779, 238);
__m512 out1554 = _mm512_shuffle_f32x4(tmp11777, tmp11780, 68);
__m512 out1562 = _mm512_shuffle_f32x4(tmp11777, tmp11780, 238);
__m512 out1555 = _mm512_shuffle_f32x4(in1678, tmp11791, 68);
__m512 out1563 = _mm512_shuffle_f32x4(in1678, tmp11791, 238);
__m512 out1556 = _mm512_shuffle_f32x4(tmp11792, in1682, 68);
__m512 out1564 = _mm512_shuffle_f32x4(tmp11792, in1682, 238);
__m512 out1557 = _mm512_shuffle_f32x4(tmp11790, tmp11784, 68);
__m512 out1565 = _mm512_shuffle_f32x4(tmp11790, tmp11784, 238);
__m512 out1558 = _mm512_shuffle_f32x4(tmp11782, in1684, 68);
__m512 out1566 = _mm512_shuffle_f32x4(tmp11782, in1684, 238);
_mm512_storeu_ps(dfPtr11+256+51200*i49+3072*j42+3072*s46+768*k133, out1551);
_mm512_storeu_ps(dfPtr11+384+51200*i49+3072*j42+3072*s46+768*k133, out1559);
_mm512_storeu_ps(dfPtr11+320+51200*i49+3072*j42+3072*s46+768*k133, out1555);
_mm512_storeu_ps(dfPtr11+448+51200*i49+3072*j42+3072*s46+768*k133, out1563);
_mm512_storeu_ps(dfPtr11+13056+51200*i49+3072*j42+3072*s46+768*k133, out1552);
_mm512_storeu_ps(dfPtr11+13184+51200*i49+3072*j42+3072*s46+768*k133, out1560);
_mm512_storeu_ps(dfPtr11+13120+51200*i49+3072*j42+3072*s46+768*k133, out1556);
_mm512_storeu_ps(dfPtr11+13248+51200*i49+3072*j42+3072*s46+768*k133, out1564);
_mm512_storeu_ps(dfPtr11+25856+51200*i49+3072*j42+3072*s46+768*k133, out1553);
_mm512_storeu_ps(dfPtr11+25984+51200*i49+3072*j42+3072*s46+768*k133, out1561);
_mm512_storeu_ps(dfPtr11+25920+51200*i49+3072*j42+3072*s46+768*k133, out1557);
_mm512_storeu_ps(dfPtr11+26048+51200*i49+3072*j42+3072*s46+768*k133, out1565);
_mm512_storeu_ps(dfPtr11+38656+51200*i49+3072*j42+3072*s46+768*k133, out1554);
_mm512_storeu_ps(dfPtr11+38784+51200*i49+3072*j42+3072*s46+768*k133, out1562);
_mm512_storeu_ps(dfPtr11+38720+51200*i49+3072*j42+3072*s46+768*k133, out1558);
_mm512_storeu_ps(dfPtr11+38848+51200*i49+3072*j42+3072*s46+768*k133, out1566);
__m512 dat2229 = _mm512_maskz_loadu_ps(8191, datPtr25+3740+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2230 = _mm512_maskz_loadu_ps(16383, datPtr25+3784+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512i pm198 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1686 = _mm512_permutexvar_ps(pm198, dat2229);
__m512i pm199 = _mm512_set_epi32(13, 12, 11, 10, 9, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1691 = _mm512_permutexvar_ps(pm199, dat2230);
__m512 dat2231 = _mm512_maskz_loadu_ps(8191, datPtr25+3852+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2232 = _mm512_maskz_loadu_ps(16383, datPtr25+3896+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1687 = _mm512_permutexvar_ps(pm198, dat2231);
__m512 in1692 = _mm512_permutexvar_ps(pm199, dat2232);
__m512 dat2233 = _mm512_maskz_loadu_ps(8191, datPtr25+3964+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2234 = _mm512_maskz_loadu_ps(16383, datPtr25+4008+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1688 = _mm512_permutexvar_ps(pm198, dat2233);
__m512 in1693 = _mm512_permutexvar_ps(pm199, dat2234);
__m512 dat2235 = _mm512_maskz_loadu_ps(8191, datPtr25+4076+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2236 = _mm512_maskz_loadu_ps(16383, datPtr25+4120+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1689 = _mm512_permutexvar_ps(pm198, dat2235);
__m512 in1694 = _mm512_permutexvar_ps(pm199, dat2236);
__m512 dat2237 = _mm512_maskz_loadu_ps(8191, datPtr25+4188+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 dat2238 = _mm512_maskz_loadu_ps(16383, datPtr25+4232+25088*i49+112*h46+4*w59+25088*s46+6272*k133);
__m512 in1690 = _mm512_permutexvar_ps(pm198, dat2237);
__m512 in1695 = _mm512_permutexvar_ps(pm199, dat2238);
__m512 tmp11841 = in1687;
__m512 tmp11846 = in1692;
__m512 tmp11842 = _mm512_sub_ps(in1690, in1688);
__m512 tmp11847 = _mm512_sub_ps(in1695, in1693);
__m512 tmp11843 = in1688;
__m512 tmp11848 = in1693;
in1686 = in1686;
in1691 = in1691;
tmp11841 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-4.25e+00f), tmp11841);
tmp11846 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-4.25e+00f), tmp11846);
tmp11843 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-4.25e+00f), tmp11843);
tmp11848 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-4.25e+00f), tmp11848);
in1686 = _mm512_fmadd_ps(tmp11842, _mm512_set1_ps(5.25e+00f), in1686);
in1691 = _mm512_fmadd_ps(tmp11847, _mm512_set1_ps(5.25e+00f), in1691);
tmp11842 = _mm512_mul_ps(in1688, _mm512_set1_ps(2.5e-01f));
tmp11847 = _mm512_mul_ps(in1693, _mm512_set1_ps(2.5e-01f));
in1688 = _mm512_mul_ps(in1688, _mm512_set1_ps(4e+00f));
in1693 = _mm512_mul_ps(in1693, _mm512_set1_ps(4e+00f));
__m512 tmp11844 = _mm512_sub_ps(tmp11843, tmp11841);
__m512 tmp11849 = _mm512_sub_ps(tmp11848, tmp11846);
tmp11843 = _mm512_add_ps(tmp11841, tmp11843);
tmp11848 = _mm512_add_ps(tmp11846, tmp11848);
tmp11841 = _mm512_mul_ps(in1687, _mm512_set1_ps(2.5e-01f));
tmp11846 = _mm512_mul_ps(in1692, _mm512_set1_ps(2.5e-01f));
tmp11842 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-1.25e+00f), tmp11842);
tmp11847 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-1.25e+00f), tmp11847);
in1690 = _mm512_fmadd_ps(in1690, _mm512_set1_ps(-5e+00f), in1688);
in1695 = _mm512_fmadd_ps(in1695, _mm512_set1_ps(-5e+00f), in1693);
tmp11841 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-1.25e+00f), tmp11841);
tmp11846 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-1.25e+00f), tmp11846);
__m512 tmp11845 = _mm512_fmadd_ps(tmp11841, _mm512_set1_ps(2e+00f), tmp11842);
__m512 tmp11850 = _mm512_fmadd_ps(tmp11846, _mm512_set1_ps(2e+00f), tmp11847);
tmp11842 = _mm512_fnmadd_ps(tmp11841, _mm512_set1_ps(2e+00f), tmp11842);
tmp11847 = _mm512_fnmadd_ps(tmp11846, _mm512_set1_ps(2e+00f), tmp11847);
tmp11841 = in1687;
tmp11846 = in1692;
in1687 = _mm512_sub_ps(_mm512_setzero_ps(), in1687);
in1692 = _mm512_sub_ps(_mm512_setzero_ps(), in1692);
tmp11841 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(-1.25e+00f), tmp11841);
tmp11846 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(-1.25e+00f), tmp11846);
in1689 = in1689;
in1694 = in1694;
in1689 = _mm512_fmadd_ps(in1689, _mm512_set1_ps(5.25e+00f), in1687);
in1694 = _mm512_fmadd_ps(in1694, _mm512_set1_ps(5.25e+00f), in1692);
in1688 = _mm512_fmadd_ps(tmp11841, _mm512_set1_ps(2e+00f), in1690);
in1693 = _mm512_fmadd_ps(tmp11846, _mm512_set1_ps(2e+00f), in1695);
in1690 = _mm512_fnmadd_ps(tmp11841, _mm512_set1_ps(2e+00f), in1690);
in1695 = _mm512_fnmadd_ps(tmp11846, _mm512_set1_ps(2e+00f), in1695);
__m512 tmp11859 = _mm512_unpacklo_ps(in1686, tmp11843);
__m512 tmp11860 = _mm512_unpackhi_ps(in1686, tmp11843);
__m512 tmp11861 = _mm512_unpacklo_ps(tmp11844, tmp11845);
__m512 tmp11862 = _mm512_unpackhi_ps(tmp11844, tmp11845);
__m512 tmp11863 = _mm512_unpacklo_ps(tmp11842, in1688);
__m512 tmp11864 = _mm512_unpackhi_ps(tmp11842, in1688);
__m512 tmp11865 = _mm512_unpacklo_ps(in1690, in1689);
__m512 tmp11866 = _mm512_unpackhi_ps(in1690, in1689);
__m512 tmp11867 = _mm512_unpacklo_ps(in1691, tmp11848);
__m512 tmp11868 = _mm512_unpackhi_ps(in1691, tmp11848);
__m512 tmp11869 = _mm512_unpacklo_ps(tmp11849, tmp11850);
__m512 tmp11870 = _mm512_unpackhi_ps(tmp11849, tmp11850);
__m512 tmp11871 = _mm512_unpacklo_ps(tmp11847, in1693);
__m512 tmp11872 = _mm512_unpackhi_ps(tmp11847, in1693);
__m512 tmp11873 = _mm512_unpacklo_ps(in1695, in1694);
__m512 tmp11874 = _mm512_unpackhi_ps(in1695, in1694);
__m512 tmp11875 = _mm512_shuffle_ps(tmp11859, tmp11861, 68);
__m512 tmp11876 = _mm512_shuffle_ps(tmp11859, tmp11861, 238);
__m512 tmp11877 = _mm512_shuffle_ps(tmp11860, tmp11862, 68);
__m512 tmp11878 = _mm512_shuffle_ps(tmp11860, tmp11862, 238);
__m512 tmp11879 = _mm512_shuffle_ps(tmp11863, tmp11865, 68);
__m512 tmp11880 = _mm512_shuffle_ps(tmp11863, tmp11865, 238);
__m512 tmp11881 = _mm512_shuffle_ps(tmp11864, tmp11866, 68);
__m512 tmp11882 = _mm512_shuffle_ps(tmp11864, tmp11866, 238);
__m512 tmp11883 = _mm512_shuffle_ps(tmp11867, tmp11869, 68);
__m512 tmp11884 = _mm512_shuffle_ps(tmp11867, tmp11869, 238);
__m512 tmp11885 = _mm512_shuffle_ps(tmp11868, tmp11870, 68);
__m512 tmp11886 = _mm512_shuffle_ps(tmp11868, tmp11870, 238);
__m512 tmp11887 = _mm512_shuffle_ps(tmp11871, tmp11873, 68);
__m512 tmp11888 = _mm512_shuffle_ps(tmp11871, tmp11873, 238);
__m512 tmp11889 = _mm512_shuffle_ps(tmp11872, tmp11874, 68);
__m512 tmp11890 = _mm512_shuffle_ps(tmp11872, tmp11874, 238);
__m512 tmp11891 = _mm512_shuffle_f32x4(tmp11875, tmp11879, 136);
__m512 tmp11892 = _mm512_shuffle_f32x4(tmp11875, tmp11879, 221);
__m512 tmp11893 = _mm512_shuffle_f32x4(tmp11876, tmp11880, 136);
__m512 tmp11894 = _mm512_shuffle_f32x4(tmp11876, tmp11880, 221);
__m512 tmp11895 = _mm512_shuffle_f32x4(tmp11877, tmp11881, 136);
__m512 tmp11896 = _mm512_shuffle_f32x4(tmp11877, tmp11881, 221);
__m512 tmp11897 = _mm512_shuffle_f32x4(tmp11878, tmp11882, 136);
__m512 tmp11898 = _mm512_shuffle_f32x4(tmp11878, tmp11882, 221);
__m512 tmp11899 = _mm512_shuffle_f32x4(tmp11883, tmp11887, 136);
__m512 tmp11900 = _mm512_shuffle_f32x4(tmp11883, tmp11887, 221);
__m512 tmp11901 = _mm512_shuffle_f32x4(tmp11884, tmp11888, 136);
__m512 tmp11902 = _mm512_shuffle_f32x4(tmp11884, tmp11888, 221);
__m512 tmp11903 = _mm512_shuffle_f32x4(tmp11885, tmp11889, 136);
__m512 tmp11904 = _mm512_shuffle_f32x4(tmp11885, tmp11889, 221);
__m512 tmp11905 = _mm512_shuffle_f32x4(tmp11886, tmp11890, 136);
__m512 tmp11906 = _mm512_shuffle_f32x4(tmp11886, tmp11890, 221);
in1686 = _mm512_shuffle_f32x4(tmp11891, tmp11899, 136);
in1691 = _mm512_shuffle_f32x4(tmp11891, tmp11899, 221);
tmp11843 = _mm512_shuffle_f32x4(tmp11893, tmp11901, 136);
tmp11848 = _mm512_shuffle_f32x4(tmp11893, tmp11901, 221);
tmp11844 = _mm512_shuffle_f32x4(tmp11895, tmp11903, 136);
tmp11849 = _mm512_shuffle_f32x4(tmp11895, tmp11903, 221);
tmp11845 = _mm512_shuffle_f32x4(tmp11897, tmp11905, 136);
tmp11850 = _mm512_shuffle_f32x4(tmp11897, tmp11905, 221);
tmp11842 = _mm512_shuffle_f32x4(tmp11892, tmp11900, 136);
tmp11847 = _mm512_shuffle_f32x4(tmp11892, tmp11900, 221);
in1688 = _mm512_shuffle_f32x4(tmp11894, tmp11902, 136);
in1693 = _mm512_shuffle_f32x4(tmp11894, tmp11902, 221);
in1690 = _mm512_shuffle_f32x4(tmp11896, tmp11904, 136);
in1695 = _mm512_shuffle_f32x4(tmp11896, tmp11904, 221);
in1689 = _mm512_shuffle_f32x4(tmp11898, tmp11906, 136);
in1694 = _mm512_shuffle_f32x4(tmp11898, tmp11906, 221);
__m512 tmp11851 = _mm512_add_ps(tmp11843, in1688);
__m512 tmp11855 = _mm512_add_ps(tmp11848, in1693);
__m512 tmp11852 = _mm512_sub_ps(tmp11842, tmp11844);
__m512 tmp11856 = _mm512_sub_ps(tmp11847, tmp11849);
__m512 tmp11853 = _mm512_add_ps(tmp11844, in1690);
__m512 tmp11857 = _mm512_add_ps(tmp11849, in1695);
in1686 = _mm512_sub_ps(in1686, in1690);
in1691 = _mm512_sub_ps(in1691, in1695);
tmp11851 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(-4.25e+00f), tmp11851);
tmp11855 = _mm512_fmadd_ps(tmp11850, _mm512_set1_ps(-4.25e+00f), tmp11855);
tmp11853 = _mm512_fmadd_ps(tmp11842, _mm512_set1_ps(-4.25e+00f), tmp11853);
tmp11857 = _mm512_fmadd_ps(tmp11847, _mm512_set1_ps(-4.25e+00f), tmp11857);
in1686 = _mm512_fmadd_ps(tmp11852, _mm512_set1_ps(5.25e+00f), in1686);
in1691 = _mm512_fmadd_ps(tmp11856, _mm512_set1_ps(5.25e+00f), in1691);
tmp11852 = _mm512_fmadd_ps(tmp11844, _mm512_set1_ps(2.5e-01f), in1690);
tmp11856 = _mm512_fmadd_ps(tmp11849, _mm512_set1_ps(2.5e-01f), in1695);
tmp11844 = _mm512_fmadd_ps(tmp11844, _mm512_set1_ps(4e+00f), in1690);
tmp11849 = _mm512_fmadd_ps(tmp11849, _mm512_set1_ps(4e+00f), in1695);
__m512 tmp11854 = _mm512_sub_ps(tmp11853, tmp11851);
__m512 tmp11858 = _mm512_sub_ps(tmp11857, tmp11855);
tmp11853 = _mm512_add_ps(tmp11851, tmp11853);
tmp11857 = _mm512_add_ps(tmp11855, tmp11857);
tmp11851 = _mm512_fmadd_ps(tmp11843, _mm512_set1_ps(2.5e-01f), in1688);
tmp11855 = _mm512_fmadd_ps(tmp11848, _mm512_set1_ps(2.5e-01f), in1693);
tmp11852 = _mm512_fmadd_ps(tmp11842, _mm512_set1_ps(-1.25e+00f), tmp11852);
tmp11856 = _mm512_fmadd_ps(tmp11847, _mm512_set1_ps(-1.25e+00f), tmp11856);
tmp11842 = _mm512_fmadd_ps(tmp11842, _mm512_set1_ps(-5e+00f), tmp11844);
tmp11847 = _mm512_fmadd_ps(tmp11847, _mm512_set1_ps(-5e+00f), tmp11849);
tmp11851 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(-1.25e+00f), tmp11851);
tmp11855 = _mm512_fmadd_ps(tmp11850, _mm512_set1_ps(-1.25e+00f), tmp11855);
in1690 = _mm512_fmadd_ps(tmp11851, _mm512_set1_ps(2e+00f), tmp11852);
in1695 = _mm512_fmadd_ps(tmp11855, _mm512_set1_ps(2e+00f), tmp11856);
tmp11852 = _mm512_fnmadd_ps(tmp11851, _mm512_set1_ps(2e+00f), tmp11852);
tmp11856 = _mm512_fnmadd_ps(tmp11855, _mm512_set1_ps(2e+00f), tmp11856);
tmp11851 = _mm512_fmadd_ps(in1688, _mm512_set1_ps(2.5e-01f), tmp11843);
tmp11855 = _mm512_fmadd_ps(in1693, _mm512_set1_ps(2.5e-01f), tmp11848);
tmp11843 = _mm512_sub_ps(in1689, tmp11843);
tmp11848 = _mm512_sub_ps(in1694, tmp11848);
tmp11851 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(-1.25e+00f), tmp11851);
tmp11855 = _mm512_fmadd_ps(tmp11850, _mm512_set1_ps(-1.25e+00f), tmp11855);
tmp11845 = _mm512_sub_ps(tmp11845, in1688);
tmp11850 = _mm512_sub_ps(tmp11850, in1693);
tmp11845 = _mm512_fmadd_ps(tmp11845, _mm512_set1_ps(5.25e+00f), tmp11843);
tmp11850 = _mm512_fmadd_ps(tmp11850, _mm512_set1_ps(5.25e+00f), tmp11848);
tmp11844 = _mm512_fmadd_ps(tmp11851, _mm512_set1_ps(2e+00f), tmp11842);
tmp11849 = _mm512_fmadd_ps(tmp11855, _mm512_set1_ps(2e+00f), tmp11847);
tmp11842 = _mm512_fnmadd_ps(tmp11851, _mm512_set1_ps(2e+00f), tmp11842);
tmp11847 = _mm512_fnmadd_ps(tmp11855, _mm512_set1_ps(2e+00f), tmp11847);
__m512 out1567 = _mm512_shuffle_f32x4(in1686, tmp11853, 68);
__m512 out1575 = _mm512_shuffle_f32x4(in1686, tmp11853, 238);
__m512 out1568 = _mm512_shuffle_f32x4(tmp11854, in1690, 68);
__m512 out1576 = _mm512_shuffle_f32x4(tmp11854, in1690, 238);
__m512 out1569 = _mm512_shuffle_f32x4(tmp11852, tmp11844, 68);
__m512 out1577 = _mm512_shuffle_f32x4(tmp11852, tmp11844, 238);
__m512 out1570 = _mm512_shuffle_f32x4(tmp11842, tmp11845, 68);
__m512 out1578 = _mm512_shuffle_f32x4(tmp11842, tmp11845, 238);
__m512 out1571 = _mm512_shuffle_f32x4(in1691, tmp11857, 68);
__m512 out1579 = _mm512_shuffle_f32x4(in1691, tmp11857, 238);
__m512 out1572 = _mm512_shuffle_f32x4(tmp11858, in1695, 68);
__m512 out1580 = _mm512_shuffle_f32x4(tmp11858, in1695, 238);
__m512 out1573 = _mm512_shuffle_f32x4(tmp11856, tmp11849, 68);
__m512 out1581 = _mm512_shuffle_f32x4(tmp11856, tmp11849, 238);
__m512 out1574 = _mm512_shuffle_f32x4(tmp11847, tmp11850, 68);
__m512 out1582 = _mm512_shuffle_f32x4(tmp11847, tmp11850, 238);
_mm512_storeu_ps(dfPtr11+512+51200*i49+3072*j42+3072*s46+768*k133, out1567);
_mm512_storeu_ps(dfPtr11+640+51200*i49+3072*j42+3072*s46+768*k133, out1575);
_mm512_storeu_ps(dfPtr11+576+51200*i49+3072*j42+3072*s46+768*k133, out1571);
_mm512_storeu_ps(dfPtr11+704+51200*i49+3072*j42+3072*s46+768*k133, out1579);
_mm512_storeu_ps(dfPtr11+13312+51200*i49+3072*j42+3072*s46+768*k133, out1568);
_mm512_storeu_ps(dfPtr11+13440+51200*i49+3072*j42+3072*s46+768*k133, out1576);
_mm512_storeu_ps(dfPtr11+13376+51200*i49+3072*j42+3072*s46+768*k133, out1572);
_mm512_storeu_ps(dfPtr11+13504+51200*i49+3072*j42+3072*s46+768*k133, out1580);
_mm512_storeu_ps(dfPtr11+26112+51200*i49+3072*j42+3072*s46+768*k133, out1569);
_mm512_storeu_ps(dfPtr11+26240+51200*i49+3072*j42+3072*s46+768*k133, out1577);
_mm512_storeu_ps(dfPtr11+26176+51200*i49+3072*j42+3072*s46+768*k133, out1573);
_mm512_storeu_ps(dfPtr11+26304+51200*i49+3072*j42+3072*s46+768*k133, out1581);
_mm512_storeu_ps(dfPtr11+38912+51200*i49+3072*j42+3072*s46+768*k133, out1570);
_mm512_storeu_ps(dfPtr11+39040+51200*i49+3072*j42+3072*s46+768*k133, out1578);
_mm512_storeu_ps(dfPtr11+38976+51200*i49+3072*j42+3072*s46+768*k133, out1574);
_mm512_storeu_ps(dfPtr11+39104+51200*i49+3072*j42+3072*s46+768*k133, out1582);
}
++j42;
rel21 = 4;
}
ptrdiff_t h47 = base21+24;
ptrdiff_t w60 = 24;
ptrdiff_t k134 = 0;
for (; k134 != 2; ++k134) {
__m512 dat2239 = _mm512_maskz_loadu_ps(31, datPtr25+0+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2240 = _mm512_maskz_loadu_ps(31, datPtr25+3136+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2241 = _mm512_maskz_loadu_ps(31, datPtr25+6272+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2242 = _mm512_maskz_loadu_ps(31, datPtr25+9408+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512i pm200 = _mm512_set_epi32(15, 15, 15, 20, 19, 18, 17, 16, 15, 15, 15, 4, 3, 2, 1, 0);
__m512 in1696 = _mm512_permutex2var_ps(dat2239, pm200, dat2240);
__m512 in1701 = _mm512_permutex2var_ps(dat2241, pm200, dat2242);
__m512 dat2243 = _mm512_maskz_loadu_ps(31, datPtr25+112+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2244 = _mm512_maskz_loadu_ps(31, datPtr25+3248+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2245 = _mm512_maskz_loadu_ps(31, datPtr25+6384+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2246 = _mm512_maskz_loadu_ps(31, datPtr25+9520+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 in1697 = _mm512_permutex2var_ps(dat2243, pm200, dat2244);
__m512 in1702 = _mm512_permutex2var_ps(dat2245, pm200, dat2246);
__m512 dat2247 = _mm512_maskz_loadu_ps(31, datPtr25+224+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2248 = _mm512_maskz_loadu_ps(31, datPtr25+3360+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2249 = _mm512_maskz_loadu_ps(31, datPtr25+6496+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2250 = _mm512_maskz_loadu_ps(31, datPtr25+9632+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 in1698 = _mm512_permutex2var_ps(dat2247, pm200, dat2248);
__m512 in1703 = _mm512_permutex2var_ps(dat2249, pm200, dat2250);
__m512 dat2251 = _mm512_maskz_loadu_ps(31, datPtr25+336+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2252 = _mm512_maskz_loadu_ps(31, datPtr25+3472+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2253 = _mm512_maskz_loadu_ps(31, datPtr25+6608+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2254 = _mm512_maskz_loadu_ps(31, datPtr25+9744+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 in1699 = _mm512_permutex2var_ps(dat2251, pm200, dat2252);
__m512 in1704 = _mm512_permutex2var_ps(dat2253, pm200, dat2254);
__m512 dat2255 = _mm512_maskz_loadu_ps(31, datPtr25+448+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2256 = _mm512_maskz_loadu_ps(31, datPtr25+3584+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2257 = _mm512_maskz_loadu_ps(31, datPtr25+6720+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 dat2258 = _mm512_maskz_loadu_ps(31, datPtr25+9856+25088*i49+112*h47+4*w60+25088*s46+12544*k134);
__m512 in1700 = _mm512_permutex2var_ps(dat2255, pm200, dat2256);
__m512 in1705 = _mm512_permutex2var_ps(dat2257, pm200, dat2258);
__m512 tmp11907 = in1697;
__m512 tmp11912 = in1702;
__m512 tmp11908 = _mm512_sub_ps(in1700, in1698);
__m512 tmp11913 = _mm512_sub_ps(in1705, in1703);
__m512 tmp11909 = in1698;
__m512 tmp11914 = in1703;
in1696 = in1696;
in1701 = in1701;
tmp11907 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-4.25e+00f), tmp11907);
tmp11912 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-4.25e+00f), tmp11912);
tmp11909 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-4.25e+00f), tmp11909);
tmp11914 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-4.25e+00f), tmp11914);
in1696 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(5.25e+00f), in1696);
in1701 = _mm512_fmadd_ps(tmp11913, _mm512_set1_ps(5.25e+00f), in1701);
tmp11908 = _mm512_mul_ps(in1698, _mm512_set1_ps(2.5e-01f));
tmp11913 = _mm512_mul_ps(in1703, _mm512_set1_ps(2.5e-01f));
in1698 = _mm512_mul_ps(in1698, _mm512_set1_ps(4e+00f));
in1703 = _mm512_mul_ps(in1703, _mm512_set1_ps(4e+00f));
__m512 tmp11910 = _mm512_sub_ps(tmp11909, tmp11907);
__m512 tmp11915 = _mm512_sub_ps(tmp11914, tmp11912);
tmp11909 = _mm512_add_ps(tmp11907, tmp11909);
tmp11914 = _mm512_add_ps(tmp11912, tmp11914);
tmp11907 = _mm512_mul_ps(in1697, _mm512_set1_ps(2.5e-01f));
tmp11912 = _mm512_mul_ps(in1702, _mm512_set1_ps(2.5e-01f));
tmp11908 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-1.25e+00f), tmp11908);
tmp11913 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-1.25e+00f), tmp11913);
in1700 = _mm512_fmadd_ps(in1700, _mm512_set1_ps(-5e+00f), in1698);
in1705 = _mm512_fmadd_ps(in1705, _mm512_set1_ps(-5e+00f), in1703);
tmp11907 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-1.25e+00f), tmp11907);
tmp11912 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-1.25e+00f), tmp11912);
__m512 tmp11911 = _mm512_fmadd_ps(tmp11907, _mm512_set1_ps(2e+00f), tmp11908);
__m512 tmp11916 = _mm512_fmadd_ps(tmp11912, _mm512_set1_ps(2e+00f), tmp11913);
tmp11908 = _mm512_fnmadd_ps(tmp11907, _mm512_set1_ps(2e+00f), tmp11908);
tmp11913 = _mm512_fnmadd_ps(tmp11912, _mm512_set1_ps(2e+00f), tmp11913);
tmp11907 = in1697;
tmp11912 = in1702;
in1697 = _mm512_sub_ps(_mm512_setzero_ps(), in1697);
in1702 = _mm512_sub_ps(_mm512_setzero_ps(), in1702);
tmp11907 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(-1.25e+00f), tmp11907);
tmp11912 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(-1.25e+00f), tmp11912);
in1699 = in1699;
in1704 = in1704;
in1699 = _mm512_fmadd_ps(in1699, _mm512_set1_ps(5.25e+00f), in1697);
in1704 = _mm512_fmadd_ps(in1704, _mm512_set1_ps(5.25e+00f), in1702);
in1698 = _mm512_fmadd_ps(tmp11907, _mm512_set1_ps(2e+00f), in1700);
in1703 = _mm512_fmadd_ps(tmp11912, _mm512_set1_ps(2e+00f), in1705);
in1700 = _mm512_fnmadd_ps(tmp11907, _mm512_set1_ps(2e+00f), in1700);
in1705 = _mm512_fnmadd_ps(tmp11912, _mm512_set1_ps(2e+00f), in1705);
__m512 tmp11927 = _mm512_unpacklo_ps(in1696, tmp11909);
__m512 tmp11928 = _mm512_unpackhi_ps(in1696, tmp11909);
__m512 tmp11929 = _mm512_unpacklo_ps(tmp11910, tmp11911);
__m512 tmp11930 = _mm512_unpackhi_ps(tmp11910, tmp11911);
__m512 tmp11931 = _mm512_unpacklo_ps(tmp11908, in1698);
__m512 tmp11932 = _mm512_unpackhi_ps(tmp11908, in1698);
__m512 tmp11933 = _mm512_unpacklo_ps(in1700, in1699);
__m512 tmp11934 = _mm512_unpackhi_ps(in1700, in1699);
__m512 tmp11935 = _mm512_unpacklo_ps(in1701, tmp11914);
__m512 tmp11936 = _mm512_unpackhi_ps(in1701, tmp11914);
__m512 tmp11937 = _mm512_unpacklo_ps(tmp11915, tmp11916);
__m512 tmp11938 = _mm512_unpackhi_ps(tmp11915, tmp11916);
__m512 tmp11939 = _mm512_unpacklo_ps(tmp11913, in1703);
__m512 tmp11940 = _mm512_unpackhi_ps(tmp11913, in1703);
__m512 tmp11941 = _mm512_unpacklo_ps(in1705, in1704);
__m512 tmp11942 = _mm512_unpackhi_ps(in1705, in1704);
__m512 tmp11943 = _mm512_shuffle_ps(tmp11927, tmp11929, 68);
__m512 tmp11944 = _mm512_shuffle_ps(tmp11927, tmp11929, 238);
__m512 tmp11945 = _mm512_shuffle_ps(tmp11928, tmp11930, 68);
__m512 tmp11946 = _mm512_shuffle_ps(tmp11928, tmp11930, 238);
__m512 tmp11947 = _mm512_shuffle_ps(tmp11931, tmp11933, 68);
__m512 tmp11948 = _mm512_shuffle_ps(tmp11931, tmp11933, 238);
__m512 tmp11949 = _mm512_shuffle_ps(tmp11932, tmp11934, 68);
__m512 tmp11950 = _mm512_shuffle_ps(tmp11932, tmp11934, 238);
__m512 tmp11951 = _mm512_shuffle_ps(tmp11935, tmp11937, 68);
__m512 tmp11952 = _mm512_shuffle_ps(tmp11935, tmp11937, 238);
__m512 tmp11953 = _mm512_shuffle_ps(tmp11936, tmp11938, 68);
__m512 tmp11954 = _mm512_shuffle_ps(tmp11936, tmp11938, 238);
__m512 tmp11955 = _mm512_shuffle_ps(tmp11939, tmp11941, 68);
__m512 tmp11956 = _mm512_shuffle_ps(tmp11939, tmp11941, 238);
__m512 tmp11957 = _mm512_shuffle_ps(tmp11940, tmp11942, 68);
__m512 tmp11958 = _mm512_shuffle_ps(tmp11940, tmp11942, 238);
__m512 tmp11959 = _mm512_shuffle_f32x4(tmp11943, tmp11947, 136);
__m512 tmp11960 = _mm512_shuffle_f32x4(tmp11943, tmp11947, 221);
__m512 tmp11961 = _mm512_shuffle_f32x4(tmp11944, tmp11948, 136);
__m512 tmp11962 = _mm512_shuffle_f32x4(tmp11944, tmp11948, 221);
__m512 tmp11963 = _mm512_shuffle_f32x4(tmp11945, tmp11949, 136);
__m512 tmp11964 = _mm512_shuffle_f32x4(tmp11945, tmp11949, 221);
__m512 tmp11965 = _mm512_shuffle_f32x4(tmp11946, tmp11950, 136);
__m512 tmp11966 = _mm512_shuffle_f32x4(tmp11946, tmp11950, 221);
__m512 tmp11967 = _mm512_shuffle_f32x4(tmp11951, tmp11955, 136);
__m512 tmp11968 = _mm512_shuffle_f32x4(tmp11951, tmp11955, 221);
__m512 tmp11969 = _mm512_shuffle_f32x4(tmp11952, tmp11956, 136);
__m512 tmp11970 = _mm512_shuffle_f32x4(tmp11952, tmp11956, 221);
__m512 tmp11971 = _mm512_shuffle_f32x4(tmp11953, tmp11957, 136);
__m512 tmp11972 = _mm512_shuffle_f32x4(tmp11953, tmp11957, 221);
__m512 tmp11973 = _mm512_shuffle_f32x4(tmp11954, tmp11958, 136);
__m512 tmp11974 = _mm512_shuffle_f32x4(tmp11954, tmp11958, 221);
in1696 = _mm512_shuffle_f32x4(tmp11959, tmp11967, 136);
in1701 = _mm512_shuffle_f32x4(tmp11959, tmp11967, 221);
tmp11909 = _mm512_shuffle_f32x4(tmp11961, tmp11969, 136);
tmp11914 = _mm512_shuffle_f32x4(tmp11961, tmp11969, 221);
tmp11910 = _mm512_shuffle_f32x4(tmp11963, tmp11971, 136);
tmp11915 = _mm512_shuffle_f32x4(tmp11963, tmp11971, 221);
tmp11911 = _mm512_shuffle_f32x4(tmp11965, tmp11973, 136);
tmp11916 = _mm512_shuffle_f32x4(tmp11965, tmp11973, 221);
tmp11908 = _mm512_shuffle_f32x4(tmp11960, tmp11968, 136);
tmp11913 = _mm512_shuffle_f32x4(tmp11960, tmp11968, 221);
in1698 = _mm512_shuffle_f32x4(tmp11962, tmp11970, 136);
in1700 = _mm512_shuffle_f32x4(tmp11964, tmp11972, 136);
in1699 = _mm512_shuffle_f32x4(tmp11966, tmp11974, 136);
(void)in1698;
(void)in1700;
(void)in1699;
__m512 tmp11917 = tmp11909;
__m512 tmp11922 = tmp11914;
__m512 tmp11918 = _mm512_sub_ps(tmp11908, tmp11910);
__m512 tmp11923 = _mm512_sub_ps(tmp11913, tmp11915);
__m512 tmp11919 = tmp11910;
__m512 tmp11924 = tmp11915;
in1696 = in1696;
in1701 = in1701;
tmp11917 = _mm512_fmadd_ps(tmp11911, _mm512_set1_ps(-4.25e+00f), tmp11917);
tmp11922 = _mm512_fmadd_ps(tmp11916, _mm512_set1_ps(-4.25e+00f), tmp11922);
tmp11919 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(-4.25e+00f), tmp11919);
tmp11924 = _mm512_fmadd_ps(tmp11913, _mm512_set1_ps(-4.25e+00f), tmp11924);
in1696 = _mm512_fmadd_ps(tmp11918, _mm512_set1_ps(5.25e+00f), in1696);
in1701 = _mm512_fmadd_ps(tmp11923, _mm512_set1_ps(5.25e+00f), in1701);
tmp11918 = _mm512_mul_ps(tmp11910, _mm512_set1_ps(2.5e-01f));
tmp11923 = _mm512_mul_ps(tmp11915, _mm512_set1_ps(2.5e-01f));
tmp11910 = _mm512_mul_ps(tmp11910, _mm512_set1_ps(4e+00f));
tmp11915 = _mm512_mul_ps(tmp11915, _mm512_set1_ps(4e+00f));
__m512 tmp11920 = _mm512_sub_ps(tmp11919, tmp11917);
__m512 tmp11925 = _mm512_sub_ps(tmp11924, tmp11922);
tmp11919 = _mm512_add_ps(tmp11917, tmp11919);
tmp11924 = _mm512_add_ps(tmp11922, tmp11924);
tmp11917 = _mm512_mul_ps(tmp11909, _mm512_set1_ps(2.5e-01f));
tmp11922 = _mm512_mul_ps(tmp11914, _mm512_set1_ps(2.5e-01f));
tmp11918 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(-1.25e+00f), tmp11918);
tmp11923 = _mm512_fmadd_ps(tmp11913, _mm512_set1_ps(-1.25e+00f), tmp11923);
tmp11908 = _mm512_fmadd_ps(tmp11908, _mm512_set1_ps(-5e+00f), tmp11910);
tmp11913 = _mm512_fmadd_ps(tmp11913, _mm512_set1_ps(-5e+00f), tmp11915);
tmp11917 = _mm512_fmadd_ps(tmp11911, _mm512_set1_ps(-1.25e+00f), tmp11917);
tmp11922 = _mm512_fmadd_ps(tmp11916, _mm512_set1_ps(-1.25e+00f), tmp11922);
__m512 tmp11921 = _mm512_fmadd_ps(tmp11917, _mm512_set1_ps(2e+00f), tmp11918);
__m512 tmp11926 = _mm512_fmadd_ps(tmp11922, _mm512_set1_ps(2e+00f), tmp11923);
tmp11918 = _mm512_fnmadd_ps(tmp11917, _mm512_set1_ps(2e+00f), tmp11918);
tmp11923 = _mm512_fnmadd_ps(tmp11922, _mm512_set1_ps(2e+00f), tmp11923);
tmp11917 = tmp11909;
tmp11922 = tmp11914;
tmp11909 = _mm512_sub_ps(_mm512_setzero_ps(), tmp11909);
tmp11914 = _mm512_sub_ps(_mm512_setzero_ps(), tmp11914);
tmp11917 = _mm512_fmadd_ps(tmp11911, _mm512_set1_ps(-1.25e+00f), tmp11917);
tmp11922 = _mm512_fmadd_ps(tmp11916, _mm512_set1_ps(-1.25e+00f), tmp11922);
tmp11911 = tmp11911;
tmp11916 = tmp11916;
tmp11911 = _mm512_fmadd_ps(tmp11911, _mm512_set1_ps(5.25e+00f), tmp11909);
tmp11916 = _mm512_fmadd_ps(tmp11916, _mm512_set1_ps(5.25e+00f), tmp11914);
tmp11910 = _mm512_fmadd_ps(tmp11917, _mm512_set1_ps(2e+00f), tmp11908);
tmp11915 = _mm512_fmadd_ps(tmp11922, _mm512_set1_ps(2e+00f), tmp11913);
tmp11908 = _mm512_fnmadd_ps(tmp11917, _mm512_set1_ps(2e+00f), tmp11908);
tmp11913 = _mm512_fnmadd_ps(tmp11922, _mm512_set1_ps(2e+00f), tmp11913);
__m512 out1583 = _mm512_shuffle_f32x4(in1696, tmp11919, 68);
__m512 out1591 = _mm512_shuffle_f32x4(in1696, tmp11919, 238);
__m512 out1584 = _mm512_shuffle_f32x4(tmp11920, tmp11921, 68);
__m512 out1592 = _mm512_shuffle_f32x4(tmp11920, tmp11921, 238);
__m512 out1585 = _mm512_shuffle_f32x4(tmp11918, tmp11910, 68);
__m512 out1593 = _mm512_shuffle_f32x4(tmp11918, tmp11910, 238);
__m512 out1586 = _mm512_shuffle_f32x4(tmp11908, tmp11911, 68);
__m512 out1594 = _mm512_shuffle_f32x4(tmp11908, tmp11911, 238);
__m512 out1587 = _mm512_shuffle_f32x4(in1701, tmp11924, 68);
__m512 out1595 = _mm512_shuffle_f32x4(in1701, tmp11924, 238);
__m512 out1588 = _mm512_shuffle_f32x4(tmp11925, tmp11926, 68);
__m512 out1596 = _mm512_shuffle_f32x4(tmp11925, tmp11926, 238);
__m512 out1589 = _mm512_shuffle_f32x4(tmp11923, tmp11915, 68);
__m512 out1597 = _mm512_shuffle_f32x4(tmp11923, tmp11915, 238);
__m512 out1590 = _mm512_shuffle_f32x4(tmp11913, tmp11916, 68);
__m512 out1598 = _mm512_shuffle_f32x4(tmp11913, tmp11916, 238);
_mm512_storeu_ps(dfPtr11+0+51200*i49+3072*j42+512*s46+256*k134, out1583);
_mm512_storeu_ps(dfPtr11+128+51200*i49+3072*j42+512*s46+256*k134, out1591);
_mm512_storeu_ps(dfPtr11+64+51200*i49+3072*j42+512*s46+256*k134, out1587);
_mm512_storeu_ps(dfPtr11+192+51200*i49+3072*j42+512*s46+256*k134, out1595);
_mm512_storeu_ps(dfPtr11+12800+51200*i49+3072*j42+512*s46+256*k134, out1584);
_mm512_storeu_ps(dfPtr11+12928+51200*i49+3072*j42+512*s46+256*k134, out1592);
_mm512_storeu_ps(dfPtr11+12864+51200*i49+3072*j42+512*s46+256*k134, out1588);
_mm512_storeu_ps(dfPtr11+12992+51200*i49+3072*j42+512*s46+256*k134, out1596);
_mm512_storeu_ps(dfPtr11+25600+51200*i49+3072*j42+512*s46+256*k134, out1585);
_mm512_storeu_ps(dfPtr11+25728+51200*i49+3072*j42+512*s46+256*k134, out1593);
_mm512_storeu_ps(dfPtr11+25664+51200*i49+3072*j42+512*s46+256*k134, out1589);
_mm512_storeu_ps(dfPtr11+25792+51200*i49+3072*j42+512*s46+256*k134, out1597);
_mm512_storeu_ps(dfPtr11+38400+51200*i49+3072*j42+512*s46+256*k134, out1586);
_mm512_storeu_ps(dfPtr11+38528+51200*i49+3072*j42+512*s46+256*k134, out1594);
_mm512_storeu_ps(dfPtr11+38464+51200*i49+3072*j42+512*s46+256*k134, out1590);
_mm512_storeu_ps(dfPtr11+38592+51200*i49+3072*j42+512*s46+256*k134, out1598);
}
++j42;
}
}

static void ResNeXt50ThreeArrangeDats3(ResNeXt50ThreaderTeam1* team53, char** tensors79) {
ResNeXt50ThreaderTask1 task83;
task83.callee1 = ResNeXt50ThreeArrangeDats3Callee1;
task83.any1 = tensors79;
task83.nd1 = 4;
task83.hull1[0] = 1;
task83.hull1[1] = 1;
task83.hull1[2] = 10;
task83.hull1[3] = 1;
ResNeXt50ThreaderDo1(team53, &task83);
}

static void ResNeXt50ThreeProduceSums3Callee1(ResNeXt50ThreaderTask1* task84, int64_t* pt47) {
void** pair20 = task84->any1;
char** tensors82 = pair20[0];
ptrdiff_t e25 = 0;
ptrdiff_t g27 = pt47[3];
ptrdiff_t f61 = 0;
ptrdiff_t d16 = 0;
ptrdiff_t w61 = 0;
char*restrict bfPtr12 = tensors82[0]+1024*e25;
char*restrict wfPtr12 = tensors82[0]+1024+12976128*e25;
char*restrict dfPtr12 = tensors82[1]+81100800*e25;
char*restrict sfPtr11 = tensors82[2];
ptrdiff_t i50 = 2*g27;
ptrdiff_t ii36 = i50+1;
for (; i50 <= ii36; ++i50) {
ptrdiff_t j43 = 4*f61;
ptrdiff_t jj44 = j43+3;
for (; j43 <= jj44; ++j43) {
ptrdiff_t k135 = 5*d16;
for (; k135 != 4; ++k135) {
ptrdiff_t l53 = 2*w61;
for (; l53 != 2; ++l53) {
__m512 sum362;
__m512 sum368;
__m512 sum374;
__m512 sum380;
if (__builtin_expect(!j43, 0)) {
sum362 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+0+32*i50+16*l53)));
sum368 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+4+32*i50+16*l53)));
sum374 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+8+32*i50+16*l53)));
sum380 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+12+32*i50+16*l53)));
} else {
sum362 = _mm512_setzero_ps();
sum368 = _mm512_setzero_ps();
sum374 = _mm512_setzero_ps();
sum380 = _mm512_setzero_ps();
}
__m512 sum363 = sum362;
__m512 sum364 = sum362;
__m512 sum365 = sum362;
__m512 sum366 = sum362;
__m512 sum367 = sum362;
__m512 sum369 = sum368;
__m512 sum370 = sum368;
__m512 sum371 = sum368;
__m512 sum372 = sum368;
__m512 sum373 = sum368;
__m512 sum375 = sum374;
__m512 sum376 = sum374;
__m512 sum377 = sum374;
__m512 sum378 = sum374;
__m512 sum379 = sum374;
__m512 sum381 = sum380;
__m512 sum382 = sum380;
__m512 sum383 = sum380;
__m512 sum384 = sum380;
__m512 sum385 = sum380;
ptrdiff_t b72 = 0;
for (; b72 != 8; ++b72) {
__m512i wfs41 = _mm512_maskz_loadu_epi32(65535, wfPtr12+0+8192*i50+2048*j43+1024*l53+128*b72);
__m512 wf129 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs41));
__m512 df869 = _mm512_loadu_ps(dfPtr12+0+51200*i50+12800*j43+3072*k135+384*b72);
sum362 = _mm512_fmadd_ps(wf129, df869, sum362);
__m512 df870 = _mm512_loadu_ps(dfPtr12+64+51200*i50+12800*j43+3072*k135+384*b72);
sum363 = _mm512_fmadd_ps(wf129, df870, sum363);
__m512 df871 = _mm512_loadu_ps(dfPtr12+128+51200*i50+12800*j43+3072*k135+384*b72);
sum364 = _mm512_fmadd_ps(wf129, df871, sum364);
__m512 df872 = _mm512_loadu_ps(dfPtr12+192+51200*i50+12800*j43+3072*k135+384*b72);
sum365 = _mm512_fmadd_ps(wf129, df872, sum365);
__m512 df873 = _mm512_loadu_ps(dfPtr12+256+51200*i50+12800*j43+3072*k135+384*b72);
sum366 = _mm512_fmadd_ps(wf129, df873, sum366);
__m512 df874 = _mm512_loadu_ps(dfPtr12+320+51200*i50+12800*j43+3072*k135+384*b72);
sum367 = _mm512_fmadd_ps(wf129, df874, sum367);
__m512 wf130 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs41, 1));
sum368 = _mm512_fmadd_ps(wf130, df869, sum368);
sum369 = _mm512_fmadd_ps(wf130, df870, sum369);
sum370 = _mm512_fmadd_ps(wf130, df871, sum370);
sum371 = _mm512_fmadd_ps(wf130, df872, sum371);
sum372 = _mm512_fmadd_ps(wf130, df873, sum372);
sum373 = _mm512_fmadd_ps(wf130, df874, sum373);
__m512i wfs42 = _mm512_maskz_loadu_epi32(65535, wfPtr12+64+8192*i50+2048*j43+1024*l53+128*b72);
__m512 wf131 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs42));
sum374 = _mm512_fmadd_ps(wf131, df869, sum374);
sum375 = _mm512_fmadd_ps(wf131, df870, sum375);
sum376 = _mm512_fmadd_ps(wf131, df871, sum376);
sum377 = _mm512_fmadd_ps(wf131, df872, sum377);
sum378 = _mm512_fmadd_ps(wf131, df873, sum378);
sum379 = _mm512_fmadd_ps(wf131, df874, sum379);
__m512 wf132 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs42, 1));
sum380 = _mm512_fmadd_ps(wf132, df869, sum380);
sum381 = _mm512_fmadd_ps(wf132, df870, sum381);
sum382 = _mm512_fmadd_ps(wf132, df871, sum382);
sum383 = _mm512_fmadd_ps(wf132, df872, sum383);
sum384 = _mm512_fmadd_ps(wf132, df873, sum384);
sum385 = _mm512_fmadd_ps(wf132, df874, sum385);
}
_mm512_storeu_ps(sfPtr11+0+51200*i50+12800*j43+3072*k135+1536*l53, sum362);
_mm512_storeu_ps(sfPtr11+64+51200*i50+12800*j43+3072*k135+1536*l53, sum363);
_mm512_storeu_ps(sfPtr11+128+51200*i50+12800*j43+3072*k135+1536*l53, sum364);
_mm512_storeu_ps(sfPtr11+192+51200*i50+12800*j43+3072*k135+1536*l53, sum365);
_mm512_storeu_ps(sfPtr11+256+51200*i50+12800*j43+3072*k135+1536*l53, sum366);
_mm512_storeu_ps(sfPtr11+320+51200*i50+12800*j43+3072*k135+1536*l53, sum367);
_mm512_storeu_ps(sfPtr11+384+51200*i50+12800*j43+3072*k135+1536*l53, sum368);
_mm512_storeu_ps(sfPtr11+448+51200*i50+12800*j43+3072*k135+1536*l53, sum369);
_mm512_storeu_ps(sfPtr11+512+51200*i50+12800*j43+3072*k135+1536*l53, sum370);
_mm512_storeu_ps(sfPtr11+576+51200*i50+12800*j43+3072*k135+1536*l53, sum371);
_mm512_storeu_ps(sfPtr11+640+51200*i50+12800*j43+3072*k135+1536*l53, sum372);
_mm512_storeu_ps(sfPtr11+704+51200*i50+12800*j43+3072*k135+1536*l53, sum373);
_mm512_storeu_ps(sfPtr11+768+51200*i50+12800*j43+3072*k135+1536*l53, sum374);
_mm512_storeu_ps(sfPtr11+832+51200*i50+12800*j43+3072*k135+1536*l53, sum375);
_mm512_storeu_ps(sfPtr11+896+51200*i50+12800*j43+3072*k135+1536*l53, sum376);
_mm512_storeu_ps(sfPtr11+960+51200*i50+12800*j43+3072*k135+1536*l53, sum377);
_mm512_storeu_ps(sfPtr11+1024+51200*i50+12800*j43+3072*k135+1536*l53, sum378);
_mm512_storeu_ps(sfPtr11+1088+51200*i50+12800*j43+3072*k135+1536*l53, sum379);
_mm512_storeu_ps(sfPtr11+1152+51200*i50+12800*j43+3072*k135+1536*l53, sum380);
_mm512_storeu_ps(sfPtr11+1216+51200*i50+12800*j43+3072*k135+1536*l53, sum381);
_mm512_storeu_ps(sfPtr11+1280+51200*i50+12800*j43+3072*k135+1536*l53, sum382);
_mm512_storeu_ps(sfPtr11+1344+51200*i50+12800*j43+3072*k135+1536*l53, sum383);
_mm512_storeu_ps(sfPtr11+1408+51200*i50+12800*j43+3072*k135+1536*l53, sum384);
_mm512_storeu_ps(sfPtr11+1472+51200*i50+12800*j43+3072*k135+1536*l53, sum385);
}
}
ptrdiff_t l54 = 2*w61;
for (; l54 != 2; ++l54) {
__m512 sum386;
__m512 sum387;
__m512 sum388;
__m512 sum389;
if (__builtin_expect(!j43, 0)) {
sum386 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+0+32*i50+16*l54)));
sum387 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+4+32*i50+16*l54)));
sum388 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+8+32*i50+16*l54)));
sum389 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr12+12+32*i50+16*l54)));
} else {
sum386 = _mm512_setzero_ps();
sum387 = _mm512_setzero_ps();
sum388 = _mm512_setzero_ps();
sum389 = _mm512_setzero_ps();
}
ptrdiff_t b73 = 0;
for (; b73 != 8; ++b73) {
__m512i wfs43 = _mm512_maskz_loadu_epi32(65535, wfPtr12+0+8192*i50+2048*j43+1024*l54+128*b73);
__m512 wf133 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs43));
__m512 df875 = _mm512_loadu_ps(dfPtr12+0+51200*i50+12800*j43+3072*k135+64*b73);
sum386 = _mm512_fmadd_ps(wf133, df875, sum386);
__m512 wf134 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs43, 1));
sum387 = _mm512_fmadd_ps(wf134, df875, sum387);
__m512i wfs44 = _mm512_maskz_loadu_epi32(65535, wfPtr12+64+8192*i50+2048*j43+1024*l54+128*b73);
__m512 wf135 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs44));
sum388 = _mm512_fmadd_ps(wf135, df875, sum388);
__m512 wf136 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs44, 1));
sum389 = _mm512_fmadd_ps(wf136, df875, sum389);
}
_mm512_storeu_ps(sfPtr11+0+51200*i50+12800*j43+3072*k135+256*l54, sum386);
_mm512_storeu_ps(sfPtr11+64+51200*i50+12800*j43+3072*k135+256*l54, sum387);
_mm512_storeu_ps(sfPtr11+128+51200*i50+12800*j43+3072*k135+256*l54, sum388);
_mm512_storeu_ps(sfPtr11+192+51200*i50+12800*j43+3072*k135+256*l54, sum389);
}
}
}
}

static void ResNeXt50ThreeProduceSums3(ResNeXt50ThreaderTeam1* team54, char** tensors81) {
void* pair19[] = {tensors81, 0};
ResNeXt50ThreaderTask1 task85;
task85.callee1 = ResNeXt50ThreeProduceSums3Callee1;
task85.any1 = pair19;
task85.nd1 = 4;
task85.hull1[0] = 1;
task85.hull1[1] = 1;
task85.hull1[2] = 1;
task85.hull1[3] = 16;
ResNeXt50ThreaderDo1(team54, &task85);
}

static void ResNeXt50ThreeConsumeSums3Callee1(ResNeXt50ThreaderTask1* task86, int64_t* pt48) {
char** tensors84 = task86->any1;
ptrdiff_t w62 = 0;
ptrdiff_t d17 = 0;
ptrdiff_t g28 = pt48[2];
char*restrict sfPtr12 = tensors84[0];
char*restrict datPtr26 = tensors84[1];
ptrdiff_t i51 = 3*g28;
ptrdiff_t ii37 = i51+(g28 < 9 ? 2 : 4);
for (; i51 <= ii37; ++i51) {
ptrdiff_t j44 = 5*d17;
ptrdiff_t rel22 = j44-0;
ptrdiff_t base22 = 0;
if (rel22 < 2) {
if (rel22 < 1) {
ptrdiff_t toH41 = base22+0;
ptrdiff_t toW41 = 0;
ptrdiff_t k136 = 2*w62;
for (; k136 != 2; ++k136) {
ptrdiff_t l55 = 0;
for (; l55 != 2; ++l55) {
__m512 sf801 = _mm512_loadu_ps(sfPtr12+0+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf802 = _mm512_loadu_ps(sfPtr12+128+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1706 = _mm512_shuffle_f32x4(sf801, sf802, 68);
__m512 in1707 = _mm512_shuffle_f32x4(sf801, sf802, 238);
__m512 sf803 = _mm512_loadu_ps(sfPtr12+64+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf804 = _mm512_loadu_ps(sfPtr12+192+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1714 = _mm512_shuffle_f32x4(sf803, sf804, 68);
__m512 in1715 = _mm512_shuffle_f32x4(sf803, sf804, 238);
__m512 sf805 = _mm512_loadu_ps(sfPtr12+12800+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf806 = _mm512_loadu_ps(sfPtr12+12928+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1708 = _mm512_shuffle_f32x4(sf805, sf806, 68);
__m512 in1709 = _mm512_shuffle_f32x4(sf805, sf806, 238);
__m512 sf807 = _mm512_loadu_ps(sfPtr12+12864+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf808 = _mm512_loadu_ps(sfPtr12+12992+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1716 = _mm512_shuffle_f32x4(sf807, sf808, 68);
__m512 in1717 = _mm512_shuffle_f32x4(sf807, sf808, 238);
__m512 sf809 = _mm512_loadu_ps(sfPtr12+25600+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf810 = _mm512_loadu_ps(sfPtr12+25728+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1710 = _mm512_shuffle_f32x4(sf809, sf810, 68);
__m512 in1711 = _mm512_shuffle_f32x4(sf809, sf810, 238);
__m512 sf811 = _mm512_loadu_ps(sfPtr12+25664+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf812 = _mm512_loadu_ps(sfPtr12+25792+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1718 = _mm512_shuffle_f32x4(sf811, sf812, 68);
__m512 in1719 = _mm512_shuffle_f32x4(sf811, sf812, 238);
__m512 sf813 = _mm512_loadu_ps(sfPtr12+38400+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf814 = _mm512_loadu_ps(sfPtr12+38528+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1712 = _mm512_shuffle_f32x4(sf813, sf814, 68);
__m512 in1713 = _mm512_shuffle_f32x4(sf813, sf814, 238);
__m512 sf815 = _mm512_loadu_ps(sfPtr12+38464+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf816 = _mm512_loadu_ps(sfPtr12+38592+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1720 = _mm512_shuffle_f32x4(sf815, sf816, 68);
__m512 in1721 = _mm512_shuffle_f32x4(sf815, sf816, 238);
__m512 tmp11991 = _mm512_add_ps(in1707, in1708);
__m512 tmp12011 = _mm512_add_ps(in1715, in1716);
__m512 tmp11990 = _mm512_add_ps(in1709, in1710);
__m512 tmp12010 = _mm512_add_ps(in1717, in1718);
__m512 tmp11996 = _mm512_sub_ps(in1709, in1710);
__m512 tmp12016 = _mm512_sub_ps(in1717, in1718);
__m512 tmp11995 = _mm512_sub_ps(in1707, in1708);
__m512 tmp12015 = _mm512_sub_ps(in1715, in1716);
__m512 tmp11992 = _mm512_add_ps(in1711, in1712);
__m512 tmp12012 = _mm512_add_ps(in1719, in1720);
__m512 tmp11997 = _mm512_sub_ps(in1711, in1712);
__m512 tmp12017 = _mm512_sub_ps(in1719, in1720);
__m512 tmp11994 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(2e+00f), tmp11995);
__m512 tmp12014 = _mm512_fmadd_ps(tmp12016, _mm512_set1_ps(2e+00f), tmp12015);
__m512 tmp12001 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(8e+00f), tmp11995);
__m512 tmp12021 = _mm512_fmadd_ps(tmp12016, _mm512_set1_ps(8e+00f), tmp12015);
__m512 tmp11989 = _mm512_add_ps(tmp11990, tmp11991);
__m512 tmp12009 = _mm512_add_ps(tmp12010, tmp12011);
__m512 tmp11993 = _mm512_fmadd_ps(tmp11997, _mm512_set1_ps(1.6e+01f), tmp11994);
__m512 tmp12013 = _mm512_fmadd_ps(tmp12017, _mm512_set1_ps(1.6e+01f), tmp12014);
__m512 tmp12000 = _mm512_fmadd_ps(tmp11997, _mm512_set1_ps(4e+00f), tmp12001);
__m512 tmp12020 = _mm512_fmadd_ps(tmp12017, _mm512_set1_ps(4e+00f), tmp12021);
__m512 tmp12006 = _mm512_add_ps(tmp11997, tmp11995);
__m512 tmp12026 = _mm512_add_ps(tmp12017, tmp12015);
__m512 tmp11999 = _mm512_fmadd_ps(tmp11990, _mm512_set1_ps(4e+00f), tmp11991);
__m512 tmp12019 = _mm512_fmadd_ps(tmp12010, _mm512_set1_ps(4e+00f), tmp12011);
__m512 tmp12003 = _mm512_fmadd_ps(tmp11990, _mm512_set1_ps(1.6e+01f), tmp11991);
__m512 tmp12023 = _mm512_fmadd_ps(tmp12010, _mm512_set1_ps(1.6e+01f), tmp12011);
__m512 tmp11988 = _mm512_add_ps(tmp11989, in1706);
__m512 tmp12008 = _mm512_add_ps(tmp12009, in1714);
__m512 tmp12005 = _mm512_add_ps(tmp12006, in1713);
__m512 tmp12025 = _mm512_add_ps(tmp12026, in1721);
__m512 tmp11987 = _mm512_fmadd_ps(tmp11992, _mm512_set1_ps(3.2e+01f), tmp11988);
__m512 tmp12007 = _mm512_fmadd_ps(tmp12012, _mm512_set1_ps(3.2e+01f), tmp12008);
__m512 tmp11998 = _mm512_fmadd_ps(tmp11992, _mm512_set1_ps(8e+00f), tmp11999);
__m512 tmp12018 = _mm512_fmadd_ps(tmp12012, _mm512_set1_ps(8e+00f), tmp12019);
__m512 tmp12004 = _mm512_fmadd_ps(tmp11996, _mm512_set1_ps(3.2e+01f), tmp12005);
__m512 tmp12024 = _mm512_fmadd_ps(tmp12016, _mm512_set1_ps(3.2e+01f), tmp12025);
__m512 tmp12002 = _mm512_fmadd_ps(tmp11992, _mm512_set1_ps(2e+00f), tmp12003);
__m512 tmp12022 = _mm512_fmadd_ps(tmp12012, _mm512_set1_ps(2e+00f), tmp12023);
__m512 tmp11975 = tmp11987;
__m512 tmp11981 = tmp12007;
__m512 tmp11976 = tmp11993;
__m512 tmp11982 = tmp12013;
__m512 tmp11977 = tmp11998;
__m512 tmp11983 = tmp12018;
__m512 tmp11978 = tmp12000;
__m512 tmp11984 = tmp12020;
__m512 tmp11979 = tmp12002;
__m512 tmp11985 = tmp12022;
__m512 tmp11980 = tmp12004;
__m512 tmp11986 = tmp12024;
__m512 tmp12071 = _mm512_unpacklo_ps(tmp11975, tmp11976);
__m512 tmp12072 = _mm512_unpackhi_ps(tmp11975, tmp11976);
__m512 tmp12073 = _mm512_unpacklo_ps(tmp11977, tmp11978);
__m512 tmp12074 = _mm512_unpackhi_ps(tmp11977, tmp11978);
__m512 tmp12075 = _mm512_unpacklo_ps(tmp11979, tmp11980);
__m512 tmp12076 = _mm512_unpackhi_ps(tmp11979, tmp11980);
__m512 tmp12077 = _mm512_unpacklo_ps(tmp11981, tmp11982);
__m512 tmp12078 = _mm512_unpackhi_ps(tmp11981, tmp11982);
__m512 tmp12079 = _mm512_unpacklo_ps(tmp11983, tmp11984);
__m512 tmp12080 = _mm512_unpackhi_ps(tmp11983, tmp11984);
__m512 tmp12081 = _mm512_unpacklo_ps(tmp11985, tmp11986);
__m512 tmp12082 = _mm512_unpackhi_ps(tmp11985, tmp11986);
__m512 tmp12083 = _mm512_shuffle_ps(tmp12071, tmp12073, 68);
__m512 tmp12084 = _mm512_shuffle_ps(tmp12071, tmp12073, 238);
__m512 tmp12085 = _mm512_shuffle_ps(tmp12072, tmp12074, 68);
__m512 tmp12086 = _mm512_shuffle_ps(tmp12072, tmp12074, 238);
__m512 tmp12087 = _mm512_shuffle_ps(tmp12075, tmp12077, 68);
__m512 tmp12088 = _mm512_shuffle_ps(tmp12075, tmp12077, 238);
__m512 tmp12089 = _mm512_shuffle_ps(tmp12076, tmp12078, 68);
__m512 tmp12090 = _mm512_shuffle_ps(tmp12076, tmp12078, 238);
__m512 tmp12091 = _mm512_shuffle_ps(tmp12079, tmp12081, 68);
__m512 tmp12092 = _mm512_shuffle_ps(tmp12079, tmp12081, 238);
__m512 tmp12093 = _mm512_shuffle_ps(tmp12080, tmp12082, 68);
__m512 tmp12094 = _mm512_shuffle_ps(tmp12080, tmp12082, 238);
__m512 tmp12095 = _mm512_shuffle_f32x4(tmp12083, tmp12087, 136);
__m512 tmp12096 = _mm512_shuffle_f32x4(tmp12083, tmp12087, 221);
__m512 tmp12097 = _mm512_shuffle_f32x4(tmp12084, tmp12088, 136);
__m512 tmp12098 = _mm512_shuffle_f32x4(tmp12084, tmp12088, 221);
__m512 tmp12099 = _mm512_shuffle_f32x4(tmp12085, tmp12089, 136);
__m512 tmp12100 = _mm512_shuffle_f32x4(tmp12085, tmp12089, 221);
__m512 tmp12101 = _mm512_shuffle_f32x4(tmp12086, tmp12090, 136);
__m512 tmp12102 = _mm512_shuffle_f32x4(tmp12086, tmp12090, 221);
__m512 tmp12103 = _mm512_shuffle_f32x4(tmp12091, tmp12091, 136);
__m512 tmp12104 = _mm512_shuffle_f32x4(tmp12091, tmp12091, 221);
__m512 tmp12105 = _mm512_shuffle_f32x4(tmp12092, tmp12092, 136);
__m512 tmp12106 = _mm512_shuffle_f32x4(tmp12092, tmp12092, 221);
__m512 tmp12107 = _mm512_shuffle_f32x4(tmp12093, tmp12093, 136);
__m512 tmp12108 = _mm512_shuffle_f32x4(tmp12093, tmp12093, 221);
__m512 tmp12109 = _mm512_shuffle_f32x4(tmp12094, tmp12094, 136);
__m512 tmp12110 = _mm512_shuffle_f32x4(tmp12094, tmp12094, 221);
tmp11975 = _mm512_shuffle_f32x4(tmp12095, tmp12103, 136);
tmp11983 = _mm512_shuffle_f32x4(tmp12095, tmp12103, 221);
tmp11976 = _mm512_shuffle_f32x4(tmp12097, tmp12105, 136);
tmp11984 = _mm512_shuffle_f32x4(tmp12097, tmp12105, 221);
tmp11977 = _mm512_shuffle_f32x4(tmp12099, tmp12107, 136);
tmp11985 = _mm512_shuffle_f32x4(tmp12099, tmp12107, 221);
tmp11978 = _mm512_shuffle_f32x4(tmp12101, tmp12109, 136);
tmp11986 = _mm512_shuffle_f32x4(tmp12101, tmp12109, 221);
tmp11979 = _mm512_shuffle_f32x4(tmp12096, tmp12104, 136);
__m512 tmp12027 = _mm512_shuffle_f32x4(tmp12096, tmp12104, 221);
tmp11980 = _mm512_shuffle_f32x4(tmp12098, tmp12106, 136);
__m512 tmp12028 = _mm512_shuffle_f32x4(tmp12098, tmp12106, 221);
tmp11981 = _mm512_shuffle_f32x4(tmp12100, tmp12108, 136);
__m512 tmp12029 = _mm512_shuffle_f32x4(tmp12100, tmp12108, 221);
tmp11982 = _mm512_shuffle_f32x4(tmp12102, tmp12110, 136);
__m512 tmp12030 = _mm512_shuffle_f32x4(tmp12102, tmp12110, 221);
__m512 tmp12035 = _mm512_add_ps(tmp11976, tmp11977);
__m512 tmp12055 = _mm512_add_ps(tmp11984, tmp11985);
__m512 tmp12034 = _mm512_add_ps(tmp11978, tmp11979);
__m512 tmp12054 = _mm512_add_ps(tmp11986, tmp12027);
__m512 tmp12040 = _mm512_sub_ps(tmp11978, tmp11979);
__m512 tmp12060 = _mm512_sub_ps(tmp11986, tmp12027);
__m512 tmp12039 = _mm512_sub_ps(tmp11976, tmp11977);
__m512 tmp12059 = _mm512_sub_ps(tmp11984, tmp11985);
__m512 tmp12036 = _mm512_add_ps(tmp11980, tmp11981);
__m512 tmp12056 = _mm512_add_ps(tmp12028, tmp12029);
__m512 tmp12041 = _mm512_sub_ps(tmp11980, tmp11981);
__m512 tmp12061 = _mm512_sub_ps(tmp12028, tmp12029);
__m512 tmp12038 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(2e+00f), tmp12039);
__m512 tmp12058 = _mm512_fmadd_ps(tmp12060, _mm512_set1_ps(2e+00f), tmp12059);
__m512 tmp12045 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(8e+00f), tmp12039);
__m512 tmp12065 = _mm512_fmadd_ps(tmp12060, _mm512_set1_ps(8e+00f), tmp12059);
__m512 tmp12033 = _mm512_add_ps(tmp12034, tmp12035);
__m512 tmp12053 = _mm512_add_ps(tmp12054, tmp12055);
__m512 tmp12037 = _mm512_fmadd_ps(tmp12041, _mm512_set1_ps(1.6e+01f), tmp12038);
__m512 tmp12057 = _mm512_fmadd_ps(tmp12061, _mm512_set1_ps(1.6e+01f), tmp12058);
__m512 tmp12044 = _mm512_fmadd_ps(tmp12041, _mm512_set1_ps(4e+00f), tmp12045);
__m512 tmp12064 = _mm512_fmadd_ps(tmp12061, _mm512_set1_ps(4e+00f), tmp12065);
__m512 tmp12050 = _mm512_add_ps(tmp12041, tmp12039);
__m512 tmp12070 = _mm512_add_ps(tmp12061, tmp12059);
__m512 tmp12043 = _mm512_fmadd_ps(tmp12034, _mm512_set1_ps(4e+00f), tmp12035);
__m512 tmp12063 = _mm512_fmadd_ps(tmp12054, _mm512_set1_ps(4e+00f), tmp12055);
__m512 tmp12047 = _mm512_fmadd_ps(tmp12034, _mm512_set1_ps(1.6e+01f), tmp12035);
__m512 tmp12067 = _mm512_fmadd_ps(tmp12054, _mm512_set1_ps(1.6e+01f), tmp12055);
__m512 tmp12032 = _mm512_add_ps(tmp12033, tmp11975);
__m512 tmp12052 = _mm512_add_ps(tmp12053, tmp11983);
__m512 tmp12049 = _mm512_add_ps(tmp12050, tmp11982);
__m512 tmp12069 = _mm512_add_ps(tmp12070, tmp12030);
__m512 tmp12031 = _mm512_fmadd_ps(tmp12036, _mm512_set1_ps(3.2e+01f), tmp12032);
__m512 tmp12051 = _mm512_fmadd_ps(tmp12056, _mm512_set1_ps(3.2e+01f), tmp12052);
__m512 tmp12042 = _mm512_fmadd_ps(tmp12036, _mm512_set1_ps(8e+00f), tmp12043);
__m512 tmp12062 = _mm512_fmadd_ps(tmp12056, _mm512_set1_ps(8e+00f), tmp12063);
__m512 tmp12048 = _mm512_fmadd_ps(tmp12040, _mm512_set1_ps(3.2e+01f), tmp12049);
__m512 tmp12068 = _mm512_fmadd_ps(tmp12060, _mm512_set1_ps(3.2e+01f), tmp12069);
__m512 tmp12046 = _mm512_fmadd_ps(tmp12036, _mm512_set1_ps(2e+00f), tmp12047);
__m512 tmp12066 = _mm512_fmadd_ps(tmp12056, _mm512_set1_ps(2e+00f), tmp12067);
__m512 out1599 = tmp12031;
__m512 out1605 = tmp12051;
__m512 out1600 = tmp12037;
__m512 out1606 = tmp12057;
__m512 out1601 = tmp12042;
__m512 out1607 = tmp12062;
__m512 out1602 = tmp12044;
__m512 out1608 = tmp12064;
__m512 out1603 = tmp12046;
__m512 out1609 = tmp12066;
__m512 out1604 = tmp12048;
__m512 out1610 = tmp12068;
out1599 = _mm512_max_ps(_mm512_setzero_ps(), out1599);
out1605 = _mm512_max_ps(_mm512_setzero_ps(), out1605);
out1600 = _mm512_max_ps(_mm512_setzero_ps(), out1600);
out1606 = _mm512_max_ps(_mm512_setzero_ps(), out1606);
out1601 = _mm512_max_ps(_mm512_setzero_ps(), out1601);
out1607 = _mm512_max_ps(_mm512_setzero_ps(), out1607);
out1602 = _mm512_max_ps(_mm512_setzero_ps(), out1602);
out1608 = _mm512_max_ps(_mm512_setzero_ps(), out1608);
out1603 = _mm512_max_ps(_mm512_setzero_ps(), out1603);
out1609 = _mm512_max_ps(_mm512_setzero_ps(), out1609);
out1604 = _mm512_max_ps(_mm512_setzero_ps(), out1604);
out1610 = _mm512_max_ps(_mm512_setzero_ps(), out1610);
_mm512_mask_storeu_ps(datPtr26+0+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1599);
_mm512_mask_storeu_ps(datPtr26+48+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1605);
_mm512_mask_storeu_ps(datPtr26+112+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1600);
_mm512_mask_storeu_ps(datPtr26+160+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1606);
_mm512_mask_storeu_ps(datPtr26+224+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1601);
_mm512_mask_storeu_ps(datPtr26+272+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1607);
_mm512_mask_storeu_ps(datPtr26+336+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1602);
_mm512_mask_storeu_ps(datPtr26+384+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1608);
_mm512_mask_storeu_ps(datPtr26+448+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1603);
_mm512_mask_storeu_ps(datPtr26+496+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1609);
_mm512_mask_storeu_ps(datPtr26+560+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1604);
_mm512_mask_storeu_ps(datPtr26+608+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1610);
__m512 sf817 = _mm512_loadu_ps(sfPtr12+256+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf818 = _mm512_loadu_ps(sfPtr12+384+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1722 = _mm512_shuffle_f32x4(sf818, sf817, 68);
__m512 in1723 = _mm512_shuffle_f32x4(sf818, sf817, 238);
__m512 sf819 = _mm512_loadu_ps(sfPtr12+320+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf820 = _mm512_loadu_ps(sfPtr12+448+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1730 = _mm512_shuffle_f32x4(sf820, sf819, 68);
__m512 in1731 = _mm512_shuffle_f32x4(sf820, sf819, 238);
__m512 sf821 = _mm512_loadu_ps(sfPtr12+13056+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf822 = _mm512_loadu_ps(sfPtr12+13184+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1724 = _mm512_shuffle_f32x4(sf822, sf821, 68);
__m512 in1725 = _mm512_shuffle_f32x4(sf822, sf821, 238);
__m512 sf823 = _mm512_loadu_ps(sfPtr12+13120+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf824 = _mm512_loadu_ps(sfPtr12+13248+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1732 = _mm512_shuffle_f32x4(sf824, sf823, 68);
__m512 in1733 = _mm512_shuffle_f32x4(sf824, sf823, 238);
__m512 sf825 = _mm512_loadu_ps(sfPtr12+25856+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf826 = _mm512_loadu_ps(sfPtr12+25984+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1726 = _mm512_shuffle_f32x4(sf826, sf825, 68);
__m512 in1727 = _mm512_shuffle_f32x4(sf826, sf825, 238);
__m512 sf827 = _mm512_loadu_ps(sfPtr12+25920+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf828 = _mm512_loadu_ps(sfPtr12+26048+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1734 = _mm512_shuffle_f32x4(sf828, sf827, 68);
__m512 in1735 = _mm512_shuffle_f32x4(sf828, sf827, 238);
__m512 sf829 = _mm512_loadu_ps(sfPtr12+38656+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf830 = _mm512_loadu_ps(sfPtr12+38784+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1728 = _mm512_shuffle_f32x4(sf830, sf829, 68);
__m512 in1729 = _mm512_shuffle_f32x4(sf830, sf829, 238);
__m512 sf831 = _mm512_loadu_ps(sfPtr12+38720+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf832 = _mm512_loadu_ps(sfPtr12+38848+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1736 = _mm512_shuffle_f32x4(sf832, sf831, 68);
__m512 in1737 = _mm512_shuffle_f32x4(sf832, sf831, 238);
__m512 tmp12127 = _mm512_add_ps(in1723, in1724);
__m512 tmp12147 = _mm512_add_ps(in1731, in1732);
__m512 tmp12126 = _mm512_add_ps(in1725, in1726);
__m512 tmp12146 = _mm512_add_ps(in1733, in1734);
__m512 tmp12132 = _mm512_sub_ps(in1725, in1726);
__m512 tmp12152 = _mm512_sub_ps(in1733, in1734);
__m512 tmp12131 = _mm512_sub_ps(in1723, in1724);
__m512 tmp12151 = _mm512_sub_ps(in1731, in1732);
__m512 tmp12128 = _mm512_add_ps(in1727, in1728);
__m512 tmp12148 = _mm512_add_ps(in1735, in1736);
__m512 tmp12133 = _mm512_sub_ps(in1727, in1728);
__m512 tmp12153 = _mm512_sub_ps(in1735, in1736);
__m512 tmp12130 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(2e+00f), tmp12131);
__m512 tmp12150 = _mm512_fmadd_ps(tmp12152, _mm512_set1_ps(2e+00f), tmp12151);
__m512 tmp12137 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(8e+00f), tmp12131);
__m512 tmp12157 = _mm512_fmadd_ps(tmp12152, _mm512_set1_ps(8e+00f), tmp12151);
__m512 tmp12125 = _mm512_add_ps(tmp12126, tmp12127);
__m512 tmp12145 = _mm512_add_ps(tmp12146, tmp12147);
__m512 tmp12129 = _mm512_fmadd_ps(tmp12133, _mm512_set1_ps(1.6e+01f), tmp12130);
__m512 tmp12149 = _mm512_fmadd_ps(tmp12153, _mm512_set1_ps(1.6e+01f), tmp12150);
__m512 tmp12136 = _mm512_fmadd_ps(tmp12133, _mm512_set1_ps(4e+00f), tmp12137);
__m512 tmp12156 = _mm512_fmadd_ps(tmp12153, _mm512_set1_ps(4e+00f), tmp12157);
__m512 tmp12142 = _mm512_add_ps(tmp12133, tmp12131);
__m512 tmp12162 = _mm512_add_ps(tmp12153, tmp12151);
__m512 tmp12135 = _mm512_fmadd_ps(tmp12126, _mm512_set1_ps(4e+00f), tmp12127);
__m512 tmp12155 = _mm512_fmadd_ps(tmp12146, _mm512_set1_ps(4e+00f), tmp12147);
__m512 tmp12139 = _mm512_fmadd_ps(tmp12126, _mm512_set1_ps(1.6e+01f), tmp12127);
__m512 tmp12159 = _mm512_fmadd_ps(tmp12146, _mm512_set1_ps(1.6e+01f), tmp12147);
__m512 tmp12124 = _mm512_add_ps(tmp12125, in1722);
__m512 tmp12144 = _mm512_add_ps(tmp12145, in1730);
__m512 tmp12141 = _mm512_add_ps(tmp12142, in1729);
__m512 tmp12161 = _mm512_add_ps(tmp12162, in1737);
__m512 tmp12123 = _mm512_fmadd_ps(tmp12128, _mm512_set1_ps(3.2e+01f), tmp12124);
__m512 tmp12143 = _mm512_fmadd_ps(tmp12148, _mm512_set1_ps(3.2e+01f), tmp12144);
__m512 tmp12134 = _mm512_fmadd_ps(tmp12128, _mm512_set1_ps(8e+00f), tmp12135);
__m512 tmp12154 = _mm512_fmadd_ps(tmp12148, _mm512_set1_ps(8e+00f), tmp12155);
__m512 tmp12140 = _mm512_fmadd_ps(tmp12132, _mm512_set1_ps(3.2e+01f), tmp12141);
__m512 tmp12160 = _mm512_fmadd_ps(tmp12152, _mm512_set1_ps(3.2e+01f), tmp12161);
__m512 tmp12138 = _mm512_fmadd_ps(tmp12128, _mm512_set1_ps(2e+00f), tmp12139);
__m512 tmp12158 = _mm512_fmadd_ps(tmp12148, _mm512_set1_ps(2e+00f), tmp12159);
__m512 tmp12111 = tmp12123;
__m512 tmp12117 = tmp12143;
__m512 tmp12112 = tmp12129;
__m512 tmp12118 = tmp12149;
__m512 tmp12113 = tmp12134;
__m512 tmp12119 = tmp12154;
__m512 tmp12114 = tmp12136;
__m512 tmp12120 = tmp12156;
__m512 tmp12115 = tmp12138;
__m512 tmp12121 = tmp12158;
__m512 tmp12116 = tmp12140;
__m512 tmp12122 = tmp12160;
__m512 tmp12207 = _mm512_unpacklo_ps(tmp12111, tmp12112);
__m512 tmp12208 = _mm512_unpackhi_ps(tmp12111, tmp12112);
__m512 tmp12209 = _mm512_unpacklo_ps(tmp12113, tmp12114);
__m512 tmp12210 = _mm512_unpackhi_ps(tmp12113, tmp12114);
__m512 tmp12211 = _mm512_unpacklo_ps(tmp12115, tmp12116);
__m512 tmp12212 = _mm512_unpackhi_ps(tmp12115, tmp12116);
__m512 tmp12213 = _mm512_unpacklo_ps(tmp12117, tmp12118);
__m512 tmp12214 = _mm512_unpackhi_ps(tmp12117, tmp12118);
__m512 tmp12215 = _mm512_unpacklo_ps(tmp12119, tmp12120);
__m512 tmp12216 = _mm512_unpackhi_ps(tmp12119, tmp12120);
__m512 tmp12217 = _mm512_unpacklo_ps(tmp12121, tmp12122);
__m512 tmp12218 = _mm512_unpackhi_ps(tmp12121, tmp12122);
__m512 tmp12219 = _mm512_shuffle_ps(tmp12207, tmp12209, 68);
__m512 tmp12220 = _mm512_shuffle_ps(tmp12207, tmp12209, 238);
__m512 tmp12221 = _mm512_shuffle_ps(tmp12208, tmp12210, 68);
__m512 tmp12222 = _mm512_shuffle_ps(tmp12208, tmp12210, 238);
__m512 tmp12223 = _mm512_shuffle_ps(tmp12211, tmp12213, 68);
__m512 tmp12224 = _mm512_shuffle_ps(tmp12211, tmp12213, 238);
__m512 tmp12225 = _mm512_shuffle_ps(tmp12212, tmp12214, 68);
__m512 tmp12226 = _mm512_shuffle_ps(tmp12212, tmp12214, 238);
__m512 tmp12227 = _mm512_shuffle_ps(tmp12215, tmp12217, 68);
__m512 tmp12228 = _mm512_shuffle_ps(tmp12215, tmp12217, 238);
__m512 tmp12229 = _mm512_shuffle_ps(tmp12216, tmp12218, 68);
__m512 tmp12230 = _mm512_shuffle_ps(tmp12216, tmp12218, 238);
__m512 tmp12231 = _mm512_shuffle_f32x4(tmp12219, tmp12223, 136);
__m512 tmp12232 = _mm512_shuffle_f32x4(tmp12219, tmp12223, 221);
__m512 tmp12233 = _mm512_shuffle_f32x4(tmp12220, tmp12224, 136);
__m512 tmp12234 = _mm512_shuffle_f32x4(tmp12220, tmp12224, 221);
__m512 tmp12235 = _mm512_shuffle_f32x4(tmp12221, tmp12225, 136);
__m512 tmp12236 = _mm512_shuffle_f32x4(tmp12221, tmp12225, 221);
__m512 tmp12237 = _mm512_shuffle_f32x4(tmp12222, tmp12226, 136);
__m512 tmp12238 = _mm512_shuffle_f32x4(tmp12222, tmp12226, 221);
__m512 tmp12239 = _mm512_shuffle_f32x4(tmp12227, tmp12227, 136);
__m512 tmp12240 = _mm512_shuffle_f32x4(tmp12227, tmp12227, 221);
__m512 tmp12241 = _mm512_shuffle_f32x4(tmp12228, tmp12228, 136);
__m512 tmp12242 = _mm512_shuffle_f32x4(tmp12228, tmp12228, 221);
__m512 tmp12243 = _mm512_shuffle_f32x4(tmp12229, tmp12229, 136);
__m512 tmp12244 = _mm512_shuffle_f32x4(tmp12229, tmp12229, 221);
__m512 tmp12245 = _mm512_shuffle_f32x4(tmp12230, tmp12230, 136);
__m512 tmp12246 = _mm512_shuffle_f32x4(tmp12230, tmp12230, 221);
tmp12111 = _mm512_shuffle_f32x4(tmp12231, tmp12239, 136);
tmp12119 = _mm512_shuffle_f32x4(tmp12231, tmp12239, 221);
tmp12112 = _mm512_shuffle_f32x4(tmp12233, tmp12241, 136);
tmp12120 = _mm512_shuffle_f32x4(tmp12233, tmp12241, 221);
tmp12113 = _mm512_shuffle_f32x4(tmp12235, tmp12243, 136);
tmp12121 = _mm512_shuffle_f32x4(tmp12235, tmp12243, 221);
tmp12114 = _mm512_shuffle_f32x4(tmp12237, tmp12245, 136);
tmp12122 = _mm512_shuffle_f32x4(tmp12237, tmp12245, 221);
tmp12115 = _mm512_shuffle_f32x4(tmp12232, tmp12240, 136);
__m512 tmp12163 = _mm512_shuffle_f32x4(tmp12232, tmp12240, 221);
tmp12116 = _mm512_shuffle_f32x4(tmp12234, tmp12242, 136);
__m512 tmp12164 = _mm512_shuffle_f32x4(tmp12234, tmp12242, 221);
tmp12117 = _mm512_shuffle_f32x4(tmp12236, tmp12244, 136);
__m512 tmp12165 = _mm512_shuffle_f32x4(tmp12236, tmp12244, 221);
tmp12118 = _mm512_shuffle_f32x4(tmp12238, tmp12246, 136);
__m512 tmp12166 = _mm512_shuffle_f32x4(tmp12238, tmp12246, 221);
__m512 tmp12171 = _mm512_add_ps(tmp12112, tmp12113);
__m512 tmp12191 = _mm512_add_ps(tmp12120, tmp12121);
__m512 tmp12170 = _mm512_add_ps(tmp12114, tmp12115);
__m512 tmp12190 = _mm512_add_ps(tmp12122, tmp12163);
__m512 tmp12176 = _mm512_sub_ps(tmp12114, tmp12115);
__m512 tmp12196 = _mm512_sub_ps(tmp12122, tmp12163);
__m512 tmp12175 = _mm512_sub_ps(tmp12112, tmp12113);
__m512 tmp12195 = _mm512_sub_ps(tmp12120, tmp12121);
__m512 tmp12172 = _mm512_add_ps(tmp12116, tmp12117);
__m512 tmp12192 = _mm512_add_ps(tmp12164, tmp12165);
__m512 tmp12177 = _mm512_sub_ps(tmp12116, tmp12117);
__m512 tmp12197 = _mm512_sub_ps(tmp12164, tmp12165);
__m512 tmp12174 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(2e+00f), tmp12175);
__m512 tmp12194 = _mm512_fmadd_ps(tmp12196, _mm512_set1_ps(2e+00f), tmp12195);
__m512 tmp12181 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(8e+00f), tmp12175);
__m512 tmp12201 = _mm512_fmadd_ps(tmp12196, _mm512_set1_ps(8e+00f), tmp12195);
__m512 tmp12169 = _mm512_add_ps(tmp12170, tmp12171);
__m512 tmp12189 = _mm512_add_ps(tmp12190, tmp12191);
__m512 tmp12173 = _mm512_fmadd_ps(tmp12177, _mm512_set1_ps(1.6e+01f), tmp12174);
__m512 tmp12193 = _mm512_fmadd_ps(tmp12197, _mm512_set1_ps(1.6e+01f), tmp12194);
__m512 tmp12180 = _mm512_fmadd_ps(tmp12177, _mm512_set1_ps(4e+00f), tmp12181);
__m512 tmp12200 = _mm512_fmadd_ps(tmp12197, _mm512_set1_ps(4e+00f), tmp12201);
__m512 tmp12186 = _mm512_add_ps(tmp12177, tmp12175);
__m512 tmp12206 = _mm512_add_ps(tmp12197, tmp12195);
__m512 tmp12179 = _mm512_fmadd_ps(tmp12170, _mm512_set1_ps(4e+00f), tmp12171);
__m512 tmp12199 = _mm512_fmadd_ps(tmp12190, _mm512_set1_ps(4e+00f), tmp12191);
__m512 tmp12183 = _mm512_fmadd_ps(tmp12170, _mm512_set1_ps(1.6e+01f), tmp12171);
__m512 tmp12203 = _mm512_fmadd_ps(tmp12190, _mm512_set1_ps(1.6e+01f), tmp12191);
__m512 tmp12168 = _mm512_add_ps(tmp12169, tmp12111);
__m512 tmp12188 = _mm512_add_ps(tmp12189, tmp12119);
__m512 tmp12185 = _mm512_add_ps(tmp12186, tmp12118);
__m512 tmp12205 = _mm512_add_ps(tmp12206, tmp12166);
__m512 tmp12167 = _mm512_fmadd_ps(tmp12172, _mm512_set1_ps(3.2e+01f), tmp12168);
__m512 tmp12187 = _mm512_fmadd_ps(tmp12192, _mm512_set1_ps(3.2e+01f), tmp12188);
__m512 tmp12178 = _mm512_fmadd_ps(tmp12172, _mm512_set1_ps(8e+00f), tmp12179);
__m512 tmp12198 = _mm512_fmadd_ps(tmp12192, _mm512_set1_ps(8e+00f), tmp12199);
__m512 tmp12184 = _mm512_fmadd_ps(tmp12176, _mm512_set1_ps(3.2e+01f), tmp12185);
__m512 tmp12204 = _mm512_fmadd_ps(tmp12196, _mm512_set1_ps(3.2e+01f), tmp12205);
__m512 tmp12182 = _mm512_fmadd_ps(tmp12172, _mm512_set1_ps(2e+00f), tmp12183);
__m512 tmp12202 = _mm512_fmadd_ps(tmp12192, _mm512_set1_ps(2e+00f), tmp12203);
__m512 out1617 = tmp12167;
__m512 out1611 = tmp12187;
__m512 out1618 = tmp12173;
__m512 out1612 = tmp12193;
__m512 out1619 = tmp12178;
__m512 out1613 = tmp12198;
__m512 out1620 = tmp12180;
__m512 out1614 = tmp12200;
__m512 out1621 = tmp12182;
__m512 out1615 = tmp12202;
__m512 out1622 = tmp12184;
__m512 out1616 = tmp12204;
out1617 = _mm512_max_ps(_mm512_setzero_ps(), out1617);
out1611 = _mm512_max_ps(_mm512_setzero_ps(), out1611);
out1618 = _mm512_max_ps(_mm512_setzero_ps(), out1618);
out1612 = _mm512_max_ps(_mm512_setzero_ps(), out1612);
out1619 = _mm512_max_ps(_mm512_setzero_ps(), out1619);
out1613 = _mm512_max_ps(_mm512_setzero_ps(), out1613);
out1620 = _mm512_max_ps(_mm512_setzero_ps(), out1620);
out1614 = _mm512_max_ps(_mm512_setzero_ps(), out1614);
out1621 = _mm512_max_ps(_mm512_setzero_ps(), out1621);
out1615 = _mm512_max_ps(_mm512_setzero_ps(), out1615);
out1622 = _mm512_max_ps(_mm512_setzero_ps(), out1622);
out1616 = _mm512_max_ps(_mm512_setzero_ps(), out1616);
_mm512_mask_storeu_ps(datPtr26+3136+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1617);
_mm512_mask_storeu_ps(datPtr26+96+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1611);
_mm512_mask_storeu_ps(datPtr26+648+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1611);
_mm512_mask_storeu_ps(datPtr26+3248+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1618);
_mm512_mask_storeu_ps(datPtr26+208+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1612);
_mm512_mask_storeu_ps(datPtr26+760+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1612);
_mm512_mask_storeu_ps(datPtr26+3360+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1619);
_mm512_mask_storeu_ps(datPtr26+320+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1613);
_mm512_mask_storeu_ps(datPtr26+872+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1613);
_mm512_mask_storeu_ps(datPtr26+3472+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1620);
_mm512_mask_storeu_ps(datPtr26+432+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1614);
_mm512_mask_storeu_ps(datPtr26+984+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1614);
_mm512_mask_storeu_ps(datPtr26+3584+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1621);
_mm512_mask_storeu_ps(datPtr26+544+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1615);
_mm512_mask_storeu_ps(datPtr26+1096+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1615);
_mm512_mask_storeu_ps(datPtr26+3696+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1622);
_mm512_mask_storeu_ps(datPtr26+656+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1616);
_mm512_mask_storeu_ps(datPtr26+1208+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1616);
__m512 sf833 = _mm512_loadu_ps(sfPtr12+512+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf834 = _mm512_loadu_ps(sfPtr12+640+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1738 = _mm512_shuffle_f32x4(sf833, sf834, 68);
__m512 in1739 = _mm512_shuffle_f32x4(sf833, sf834, 238);
__m512 sf835 = _mm512_loadu_ps(sfPtr12+576+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf836 = _mm512_loadu_ps(sfPtr12+704+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1746 = _mm512_shuffle_f32x4(sf835, sf836, 68);
__m512 in1747 = _mm512_shuffle_f32x4(sf835, sf836, 238);
__m512 sf837 = _mm512_loadu_ps(sfPtr12+13312+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf838 = _mm512_loadu_ps(sfPtr12+13440+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1740 = _mm512_shuffle_f32x4(sf837, sf838, 68);
__m512 in1741 = _mm512_shuffle_f32x4(sf837, sf838, 238);
__m512 sf839 = _mm512_loadu_ps(sfPtr12+13376+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf840 = _mm512_loadu_ps(sfPtr12+13504+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1748 = _mm512_shuffle_f32x4(sf839, sf840, 68);
__m512 in1749 = _mm512_shuffle_f32x4(sf839, sf840, 238);
__m512 sf841 = _mm512_loadu_ps(sfPtr12+26112+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf842 = _mm512_loadu_ps(sfPtr12+26240+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1742 = _mm512_shuffle_f32x4(sf841, sf842, 68);
__m512 in1743 = _mm512_shuffle_f32x4(sf841, sf842, 238);
__m512 sf843 = _mm512_loadu_ps(sfPtr12+26176+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf844 = _mm512_loadu_ps(sfPtr12+26304+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1750 = _mm512_shuffle_f32x4(sf843, sf844, 68);
__m512 in1751 = _mm512_shuffle_f32x4(sf843, sf844, 238);
__m512 sf845 = _mm512_loadu_ps(sfPtr12+38912+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf846 = _mm512_loadu_ps(sfPtr12+39040+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1744 = _mm512_shuffle_f32x4(sf845, sf846, 68);
__m512 in1745 = _mm512_shuffle_f32x4(sf845, sf846, 238);
__m512 sf847 = _mm512_loadu_ps(sfPtr12+38976+51200*i51+3072*j44+1536*k136+768*l55);
__m512 sf848 = _mm512_loadu_ps(sfPtr12+39104+51200*i51+3072*j44+1536*k136+768*l55);
__m512 in1752 = _mm512_shuffle_f32x4(sf847, sf848, 68);
__m512 in1753 = _mm512_shuffle_f32x4(sf847, sf848, 238);
__m512 tmp12263 = _mm512_add_ps(in1739, in1740);
__m512 tmp12283 = _mm512_add_ps(in1747, in1748);
__m512 tmp12262 = _mm512_add_ps(in1741, in1742);
__m512 tmp12282 = _mm512_add_ps(in1749, in1750);
__m512 tmp12268 = _mm512_sub_ps(in1741, in1742);
__m512 tmp12288 = _mm512_sub_ps(in1749, in1750);
__m512 tmp12267 = _mm512_sub_ps(in1739, in1740);
__m512 tmp12287 = _mm512_sub_ps(in1747, in1748);
__m512 tmp12264 = _mm512_add_ps(in1743, in1744);
__m512 tmp12284 = _mm512_add_ps(in1751, in1752);
__m512 tmp12269 = _mm512_sub_ps(in1743, in1744);
__m512 tmp12289 = _mm512_sub_ps(in1751, in1752);
__m512 tmp12266 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(2e+00f), tmp12267);
__m512 tmp12286 = _mm512_fmadd_ps(tmp12288, _mm512_set1_ps(2e+00f), tmp12287);
__m512 tmp12273 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(8e+00f), tmp12267);
__m512 tmp12293 = _mm512_fmadd_ps(tmp12288, _mm512_set1_ps(8e+00f), tmp12287);
__m512 tmp12261 = _mm512_add_ps(tmp12262, tmp12263);
__m512 tmp12281 = _mm512_add_ps(tmp12282, tmp12283);
__m512 tmp12265 = _mm512_fmadd_ps(tmp12269, _mm512_set1_ps(1.6e+01f), tmp12266);
__m512 tmp12285 = _mm512_fmadd_ps(tmp12289, _mm512_set1_ps(1.6e+01f), tmp12286);
__m512 tmp12272 = _mm512_fmadd_ps(tmp12269, _mm512_set1_ps(4e+00f), tmp12273);
__m512 tmp12292 = _mm512_fmadd_ps(tmp12289, _mm512_set1_ps(4e+00f), tmp12293);
__m512 tmp12278 = _mm512_add_ps(tmp12269, tmp12267);
__m512 tmp12298 = _mm512_add_ps(tmp12289, tmp12287);
__m512 tmp12271 = _mm512_fmadd_ps(tmp12262, _mm512_set1_ps(4e+00f), tmp12263);
__m512 tmp12291 = _mm512_fmadd_ps(tmp12282, _mm512_set1_ps(4e+00f), tmp12283);
__m512 tmp12275 = _mm512_fmadd_ps(tmp12262, _mm512_set1_ps(1.6e+01f), tmp12263);
__m512 tmp12295 = _mm512_fmadd_ps(tmp12282, _mm512_set1_ps(1.6e+01f), tmp12283);
__m512 tmp12260 = _mm512_add_ps(tmp12261, in1738);
__m512 tmp12280 = _mm512_add_ps(tmp12281, in1746);
__m512 tmp12277 = _mm512_add_ps(tmp12278, in1745);
__m512 tmp12297 = _mm512_add_ps(tmp12298, in1753);
__m512 tmp12259 = _mm512_fmadd_ps(tmp12264, _mm512_set1_ps(3.2e+01f), tmp12260);
__m512 tmp12279 = _mm512_fmadd_ps(tmp12284, _mm512_set1_ps(3.2e+01f), tmp12280);
__m512 tmp12270 = _mm512_fmadd_ps(tmp12264, _mm512_set1_ps(8e+00f), tmp12271);
__m512 tmp12290 = _mm512_fmadd_ps(tmp12284, _mm512_set1_ps(8e+00f), tmp12291);
__m512 tmp12276 = _mm512_fmadd_ps(tmp12268, _mm512_set1_ps(3.2e+01f), tmp12277);
__m512 tmp12296 = _mm512_fmadd_ps(tmp12288, _mm512_set1_ps(3.2e+01f), tmp12297);
__m512 tmp12274 = _mm512_fmadd_ps(tmp12264, _mm512_set1_ps(2e+00f), tmp12275);
__m512 tmp12294 = _mm512_fmadd_ps(tmp12284, _mm512_set1_ps(2e+00f), tmp12295);
__m512 tmp12247 = tmp12259;
__m512 tmp12253 = tmp12279;
__m512 tmp12248 = tmp12265;
__m512 tmp12254 = tmp12285;
__m512 tmp12249 = tmp12270;
__m512 tmp12255 = tmp12290;
__m512 tmp12250 = tmp12272;
__m512 tmp12256 = tmp12292;
__m512 tmp12251 = tmp12274;
__m512 tmp12257 = tmp12294;
__m512 tmp12252 = tmp12276;
__m512 tmp12258 = tmp12296;
__m512 tmp12343 = _mm512_unpacklo_ps(tmp12247, tmp12248);
__m512 tmp12344 = _mm512_unpackhi_ps(tmp12247, tmp12248);
__m512 tmp12345 = _mm512_unpacklo_ps(tmp12249, tmp12250);
__m512 tmp12346 = _mm512_unpackhi_ps(tmp12249, tmp12250);
__m512 tmp12347 = _mm512_unpacklo_ps(tmp12251, tmp12252);
__m512 tmp12348 = _mm512_unpackhi_ps(tmp12251, tmp12252);
__m512 tmp12349 = _mm512_unpacklo_ps(tmp12253, tmp12254);
__m512 tmp12350 = _mm512_unpackhi_ps(tmp12253, tmp12254);
__m512 tmp12351 = _mm512_unpacklo_ps(tmp12255, tmp12256);
__m512 tmp12352 = _mm512_unpackhi_ps(tmp12255, tmp12256);
__m512 tmp12353 = _mm512_unpacklo_ps(tmp12257, tmp12258);
__m512 tmp12354 = _mm512_unpackhi_ps(tmp12257, tmp12258);
__m512 tmp12355 = _mm512_shuffle_ps(tmp12343, tmp12345, 68);
__m512 tmp12356 = _mm512_shuffle_ps(tmp12343, tmp12345, 238);
__m512 tmp12357 = _mm512_shuffle_ps(tmp12344, tmp12346, 68);
__m512 tmp12358 = _mm512_shuffle_ps(tmp12344, tmp12346, 238);
__m512 tmp12359 = _mm512_shuffle_ps(tmp12347, tmp12349, 68);
__m512 tmp12360 = _mm512_shuffle_ps(tmp12347, tmp12349, 238);
__m512 tmp12361 = _mm512_shuffle_ps(tmp12348, tmp12350, 68);
__m512 tmp12362 = _mm512_shuffle_ps(tmp12348, tmp12350, 238);
__m512 tmp12363 = _mm512_shuffle_ps(tmp12351, tmp12353, 68);
__m512 tmp12364 = _mm512_shuffle_ps(tmp12351, tmp12353, 238);
__m512 tmp12365 = _mm512_shuffle_ps(tmp12352, tmp12354, 68);
__m512 tmp12366 = _mm512_shuffle_ps(tmp12352, tmp12354, 238);
__m512 tmp12367 = _mm512_shuffle_f32x4(tmp12355, tmp12359, 136);
__m512 tmp12368 = _mm512_shuffle_f32x4(tmp12355, tmp12359, 221);
__m512 tmp12369 = _mm512_shuffle_f32x4(tmp12356, tmp12360, 136);
__m512 tmp12370 = _mm512_shuffle_f32x4(tmp12356, tmp12360, 221);
__m512 tmp12371 = _mm512_shuffle_f32x4(tmp12357, tmp12361, 136);
__m512 tmp12372 = _mm512_shuffle_f32x4(tmp12357, tmp12361, 221);
__m512 tmp12373 = _mm512_shuffle_f32x4(tmp12358, tmp12362, 136);
__m512 tmp12374 = _mm512_shuffle_f32x4(tmp12358, tmp12362, 221);
__m512 tmp12375 = _mm512_shuffle_f32x4(tmp12363, tmp12363, 136);
__m512 tmp12376 = _mm512_shuffle_f32x4(tmp12363, tmp12363, 221);
__m512 tmp12377 = _mm512_shuffle_f32x4(tmp12364, tmp12364, 136);
__m512 tmp12378 = _mm512_shuffle_f32x4(tmp12364, tmp12364, 221);
__m512 tmp12379 = _mm512_shuffle_f32x4(tmp12365, tmp12365, 136);
__m512 tmp12380 = _mm512_shuffle_f32x4(tmp12365, tmp12365, 221);
__m512 tmp12381 = _mm512_shuffle_f32x4(tmp12366, tmp12366, 136);
__m512 tmp12382 = _mm512_shuffle_f32x4(tmp12366, tmp12366, 221);
tmp12247 = _mm512_shuffle_f32x4(tmp12367, tmp12375, 136);
tmp12255 = _mm512_shuffle_f32x4(tmp12367, tmp12375, 221);
tmp12248 = _mm512_shuffle_f32x4(tmp12369, tmp12377, 136);
tmp12256 = _mm512_shuffle_f32x4(tmp12369, tmp12377, 221);
tmp12249 = _mm512_shuffle_f32x4(tmp12371, tmp12379, 136);
tmp12257 = _mm512_shuffle_f32x4(tmp12371, tmp12379, 221);
tmp12250 = _mm512_shuffle_f32x4(tmp12373, tmp12381, 136);
tmp12258 = _mm512_shuffle_f32x4(tmp12373, tmp12381, 221);
tmp12251 = _mm512_shuffle_f32x4(tmp12368, tmp12376, 136);
__m512 tmp12299 = _mm512_shuffle_f32x4(tmp12368, tmp12376, 221);
tmp12252 = _mm512_shuffle_f32x4(tmp12370, tmp12378, 136);
__m512 tmp12300 = _mm512_shuffle_f32x4(tmp12370, tmp12378, 221);
tmp12253 = _mm512_shuffle_f32x4(tmp12372, tmp12380, 136);
__m512 tmp12301 = _mm512_shuffle_f32x4(tmp12372, tmp12380, 221);
tmp12254 = _mm512_shuffle_f32x4(tmp12374, tmp12382, 136);
__m512 tmp12302 = _mm512_shuffle_f32x4(tmp12374, tmp12382, 221);
__m512 tmp12307 = _mm512_add_ps(tmp12248, tmp12249);
__m512 tmp12327 = _mm512_add_ps(tmp12256, tmp12257);
__m512 tmp12306 = _mm512_add_ps(tmp12250, tmp12251);
__m512 tmp12326 = _mm512_add_ps(tmp12258, tmp12299);
__m512 tmp12312 = _mm512_sub_ps(tmp12250, tmp12251);
__m512 tmp12332 = _mm512_sub_ps(tmp12258, tmp12299);
__m512 tmp12311 = _mm512_sub_ps(tmp12248, tmp12249);
__m512 tmp12331 = _mm512_sub_ps(tmp12256, tmp12257);
__m512 tmp12308 = _mm512_add_ps(tmp12252, tmp12253);
__m512 tmp12328 = _mm512_add_ps(tmp12300, tmp12301);
__m512 tmp12313 = _mm512_sub_ps(tmp12252, tmp12253);
__m512 tmp12333 = _mm512_sub_ps(tmp12300, tmp12301);
__m512 tmp12310 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(2e+00f), tmp12311);
__m512 tmp12330 = _mm512_fmadd_ps(tmp12332, _mm512_set1_ps(2e+00f), tmp12331);
__m512 tmp12317 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(8e+00f), tmp12311);
__m512 tmp12337 = _mm512_fmadd_ps(tmp12332, _mm512_set1_ps(8e+00f), tmp12331);
__m512 tmp12305 = _mm512_add_ps(tmp12306, tmp12307);
__m512 tmp12325 = _mm512_add_ps(tmp12326, tmp12327);
__m512 tmp12309 = _mm512_fmadd_ps(tmp12313, _mm512_set1_ps(1.6e+01f), tmp12310);
__m512 tmp12329 = _mm512_fmadd_ps(tmp12333, _mm512_set1_ps(1.6e+01f), tmp12330);
__m512 tmp12316 = _mm512_fmadd_ps(tmp12313, _mm512_set1_ps(4e+00f), tmp12317);
__m512 tmp12336 = _mm512_fmadd_ps(tmp12333, _mm512_set1_ps(4e+00f), tmp12337);
__m512 tmp12322 = _mm512_add_ps(tmp12313, tmp12311);
__m512 tmp12342 = _mm512_add_ps(tmp12333, tmp12331);
__m512 tmp12315 = _mm512_fmadd_ps(tmp12306, _mm512_set1_ps(4e+00f), tmp12307);
__m512 tmp12335 = _mm512_fmadd_ps(tmp12326, _mm512_set1_ps(4e+00f), tmp12327);
__m512 tmp12319 = _mm512_fmadd_ps(tmp12306, _mm512_set1_ps(1.6e+01f), tmp12307);
__m512 tmp12339 = _mm512_fmadd_ps(tmp12326, _mm512_set1_ps(1.6e+01f), tmp12327);
__m512 tmp12304 = _mm512_add_ps(tmp12305, tmp12247);
__m512 tmp12324 = _mm512_add_ps(tmp12325, tmp12255);
__m512 tmp12321 = _mm512_add_ps(tmp12322, tmp12254);
__m512 tmp12341 = _mm512_add_ps(tmp12342, tmp12302);
__m512 tmp12303 = _mm512_fmadd_ps(tmp12308, _mm512_set1_ps(3.2e+01f), tmp12304);
__m512 tmp12323 = _mm512_fmadd_ps(tmp12328, _mm512_set1_ps(3.2e+01f), tmp12324);
__m512 tmp12314 = _mm512_fmadd_ps(tmp12308, _mm512_set1_ps(8e+00f), tmp12315);
__m512 tmp12334 = _mm512_fmadd_ps(tmp12328, _mm512_set1_ps(8e+00f), tmp12335);
__m512 tmp12320 = _mm512_fmadd_ps(tmp12312, _mm512_set1_ps(3.2e+01f), tmp12321);
__m512 tmp12340 = _mm512_fmadd_ps(tmp12332, _mm512_set1_ps(3.2e+01f), tmp12341);
__m512 tmp12318 = _mm512_fmadd_ps(tmp12308, _mm512_set1_ps(2e+00f), tmp12319);
__m512 tmp12338 = _mm512_fmadd_ps(tmp12328, _mm512_set1_ps(2e+00f), tmp12339);
__m512 out1623 = tmp12303;
__m512 out1629 = tmp12323;
__m512 out1624 = tmp12309;
__m512 out1630 = tmp12329;
__m512 out1625 = tmp12314;
__m512 out1631 = tmp12334;
__m512 out1626 = tmp12316;
__m512 out1632 = tmp12336;
__m512 out1627 = tmp12318;
__m512 out1633 = tmp12338;
__m512 out1628 = tmp12320;
__m512 out1634 = tmp12340;
out1623 = _mm512_max_ps(_mm512_setzero_ps(), out1623);
out1629 = _mm512_max_ps(_mm512_setzero_ps(), out1629);
out1624 = _mm512_max_ps(_mm512_setzero_ps(), out1624);
out1630 = _mm512_max_ps(_mm512_setzero_ps(), out1630);
out1625 = _mm512_max_ps(_mm512_setzero_ps(), out1625);
out1631 = _mm512_max_ps(_mm512_setzero_ps(), out1631);
out1626 = _mm512_max_ps(_mm512_setzero_ps(), out1626);
out1632 = _mm512_max_ps(_mm512_setzero_ps(), out1632);
out1627 = _mm512_max_ps(_mm512_setzero_ps(), out1627);
out1633 = _mm512_max_ps(_mm512_setzero_ps(), out1633);
out1628 = _mm512_max_ps(_mm512_setzero_ps(), out1628);
out1634 = _mm512_max_ps(_mm512_setzero_ps(), out1634);
_mm512_mask_storeu_ps(datPtr26+3184+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1623);
_mm512_mask_storeu_ps(datPtr26+3232+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1629);
_mm512_mask_storeu_ps(datPtr26+3784+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1629);
_mm512_mask_storeu_ps(datPtr26+3296+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1624);
_mm512_mask_storeu_ps(datPtr26+3344+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1630);
_mm512_mask_storeu_ps(datPtr26+3896+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1630);
_mm512_mask_storeu_ps(datPtr26+3408+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1625);
_mm512_mask_storeu_ps(datPtr26+3456+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1631);
_mm512_mask_storeu_ps(datPtr26+4008+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1631);
_mm512_mask_storeu_ps(datPtr26+3520+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1626);
_mm512_mask_storeu_ps(datPtr26+3568+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1632);
_mm512_mask_storeu_ps(datPtr26+4120+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1632);
_mm512_mask_storeu_ps(datPtr26+3632+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1627);
_mm512_mask_storeu_ps(datPtr26+3680+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1633);
_mm512_mask_storeu_ps(datPtr26+4232+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1633);
_mm512_mask_storeu_ps(datPtr26+3744+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4095, out1628);
_mm512_mask_storeu_ps(datPtr26+3792+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 15, out1634);
_mm512_mask_storeu_ps(datPtr26+4344+25088*i51+112*toH41+4*toW41+12544*k136+6272*l55, 4032, out1634);
}
}
++j44;
rel22 = 1;
}
ptrdiff_t toH42 = base22+6;
ptrdiff_t toW42 = 6;
ptrdiff_t k137 = 2*w62;
for (; k137 != 2; ++k137) {
ptrdiff_t l56 = 0;
for (; l56 != 2; ++l56) {
__m512 sf849 = _mm512_loadu_ps(sfPtr12+0+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf850 = _mm512_loadu_ps(sfPtr12+128+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1754 = _mm512_shuffle_f32x4(sf849, sf850, 68);
__m512 in1755 = _mm512_shuffle_f32x4(sf849, sf850, 238);
__m512 sf851 = _mm512_loadu_ps(sfPtr12+64+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf852 = _mm512_loadu_ps(sfPtr12+192+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1762 = _mm512_shuffle_f32x4(sf851, sf852, 68);
__m512 in1763 = _mm512_shuffle_f32x4(sf851, sf852, 238);
__m512 sf853 = _mm512_loadu_ps(sfPtr12+12800+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf854 = _mm512_loadu_ps(sfPtr12+12928+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1756 = _mm512_shuffle_f32x4(sf853, sf854, 68);
__m512 in1757 = _mm512_shuffle_f32x4(sf853, sf854, 238);
__m512 sf855 = _mm512_loadu_ps(sfPtr12+12864+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf856 = _mm512_loadu_ps(sfPtr12+12992+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1764 = _mm512_shuffle_f32x4(sf855, sf856, 68);
__m512 in1765 = _mm512_shuffle_f32x4(sf855, sf856, 238);
__m512 sf857 = _mm512_loadu_ps(sfPtr12+25600+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf858 = _mm512_loadu_ps(sfPtr12+25728+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1758 = _mm512_shuffle_f32x4(sf857, sf858, 68);
__m512 in1759 = _mm512_shuffle_f32x4(sf857, sf858, 238);
__m512 sf859 = _mm512_loadu_ps(sfPtr12+25664+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf860 = _mm512_loadu_ps(sfPtr12+25792+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1766 = _mm512_shuffle_f32x4(sf859, sf860, 68);
__m512 in1767 = _mm512_shuffle_f32x4(sf859, sf860, 238);
__m512 sf861 = _mm512_loadu_ps(sfPtr12+38400+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf862 = _mm512_loadu_ps(sfPtr12+38528+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1760 = _mm512_shuffle_f32x4(sf861, sf862, 68);
__m512 in1761 = _mm512_shuffle_f32x4(sf861, sf862, 238);
__m512 sf863 = _mm512_loadu_ps(sfPtr12+38464+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf864 = _mm512_loadu_ps(sfPtr12+38592+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1768 = _mm512_shuffle_f32x4(sf863, sf864, 68);
__m512 in1769 = _mm512_shuffle_f32x4(sf863, sf864, 238);
__m512 tmp12399 = _mm512_add_ps(in1755, in1756);
__m512 tmp12419 = _mm512_add_ps(in1763, in1764);
__m512 tmp12398 = _mm512_add_ps(in1757, in1758);
__m512 tmp12418 = _mm512_add_ps(in1765, in1766);
__m512 tmp12404 = _mm512_sub_ps(in1757, in1758);
__m512 tmp12424 = _mm512_sub_ps(in1765, in1766);
__m512 tmp12403 = _mm512_sub_ps(in1755, in1756);
__m512 tmp12423 = _mm512_sub_ps(in1763, in1764);
__m512 tmp12400 = _mm512_add_ps(in1759, in1760);
__m512 tmp12420 = _mm512_add_ps(in1767, in1768);
__m512 tmp12405 = _mm512_sub_ps(in1759, in1760);
__m512 tmp12425 = _mm512_sub_ps(in1767, in1768);
__m512 tmp12402 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(2e+00f), tmp12403);
__m512 tmp12422 = _mm512_fmadd_ps(tmp12424, _mm512_set1_ps(2e+00f), tmp12423);
__m512 tmp12409 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(8e+00f), tmp12403);
__m512 tmp12429 = _mm512_fmadd_ps(tmp12424, _mm512_set1_ps(8e+00f), tmp12423);
__m512 tmp12397 = _mm512_add_ps(tmp12398, tmp12399);
__m512 tmp12417 = _mm512_add_ps(tmp12418, tmp12419);
__m512 tmp12401 = _mm512_fmadd_ps(tmp12405, _mm512_set1_ps(1.6e+01f), tmp12402);
__m512 tmp12421 = _mm512_fmadd_ps(tmp12425, _mm512_set1_ps(1.6e+01f), tmp12422);
__m512 tmp12408 = _mm512_fmadd_ps(tmp12405, _mm512_set1_ps(4e+00f), tmp12409);
__m512 tmp12428 = _mm512_fmadd_ps(tmp12425, _mm512_set1_ps(4e+00f), tmp12429);
__m512 tmp12414 = _mm512_add_ps(tmp12405, tmp12403);
__m512 tmp12434 = _mm512_add_ps(tmp12425, tmp12423);
__m512 tmp12407 = _mm512_fmadd_ps(tmp12398, _mm512_set1_ps(4e+00f), tmp12399);
__m512 tmp12427 = _mm512_fmadd_ps(tmp12418, _mm512_set1_ps(4e+00f), tmp12419);
__m512 tmp12411 = _mm512_fmadd_ps(tmp12398, _mm512_set1_ps(1.6e+01f), tmp12399);
__m512 tmp12431 = _mm512_fmadd_ps(tmp12418, _mm512_set1_ps(1.6e+01f), tmp12419);
__m512 tmp12396 = _mm512_add_ps(tmp12397, in1754);
__m512 tmp12416 = _mm512_add_ps(tmp12417, in1762);
__m512 tmp12413 = _mm512_add_ps(tmp12414, in1761);
__m512 tmp12433 = _mm512_add_ps(tmp12434, in1769);
__m512 tmp12395 = _mm512_fmadd_ps(tmp12400, _mm512_set1_ps(3.2e+01f), tmp12396);
__m512 tmp12415 = _mm512_fmadd_ps(tmp12420, _mm512_set1_ps(3.2e+01f), tmp12416);
__m512 tmp12406 = _mm512_fmadd_ps(tmp12400, _mm512_set1_ps(8e+00f), tmp12407);
__m512 tmp12426 = _mm512_fmadd_ps(tmp12420, _mm512_set1_ps(8e+00f), tmp12427);
__m512 tmp12412 = _mm512_fmadd_ps(tmp12404, _mm512_set1_ps(3.2e+01f), tmp12413);
__m512 tmp12432 = _mm512_fmadd_ps(tmp12424, _mm512_set1_ps(3.2e+01f), tmp12433);
__m512 tmp12410 = _mm512_fmadd_ps(tmp12400, _mm512_set1_ps(2e+00f), tmp12411);
__m512 tmp12430 = _mm512_fmadd_ps(tmp12420, _mm512_set1_ps(2e+00f), tmp12431);
__m512 tmp12383 = tmp12395;
__m512 tmp12389 = tmp12415;
__m512 tmp12384 = tmp12401;
__m512 tmp12390 = tmp12421;
__m512 tmp12385 = tmp12406;
__m512 tmp12391 = tmp12426;
__m512 tmp12386 = tmp12408;
__m512 tmp12392 = tmp12428;
__m512 tmp12387 = tmp12410;
__m512 tmp12393 = tmp12430;
__m512 tmp12388 = tmp12412;
__m512 tmp12394 = tmp12432;
__m512 tmp12479 = _mm512_unpacklo_ps(tmp12383, tmp12384);
__m512 tmp12480 = _mm512_unpackhi_ps(tmp12383, tmp12384);
__m512 tmp12481 = _mm512_unpacklo_ps(tmp12385, tmp12386);
__m512 tmp12482 = _mm512_unpackhi_ps(tmp12385, tmp12386);
__m512 tmp12483 = _mm512_unpacklo_ps(tmp12387, tmp12388);
__m512 tmp12484 = _mm512_unpackhi_ps(tmp12387, tmp12388);
__m512 tmp12485 = _mm512_unpacklo_ps(tmp12389, tmp12390);
__m512 tmp12486 = _mm512_unpackhi_ps(tmp12389, tmp12390);
__m512 tmp12487 = _mm512_unpacklo_ps(tmp12391, tmp12392);
__m512 tmp12488 = _mm512_unpackhi_ps(tmp12391, tmp12392);
__m512 tmp12489 = _mm512_unpacklo_ps(tmp12393, tmp12394);
__m512 tmp12490 = _mm512_unpackhi_ps(tmp12393, tmp12394);
__m512 tmp12491 = _mm512_shuffle_ps(tmp12479, tmp12481, 68);
__m512 tmp12492 = _mm512_shuffle_ps(tmp12479, tmp12481, 238);
__m512 tmp12493 = _mm512_shuffle_ps(tmp12480, tmp12482, 68);
__m512 tmp12494 = _mm512_shuffle_ps(tmp12480, tmp12482, 238);
__m512 tmp12495 = _mm512_shuffle_ps(tmp12483, tmp12485, 68);
__m512 tmp12496 = _mm512_shuffle_ps(tmp12483, tmp12485, 238);
__m512 tmp12497 = _mm512_shuffle_ps(tmp12484, tmp12486, 68);
__m512 tmp12498 = _mm512_shuffle_ps(tmp12484, tmp12486, 238);
__m512 tmp12499 = _mm512_shuffle_ps(tmp12487, tmp12489, 68);
__m512 tmp12500 = _mm512_shuffle_ps(tmp12487, tmp12489, 238);
__m512 tmp12501 = _mm512_shuffle_ps(tmp12488, tmp12490, 68);
__m512 tmp12502 = _mm512_shuffle_ps(tmp12488, tmp12490, 238);
__m512 tmp12503 = _mm512_shuffle_f32x4(tmp12491, tmp12495, 136);
__m512 tmp12504 = _mm512_shuffle_f32x4(tmp12491, tmp12495, 221);
__m512 tmp12505 = _mm512_shuffle_f32x4(tmp12492, tmp12496, 136);
__m512 tmp12506 = _mm512_shuffle_f32x4(tmp12492, tmp12496, 221);
__m512 tmp12507 = _mm512_shuffle_f32x4(tmp12493, tmp12497, 136);
__m512 tmp12508 = _mm512_shuffle_f32x4(tmp12493, tmp12497, 221);
__m512 tmp12509 = _mm512_shuffle_f32x4(tmp12494, tmp12498, 136);
__m512 tmp12510 = _mm512_shuffle_f32x4(tmp12494, tmp12498, 221);
__m512 tmp12511 = _mm512_shuffle_f32x4(tmp12499, tmp12499, 136);
__m512 tmp12512 = _mm512_shuffle_f32x4(tmp12499, tmp12499, 221);
__m512 tmp12513 = _mm512_shuffle_f32x4(tmp12500, tmp12500, 136);
__m512 tmp12514 = _mm512_shuffle_f32x4(tmp12500, tmp12500, 221);
__m512 tmp12515 = _mm512_shuffle_f32x4(tmp12501, tmp12501, 136);
__m512 tmp12516 = _mm512_shuffle_f32x4(tmp12501, tmp12501, 221);
__m512 tmp12517 = _mm512_shuffle_f32x4(tmp12502, tmp12502, 136);
__m512 tmp12518 = _mm512_shuffle_f32x4(tmp12502, tmp12502, 221);
tmp12383 = _mm512_shuffle_f32x4(tmp12503, tmp12511, 136);
tmp12391 = _mm512_shuffle_f32x4(tmp12503, tmp12511, 221);
tmp12384 = _mm512_shuffle_f32x4(tmp12505, tmp12513, 136);
tmp12392 = _mm512_shuffle_f32x4(tmp12505, tmp12513, 221);
tmp12385 = _mm512_shuffle_f32x4(tmp12507, tmp12515, 136);
tmp12393 = _mm512_shuffle_f32x4(tmp12507, tmp12515, 221);
tmp12386 = _mm512_shuffle_f32x4(tmp12509, tmp12517, 136);
tmp12394 = _mm512_shuffle_f32x4(tmp12509, tmp12517, 221);
tmp12387 = _mm512_shuffle_f32x4(tmp12504, tmp12512, 136);
__m512 tmp12435 = _mm512_shuffle_f32x4(tmp12504, tmp12512, 221);
tmp12388 = _mm512_shuffle_f32x4(tmp12506, tmp12514, 136);
__m512 tmp12436 = _mm512_shuffle_f32x4(tmp12506, tmp12514, 221);
tmp12389 = _mm512_shuffle_f32x4(tmp12508, tmp12516, 136);
__m512 tmp12437 = _mm512_shuffle_f32x4(tmp12508, tmp12516, 221);
tmp12390 = _mm512_shuffle_f32x4(tmp12510, tmp12518, 136);
__m512 tmp12438 = _mm512_shuffle_f32x4(tmp12510, tmp12518, 221);
__m512 tmp12443 = _mm512_add_ps(tmp12384, tmp12385);
__m512 tmp12463 = _mm512_add_ps(tmp12392, tmp12393);
__m512 tmp12442 = _mm512_add_ps(tmp12386, tmp12387);
__m512 tmp12462 = _mm512_add_ps(tmp12394, tmp12435);
__m512 tmp12448 = _mm512_sub_ps(tmp12386, tmp12387);
__m512 tmp12468 = _mm512_sub_ps(tmp12394, tmp12435);
__m512 tmp12447 = _mm512_sub_ps(tmp12384, tmp12385);
__m512 tmp12467 = _mm512_sub_ps(tmp12392, tmp12393);
__m512 tmp12444 = _mm512_add_ps(tmp12388, tmp12389);
__m512 tmp12464 = _mm512_add_ps(tmp12436, tmp12437);
__m512 tmp12449 = _mm512_sub_ps(tmp12388, tmp12389);
__m512 tmp12469 = _mm512_sub_ps(tmp12436, tmp12437);
__m512 tmp12446 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(2e+00f), tmp12447);
__m512 tmp12466 = _mm512_fmadd_ps(tmp12468, _mm512_set1_ps(2e+00f), tmp12467);
__m512 tmp12453 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(8e+00f), tmp12447);
__m512 tmp12473 = _mm512_fmadd_ps(tmp12468, _mm512_set1_ps(8e+00f), tmp12467);
__m512 tmp12441 = _mm512_add_ps(tmp12442, tmp12443);
__m512 tmp12461 = _mm512_add_ps(tmp12462, tmp12463);
__m512 tmp12445 = _mm512_fmadd_ps(tmp12449, _mm512_set1_ps(1.6e+01f), tmp12446);
__m512 tmp12465 = _mm512_fmadd_ps(tmp12469, _mm512_set1_ps(1.6e+01f), tmp12466);
__m512 tmp12452 = _mm512_fmadd_ps(tmp12449, _mm512_set1_ps(4e+00f), tmp12453);
__m512 tmp12472 = _mm512_fmadd_ps(tmp12469, _mm512_set1_ps(4e+00f), tmp12473);
__m512 tmp12458 = _mm512_add_ps(tmp12449, tmp12447);
__m512 tmp12478 = _mm512_add_ps(tmp12469, tmp12467);
__m512 tmp12451 = _mm512_fmadd_ps(tmp12442, _mm512_set1_ps(4e+00f), tmp12443);
__m512 tmp12471 = _mm512_fmadd_ps(tmp12462, _mm512_set1_ps(4e+00f), tmp12463);
__m512 tmp12455 = _mm512_fmadd_ps(tmp12442, _mm512_set1_ps(1.6e+01f), tmp12443);
__m512 tmp12475 = _mm512_fmadd_ps(tmp12462, _mm512_set1_ps(1.6e+01f), tmp12463);
__m512 tmp12440 = _mm512_add_ps(tmp12441, tmp12383);
__m512 tmp12460 = _mm512_add_ps(tmp12461, tmp12391);
__m512 tmp12457 = _mm512_add_ps(tmp12458, tmp12390);
__m512 tmp12477 = _mm512_add_ps(tmp12478, tmp12438);
__m512 tmp12439 = _mm512_fmadd_ps(tmp12444, _mm512_set1_ps(3.2e+01f), tmp12440);
__m512 tmp12459 = _mm512_fmadd_ps(tmp12464, _mm512_set1_ps(3.2e+01f), tmp12460);
__m512 tmp12450 = _mm512_fmadd_ps(tmp12444, _mm512_set1_ps(8e+00f), tmp12451);
__m512 tmp12470 = _mm512_fmadd_ps(tmp12464, _mm512_set1_ps(8e+00f), tmp12471);
__m512 tmp12456 = _mm512_fmadd_ps(tmp12448, _mm512_set1_ps(3.2e+01f), tmp12457);
__m512 tmp12476 = _mm512_fmadd_ps(tmp12468, _mm512_set1_ps(3.2e+01f), tmp12477);
__m512 tmp12454 = _mm512_fmadd_ps(tmp12444, _mm512_set1_ps(2e+00f), tmp12455);
__m512 tmp12474 = _mm512_fmadd_ps(tmp12464, _mm512_set1_ps(2e+00f), tmp12475);
__m512 out1635 = tmp12439;
__m512 out1641 = tmp12459;
__m512 out1636 = tmp12445;
__m512 out1642 = tmp12465;
__m512 out1637 = tmp12450;
__m512 out1643 = tmp12470;
__m512 out1638 = tmp12452;
__m512 out1644 = tmp12472;
__m512 out1639 = tmp12454;
__m512 out1645 = tmp12474;
__m512 out1640 = tmp12456;
__m512 out1646 = tmp12476;
out1635 = _mm512_max_ps(_mm512_setzero_ps(), out1635);
out1641 = _mm512_max_ps(_mm512_setzero_ps(), out1641);
out1636 = _mm512_max_ps(_mm512_setzero_ps(), out1636);
out1642 = _mm512_max_ps(_mm512_setzero_ps(), out1642);
out1637 = _mm512_max_ps(_mm512_setzero_ps(), out1637);
out1643 = _mm512_max_ps(_mm512_setzero_ps(), out1643);
out1638 = _mm512_max_ps(_mm512_setzero_ps(), out1638);
out1644 = _mm512_max_ps(_mm512_setzero_ps(), out1644);
out1639 = _mm512_max_ps(_mm512_setzero_ps(), out1639);
out1645 = _mm512_max_ps(_mm512_setzero_ps(), out1645);
out1640 = _mm512_max_ps(_mm512_setzero_ps(), out1640);
out1646 = _mm512_max_ps(_mm512_setzero_ps(), out1646);
_mm512_mask_storeu_ps(datPtr26+0+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1635);
_mm512_mask_storeu_ps(datPtr26+48+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1641);
_mm512_mask_storeu_ps(datPtr26+112+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1636);
_mm512_mask_storeu_ps(datPtr26+160+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1642);
_mm512_mask_storeu_ps(datPtr26+224+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1637);
_mm512_mask_storeu_ps(datPtr26+272+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1643);
_mm512_mask_storeu_ps(datPtr26+336+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1638);
_mm512_mask_storeu_ps(datPtr26+384+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1644);
_mm512_mask_storeu_ps(datPtr26+448+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1639);
_mm512_mask_storeu_ps(datPtr26+496+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1645);
_mm512_mask_storeu_ps(datPtr26+560+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1640);
_mm512_mask_storeu_ps(datPtr26+608+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1646);
__m512 sf865 = _mm512_loadu_ps(sfPtr12+256+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf866 = _mm512_loadu_ps(sfPtr12+384+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1770 = _mm512_shuffle_f32x4(sf865, sf866, 68);
__m512 in1771 = _mm512_shuffle_f32x4(sf865, sf866, 238);
__m512 sf867 = _mm512_loadu_ps(sfPtr12+320+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf868 = _mm512_loadu_ps(sfPtr12+448+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1778 = _mm512_shuffle_f32x4(sf867, sf868, 68);
__m512 in1779 = _mm512_shuffle_f32x4(sf867, sf868, 238);
__m512 sf869 = _mm512_loadu_ps(sfPtr12+13056+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf870 = _mm512_loadu_ps(sfPtr12+13184+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1772 = _mm512_shuffle_f32x4(sf869, sf870, 68);
__m512 in1773 = _mm512_shuffle_f32x4(sf869, sf870, 238);
__m512 sf871 = _mm512_loadu_ps(sfPtr12+13120+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf872 = _mm512_loadu_ps(sfPtr12+13248+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1780 = _mm512_shuffle_f32x4(sf871, sf872, 68);
__m512 in1781 = _mm512_shuffle_f32x4(sf871, sf872, 238);
__m512 sf873 = _mm512_loadu_ps(sfPtr12+25856+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf874 = _mm512_loadu_ps(sfPtr12+25984+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1774 = _mm512_shuffle_f32x4(sf873, sf874, 68);
__m512 in1775 = _mm512_shuffle_f32x4(sf873, sf874, 238);
__m512 sf875 = _mm512_loadu_ps(sfPtr12+25920+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf876 = _mm512_loadu_ps(sfPtr12+26048+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1782 = _mm512_shuffle_f32x4(sf875, sf876, 68);
__m512 in1783 = _mm512_shuffle_f32x4(sf875, sf876, 238);
__m512 sf877 = _mm512_loadu_ps(sfPtr12+38656+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf878 = _mm512_loadu_ps(sfPtr12+38784+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1776 = _mm512_shuffle_f32x4(sf877, sf878, 68);
__m512 in1777 = _mm512_shuffle_f32x4(sf877, sf878, 238);
__m512 sf879 = _mm512_loadu_ps(sfPtr12+38720+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf880 = _mm512_loadu_ps(sfPtr12+38848+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1784 = _mm512_shuffle_f32x4(sf879, sf880, 68);
__m512 in1785 = _mm512_shuffle_f32x4(sf879, sf880, 238);
__m512 tmp12535 = _mm512_add_ps(in1771, in1772);
__m512 tmp12555 = _mm512_add_ps(in1779, in1780);
__m512 tmp12534 = _mm512_add_ps(in1773, in1774);
__m512 tmp12554 = _mm512_add_ps(in1781, in1782);
__m512 tmp12540 = _mm512_sub_ps(in1773, in1774);
__m512 tmp12560 = _mm512_sub_ps(in1781, in1782);
__m512 tmp12539 = _mm512_sub_ps(in1771, in1772);
__m512 tmp12559 = _mm512_sub_ps(in1779, in1780);
__m512 tmp12536 = _mm512_add_ps(in1775, in1776);
__m512 tmp12556 = _mm512_add_ps(in1783, in1784);
__m512 tmp12541 = _mm512_sub_ps(in1775, in1776);
__m512 tmp12561 = _mm512_sub_ps(in1783, in1784);
__m512 tmp12538 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(2e+00f), tmp12539);
__m512 tmp12558 = _mm512_fmadd_ps(tmp12560, _mm512_set1_ps(2e+00f), tmp12559);
__m512 tmp12545 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(8e+00f), tmp12539);
__m512 tmp12565 = _mm512_fmadd_ps(tmp12560, _mm512_set1_ps(8e+00f), tmp12559);
__m512 tmp12533 = _mm512_add_ps(tmp12534, tmp12535);
__m512 tmp12553 = _mm512_add_ps(tmp12554, tmp12555);
__m512 tmp12537 = _mm512_fmadd_ps(tmp12541, _mm512_set1_ps(1.6e+01f), tmp12538);
__m512 tmp12557 = _mm512_fmadd_ps(tmp12561, _mm512_set1_ps(1.6e+01f), tmp12558);
__m512 tmp12544 = _mm512_fmadd_ps(tmp12541, _mm512_set1_ps(4e+00f), tmp12545);
__m512 tmp12564 = _mm512_fmadd_ps(tmp12561, _mm512_set1_ps(4e+00f), tmp12565);
__m512 tmp12550 = _mm512_add_ps(tmp12541, tmp12539);
__m512 tmp12570 = _mm512_add_ps(tmp12561, tmp12559);
__m512 tmp12543 = _mm512_fmadd_ps(tmp12534, _mm512_set1_ps(4e+00f), tmp12535);
__m512 tmp12563 = _mm512_fmadd_ps(tmp12554, _mm512_set1_ps(4e+00f), tmp12555);
__m512 tmp12547 = _mm512_fmadd_ps(tmp12534, _mm512_set1_ps(1.6e+01f), tmp12535);
__m512 tmp12567 = _mm512_fmadd_ps(tmp12554, _mm512_set1_ps(1.6e+01f), tmp12555);
__m512 tmp12532 = _mm512_add_ps(tmp12533, in1770);
__m512 tmp12552 = _mm512_add_ps(tmp12553, in1778);
__m512 tmp12549 = _mm512_add_ps(tmp12550, in1777);
__m512 tmp12569 = _mm512_add_ps(tmp12570, in1785);
__m512 tmp12531 = _mm512_fmadd_ps(tmp12536, _mm512_set1_ps(3.2e+01f), tmp12532);
__m512 tmp12551 = _mm512_fmadd_ps(tmp12556, _mm512_set1_ps(3.2e+01f), tmp12552);
__m512 tmp12542 = _mm512_fmadd_ps(tmp12536, _mm512_set1_ps(8e+00f), tmp12543);
__m512 tmp12562 = _mm512_fmadd_ps(tmp12556, _mm512_set1_ps(8e+00f), tmp12563);
__m512 tmp12548 = _mm512_fmadd_ps(tmp12540, _mm512_set1_ps(3.2e+01f), tmp12549);
__m512 tmp12568 = _mm512_fmadd_ps(tmp12560, _mm512_set1_ps(3.2e+01f), tmp12569);
__m512 tmp12546 = _mm512_fmadd_ps(tmp12536, _mm512_set1_ps(2e+00f), tmp12547);
__m512 tmp12566 = _mm512_fmadd_ps(tmp12556, _mm512_set1_ps(2e+00f), tmp12567);
__m512 tmp12519 = tmp12531;
__m512 tmp12525 = tmp12551;
__m512 tmp12520 = tmp12537;
__m512 tmp12526 = tmp12557;
__m512 tmp12521 = tmp12542;
__m512 tmp12527 = tmp12562;
__m512 tmp12522 = tmp12544;
__m512 tmp12528 = tmp12564;
__m512 tmp12523 = tmp12546;
__m512 tmp12529 = tmp12566;
__m512 tmp12524 = tmp12548;
__m512 tmp12530 = tmp12568;
__m512 tmp12615 = _mm512_unpacklo_ps(tmp12519, tmp12520);
__m512 tmp12616 = _mm512_unpackhi_ps(tmp12519, tmp12520);
__m512 tmp12617 = _mm512_unpacklo_ps(tmp12521, tmp12522);
__m512 tmp12618 = _mm512_unpackhi_ps(tmp12521, tmp12522);
__m512 tmp12619 = _mm512_unpacklo_ps(tmp12523, tmp12524);
__m512 tmp12620 = _mm512_unpackhi_ps(tmp12523, tmp12524);
__m512 tmp12621 = _mm512_unpacklo_ps(tmp12525, tmp12526);
__m512 tmp12622 = _mm512_unpackhi_ps(tmp12525, tmp12526);
__m512 tmp12623 = _mm512_unpacklo_ps(tmp12527, tmp12528);
__m512 tmp12624 = _mm512_unpackhi_ps(tmp12527, tmp12528);
__m512 tmp12625 = _mm512_unpacklo_ps(tmp12529, tmp12530);
__m512 tmp12626 = _mm512_unpackhi_ps(tmp12529, tmp12530);
__m512 tmp12627 = _mm512_shuffle_ps(tmp12615, tmp12617, 68);
__m512 tmp12628 = _mm512_shuffle_ps(tmp12615, tmp12617, 238);
__m512 tmp12629 = _mm512_shuffle_ps(tmp12616, tmp12618, 68);
__m512 tmp12630 = _mm512_shuffle_ps(tmp12616, tmp12618, 238);
__m512 tmp12631 = _mm512_shuffle_ps(tmp12619, tmp12621, 68);
__m512 tmp12632 = _mm512_shuffle_ps(tmp12619, tmp12621, 238);
__m512 tmp12633 = _mm512_shuffle_ps(tmp12620, tmp12622, 68);
__m512 tmp12634 = _mm512_shuffle_ps(tmp12620, tmp12622, 238);
__m512 tmp12635 = _mm512_shuffle_ps(tmp12623, tmp12625, 68);
__m512 tmp12636 = _mm512_shuffle_ps(tmp12623, tmp12625, 238);
__m512 tmp12637 = _mm512_shuffle_ps(tmp12624, tmp12626, 68);
__m512 tmp12638 = _mm512_shuffle_ps(tmp12624, tmp12626, 238);
__m512 tmp12639 = _mm512_shuffle_f32x4(tmp12627, tmp12631, 136);
__m512 tmp12640 = _mm512_shuffle_f32x4(tmp12627, tmp12631, 221);
__m512 tmp12641 = _mm512_shuffle_f32x4(tmp12628, tmp12632, 136);
__m512 tmp12642 = _mm512_shuffle_f32x4(tmp12628, tmp12632, 221);
__m512 tmp12643 = _mm512_shuffle_f32x4(tmp12629, tmp12633, 136);
__m512 tmp12644 = _mm512_shuffle_f32x4(tmp12629, tmp12633, 221);
__m512 tmp12645 = _mm512_shuffle_f32x4(tmp12630, tmp12634, 136);
__m512 tmp12646 = _mm512_shuffle_f32x4(tmp12630, tmp12634, 221);
__m512 tmp12647 = _mm512_shuffle_f32x4(tmp12635, tmp12635, 136);
__m512 tmp12648 = _mm512_shuffle_f32x4(tmp12635, tmp12635, 221);
__m512 tmp12649 = _mm512_shuffle_f32x4(tmp12636, tmp12636, 136);
__m512 tmp12650 = _mm512_shuffle_f32x4(tmp12636, tmp12636, 221);
__m512 tmp12651 = _mm512_shuffle_f32x4(tmp12637, tmp12637, 136);
__m512 tmp12652 = _mm512_shuffle_f32x4(tmp12637, tmp12637, 221);
__m512 tmp12653 = _mm512_shuffle_f32x4(tmp12638, tmp12638, 136);
__m512 tmp12654 = _mm512_shuffle_f32x4(tmp12638, tmp12638, 221);
tmp12519 = _mm512_shuffle_f32x4(tmp12639, tmp12647, 136);
tmp12527 = _mm512_shuffle_f32x4(tmp12639, tmp12647, 221);
tmp12520 = _mm512_shuffle_f32x4(tmp12641, tmp12649, 136);
tmp12528 = _mm512_shuffle_f32x4(tmp12641, tmp12649, 221);
tmp12521 = _mm512_shuffle_f32x4(tmp12643, tmp12651, 136);
tmp12529 = _mm512_shuffle_f32x4(tmp12643, tmp12651, 221);
tmp12522 = _mm512_shuffle_f32x4(tmp12645, tmp12653, 136);
tmp12530 = _mm512_shuffle_f32x4(tmp12645, tmp12653, 221);
tmp12523 = _mm512_shuffle_f32x4(tmp12640, tmp12648, 136);
__m512 tmp12571 = _mm512_shuffle_f32x4(tmp12640, tmp12648, 221);
tmp12524 = _mm512_shuffle_f32x4(tmp12642, tmp12650, 136);
__m512 tmp12572 = _mm512_shuffle_f32x4(tmp12642, tmp12650, 221);
tmp12525 = _mm512_shuffle_f32x4(tmp12644, tmp12652, 136);
__m512 tmp12573 = _mm512_shuffle_f32x4(tmp12644, tmp12652, 221);
tmp12526 = _mm512_shuffle_f32x4(tmp12646, tmp12654, 136);
__m512 tmp12574 = _mm512_shuffle_f32x4(tmp12646, tmp12654, 221);
__m512 tmp12579 = _mm512_add_ps(tmp12520, tmp12521);
__m512 tmp12599 = _mm512_add_ps(tmp12528, tmp12529);
__m512 tmp12578 = _mm512_add_ps(tmp12522, tmp12523);
__m512 tmp12598 = _mm512_add_ps(tmp12530, tmp12571);
__m512 tmp12584 = _mm512_sub_ps(tmp12522, tmp12523);
__m512 tmp12604 = _mm512_sub_ps(tmp12530, tmp12571);
__m512 tmp12583 = _mm512_sub_ps(tmp12520, tmp12521);
__m512 tmp12603 = _mm512_sub_ps(tmp12528, tmp12529);
__m512 tmp12580 = _mm512_add_ps(tmp12524, tmp12525);
__m512 tmp12600 = _mm512_add_ps(tmp12572, tmp12573);
__m512 tmp12585 = _mm512_sub_ps(tmp12524, tmp12525);
__m512 tmp12605 = _mm512_sub_ps(tmp12572, tmp12573);
__m512 tmp12582 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(2e+00f), tmp12583);
__m512 tmp12602 = _mm512_fmadd_ps(tmp12604, _mm512_set1_ps(2e+00f), tmp12603);
__m512 tmp12589 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(8e+00f), tmp12583);
__m512 tmp12609 = _mm512_fmadd_ps(tmp12604, _mm512_set1_ps(8e+00f), tmp12603);
__m512 tmp12577 = _mm512_add_ps(tmp12578, tmp12579);
__m512 tmp12597 = _mm512_add_ps(tmp12598, tmp12599);
__m512 tmp12581 = _mm512_fmadd_ps(tmp12585, _mm512_set1_ps(1.6e+01f), tmp12582);
__m512 tmp12601 = _mm512_fmadd_ps(tmp12605, _mm512_set1_ps(1.6e+01f), tmp12602);
__m512 tmp12588 = _mm512_fmadd_ps(tmp12585, _mm512_set1_ps(4e+00f), tmp12589);
__m512 tmp12608 = _mm512_fmadd_ps(tmp12605, _mm512_set1_ps(4e+00f), tmp12609);
__m512 tmp12594 = _mm512_add_ps(tmp12585, tmp12583);
__m512 tmp12614 = _mm512_add_ps(tmp12605, tmp12603);
__m512 tmp12587 = _mm512_fmadd_ps(tmp12578, _mm512_set1_ps(4e+00f), tmp12579);
__m512 tmp12607 = _mm512_fmadd_ps(tmp12598, _mm512_set1_ps(4e+00f), tmp12599);
__m512 tmp12591 = _mm512_fmadd_ps(tmp12578, _mm512_set1_ps(1.6e+01f), tmp12579);
__m512 tmp12611 = _mm512_fmadd_ps(tmp12598, _mm512_set1_ps(1.6e+01f), tmp12599);
__m512 tmp12576 = _mm512_add_ps(tmp12577, tmp12519);
__m512 tmp12596 = _mm512_add_ps(tmp12597, tmp12527);
__m512 tmp12593 = _mm512_add_ps(tmp12594, tmp12526);
__m512 tmp12613 = _mm512_add_ps(tmp12614, tmp12574);
__m512 tmp12575 = _mm512_fmadd_ps(tmp12580, _mm512_set1_ps(3.2e+01f), tmp12576);
__m512 tmp12595 = _mm512_fmadd_ps(tmp12600, _mm512_set1_ps(3.2e+01f), tmp12596);
__m512 tmp12586 = _mm512_fmadd_ps(tmp12580, _mm512_set1_ps(8e+00f), tmp12587);
__m512 tmp12606 = _mm512_fmadd_ps(tmp12600, _mm512_set1_ps(8e+00f), tmp12607);
__m512 tmp12592 = _mm512_fmadd_ps(tmp12584, _mm512_set1_ps(3.2e+01f), tmp12593);
__m512 tmp12612 = _mm512_fmadd_ps(tmp12604, _mm512_set1_ps(3.2e+01f), tmp12613);
__m512 tmp12590 = _mm512_fmadd_ps(tmp12580, _mm512_set1_ps(2e+00f), tmp12591);
__m512 tmp12610 = _mm512_fmadd_ps(tmp12600, _mm512_set1_ps(2e+00f), tmp12611);
__m512 out1647 = tmp12575;
__m512 out1653 = tmp12595;
__m512 out1648 = tmp12581;
__m512 out1654 = tmp12601;
__m512 out1649 = tmp12586;
__m512 out1655 = tmp12606;
__m512 out1650 = tmp12588;
__m512 out1656 = tmp12608;
__m512 out1651 = tmp12590;
__m512 out1657 = tmp12610;
__m512 out1652 = tmp12592;
__m512 out1658 = tmp12612;
out1647 = _mm512_max_ps(_mm512_setzero_ps(), out1647);
out1653 = _mm512_max_ps(_mm512_setzero_ps(), out1653);
out1648 = _mm512_max_ps(_mm512_setzero_ps(), out1648);
out1654 = _mm512_max_ps(_mm512_setzero_ps(), out1654);
out1649 = _mm512_max_ps(_mm512_setzero_ps(), out1649);
out1655 = _mm512_max_ps(_mm512_setzero_ps(), out1655);
out1650 = _mm512_max_ps(_mm512_setzero_ps(), out1650);
out1656 = _mm512_max_ps(_mm512_setzero_ps(), out1656);
out1651 = _mm512_max_ps(_mm512_setzero_ps(), out1651);
out1657 = _mm512_max_ps(_mm512_setzero_ps(), out1657);
out1652 = _mm512_max_ps(_mm512_setzero_ps(), out1652);
out1658 = _mm512_max_ps(_mm512_setzero_ps(), out1658);
_mm512_mask_storeu_ps(datPtr26+648+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1647);
_mm512_mask_storeu_ps(datPtr26+3136+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1653);
_mm512_mask_storeu_ps(datPtr26+760+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1648);
_mm512_mask_storeu_ps(datPtr26+3248+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1654);
_mm512_mask_storeu_ps(datPtr26+872+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1649);
_mm512_mask_storeu_ps(datPtr26+3360+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1655);
_mm512_mask_storeu_ps(datPtr26+984+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1650);
_mm512_mask_storeu_ps(datPtr26+3472+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1656);
_mm512_mask_storeu_ps(datPtr26+1096+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1651);
_mm512_mask_storeu_ps(datPtr26+3584+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1657);
_mm512_mask_storeu_ps(datPtr26+1208+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1652);
_mm512_mask_storeu_ps(datPtr26+3696+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1658);
__m512 sf881 = _mm512_loadu_ps(sfPtr12+512+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf882 = _mm512_loadu_ps(sfPtr12+640+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1786 = _mm512_shuffle_f32x4(sf881, sf882, 68);
__m512 in1787 = _mm512_shuffle_f32x4(sf881, sf882, 238);
__m512 sf883 = _mm512_loadu_ps(sfPtr12+576+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf884 = _mm512_loadu_ps(sfPtr12+704+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1794 = _mm512_shuffle_f32x4(sf883, sf884, 68);
__m512 in1795 = _mm512_shuffle_f32x4(sf883, sf884, 238);
__m512 sf885 = _mm512_loadu_ps(sfPtr12+13312+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf886 = _mm512_loadu_ps(sfPtr12+13440+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1788 = _mm512_shuffle_f32x4(sf885, sf886, 68);
__m512 in1789 = _mm512_shuffle_f32x4(sf885, sf886, 238);
__m512 sf887 = _mm512_loadu_ps(sfPtr12+13376+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf888 = _mm512_loadu_ps(sfPtr12+13504+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1796 = _mm512_shuffle_f32x4(sf887, sf888, 68);
__m512 in1797 = _mm512_shuffle_f32x4(sf887, sf888, 238);
__m512 sf889 = _mm512_loadu_ps(sfPtr12+26112+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf890 = _mm512_loadu_ps(sfPtr12+26240+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1790 = _mm512_shuffle_f32x4(sf889, sf890, 68);
__m512 in1791 = _mm512_shuffle_f32x4(sf889, sf890, 238);
__m512 sf891 = _mm512_loadu_ps(sfPtr12+26176+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf892 = _mm512_loadu_ps(sfPtr12+26304+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1798 = _mm512_shuffle_f32x4(sf891, sf892, 68);
__m512 in1799 = _mm512_shuffle_f32x4(sf891, sf892, 238);
__m512 sf893 = _mm512_loadu_ps(sfPtr12+38912+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf894 = _mm512_loadu_ps(sfPtr12+39040+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1792 = _mm512_shuffle_f32x4(sf893, sf894, 68);
__m512 in1793 = _mm512_shuffle_f32x4(sf893, sf894, 238);
__m512 sf895 = _mm512_loadu_ps(sfPtr12+38976+51200*i51+3072*j44+1536*k137+768*l56);
__m512 sf896 = _mm512_loadu_ps(sfPtr12+39104+51200*i51+3072*j44+1536*k137+768*l56);
__m512 in1800 = _mm512_shuffle_f32x4(sf895, sf896, 68);
__m512 in1801 = _mm512_shuffle_f32x4(sf895, sf896, 238);
__m512 tmp12671 = _mm512_add_ps(in1787, in1788);
__m512 tmp12691 = _mm512_add_ps(in1795, in1796);
__m512 tmp12670 = _mm512_add_ps(in1789, in1790);
__m512 tmp12690 = _mm512_add_ps(in1797, in1798);
__m512 tmp12676 = _mm512_sub_ps(in1789, in1790);
__m512 tmp12696 = _mm512_sub_ps(in1797, in1798);
__m512 tmp12675 = _mm512_sub_ps(in1787, in1788);
__m512 tmp12695 = _mm512_sub_ps(in1795, in1796);
__m512 tmp12672 = _mm512_add_ps(in1791, in1792);
__m512 tmp12692 = _mm512_add_ps(in1799, in1800);
__m512 tmp12677 = _mm512_sub_ps(in1791, in1792);
__m512 tmp12697 = _mm512_sub_ps(in1799, in1800);
__m512 tmp12674 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(2e+00f), tmp12675);
__m512 tmp12694 = _mm512_fmadd_ps(tmp12696, _mm512_set1_ps(2e+00f), tmp12695);
__m512 tmp12681 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(8e+00f), tmp12675);
__m512 tmp12701 = _mm512_fmadd_ps(tmp12696, _mm512_set1_ps(8e+00f), tmp12695);
__m512 tmp12669 = _mm512_add_ps(tmp12670, tmp12671);
__m512 tmp12689 = _mm512_add_ps(tmp12690, tmp12691);
__m512 tmp12673 = _mm512_fmadd_ps(tmp12677, _mm512_set1_ps(1.6e+01f), tmp12674);
__m512 tmp12693 = _mm512_fmadd_ps(tmp12697, _mm512_set1_ps(1.6e+01f), tmp12694);
__m512 tmp12680 = _mm512_fmadd_ps(tmp12677, _mm512_set1_ps(4e+00f), tmp12681);
__m512 tmp12700 = _mm512_fmadd_ps(tmp12697, _mm512_set1_ps(4e+00f), tmp12701);
__m512 tmp12686 = _mm512_add_ps(tmp12677, tmp12675);
__m512 tmp12706 = _mm512_add_ps(tmp12697, tmp12695);
__m512 tmp12679 = _mm512_fmadd_ps(tmp12670, _mm512_set1_ps(4e+00f), tmp12671);
__m512 tmp12699 = _mm512_fmadd_ps(tmp12690, _mm512_set1_ps(4e+00f), tmp12691);
__m512 tmp12683 = _mm512_fmadd_ps(tmp12670, _mm512_set1_ps(1.6e+01f), tmp12671);
__m512 tmp12703 = _mm512_fmadd_ps(tmp12690, _mm512_set1_ps(1.6e+01f), tmp12691);
__m512 tmp12668 = _mm512_add_ps(tmp12669, in1786);
__m512 tmp12688 = _mm512_add_ps(tmp12689, in1794);
__m512 tmp12685 = _mm512_add_ps(tmp12686, in1793);
__m512 tmp12705 = _mm512_add_ps(tmp12706, in1801);
__m512 tmp12667 = _mm512_fmadd_ps(tmp12672, _mm512_set1_ps(3.2e+01f), tmp12668);
__m512 tmp12687 = _mm512_fmadd_ps(tmp12692, _mm512_set1_ps(3.2e+01f), tmp12688);
__m512 tmp12678 = _mm512_fmadd_ps(tmp12672, _mm512_set1_ps(8e+00f), tmp12679);
__m512 tmp12698 = _mm512_fmadd_ps(tmp12692, _mm512_set1_ps(8e+00f), tmp12699);
__m512 tmp12684 = _mm512_fmadd_ps(tmp12676, _mm512_set1_ps(3.2e+01f), tmp12685);
__m512 tmp12704 = _mm512_fmadd_ps(tmp12696, _mm512_set1_ps(3.2e+01f), tmp12705);
__m512 tmp12682 = _mm512_fmadd_ps(tmp12672, _mm512_set1_ps(2e+00f), tmp12683);
__m512 tmp12702 = _mm512_fmadd_ps(tmp12692, _mm512_set1_ps(2e+00f), tmp12703);
__m512 tmp12655 = tmp12667;
__m512 tmp12661 = tmp12687;
__m512 tmp12656 = tmp12673;
__m512 tmp12662 = tmp12693;
__m512 tmp12657 = tmp12678;
__m512 tmp12663 = tmp12698;
__m512 tmp12658 = tmp12680;
__m512 tmp12664 = tmp12700;
__m512 tmp12659 = tmp12682;
__m512 tmp12665 = tmp12702;
__m512 tmp12660 = tmp12684;
__m512 tmp12666 = tmp12704;
__m512 tmp12751 = _mm512_unpacklo_ps(tmp12655, tmp12656);
__m512 tmp12752 = _mm512_unpackhi_ps(tmp12655, tmp12656);
__m512 tmp12753 = _mm512_unpacklo_ps(tmp12657, tmp12658);
__m512 tmp12754 = _mm512_unpackhi_ps(tmp12657, tmp12658);
__m512 tmp12755 = _mm512_unpacklo_ps(tmp12659, tmp12660);
__m512 tmp12756 = _mm512_unpackhi_ps(tmp12659, tmp12660);
__m512 tmp12757 = _mm512_unpacklo_ps(tmp12661, tmp12662);
__m512 tmp12758 = _mm512_unpackhi_ps(tmp12661, tmp12662);
__m512 tmp12759 = _mm512_unpacklo_ps(tmp12663, tmp12664);
__m512 tmp12760 = _mm512_unpackhi_ps(tmp12663, tmp12664);
__m512 tmp12761 = _mm512_unpacklo_ps(tmp12665, tmp12666);
__m512 tmp12762 = _mm512_unpackhi_ps(tmp12665, tmp12666);
__m512 tmp12763 = _mm512_shuffle_ps(tmp12751, tmp12753, 68);
__m512 tmp12764 = _mm512_shuffle_ps(tmp12751, tmp12753, 238);
__m512 tmp12765 = _mm512_shuffle_ps(tmp12752, tmp12754, 68);
__m512 tmp12766 = _mm512_shuffle_ps(tmp12752, tmp12754, 238);
__m512 tmp12767 = _mm512_shuffle_ps(tmp12755, tmp12757, 68);
__m512 tmp12768 = _mm512_shuffle_ps(tmp12755, tmp12757, 238);
__m512 tmp12769 = _mm512_shuffle_ps(tmp12756, tmp12758, 68);
__m512 tmp12770 = _mm512_shuffle_ps(tmp12756, tmp12758, 238);
__m512 tmp12771 = _mm512_shuffle_ps(tmp12759, tmp12761, 68);
__m512 tmp12772 = _mm512_shuffle_ps(tmp12759, tmp12761, 238);
__m512 tmp12773 = _mm512_shuffle_ps(tmp12760, tmp12762, 68);
__m512 tmp12774 = _mm512_shuffle_ps(tmp12760, tmp12762, 238);
__m512 tmp12775 = _mm512_shuffle_f32x4(tmp12763, tmp12767, 136);
__m512 tmp12776 = _mm512_shuffle_f32x4(tmp12763, tmp12767, 221);
__m512 tmp12777 = _mm512_shuffle_f32x4(tmp12764, tmp12768, 136);
__m512 tmp12778 = _mm512_shuffle_f32x4(tmp12764, tmp12768, 221);
__m512 tmp12779 = _mm512_shuffle_f32x4(tmp12765, tmp12769, 136);
__m512 tmp12780 = _mm512_shuffle_f32x4(tmp12765, tmp12769, 221);
__m512 tmp12781 = _mm512_shuffle_f32x4(tmp12766, tmp12770, 136);
__m512 tmp12782 = _mm512_shuffle_f32x4(tmp12766, tmp12770, 221);
__m512 tmp12783 = _mm512_shuffle_f32x4(tmp12771, tmp12771, 136);
__m512 tmp12784 = _mm512_shuffle_f32x4(tmp12771, tmp12771, 221);
__m512 tmp12785 = _mm512_shuffle_f32x4(tmp12772, tmp12772, 136);
__m512 tmp12786 = _mm512_shuffle_f32x4(tmp12772, tmp12772, 221);
__m512 tmp12787 = _mm512_shuffle_f32x4(tmp12773, tmp12773, 136);
__m512 tmp12788 = _mm512_shuffle_f32x4(tmp12773, tmp12773, 221);
__m512 tmp12789 = _mm512_shuffle_f32x4(tmp12774, tmp12774, 136);
__m512 tmp12790 = _mm512_shuffle_f32x4(tmp12774, tmp12774, 221);
tmp12655 = _mm512_shuffle_f32x4(tmp12775, tmp12783, 136);
tmp12663 = _mm512_shuffle_f32x4(tmp12775, tmp12783, 221);
tmp12656 = _mm512_shuffle_f32x4(tmp12777, tmp12785, 136);
tmp12664 = _mm512_shuffle_f32x4(tmp12777, tmp12785, 221);
tmp12657 = _mm512_shuffle_f32x4(tmp12779, tmp12787, 136);
tmp12665 = _mm512_shuffle_f32x4(tmp12779, tmp12787, 221);
tmp12658 = _mm512_shuffle_f32x4(tmp12781, tmp12789, 136);
tmp12666 = _mm512_shuffle_f32x4(tmp12781, tmp12789, 221);
tmp12659 = _mm512_shuffle_f32x4(tmp12776, tmp12784, 136);
__m512 tmp12707 = _mm512_shuffle_f32x4(tmp12776, tmp12784, 221);
tmp12660 = _mm512_shuffle_f32x4(tmp12778, tmp12786, 136);
__m512 tmp12708 = _mm512_shuffle_f32x4(tmp12778, tmp12786, 221);
tmp12661 = _mm512_shuffle_f32x4(tmp12780, tmp12788, 136);
__m512 tmp12709 = _mm512_shuffle_f32x4(tmp12780, tmp12788, 221);
tmp12662 = _mm512_shuffle_f32x4(tmp12782, tmp12790, 136);
__m512 tmp12710 = _mm512_shuffle_f32x4(tmp12782, tmp12790, 221);
__m512 tmp12715 = _mm512_add_ps(tmp12656, tmp12657);
__m512 tmp12735 = _mm512_add_ps(tmp12664, tmp12665);
__m512 tmp12714 = _mm512_add_ps(tmp12658, tmp12659);
__m512 tmp12734 = _mm512_add_ps(tmp12666, tmp12707);
__m512 tmp12720 = _mm512_sub_ps(tmp12658, tmp12659);
__m512 tmp12740 = _mm512_sub_ps(tmp12666, tmp12707);
__m512 tmp12719 = _mm512_sub_ps(tmp12656, tmp12657);
__m512 tmp12739 = _mm512_sub_ps(tmp12664, tmp12665);
__m512 tmp12716 = _mm512_add_ps(tmp12660, tmp12661);
__m512 tmp12736 = _mm512_add_ps(tmp12708, tmp12709);
__m512 tmp12721 = _mm512_sub_ps(tmp12660, tmp12661);
__m512 tmp12741 = _mm512_sub_ps(tmp12708, tmp12709);
__m512 tmp12718 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(2e+00f), tmp12719);
__m512 tmp12738 = _mm512_fmadd_ps(tmp12740, _mm512_set1_ps(2e+00f), tmp12739);
__m512 tmp12725 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(8e+00f), tmp12719);
__m512 tmp12745 = _mm512_fmadd_ps(tmp12740, _mm512_set1_ps(8e+00f), tmp12739);
__m512 tmp12713 = _mm512_add_ps(tmp12714, tmp12715);
__m512 tmp12733 = _mm512_add_ps(tmp12734, tmp12735);
__m512 tmp12717 = _mm512_fmadd_ps(tmp12721, _mm512_set1_ps(1.6e+01f), tmp12718);
__m512 tmp12737 = _mm512_fmadd_ps(tmp12741, _mm512_set1_ps(1.6e+01f), tmp12738);
__m512 tmp12724 = _mm512_fmadd_ps(tmp12721, _mm512_set1_ps(4e+00f), tmp12725);
__m512 tmp12744 = _mm512_fmadd_ps(tmp12741, _mm512_set1_ps(4e+00f), tmp12745);
__m512 tmp12730 = _mm512_add_ps(tmp12721, tmp12719);
__m512 tmp12750 = _mm512_add_ps(tmp12741, tmp12739);
__m512 tmp12723 = _mm512_fmadd_ps(tmp12714, _mm512_set1_ps(4e+00f), tmp12715);
__m512 tmp12743 = _mm512_fmadd_ps(tmp12734, _mm512_set1_ps(4e+00f), tmp12735);
__m512 tmp12727 = _mm512_fmadd_ps(tmp12714, _mm512_set1_ps(1.6e+01f), tmp12715);
__m512 tmp12747 = _mm512_fmadd_ps(tmp12734, _mm512_set1_ps(1.6e+01f), tmp12735);
__m512 tmp12712 = _mm512_add_ps(tmp12713, tmp12655);
__m512 tmp12732 = _mm512_add_ps(tmp12733, tmp12663);
__m512 tmp12729 = _mm512_add_ps(tmp12730, tmp12662);
__m512 tmp12749 = _mm512_add_ps(tmp12750, tmp12710);
__m512 tmp12711 = _mm512_fmadd_ps(tmp12716, _mm512_set1_ps(3.2e+01f), tmp12712);
__m512 tmp12731 = _mm512_fmadd_ps(tmp12736, _mm512_set1_ps(3.2e+01f), tmp12732);
__m512 tmp12722 = _mm512_fmadd_ps(tmp12716, _mm512_set1_ps(8e+00f), tmp12723);
__m512 tmp12742 = _mm512_fmadd_ps(tmp12736, _mm512_set1_ps(8e+00f), tmp12743);
__m512 tmp12728 = _mm512_fmadd_ps(tmp12720, _mm512_set1_ps(3.2e+01f), tmp12729);
__m512 tmp12748 = _mm512_fmadd_ps(tmp12740, _mm512_set1_ps(3.2e+01f), tmp12749);
__m512 tmp12726 = _mm512_fmadd_ps(tmp12716, _mm512_set1_ps(2e+00f), tmp12727);
__m512 tmp12746 = _mm512_fmadd_ps(tmp12736, _mm512_set1_ps(2e+00f), tmp12747);
__m512 out1659 = tmp12711;
__m512 out1665 = tmp12731;
__m512 out1660 = tmp12717;
__m512 out1666 = tmp12737;
__m512 out1661 = tmp12722;
__m512 out1667 = tmp12742;
__m512 out1662 = tmp12724;
__m512 out1668 = tmp12744;
__m512 out1663 = tmp12726;
__m512 out1669 = tmp12746;
__m512 out1664 = tmp12728;
__m512 out1670 = tmp12748;
out1659 = _mm512_max_ps(_mm512_setzero_ps(), out1659);
out1665 = _mm512_max_ps(_mm512_setzero_ps(), out1665);
out1660 = _mm512_max_ps(_mm512_setzero_ps(), out1660);
out1666 = _mm512_max_ps(_mm512_setzero_ps(), out1666);
out1661 = _mm512_max_ps(_mm512_setzero_ps(), out1661);
out1667 = _mm512_max_ps(_mm512_setzero_ps(), out1667);
out1662 = _mm512_max_ps(_mm512_setzero_ps(), out1662);
out1668 = _mm512_max_ps(_mm512_setzero_ps(), out1668);
out1663 = _mm512_max_ps(_mm512_setzero_ps(), out1663);
out1669 = _mm512_max_ps(_mm512_setzero_ps(), out1669);
out1664 = _mm512_max_ps(_mm512_setzero_ps(), out1664);
out1670 = _mm512_max_ps(_mm512_setzero_ps(), out1670);
_mm512_mask_storeu_ps(datPtr26+3184+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1659);
_mm512_mask_storeu_ps(datPtr26+3784+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1665);
_mm512_mask_storeu_ps(datPtr26+3296+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1660);
_mm512_mask_storeu_ps(datPtr26+3896+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1666);
_mm512_mask_storeu_ps(datPtr26+3408+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1661);
_mm512_mask_storeu_ps(datPtr26+4008+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1667);
_mm512_mask_storeu_ps(datPtr26+3520+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1662);
_mm512_mask_storeu_ps(datPtr26+4120+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1668);
_mm512_mask_storeu_ps(datPtr26+3632+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1663);
_mm512_mask_storeu_ps(datPtr26+4232+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1669);
_mm512_mask_storeu_ps(datPtr26+3744+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 1023, out1664);
_mm512_mask_storeu_ps(datPtr26+4344+25088*i51+112*toH42+4*toW42+12544*k137+6272*l56, 4095, out1670);
}
}
++j44;
rel22 = 2;
}
if (rel22 < 3) {
ptrdiff_t toH43 = base22+12;
ptrdiff_t toW43 = 12;
ptrdiff_t k138 = 2*w62;
for (; k138 != 2; ++k138) {
ptrdiff_t l57 = 0;
for (; l57 != 2; ++l57) {
__m512 sf897 = _mm512_loadu_ps(sfPtr12+0+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf898 = _mm512_loadu_ps(sfPtr12+128+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1802 = _mm512_shuffle_f32x4(sf897, sf898, 68);
__m512 in1803 = _mm512_shuffle_f32x4(sf897, sf898, 238);
__m512 sf899 = _mm512_loadu_ps(sfPtr12+64+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf900 = _mm512_loadu_ps(sfPtr12+192+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1810 = _mm512_shuffle_f32x4(sf899, sf900, 68);
__m512 in1811 = _mm512_shuffle_f32x4(sf899, sf900, 238);
__m512 sf901 = _mm512_loadu_ps(sfPtr12+12800+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf902 = _mm512_loadu_ps(sfPtr12+12928+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1804 = _mm512_shuffle_f32x4(sf901, sf902, 68);
__m512 in1805 = _mm512_shuffle_f32x4(sf901, sf902, 238);
__m512 sf903 = _mm512_loadu_ps(sfPtr12+12864+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf904 = _mm512_loadu_ps(sfPtr12+12992+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1812 = _mm512_shuffle_f32x4(sf903, sf904, 68);
__m512 in1813 = _mm512_shuffle_f32x4(sf903, sf904, 238);
__m512 sf905 = _mm512_loadu_ps(sfPtr12+25600+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf906 = _mm512_loadu_ps(sfPtr12+25728+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1806 = _mm512_shuffle_f32x4(sf905, sf906, 68);
__m512 in1807 = _mm512_shuffle_f32x4(sf905, sf906, 238);
__m512 sf907 = _mm512_loadu_ps(sfPtr12+25664+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf908 = _mm512_loadu_ps(sfPtr12+25792+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1814 = _mm512_shuffle_f32x4(sf907, sf908, 68);
__m512 in1815 = _mm512_shuffle_f32x4(sf907, sf908, 238);
__m512 sf909 = _mm512_loadu_ps(sfPtr12+38400+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf910 = _mm512_loadu_ps(sfPtr12+38528+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1808 = _mm512_shuffle_f32x4(sf909, sf910, 68);
__m512 in1809 = _mm512_shuffle_f32x4(sf909, sf910, 238);
__m512 sf911 = _mm512_loadu_ps(sfPtr12+38464+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf912 = _mm512_loadu_ps(sfPtr12+38592+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1816 = _mm512_shuffle_f32x4(sf911, sf912, 68);
__m512 in1817 = _mm512_shuffle_f32x4(sf911, sf912, 238);
__m512 tmp12807 = _mm512_add_ps(in1803, in1804);
__m512 tmp12827 = _mm512_add_ps(in1811, in1812);
__m512 tmp12806 = _mm512_add_ps(in1805, in1806);
__m512 tmp12826 = _mm512_add_ps(in1813, in1814);
__m512 tmp12812 = _mm512_sub_ps(in1805, in1806);
__m512 tmp12832 = _mm512_sub_ps(in1813, in1814);
__m512 tmp12811 = _mm512_sub_ps(in1803, in1804);
__m512 tmp12831 = _mm512_sub_ps(in1811, in1812);
__m512 tmp12808 = _mm512_add_ps(in1807, in1808);
__m512 tmp12828 = _mm512_add_ps(in1815, in1816);
__m512 tmp12813 = _mm512_sub_ps(in1807, in1808);
__m512 tmp12833 = _mm512_sub_ps(in1815, in1816);
__m512 tmp12810 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(2e+00f), tmp12811);
__m512 tmp12830 = _mm512_fmadd_ps(tmp12832, _mm512_set1_ps(2e+00f), tmp12831);
__m512 tmp12817 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(8e+00f), tmp12811);
__m512 tmp12837 = _mm512_fmadd_ps(tmp12832, _mm512_set1_ps(8e+00f), tmp12831);
__m512 tmp12805 = _mm512_add_ps(tmp12806, tmp12807);
__m512 tmp12825 = _mm512_add_ps(tmp12826, tmp12827);
__m512 tmp12809 = _mm512_fmadd_ps(tmp12813, _mm512_set1_ps(1.6e+01f), tmp12810);
__m512 tmp12829 = _mm512_fmadd_ps(tmp12833, _mm512_set1_ps(1.6e+01f), tmp12830);
__m512 tmp12816 = _mm512_fmadd_ps(tmp12813, _mm512_set1_ps(4e+00f), tmp12817);
__m512 tmp12836 = _mm512_fmadd_ps(tmp12833, _mm512_set1_ps(4e+00f), tmp12837);
__m512 tmp12822 = _mm512_add_ps(tmp12813, tmp12811);
__m512 tmp12842 = _mm512_add_ps(tmp12833, tmp12831);
__m512 tmp12815 = _mm512_fmadd_ps(tmp12806, _mm512_set1_ps(4e+00f), tmp12807);
__m512 tmp12835 = _mm512_fmadd_ps(tmp12826, _mm512_set1_ps(4e+00f), tmp12827);
__m512 tmp12819 = _mm512_fmadd_ps(tmp12806, _mm512_set1_ps(1.6e+01f), tmp12807);
__m512 tmp12839 = _mm512_fmadd_ps(tmp12826, _mm512_set1_ps(1.6e+01f), tmp12827);
__m512 tmp12804 = _mm512_add_ps(tmp12805, in1802);
__m512 tmp12824 = _mm512_add_ps(tmp12825, in1810);
__m512 tmp12821 = _mm512_add_ps(tmp12822, in1809);
__m512 tmp12841 = _mm512_add_ps(tmp12842, in1817);
__m512 tmp12803 = _mm512_fmadd_ps(tmp12808, _mm512_set1_ps(3.2e+01f), tmp12804);
__m512 tmp12823 = _mm512_fmadd_ps(tmp12828, _mm512_set1_ps(3.2e+01f), tmp12824);
__m512 tmp12814 = _mm512_fmadd_ps(tmp12808, _mm512_set1_ps(8e+00f), tmp12815);
__m512 tmp12834 = _mm512_fmadd_ps(tmp12828, _mm512_set1_ps(8e+00f), tmp12835);
__m512 tmp12820 = _mm512_fmadd_ps(tmp12812, _mm512_set1_ps(3.2e+01f), tmp12821);
__m512 tmp12840 = _mm512_fmadd_ps(tmp12832, _mm512_set1_ps(3.2e+01f), tmp12841);
__m512 tmp12818 = _mm512_fmadd_ps(tmp12808, _mm512_set1_ps(2e+00f), tmp12819);
__m512 tmp12838 = _mm512_fmadd_ps(tmp12828, _mm512_set1_ps(2e+00f), tmp12839);
__m512 tmp12791 = tmp12803;
__m512 tmp12797 = tmp12823;
__m512 tmp12792 = tmp12809;
__m512 tmp12798 = tmp12829;
__m512 tmp12793 = tmp12814;
__m512 tmp12799 = tmp12834;
__m512 tmp12794 = tmp12816;
__m512 tmp12800 = tmp12836;
__m512 tmp12795 = tmp12818;
__m512 tmp12801 = tmp12838;
__m512 tmp12796 = tmp12820;
__m512 tmp12802 = tmp12840;
__m512 tmp12887 = _mm512_unpacklo_ps(tmp12791, tmp12792);
__m512 tmp12888 = _mm512_unpackhi_ps(tmp12791, tmp12792);
__m512 tmp12889 = _mm512_unpacklo_ps(tmp12793, tmp12794);
__m512 tmp12890 = _mm512_unpackhi_ps(tmp12793, tmp12794);
__m512 tmp12891 = _mm512_unpacklo_ps(tmp12795, tmp12796);
__m512 tmp12892 = _mm512_unpackhi_ps(tmp12795, tmp12796);
__m512 tmp12893 = _mm512_unpacklo_ps(tmp12797, tmp12798);
__m512 tmp12894 = _mm512_unpackhi_ps(tmp12797, tmp12798);
__m512 tmp12895 = _mm512_unpacklo_ps(tmp12799, tmp12800);
__m512 tmp12896 = _mm512_unpackhi_ps(tmp12799, tmp12800);
__m512 tmp12897 = _mm512_unpacklo_ps(tmp12801, tmp12802);
__m512 tmp12898 = _mm512_unpackhi_ps(tmp12801, tmp12802);
__m512 tmp12899 = _mm512_shuffle_ps(tmp12887, tmp12889, 68);
__m512 tmp12900 = _mm512_shuffle_ps(tmp12887, tmp12889, 238);
__m512 tmp12901 = _mm512_shuffle_ps(tmp12888, tmp12890, 68);
__m512 tmp12902 = _mm512_shuffle_ps(tmp12888, tmp12890, 238);
__m512 tmp12903 = _mm512_shuffle_ps(tmp12891, tmp12893, 68);
__m512 tmp12904 = _mm512_shuffle_ps(tmp12891, tmp12893, 238);
__m512 tmp12905 = _mm512_shuffle_ps(tmp12892, tmp12894, 68);
__m512 tmp12906 = _mm512_shuffle_ps(tmp12892, tmp12894, 238);
__m512 tmp12907 = _mm512_shuffle_ps(tmp12895, tmp12897, 68);
__m512 tmp12908 = _mm512_shuffle_ps(tmp12895, tmp12897, 238);
__m512 tmp12909 = _mm512_shuffle_ps(tmp12896, tmp12898, 68);
__m512 tmp12910 = _mm512_shuffle_ps(tmp12896, tmp12898, 238);
__m512 tmp12911 = _mm512_shuffle_f32x4(tmp12899, tmp12903, 136);
__m512 tmp12912 = _mm512_shuffle_f32x4(tmp12899, tmp12903, 221);
__m512 tmp12913 = _mm512_shuffle_f32x4(tmp12900, tmp12904, 136);
__m512 tmp12914 = _mm512_shuffle_f32x4(tmp12900, tmp12904, 221);
__m512 tmp12915 = _mm512_shuffle_f32x4(tmp12901, tmp12905, 136);
__m512 tmp12916 = _mm512_shuffle_f32x4(tmp12901, tmp12905, 221);
__m512 tmp12917 = _mm512_shuffle_f32x4(tmp12902, tmp12906, 136);
__m512 tmp12918 = _mm512_shuffle_f32x4(tmp12902, tmp12906, 221);
__m512 tmp12919 = _mm512_shuffle_f32x4(tmp12907, tmp12907, 136);
__m512 tmp12920 = _mm512_shuffle_f32x4(tmp12907, tmp12907, 221);
__m512 tmp12921 = _mm512_shuffle_f32x4(tmp12908, tmp12908, 136);
__m512 tmp12922 = _mm512_shuffle_f32x4(tmp12908, tmp12908, 221);
__m512 tmp12923 = _mm512_shuffle_f32x4(tmp12909, tmp12909, 136);
__m512 tmp12924 = _mm512_shuffle_f32x4(tmp12909, tmp12909, 221);
__m512 tmp12925 = _mm512_shuffle_f32x4(tmp12910, tmp12910, 136);
__m512 tmp12926 = _mm512_shuffle_f32x4(tmp12910, tmp12910, 221);
tmp12791 = _mm512_shuffle_f32x4(tmp12911, tmp12919, 136);
tmp12799 = _mm512_shuffle_f32x4(tmp12911, tmp12919, 221);
tmp12792 = _mm512_shuffle_f32x4(tmp12913, tmp12921, 136);
tmp12800 = _mm512_shuffle_f32x4(tmp12913, tmp12921, 221);
tmp12793 = _mm512_shuffle_f32x4(tmp12915, tmp12923, 136);
tmp12801 = _mm512_shuffle_f32x4(tmp12915, tmp12923, 221);
tmp12794 = _mm512_shuffle_f32x4(tmp12917, tmp12925, 136);
tmp12802 = _mm512_shuffle_f32x4(tmp12917, tmp12925, 221);
tmp12795 = _mm512_shuffle_f32x4(tmp12912, tmp12920, 136);
__m512 tmp12843 = _mm512_shuffle_f32x4(tmp12912, tmp12920, 221);
tmp12796 = _mm512_shuffle_f32x4(tmp12914, tmp12922, 136);
__m512 tmp12844 = _mm512_shuffle_f32x4(tmp12914, tmp12922, 221);
tmp12797 = _mm512_shuffle_f32x4(tmp12916, tmp12924, 136);
__m512 tmp12845 = _mm512_shuffle_f32x4(tmp12916, tmp12924, 221);
tmp12798 = _mm512_shuffle_f32x4(tmp12918, tmp12926, 136);
__m512 tmp12846 = _mm512_shuffle_f32x4(tmp12918, tmp12926, 221);
__m512 tmp12851 = _mm512_add_ps(tmp12792, tmp12793);
__m512 tmp12871 = _mm512_add_ps(tmp12800, tmp12801);
__m512 tmp12850 = _mm512_add_ps(tmp12794, tmp12795);
__m512 tmp12870 = _mm512_add_ps(tmp12802, tmp12843);
__m512 tmp12856 = _mm512_sub_ps(tmp12794, tmp12795);
__m512 tmp12876 = _mm512_sub_ps(tmp12802, tmp12843);
__m512 tmp12855 = _mm512_sub_ps(tmp12792, tmp12793);
__m512 tmp12875 = _mm512_sub_ps(tmp12800, tmp12801);
__m512 tmp12852 = _mm512_add_ps(tmp12796, tmp12797);
__m512 tmp12872 = _mm512_add_ps(tmp12844, tmp12845);
__m512 tmp12857 = _mm512_sub_ps(tmp12796, tmp12797);
__m512 tmp12877 = _mm512_sub_ps(tmp12844, tmp12845);
__m512 tmp12854 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(2e+00f), tmp12855);
__m512 tmp12874 = _mm512_fmadd_ps(tmp12876, _mm512_set1_ps(2e+00f), tmp12875);
__m512 tmp12861 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(8e+00f), tmp12855);
__m512 tmp12881 = _mm512_fmadd_ps(tmp12876, _mm512_set1_ps(8e+00f), tmp12875);
__m512 tmp12849 = _mm512_add_ps(tmp12850, tmp12851);
__m512 tmp12869 = _mm512_add_ps(tmp12870, tmp12871);
__m512 tmp12853 = _mm512_fmadd_ps(tmp12857, _mm512_set1_ps(1.6e+01f), tmp12854);
__m512 tmp12873 = _mm512_fmadd_ps(tmp12877, _mm512_set1_ps(1.6e+01f), tmp12874);
__m512 tmp12860 = _mm512_fmadd_ps(tmp12857, _mm512_set1_ps(4e+00f), tmp12861);
__m512 tmp12880 = _mm512_fmadd_ps(tmp12877, _mm512_set1_ps(4e+00f), tmp12881);
__m512 tmp12866 = _mm512_add_ps(tmp12857, tmp12855);
__m512 tmp12886 = _mm512_add_ps(tmp12877, tmp12875);
__m512 tmp12859 = _mm512_fmadd_ps(tmp12850, _mm512_set1_ps(4e+00f), tmp12851);
__m512 tmp12879 = _mm512_fmadd_ps(tmp12870, _mm512_set1_ps(4e+00f), tmp12871);
__m512 tmp12863 = _mm512_fmadd_ps(tmp12850, _mm512_set1_ps(1.6e+01f), tmp12851);
__m512 tmp12883 = _mm512_fmadd_ps(tmp12870, _mm512_set1_ps(1.6e+01f), tmp12871);
__m512 tmp12848 = _mm512_add_ps(tmp12849, tmp12791);
__m512 tmp12868 = _mm512_add_ps(tmp12869, tmp12799);
__m512 tmp12865 = _mm512_add_ps(tmp12866, tmp12798);
__m512 tmp12885 = _mm512_add_ps(tmp12886, tmp12846);
__m512 tmp12847 = _mm512_fmadd_ps(tmp12852, _mm512_set1_ps(3.2e+01f), tmp12848);
__m512 tmp12867 = _mm512_fmadd_ps(tmp12872, _mm512_set1_ps(3.2e+01f), tmp12868);
__m512 tmp12858 = _mm512_fmadd_ps(tmp12852, _mm512_set1_ps(8e+00f), tmp12859);
__m512 tmp12878 = _mm512_fmadd_ps(tmp12872, _mm512_set1_ps(8e+00f), tmp12879);
__m512 tmp12864 = _mm512_fmadd_ps(tmp12856, _mm512_set1_ps(3.2e+01f), tmp12865);
__m512 tmp12884 = _mm512_fmadd_ps(tmp12876, _mm512_set1_ps(3.2e+01f), tmp12885);
__m512 tmp12862 = _mm512_fmadd_ps(tmp12852, _mm512_set1_ps(2e+00f), tmp12863);
__m512 tmp12882 = _mm512_fmadd_ps(tmp12872, _mm512_set1_ps(2e+00f), tmp12883);
__m512 out1671 = tmp12847;
__m512 out1677 = tmp12867;
__m512 out1672 = tmp12853;
__m512 out1678 = tmp12873;
__m512 out1673 = tmp12858;
__m512 out1679 = tmp12878;
__m512 out1674 = tmp12860;
__m512 out1680 = tmp12880;
__m512 out1675 = tmp12862;
__m512 out1681 = tmp12882;
__m512 out1676 = tmp12864;
__m512 out1682 = tmp12884;
out1671 = _mm512_max_ps(_mm512_setzero_ps(), out1671);
out1677 = _mm512_max_ps(_mm512_setzero_ps(), out1677);
out1672 = _mm512_max_ps(_mm512_setzero_ps(), out1672);
out1678 = _mm512_max_ps(_mm512_setzero_ps(), out1678);
out1673 = _mm512_max_ps(_mm512_setzero_ps(), out1673);
out1679 = _mm512_max_ps(_mm512_setzero_ps(), out1679);
out1674 = _mm512_max_ps(_mm512_setzero_ps(), out1674);
out1680 = _mm512_max_ps(_mm512_setzero_ps(), out1680);
out1675 = _mm512_max_ps(_mm512_setzero_ps(), out1675);
out1681 = _mm512_max_ps(_mm512_setzero_ps(), out1681);
out1676 = _mm512_max_ps(_mm512_setzero_ps(), out1676);
out1682 = _mm512_max_ps(_mm512_setzero_ps(), out1682);
_mm512_mask_storeu_ps(datPtr26+0+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1671);
_mm512_mask_storeu_ps(datPtr26+48+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1677);
_mm512_mask_storeu_ps(datPtr26+600+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1677);
_mm512_mask_storeu_ps(datPtr26+112+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1672);
_mm512_mask_storeu_ps(datPtr26+160+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1678);
_mm512_mask_storeu_ps(datPtr26+712+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1678);
_mm512_mask_storeu_ps(datPtr26+224+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1673);
_mm512_mask_storeu_ps(datPtr26+272+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1679);
_mm512_mask_storeu_ps(datPtr26+824+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1679);
_mm512_mask_storeu_ps(datPtr26+336+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1674);
_mm512_mask_storeu_ps(datPtr26+384+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1680);
_mm512_mask_storeu_ps(datPtr26+936+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1680);
_mm512_mask_storeu_ps(datPtr26+448+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1675);
_mm512_mask_storeu_ps(datPtr26+496+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1681);
_mm512_mask_storeu_ps(datPtr26+1048+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1681);
_mm512_mask_storeu_ps(datPtr26+560+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1676);
_mm512_mask_storeu_ps(datPtr26+608+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1682);
_mm512_mask_storeu_ps(datPtr26+1160+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1682);
__m512 sf913 = _mm512_loadu_ps(sfPtr12+256+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf914 = _mm512_loadu_ps(sfPtr12+384+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1818 = _mm512_shuffle_f32x4(sf913, sf914, 68);
__m512 in1819 = _mm512_shuffle_f32x4(sf913, sf914, 238);
__m512 sf915 = _mm512_loadu_ps(sfPtr12+320+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf916 = _mm512_loadu_ps(sfPtr12+448+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1826 = _mm512_shuffle_f32x4(sf915, sf916, 68);
__m512 in1827 = _mm512_shuffle_f32x4(sf915, sf916, 238);
__m512 sf917 = _mm512_loadu_ps(sfPtr12+13056+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf918 = _mm512_loadu_ps(sfPtr12+13184+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1820 = _mm512_shuffle_f32x4(sf917, sf918, 68);
__m512 in1821 = _mm512_shuffle_f32x4(sf917, sf918, 238);
__m512 sf919 = _mm512_loadu_ps(sfPtr12+13120+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf920 = _mm512_loadu_ps(sfPtr12+13248+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1828 = _mm512_shuffle_f32x4(sf919, sf920, 68);
__m512 in1829 = _mm512_shuffle_f32x4(sf919, sf920, 238);
__m512 sf921 = _mm512_loadu_ps(sfPtr12+25856+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf922 = _mm512_loadu_ps(sfPtr12+25984+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1822 = _mm512_shuffle_f32x4(sf921, sf922, 68);
__m512 in1823 = _mm512_shuffle_f32x4(sf921, sf922, 238);
__m512 sf923 = _mm512_loadu_ps(sfPtr12+25920+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf924 = _mm512_loadu_ps(sfPtr12+26048+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1830 = _mm512_shuffle_f32x4(sf923, sf924, 68);
__m512 in1831 = _mm512_shuffle_f32x4(sf923, sf924, 238);
__m512 sf925 = _mm512_loadu_ps(sfPtr12+38656+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf926 = _mm512_loadu_ps(sfPtr12+38784+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1824 = _mm512_shuffle_f32x4(sf925, sf926, 68);
__m512 in1825 = _mm512_shuffle_f32x4(sf925, sf926, 238);
__m512 sf927 = _mm512_loadu_ps(sfPtr12+38720+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf928 = _mm512_loadu_ps(sfPtr12+38848+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1832 = _mm512_shuffle_f32x4(sf927, sf928, 68);
__m512 in1833 = _mm512_shuffle_f32x4(sf927, sf928, 238);
__m512 tmp12943 = _mm512_add_ps(in1819, in1820);
__m512 tmp12963 = _mm512_add_ps(in1827, in1828);
__m512 tmp12942 = _mm512_add_ps(in1821, in1822);
__m512 tmp12962 = _mm512_add_ps(in1829, in1830);
__m512 tmp12948 = _mm512_sub_ps(in1821, in1822);
__m512 tmp12968 = _mm512_sub_ps(in1829, in1830);
__m512 tmp12947 = _mm512_sub_ps(in1819, in1820);
__m512 tmp12967 = _mm512_sub_ps(in1827, in1828);
__m512 tmp12944 = _mm512_add_ps(in1823, in1824);
__m512 tmp12964 = _mm512_add_ps(in1831, in1832);
__m512 tmp12949 = _mm512_sub_ps(in1823, in1824);
__m512 tmp12969 = _mm512_sub_ps(in1831, in1832);
__m512 tmp12946 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(2e+00f), tmp12947);
__m512 tmp12966 = _mm512_fmadd_ps(tmp12968, _mm512_set1_ps(2e+00f), tmp12967);
__m512 tmp12953 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(8e+00f), tmp12947);
__m512 tmp12973 = _mm512_fmadd_ps(tmp12968, _mm512_set1_ps(8e+00f), tmp12967);
__m512 tmp12941 = _mm512_add_ps(tmp12942, tmp12943);
__m512 tmp12961 = _mm512_add_ps(tmp12962, tmp12963);
__m512 tmp12945 = _mm512_fmadd_ps(tmp12949, _mm512_set1_ps(1.6e+01f), tmp12946);
__m512 tmp12965 = _mm512_fmadd_ps(tmp12969, _mm512_set1_ps(1.6e+01f), tmp12966);
__m512 tmp12952 = _mm512_fmadd_ps(tmp12949, _mm512_set1_ps(4e+00f), tmp12953);
__m512 tmp12972 = _mm512_fmadd_ps(tmp12969, _mm512_set1_ps(4e+00f), tmp12973);
__m512 tmp12958 = _mm512_add_ps(tmp12949, tmp12947);
__m512 tmp12978 = _mm512_add_ps(tmp12969, tmp12967);
__m512 tmp12951 = _mm512_fmadd_ps(tmp12942, _mm512_set1_ps(4e+00f), tmp12943);
__m512 tmp12971 = _mm512_fmadd_ps(tmp12962, _mm512_set1_ps(4e+00f), tmp12963);
__m512 tmp12955 = _mm512_fmadd_ps(tmp12942, _mm512_set1_ps(1.6e+01f), tmp12943);
__m512 tmp12975 = _mm512_fmadd_ps(tmp12962, _mm512_set1_ps(1.6e+01f), tmp12963);
__m512 tmp12940 = _mm512_add_ps(tmp12941, in1818);
__m512 tmp12960 = _mm512_add_ps(tmp12961, in1826);
__m512 tmp12957 = _mm512_add_ps(tmp12958, in1825);
__m512 tmp12977 = _mm512_add_ps(tmp12978, in1833);
__m512 tmp12939 = _mm512_fmadd_ps(tmp12944, _mm512_set1_ps(3.2e+01f), tmp12940);
__m512 tmp12959 = _mm512_fmadd_ps(tmp12964, _mm512_set1_ps(3.2e+01f), tmp12960);
__m512 tmp12950 = _mm512_fmadd_ps(tmp12944, _mm512_set1_ps(8e+00f), tmp12951);
__m512 tmp12970 = _mm512_fmadd_ps(tmp12964, _mm512_set1_ps(8e+00f), tmp12971);
__m512 tmp12956 = _mm512_fmadd_ps(tmp12948, _mm512_set1_ps(3.2e+01f), tmp12957);
__m512 tmp12976 = _mm512_fmadd_ps(tmp12968, _mm512_set1_ps(3.2e+01f), tmp12977);
__m512 tmp12954 = _mm512_fmadd_ps(tmp12944, _mm512_set1_ps(2e+00f), tmp12955);
__m512 tmp12974 = _mm512_fmadd_ps(tmp12964, _mm512_set1_ps(2e+00f), tmp12975);
__m512 tmp12927 = tmp12939;
__m512 tmp12933 = tmp12959;
__m512 tmp12928 = tmp12945;
__m512 tmp12934 = tmp12965;
__m512 tmp12929 = tmp12950;
__m512 tmp12935 = tmp12970;
__m512 tmp12930 = tmp12952;
__m512 tmp12936 = tmp12972;
__m512 tmp12931 = tmp12954;
__m512 tmp12937 = tmp12974;
__m512 tmp12932 = tmp12956;
__m512 tmp12938 = tmp12976;
__m512 tmp13023 = _mm512_unpacklo_ps(tmp12927, tmp12928);
__m512 tmp13024 = _mm512_unpackhi_ps(tmp12927, tmp12928);
__m512 tmp13025 = _mm512_unpacklo_ps(tmp12929, tmp12930);
__m512 tmp13026 = _mm512_unpackhi_ps(tmp12929, tmp12930);
__m512 tmp13027 = _mm512_unpacklo_ps(tmp12931, tmp12932);
__m512 tmp13028 = _mm512_unpackhi_ps(tmp12931, tmp12932);
__m512 tmp13029 = _mm512_unpacklo_ps(tmp12933, tmp12934);
__m512 tmp13030 = _mm512_unpackhi_ps(tmp12933, tmp12934);
__m512 tmp13031 = _mm512_unpacklo_ps(tmp12935, tmp12936);
__m512 tmp13032 = _mm512_unpackhi_ps(tmp12935, tmp12936);
__m512 tmp13033 = _mm512_unpacklo_ps(tmp12937, tmp12938);
__m512 tmp13034 = _mm512_unpackhi_ps(tmp12937, tmp12938);
__m512 tmp13035 = _mm512_shuffle_ps(tmp13023, tmp13025, 68);
__m512 tmp13036 = _mm512_shuffle_ps(tmp13023, tmp13025, 238);
__m512 tmp13037 = _mm512_shuffle_ps(tmp13024, tmp13026, 68);
__m512 tmp13038 = _mm512_shuffle_ps(tmp13024, tmp13026, 238);
__m512 tmp13039 = _mm512_shuffle_ps(tmp13027, tmp13029, 68);
__m512 tmp13040 = _mm512_shuffle_ps(tmp13027, tmp13029, 238);
__m512 tmp13041 = _mm512_shuffle_ps(tmp13028, tmp13030, 68);
__m512 tmp13042 = _mm512_shuffle_ps(tmp13028, tmp13030, 238);
__m512 tmp13043 = _mm512_shuffle_ps(tmp13031, tmp13033, 68);
__m512 tmp13044 = _mm512_shuffle_ps(tmp13031, tmp13033, 238);
__m512 tmp13045 = _mm512_shuffle_ps(tmp13032, tmp13034, 68);
__m512 tmp13046 = _mm512_shuffle_ps(tmp13032, tmp13034, 238);
__m512 tmp13047 = _mm512_shuffle_f32x4(tmp13035, tmp13039, 136);
__m512 tmp13048 = _mm512_shuffle_f32x4(tmp13035, tmp13039, 221);
__m512 tmp13049 = _mm512_shuffle_f32x4(tmp13036, tmp13040, 136);
__m512 tmp13050 = _mm512_shuffle_f32x4(tmp13036, tmp13040, 221);
__m512 tmp13051 = _mm512_shuffle_f32x4(tmp13037, tmp13041, 136);
__m512 tmp13052 = _mm512_shuffle_f32x4(tmp13037, tmp13041, 221);
__m512 tmp13053 = _mm512_shuffle_f32x4(tmp13038, tmp13042, 136);
__m512 tmp13054 = _mm512_shuffle_f32x4(tmp13038, tmp13042, 221);
__m512 tmp13055 = _mm512_shuffle_f32x4(tmp13043, tmp13043, 136);
__m512 tmp13056 = _mm512_shuffle_f32x4(tmp13043, tmp13043, 221);
__m512 tmp13057 = _mm512_shuffle_f32x4(tmp13044, tmp13044, 136);
__m512 tmp13058 = _mm512_shuffle_f32x4(tmp13044, tmp13044, 221);
__m512 tmp13059 = _mm512_shuffle_f32x4(tmp13045, tmp13045, 136);
__m512 tmp13060 = _mm512_shuffle_f32x4(tmp13045, tmp13045, 221);
__m512 tmp13061 = _mm512_shuffle_f32x4(tmp13046, tmp13046, 136);
__m512 tmp13062 = _mm512_shuffle_f32x4(tmp13046, tmp13046, 221);
tmp12927 = _mm512_shuffle_f32x4(tmp13047, tmp13055, 136);
tmp12935 = _mm512_shuffle_f32x4(tmp13047, tmp13055, 221);
tmp12928 = _mm512_shuffle_f32x4(tmp13049, tmp13057, 136);
tmp12936 = _mm512_shuffle_f32x4(tmp13049, tmp13057, 221);
tmp12929 = _mm512_shuffle_f32x4(tmp13051, tmp13059, 136);
tmp12937 = _mm512_shuffle_f32x4(tmp13051, tmp13059, 221);
tmp12930 = _mm512_shuffle_f32x4(tmp13053, tmp13061, 136);
tmp12938 = _mm512_shuffle_f32x4(tmp13053, tmp13061, 221);
tmp12931 = _mm512_shuffle_f32x4(tmp13048, tmp13056, 136);
__m512 tmp12979 = _mm512_shuffle_f32x4(tmp13048, tmp13056, 221);
tmp12932 = _mm512_shuffle_f32x4(tmp13050, tmp13058, 136);
__m512 tmp12980 = _mm512_shuffle_f32x4(tmp13050, tmp13058, 221);
tmp12933 = _mm512_shuffle_f32x4(tmp13052, tmp13060, 136);
__m512 tmp12981 = _mm512_shuffle_f32x4(tmp13052, tmp13060, 221);
tmp12934 = _mm512_shuffle_f32x4(tmp13054, tmp13062, 136);
__m512 tmp12982 = _mm512_shuffle_f32x4(tmp13054, tmp13062, 221);
__m512 tmp12987 = _mm512_add_ps(tmp12928, tmp12929);
__m512 tmp13007 = _mm512_add_ps(tmp12936, tmp12937);
__m512 tmp12986 = _mm512_add_ps(tmp12930, tmp12931);
__m512 tmp13006 = _mm512_add_ps(tmp12938, tmp12979);
__m512 tmp12992 = _mm512_sub_ps(tmp12930, tmp12931);
__m512 tmp13012 = _mm512_sub_ps(tmp12938, tmp12979);
__m512 tmp12991 = _mm512_sub_ps(tmp12928, tmp12929);
__m512 tmp13011 = _mm512_sub_ps(tmp12936, tmp12937);
__m512 tmp12988 = _mm512_add_ps(tmp12932, tmp12933);
__m512 tmp13008 = _mm512_add_ps(tmp12980, tmp12981);
__m512 tmp12993 = _mm512_sub_ps(tmp12932, tmp12933);
__m512 tmp13013 = _mm512_sub_ps(tmp12980, tmp12981);
__m512 tmp12990 = _mm512_fmadd_ps(tmp12992, _mm512_set1_ps(2e+00f), tmp12991);
__m512 tmp13010 = _mm512_fmadd_ps(tmp13012, _mm512_set1_ps(2e+00f), tmp13011);
__m512 tmp12997 = _mm512_fmadd_ps(tmp12992, _mm512_set1_ps(8e+00f), tmp12991);
__m512 tmp13017 = _mm512_fmadd_ps(tmp13012, _mm512_set1_ps(8e+00f), tmp13011);
__m512 tmp12985 = _mm512_add_ps(tmp12986, tmp12987);
__m512 tmp13005 = _mm512_add_ps(tmp13006, tmp13007);
__m512 tmp12989 = _mm512_fmadd_ps(tmp12993, _mm512_set1_ps(1.6e+01f), tmp12990);
__m512 tmp13009 = _mm512_fmadd_ps(tmp13013, _mm512_set1_ps(1.6e+01f), tmp13010);
__m512 tmp12996 = _mm512_fmadd_ps(tmp12993, _mm512_set1_ps(4e+00f), tmp12997);
__m512 tmp13016 = _mm512_fmadd_ps(tmp13013, _mm512_set1_ps(4e+00f), tmp13017);
__m512 tmp13002 = _mm512_add_ps(tmp12993, tmp12991);
__m512 tmp13022 = _mm512_add_ps(tmp13013, tmp13011);
__m512 tmp12995 = _mm512_fmadd_ps(tmp12986, _mm512_set1_ps(4e+00f), tmp12987);
__m512 tmp13015 = _mm512_fmadd_ps(tmp13006, _mm512_set1_ps(4e+00f), tmp13007);
__m512 tmp12999 = _mm512_fmadd_ps(tmp12986, _mm512_set1_ps(1.6e+01f), tmp12987);
__m512 tmp13019 = _mm512_fmadd_ps(tmp13006, _mm512_set1_ps(1.6e+01f), tmp13007);
__m512 tmp12984 = _mm512_add_ps(tmp12985, tmp12927);
__m512 tmp13004 = _mm512_add_ps(tmp13005, tmp12935);
__m512 tmp13001 = _mm512_add_ps(tmp13002, tmp12934);
__m512 tmp13021 = _mm512_add_ps(tmp13022, tmp12982);
__m512 tmp12983 = _mm512_fmadd_ps(tmp12988, _mm512_set1_ps(3.2e+01f), tmp12984);
__m512 tmp13003 = _mm512_fmadd_ps(tmp13008, _mm512_set1_ps(3.2e+01f), tmp13004);
__m512 tmp12994 = _mm512_fmadd_ps(tmp12988, _mm512_set1_ps(8e+00f), tmp12995);
__m512 tmp13014 = _mm512_fmadd_ps(tmp13008, _mm512_set1_ps(8e+00f), tmp13015);
__m512 tmp13000 = _mm512_fmadd_ps(tmp12992, _mm512_set1_ps(3.2e+01f), tmp13001);
__m512 tmp13020 = _mm512_fmadd_ps(tmp13012, _mm512_set1_ps(3.2e+01f), tmp13021);
__m512 tmp12998 = _mm512_fmadd_ps(tmp12988, _mm512_set1_ps(2e+00f), tmp12999);
__m512 tmp13018 = _mm512_fmadd_ps(tmp13008, _mm512_set1_ps(2e+00f), tmp13019);
__m512 out1683 = tmp12983;
__m512 out1689 = tmp13003;
__m512 out1684 = tmp12989;
__m512 out1690 = tmp13009;
__m512 out1685 = tmp12994;
__m512 out1691 = tmp13014;
__m512 out1686 = tmp12996;
__m512 out1692 = tmp13016;
__m512 out1687 = tmp12998;
__m512 out1693 = tmp13018;
__m512 out1688 = tmp13000;
__m512 out1694 = tmp13020;
out1683 = _mm512_max_ps(_mm512_setzero_ps(), out1683);
out1689 = _mm512_max_ps(_mm512_setzero_ps(), out1689);
out1684 = _mm512_max_ps(_mm512_setzero_ps(), out1684);
out1690 = _mm512_max_ps(_mm512_setzero_ps(), out1690);
out1685 = _mm512_max_ps(_mm512_setzero_ps(), out1685);
out1691 = _mm512_max_ps(_mm512_setzero_ps(), out1691);
out1686 = _mm512_max_ps(_mm512_setzero_ps(), out1686);
out1692 = _mm512_max_ps(_mm512_setzero_ps(), out1692);
out1687 = _mm512_max_ps(_mm512_setzero_ps(), out1687);
out1693 = _mm512_max_ps(_mm512_setzero_ps(), out1693);
out1688 = _mm512_max_ps(_mm512_setzero_ps(), out1688);
out1694 = _mm512_max_ps(_mm512_setzero_ps(), out1694);
_mm512_mask_storeu_ps(datPtr26+648+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1683);
_mm512_mask_storeu_ps(datPtr26+3136+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1689);
_mm512_mask_storeu_ps(datPtr26+760+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1684);
_mm512_mask_storeu_ps(datPtr26+3248+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1690);
_mm512_mask_storeu_ps(datPtr26+872+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1685);
_mm512_mask_storeu_ps(datPtr26+3360+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1691);
_mm512_mask_storeu_ps(datPtr26+984+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1686);
_mm512_mask_storeu_ps(datPtr26+3472+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1692);
_mm512_mask_storeu_ps(datPtr26+1096+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1687);
_mm512_mask_storeu_ps(datPtr26+3584+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1693);
_mm512_mask_storeu_ps(datPtr26+1208+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1688);
_mm512_mask_storeu_ps(datPtr26+3696+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1694);
__m512 sf929 = _mm512_loadu_ps(sfPtr12+512+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf930 = _mm512_loadu_ps(sfPtr12+576+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1834 = _mm512_shuffle_f32x4(sf930, sf929, 68);
__m512 in1835 = _mm512_shuffle_f32x4(sf930, sf929, 238);
__m512 sf931 = _mm512_loadu_ps(sfPtr12+640+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf932 = _mm512_loadu_ps(sfPtr12+704+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1842 = _mm512_shuffle_f32x4(sf931, sf932, 68);
__m512 in1843 = _mm512_shuffle_f32x4(sf931, sf932, 238);
__m512 sf933 = _mm512_loadu_ps(sfPtr12+13312+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf934 = _mm512_loadu_ps(sfPtr12+13376+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1836 = _mm512_shuffle_f32x4(sf934, sf933, 68);
__m512 in1837 = _mm512_shuffle_f32x4(sf934, sf933, 238);
__m512 sf935 = _mm512_loadu_ps(sfPtr12+13440+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf936 = _mm512_loadu_ps(sfPtr12+13504+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1844 = _mm512_shuffle_f32x4(sf935, sf936, 68);
__m512 in1845 = _mm512_shuffle_f32x4(sf935, sf936, 238);
__m512 sf937 = _mm512_loadu_ps(sfPtr12+26112+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf938 = _mm512_loadu_ps(sfPtr12+26176+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1838 = _mm512_shuffle_f32x4(sf938, sf937, 68);
__m512 in1839 = _mm512_shuffle_f32x4(sf938, sf937, 238);
__m512 sf939 = _mm512_loadu_ps(sfPtr12+26240+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf940 = _mm512_loadu_ps(sfPtr12+26304+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1846 = _mm512_shuffle_f32x4(sf939, sf940, 68);
__m512 in1847 = _mm512_shuffle_f32x4(sf939, sf940, 238);
__m512 sf941 = _mm512_loadu_ps(sfPtr12+38912+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf942 = _mm512_loadu_ps(sfPtr12+38976+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1840 = _mm512_shuffle_f32x4(sf942, sf941, 68);
__m512 in1841 = _mm512_shuffle_f32x4(sf942, sf941, 238);
__m512 sf943 = _mm512_loadu_ps(sfPtr12+39040+51200*i51+3072*j44+1536*k138+768*l57);
__m512 sf944 = _mm512_loadu_ps(sfPtr12+39104+51200*i51+3072*j44+1536*k138+768*l57);
__m512 in1848 = _mm512_shuffle_f32x4(sf943, sf944, 68);
__m512 in1849 = _mm512_shuffle_f32x4(sf943, sf944, 238);
__m512 tmp13079 = _mm512_add_ps(in1835, in1836);
__m512 tmp13099 = _mm512_add_ps(in1843, in1844);
__m512 tmp13078 = _mm512_add_ps(in1837, in1838);
__m512 tmp13098 = _mm512_add_ps(in1845, in1846);
__m512 tmp13084 = _mm512_sub_ps(in1837, in1838);
__m512 tmp13104 = _mm512_sub_ps(in1845, in1846);
__m512 tmp13083 = _mm512_sub_ps(in1835, in1836);
__m512 tmp13103 = _mm512_sub_ps(in1843, in1844);
__m512 tmp13080 = _mm512_add_ps(in1839, in1840);
__m512 tmp13100 = _mm512_add_ps(in1847, in1848);
__m512 tmp13085 = _mm512_sub_ps(in1839, in1840);
__m512 tmp13105 = _mm512_sub_ps(in1847, in1848);
__m512 tmp13082 = _mm512_fmadd_ps(tmp13084, _mm512_set1_ps(2e+00f), tmp13083);
__m512 tmp13102 = _mm512_fmadd_ps(tmp13104, _mm512_set1_ps(2e+00f), tmp13103);
__m512 tmp13089 = _mm512_fmadd_ps(tmp13084, _mm512_set1_ps(8e+00f), tmp13083);
__m512 tmp13109 = _mm512_fmadd_ps(tmp13104, _mm512_set1_ps(8e+00f), tmp13103);
__m512 tmp13077 = _mm512_add_ps(tmp13078, tmp13079);
__m512 tmp13097 = _mm512_add_ps(tmp13098, tmp13099);
__m512 tmp13081 = _mm512_fmadd_ps(tmp13085, _mm512_set1_ps(1.6e+01f), tmp13082);
__m512 tmp13101 = _mm512_fmadd_ps(tmp13105, _mm512_set1_ps(1.6e+01f), tmp13102);
__m512 tmp13088 = _mm512_fmadd_ps(tmp13085, _mm512_set1_ps(4e+00f), tmp13089);
__m512 tmp13108 = _mm512_fmadd_ps(tmp13105, _mm512_set1_ps(4e+00f), tmp13109);
__m512 tmp13094 = _mm512_add_ps(tmp13085, tmp13083);
__m512 tmp13114 = _mm512_add_ps(tmp13105, tmp13103);
__m512 tmp13087 = _mm512_fmadd_ps(tmp13078, _mm512_set1_ps(4e+00f), tmp13079);
__m512 tmp13107 = _mm512_fmadd_ps(tmp13098, _mm512_set1_ps(4e+00f), tmp13099);
__m512 tmp13091 = _mm512_fmadd_ps(tmp13078, _mm512_set1_ps(1.6e+01f), tmp13079);
__m512 tmp13111 = _mm512_fmadd_ps(tmp13098, _mm512_set1_ps(1.6e+01f), tmp13099);
__m512 tmp13076 = _mm512_add_ps(tmp13077, in1834);
__m512 tmp13096 = _mm512_add_ps(tmp13097, in1842);
__m512 tmp13093 = _mm512_add_ps(tmp13094, in1841);
__m512 tmp13113 = _mm512_add_ps(tmp13114, in1849);
__m512 tmp13075 = _mm512_fmadd_ps(tmp13080, _mm512_set1_ps(3.2e+01f), tmp13076);
__m512 tmp13095 = _mm512_fmadd_ps(tmp13100, _mm512_set1_ps(3.2e+01f), tmp13096);
__m512 tmp13086 = _mm512_fmadd_ps(tmp13080, _mm512_set1_ps(8e+00f), tmp13087);
__m512 tmp13106 = _mm512_fmadd_ps(tmp13100, _mm512_set1_ps(8e+00f), tmp13107);
__m512 tmp13092 = _mm512_fmadd_ps(tmp13084, _mm512_set1_ps(3.2e+01f), tmp13093);
__m512 tmp13112 = _mm512_fmadd_ps(tmp13104, _mm512_set1_ps(3.2e+01f), tmp13113);
__m512 tmp13090 = _mm512_fmadd_ps(tmp13080, _mm512_set1_ps(2e+00f), tmp13091);
__m512 tmp13110 = _mm512_fmadd_ps(tmp13100, _mm512_set1_ps(2e+00f), tmp13111);
__m512 tmp13063 = tmp13075;
__m512 tmp13069 = tmp13095;
__m512 tmp13064 = tmp13081;
__m512 tmp13070 = tmp13101;
__m512 tmp13065 = tmp13086;
__m512 tmp13071 = tmp13106;
__m512 tmp13066 = tmp13088;
__m512 tmp13072 = tmp13108;
__m512 tmp13067 = tmp13090;
__m512 tmp13073 = tmp13110;
__m512 tmp13068 = tmp13092;
__m512 tmp13074 = tmp13112;
__m512 tmp13159 = _mm512_unpacklo_ps(tmp13063, tmp13064);
__m512 tmp13160 = _mm512_unpackhi_ps(tmp13063, tmp13064);
__m512 tmp13161 = _mm512_unpacklo_ps(tmp13065, tmp13066);
__m512 tmp13162 = _mm512_unpackhi_ps(tmp13065, tmp13066);
__m512 tmp13163 = _mm512_unpacklo_ps(tmp13067, tmp13068);
__m512 tmp13164 = _mm512_unpackhi_ps(tmp13067, tmp13068);
__m512 tmp13165 = _mm512_unpacklo_ps(tmp13069, tmp13070);
__m512 tmp13166 = _mm512_unpackhi_ps(tmp13069, tmp13070);
__m512 tmp13167 = _mm512_unpacklo_ps(tmp13071, tmp13072);
__m512 tmp13168 = _mm512_unpackhi_ps(tmp13071, tmp13072);
__m512 tmp13169 = _mm512_unpacklo_ps(tmp13073, tmp13074);
__m512 tmp13170 = _mm512_unpackhi_ps(tmp13073, tmp13074);
__m512 tmp13171 = _mm512_shuffle_ps(tmp13159, tmp13161, 68);
__m512 tmp13172 = _mm512_shuffle_ps(tmp13159, tmp13161, 238);
__m512 tmp13173 = _mm512_shuffle_ps(tmp13160, tmp13162, 68);
__m512 tmp13174 = _mm512_shuffle_ps(tmp13160, tmp13162, 238);
__m512 tmp13175 = _mm512_shuffle_ps(tmp13163, tmp13165, 68);
__m512 tmp13176 = _mm512_shuffle_ps(tmp13163, tmp13165, 238);
__m512 tmp13177 = _mm512_shuffle_ps(tmp13164, tmp13166, 68);
__m512 tmp13178 = _mm512_shuffle_ps(tmp13164, tmp13166, 238);
__m512 tmp13179 = _mm512_shuffle_ps(tmp13167, tmp13169, 68);
__m512 tmp13180 = _mm512_shuffle_ps(tmp13167, tmp13169, 238);
__m512 tmp13181 = _mm512_shuffle_ps(tmp13168, tmp13170, 68);
__m512 tmp13182 = _mm512_shuffle_ps(tmp13168, tmp13170, 238);
__m512 tmp13183 = _mm512_shuffle_f32x4(tmp13171, tmp13175, 136);
__m512 tmp13184 = _mm512_shuffle_f32x4(tmp13171, tmp13175, 221);
__m512 tmp13185 = _mm512_shuffle_f32x4(tmp13172, tmp13176, 136);
__m512 tmp13186 = _mm512_shuffle_f32x4(tmp13172, tmp13176, 221);
__m512 tmp13187 = _mm512_shuffle_f32x4(tmp13173, tmp13177, 136);
__m512 tmp13188 = _mm512_shuffle_f32x4(tmp13173, tmp13177, 221);
__m512 tmp13189 = _mm512_shuffle_f32x4(tmp13174, tmp13178, 136);
__m512 tmp13190 = _mm512_shuffle_f32x4(tmp13174, tmp13178, 221);
__m512 tmp13191 = _mm512_shuffle_f32x4(tmp13179, tmp13179, 136);
__m512 tmp13192 = _mm512_shuffle_f32x4(tmp13179, tmp13179, 221);
__m512 tmp13193 = _mm512_shuffle_f32x4(tmp13180, tmp13180, 136);
__m512 tmp13194 = _mm512_shuffle_f32x4(tmp13180, tmp13180, 221);
__m512 tmp13195 = _mm512_shuffle_f32x4(tmp13181, tmp13181, 136);
__m512 tmp13196 = _mm512_shuffle_f32x4(tmp13181, tmp13181, 221);
__m512 tmp13197 = _mm512_shuffle_f32x4(tmp13182, tmp13182, 136);
__m512 tmp13198 = _mm512_shuffle_f32x4(tmp13182, tmp13182, 221);
tmp13063 = _mm512_shuffle_f32x4(tmp13183, tmp13191, 136);
tmp13071 = _mm512_shuffle_f32x4(tmp13183, tmp13191, 221);
tmp13064 = _mm512_shuffle_f32x4(tmp13185, tmp13193, 136);
tmp13072 = _mm512_shuffle_f32x4(tmp13185, tmp13193, 221);
tmp13065 = _mm512_shuffle_f32x4(tmp13187, tmp13195, 136);
tmp13073 = _mm512_shuffle_f32x4(tmp13187, tmp13195, 221);
tmp13066 = _mm512_shuffle_f32x4(tmp13189, tmp13197, 136);
tmp13074 = _mm512_shuffle_f32x4(tmp13189, tmp13197, 221);
tmp13067 = _mm512_shuffle_f32x4(tmp13184, tmp13192, 136);
__m512 tmp13115 = _mm512_shuffle_f32x4(tmp13184, tmp13192, 221);
tmp13068 = _mm512_shuffle_f32x4(tmp13186, tmp13194, 136);
__m512 tmp13116 = _mm512_shuffle_f32x4(tmp13186, tmp13194, 221);
tmp13069 = _mm512_shuffle_f32x4(tmp13188, tmp13196, 136);
__m512 tmp13117 = _mm512_shuffle_f32x4(tmp13188, tmp13196, 221);
tmp13070 = _mm512_shuffle_f32x4(tmp13190, tmp13198, 136);
__m512 tmp13118 = _mm512_shuffle_f32x4(tmp13190, tmp13198, 221);
__m512 tmp13123 = _mm512_add_ps(tmp13064, tmp13065);
__m512 tmp13143 = _mm512_add_ps(tmp13072, tmp13073);
__m512 tmp13122 = _mm512_add_ps(tmp13066, tmp13067);
__m512 tmp13142 = _mm512_add_ps(tmp13074, tmp13115);
__m512 tmp13128 = _mm512_sub_ps(tmp13066, tmp13067);
__m512 tmp13148 = _mm512_sub_ps(tmp13074, tmp13115);
__m512 tmp13127 = _mm512_sub_ps(tmp13064, tmp13065);
__m512 tmp13147 = _mm512_sub_ps(tmp13072, tmp13073);
__m512 tmp13124 = _mm512_add_ps(tmp13068, tmp13069);
__m512 tmp13144 = _mm512_add_ps(tmp13116, tmp13117);
__m512 tmp13129 = _mm512_sub_ps(tmp13068, tmp13069);
__m512 tmp13149 = _mm512_sub_ps(tmp13116, tmp13117);
__m512 tmp13126 = _mm512_fmadd_ps(tmp13128, _mm512_set1_ps(2e+00f), tmp13127);
__m512 tmp13146 = _mm512_fmadd_ps(tmp13148, _mm512_set1_ps(2e+00f), tmp13147);
__m512 tmp13133 = _mm512_fmadd_ps(tmp13128, _mm512_set1_ps(8e+00f), tmp13127);
__m512 tmp13153 = _mm512_fmadd_ps(tmp13148, _mm512_set1_ps(8e+00f), tmp13147);
__m512 tmp13121 = _mm512_add_ps(tmp13122, tmp13123);
__m512 tmp13141 = _mm512_add_ps(tmp13142, tmp13143);
__m512 tmp13125 = _mm512_fmadd_ps(tmp13129, _mm512_set1_ps(1.6e+01f), tmp13126);
__m512 tmp13145 = _mm512_fmadd_ps(tmp13149, _mm512_set1_ps(1.6e+01f), tmp13146);
__m512 tmp13132 = _mm512_fmadd_ps(tmp13129, _mm512_set1_ps(4e+00f), tmp13133);
__m512 tmp13152 = _mm512_fmadd_ps(tmp13149, _mm512_set1_ps(4e+00f), tmp13153);
__m512 tmp13138 = _mm512_add_ps(tmp13129, tmp13127);
__m512 tmp13158 = _mm512_add_ps(tmp13149, tmp13147);
__m512 tmp13131 = _mm512_fmadd_ps(tmp13122, _mm512_set1_ps(4e+00f), tmp13123);
__m512 tmp13151 = _mm512_fmadd_ps(tmp13142, _mm512_set1_ps(4e+00f), tmp13143);
__m512 tmp13135 = _mm512_fmadd_ps(tmp13122, _mm512_set1_ps(1.6e+01f), tmp13123);
__m512 tmp13155 = _mm512_fmadd_ps(tmp13142, _mm512_set1_ps(1.6e+01f), tmp13143);
__m512 tmp13120 = _mm512_add_ps(tmp13121, tmp13063);
__m512 tmp13140 = _mm512_add_ps(tmp13141, tmp13071);
__m512 tmp13137 = _mm512_add_ps(tmp13138, tmp13070);
__m512 tmp13157 = _mm512_add_ps(tmp13158, tmp13118);
__m512 tmp13119 = _mm512_fmadd_ps(tmp13124, _mm512_set1_ps(3.2e+01f), tmp13120);
__m512 tmp13139 = _mm512_fmadd_ps(tmp13144, _mm512_set1_ps(3.2e+01f), tmp13140);
__m512 tmp13130 = _mm512_fmadd_ps(tmp13124, _mm512_set1_ps(8e+00f), tmp13131);
__m512 tmp13150 = _mm512_fmadd_ps(tmp13144, _mm512_set1_ps(8e+00f), tmp13151);
__m512 tmp13136 = _mm512_fmadd_ps(tmp13128, _mm512_set1_ps(3.2e+01f), tmp13137);
__m512 tmp13156 = _mm512_fmadd_ps(tmp13148, _mm512_set1_ps(3.2e+01f), tmp13157);
__m512 tmp13134 = _mm512_fmadd_ps(tmp13124, _mm512_set1_ps(2e+00f), tmp13135);
__m512 tmp13154 = _mm512_fmadd_ps(tmp13144, _mm512_set1_ps(2e+00f), tmp13155);
__m512 out1701 = tmp13119;
__m512 out1695 = tmp13139;
__m512 out1702 = tmp13125;
__m512 out1696 = tmp13145;
__m512 out1703 = tmp13130;
__m512 out1697 = tmp13150;
__m512 out1704 = tmp13132;
__m512 out1698 = tmp13152;
__m512 out1705 = tmp13134;
__m512 out1699 = tmp13154;
__m512 out1706 = tmp13136;
__m512 out1700 = tmp13156;
out1701 = _mm512_max_ps(_mm512_setzero_ps(), out1701);
out1695 = _mm512_max_ps(_mm512_setzero_ps(), out1695);
out1702 = _mm512_max_ps(_mm512_setzero_ps(), out1702);
out1696 = _mm512_max_ps(_mm512_setzero_ps(), out1696);
out1703 = _mm512_max_ps(_mm512_setzero_ps(), out1703);
out1697 = _mm512_max_ps(_mm512_setzero_ps(), out1697);
out1704 = _mm512_max_ps(_mm512_setzero_ps(), out1704);
out1698 = _mm512_max_ps(_mm512_setzero_ps(), out1698);
out1705 = _mm512_max_ps(_mm512_setzero_ps(), out1705);
out1699 = _mm512_max_ps(_mm512_setzero_ps(), out1699);
out1706 = _mm512_max_ps(_mm512_setzero_ps(), out1706);
out1700 = _mm512_max_ps(_mm512_setzero_ps(), out1700);
_mm512_mask_storeu_ps(datPtr26+3760+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1701);
_mm512_mask_storeu_ps(datPtr26+3184+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1695);
_mm512_mask_storeu_ps(datPtr26+3784+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1695);
_mm512_mask_storeu_ps(datPtr26+3872+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1702);
_mm512_mask_storeu_ps(datPtr26+3296+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1696);
_mm512_mask_storeu_ps(datPtr26+3896+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1696);
_mm512_mask_storeu_ps(datPtr26+3984+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1703);
_mm512_mask_storeu_ps(datPtr26+3408+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1697);
_mm512_mask_storeu_ps(datPtr26+4008+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1697);
_mm512_mask_storeu_ps(datPtr26+4096+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1704);
_mm512_mask_storeu_ps(datPtr26+3520+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1698);
_mm512_mask_storeu_ps(datPtr26+4120+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1698);
_mm512_mask_storeu_ps(datPtr26+4208+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1705);
_mm512_mask_storeu_ps(datPtr26+3632+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1699);
_mm512_mask_storeu_ps(datPtr26+4232+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1699);
_mm512_mask_storeu_ps(datPtr26+4320+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4095, out1706);
_mm512_mask_storeu_ps(datPtr26+3744+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 15, out1700);
_mm512_mask_storeu_ps(datPtr26+4344+25088*i51+112*toH43+4*toW43+12544*k138+6272*l57, 4032, out1700);
}
}
++j44;
rel22 = 3;
}
if (rel22 < 4) {
ptrdiff_t toH44 = base22+18;
ptrdiff_t toW44 = 18;
ptrdiff_t k139 = 2*w62;
for (; k139 != 2; ++k139) {
ptrdiff_t l58 = 0;
for (; l58 != 2; ++l58) {
__m512 sf945 = _mm512_loadu_ps(sfPtr12+0+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf946 = _mm512_loadu_ps(sfPtr12+128+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1850 = _mm512_shuffle_f32x4(sf945, sf946, 68);
__m512 in1851 = _mm512_shuffle_f32x4(sf945, sf946, 238);
__m512 sf947 = _mm512_loadu_ps(sfPtr12+64+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf948 = _mm512_loadu_ps(sfPtr12+192+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1858 = _mm512_shuffle_f32x4(sf947, sf948, 68);
__m512 in1859 = _mm512_shuffle_f32x4(sf947, sf948, 238);
__m512 sf949 = _mm512_loadu_ps(sfPtr12+12800+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf950 = _mm512_loadu_ps(sfPtr12+12928+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1852 = _mm512_shuffle_f32x4(sf949, sf950, 68);
__m512 in1853 = _mm512_shuffle_f32x4(sf949, sf950, 238);
__m512 sf951 = _mm512_loadu_ps(sfPtr12+12864+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf952 = _mm512_loadu_ps(sfPtr12+12992+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1860 = _mm512_shuffle_f32x4(sf951, sf952, 68);
__m512 in1861 = _mm512_shuffle_f32x4(sf951, sf952, 238);
__m512 sf953 = _mm512_loadu_ps(sfPtr12+25600+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf954 = _mm512_loadu_ps(sfPtr12+25728+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1854 = _mm512_shuffle_f32x4(sf953, sf954, 68);
__m512 in1855 = _mm512_shuffle_f32x4(sf953, sf954, 238);
__m512 sf955 = _mm512_loadu_ps(sfPtr12+25664+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf956 = _mm512_loadu_ps(sfPtr12+25792+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1862 = _mm512_shuffle_f32x4(sf955, sf956, 68);
__m512 in1863 = _mm512_shuffle_f32x4(sf955, sf956, 238);
__m512 sf957 = _mm512_loadu_ps(sfPtr12+38400+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf958 = _mm512_loadu_ps(sfPtr12+38528+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1856 = _mm512_shuffle_f32x4(sf957, sf958, 68);
__m512 in1857 = _mm512_shuffle_f32x4(sf957, sf958, 238);
__m512 sf959 = _mm512_loadu_ps(sfPtr12+38464+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf960 = _mm512_loadu_ps(sfPtr12+38592+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1864 = _mm512_shuffle_f32x4(sf959, sf960, 68);
__m512 in1865 = _mm512_shuffle_f32x4(sf959, sf960, 238);
__m512 tmp13215 = _mm512_add_ps(in1851, in1852);
__m512 tmp13235 = _mm512_add_ps(in1859, in1860);
__m512 tmp13214 = _mm512_add_ps(in1853, in1854);
__m512 tmp13234 = _mm512_add_ps(in1861, in1862);
__m512 tmp13220 = _mm512_sub_ps(in1853, in1854);
__m512 tmp13240 = _mm512_sub_ps(in1861, in1862);
__m512 tmp13219 = _mm512_sub_ps(in1851, in1852);
__m512 tmp13239 = _mm512_sub_ps(in1859, in1860);
__m512 tmp13216 = _mm512_add_ps(in1855, in1856);
__m512 tmp13236 = _mm512_add_ps(in1863, in1864);
__m512 tmp13221 = _mm512_sub_ps(in1855, in1856);
__m512 tmp13241 = _mm512_sub_ps(in1863, in1864);
__m512 tmp13218 = _mm512_fmadd_ps(tmp13220, _mm512_set1_ps(2e+00f), tmp13219);
__m512 tmp13238 = _mm512_fmadd_ps(tmp13240, _mm512_set1_ps(2e+00f), tmp13239);
__m512 tmp13225 = _mm512_fmadd_ps(tmp13220, _mm512_set1_ps(8e+00f), tmp13219);
__m512 tmp13245 = _mm512_fmadd_ps(tmp13240, _mm512_set1_ps(8e+00f), tmp13239);
__m512 tmp13213 = _mm512_add_ps(tmp13214, tmp13215);
__m512 tmp13233 = _mm512_add_ps(tmp13234, tmp13235);
__m512 tmp13217 = _mm512_fmadd_ps(tmp13221, _mm512_set1_ps(1.6e+01f), tmp13218);
__m512 tmp13237 = _mm512_fmadd_ps(tmp13241, _mm512_set1_ps(1.6e+01f), tmp13238);
__m512 tmp13224 = _mm512_fmadd_ps(tmp13221, _mm512_set1_ps(4e+00f), tmp13225);
__m512 tmp13244 = _mm512_fmadd_ps(tmp13241, _mm512_set1_ps(4e+00f), tmp13245);
__m512 tmp13230 = _mm512_add_ps(tmp13221, tmp13219);
__m512 tmp13250 = _mm512_add_ps(tmp13241, tmp13239);
__m512 tmp13223 = _mm512_fmadd_ps(tmp13214, _mm512_set1_ps(4e+00f), tmp13215);
__m512 tmp13243 = _mm512_fmadd_ps(tmp13234, _mm512_set1_ps(4e+00f), tmp13235);
__m512 tmp13227 = _mm512_fmadd_ps(tmp13214, _mm512_set1_ps(1.6e+01f), tmp13215);
__m512 tmp13247 = _mm512_fmadd_ps(tmp13234, _mm512_set1_ps(1.6e+01f), tmp13235);
__m512 tmp13212 = _mm512_add_ps(tmp13213, in1850);
__m512 tmp13232 = _mm512_add_ps(tmp13233, in1858);
__m512 tmp13229 = _mm512_add_ps(tmp13230, in1857);
__m512 tmp13249 = _mm512_add_ps(tmp13250, in1865);
__m512 tmp13211 = _mm512_fmadd_ps(tmp13216, _mm512_set1_ps(3.2e+01f), tmp13212);
__m512 tmp13231 = _mm512_fmadd_ps(tmp13236, _mm512_set1_ps(3.2e+01f), tmp13232);
__m512 tmp13222 = _mm512_fmadd_ps(tmp13216, _mm512_set1_ps(8e+00f), tmp13223);
__m512 tmp13242 = _mm512_fmadd_ps(tmp13236, _mm512_set1_ps(8e+00f), tmp13243);
__m512 tmp13228 = _mm512_fmadd_ps(tmp13220, _mm512_set1_ps(3.2e+01f), tmp13229);
__m512 tmp13248 = _mm512_fmadd_ps(tmp13240, _mm512_set1_ps(3.2e+01f), tmp13249);
__m512 tmp13226 = _mm512_fmadd_ps(tmp13216, _mm512_set1_ps(2e+00f), tmp13227);
__m512 tmp13246 = _mm512_fmadd_ps(tmp13236, _mm512_set1_ps(2e+00f), tmp13247);
__m512 tmp13199 = tmp13211;
__m512 tmp13205 = tmp13231;
__m512 tmp13200 = tmp13217;
__m512 tmp13206 = tmp13237;
__m512 tmp13201 = tmp13222;
__m512 tmp13207 = tmp13242;
__m512 tmp13202 = tmp13224;
__m512 tmp13208 = tmp13244;
__m512 tmp13203 = tmp13226;
__m512 tmp13209 = tmp13246;
__m512 tmp13204 = tmp13228;
__m512 tmp13210 = tmp13248;
__m512 tmp13290 = _mm512_unpacklo_ps(tmp13199, tmp13200);
__m512 tmp13291 = _mm512_unpackhi_ps(tmp13199, tmp13200);
__m512 tmp13292 = _mm512_unpacklo_ps(tmp13201, tmp13202);
__m512 tmp13293 = _mm512_unpackhi_ps(tmp13201, tmp13202);
__m512 tmp13294 = _mm512_unpacklo_ps(tmp13203, tmp13204);
__m512 tmp13295 = _mm512_unpackhi_ps(tmp13203, tmp13204);
__m512 tmp13296 = _mm512_unpacklo_ps(tmp13205, tmp13206);
__m512 tmp13297 = _mm512_unpackhi_ps(tmp13205, tmp13206);
__m512 tmp13298 = _mm512_unpacklo_ps(tmp13207, tmp13208);
__m512 tmp13299 = _mm512_unpackhi_ps(tmp13207, tmp13208);
__m512 tmp13300 = _mm512_unpacklo_ps(tmp13209, tmp13210);
__m512 tmp13301 = _mm512_unpackhi_ps(tmp13209, tmp13210);
__m512 tmp13302 = _mm512_shuffle_ps(tmp13290, tmp13292, 68);
__m512 tmp13303 = _mm512_shuffle_ps(tmp13290, tmp13292, 238);
__m512 tmp13304 = _mm512_shuffle_ps(tmp13291, tmp13293, 68);
__m512 tmp13305 = _mm512_shuffle_ps(tmp13291, tmp13293, 238);
__m512 tmp13306 = _mm512_shuffle_ps(tmp13294, tmp13296, 68);
__m512 tmp13307 = _mm512_shuffle_ps(tmp13294, tmp13296, 238);
__m512 tmp13308 = _mm512_shuffle_ps(tmp13295, tmp13297, 68);
__m512 tmp13309 = _mm512_shuffle_ps(tmp13295, tmp13297, 238);
__m512 tmp13310 = _mm512_shuffle_ps(tmp13298, tmp13300, 68);
__m512 tmp13311 = _mm512_shuffle_ps(tmp13298, tmp13300, 238);
__m512 tmp13312 = _mm512_shuffle_ps(tmp13299, tmp13301, 68);
__m512 tmp13313 = _mm512_shuffle_ps(tmp13299, tmp13301, 238);
__m512 tmp13314 = _mm512_shuffle_f32x4(tmp13302, tmp13306, 136);
__m512 tmp13315 = _mm512_shuffle_f32x4(tmp13302, tmp13306, 221);
__m512 tmp13316 = _mm512_shuffle_f32x4(tmp13303, tmp13307, 136);
__m512 tmp13317 = _mm512_shuffle_f32x4(tmp13303, tmp13307, 221);
__m512 tmp13318 = _mm512_shuffle_f32x4(tmp13304, tmp13308, 136);
__m512 tmp13319 = _mm512_shuffle_f32x4(tmp13304, tmp13308, 221);
__m512 tmp13320 = _mm512_shuffle_f32x4(tmp13305, tmp13309, 136);
__m512 tmp13321 = _mm512_shuffle_f32x4(tmp13305, tmp13309, 221);
__m512 tmp13322 = _mm512_shuffle_f32x4(tmp13310, tmp13310, 136);
__m512 tmp13323 = _mm512_shuffle_f32x4(tmp13310, tmp13310, 221);
__m512 tmp13324 = _mm512_shuffle_f32x4(tmp13311, tmp13311, 136);
__m512 tmp13325 = _mm512_shuffle_f32x4(tmp13311, tmp13311, 221);
__m512 tmp13326 = _mm512_shuffle_f32x4(tmp13312, tmp13312, 136);
__m512 tmp13327 = _mm512_shuffle_f32x4(tmp13312, tmp13312, 221);
__m512 tmp13328 = _mm512_shuffle_f32x4(tmp13313, tmp13313, 136);
__m512 tmp13329 = _mm512_shuffle_f32x4(tmp13313, tmp13313, 221);
tmp13199 = _mm512_shuffle_f32x4(tmp13314, tmp13322, 136);
tmp13207 = _mm512_shuffle_f32x4(tmp13314, tmp13322, 221);
tmp13200 = _mm512_shuffle_f32x4(tmp13316, tmp13324, 136);
tmp13208 = _mm512_shuffle_f32x4(tmp13316, tmp13324, 221);
tmp13201 = _mm512_shuffle_f32x4(tmp13318, tmp13326, 136);
tmp13209 = _mm512_shuffle_f32x4(tmp13318, tmp13326, 221);
tmp13202 = _mm512_shuffle_f32x4(tmp13320, tmp13328, 136);
tmp13210 = _mm512_shuffle_f32x4(tmp13320, tmp13328, 221);
tmp13203 = _mm512_shuffle_f32x4(tmp13315, tmp13323, 136);
__m512 tmp13251 = _mm512_shuffle_f32x4(tmp13315, tmp13323, 221);
tmp13204 = _mm512_shuffle_f32x4(tmp13317, tmp13325, 136);
__m512 tmp13252 = _mm512_shuffle_f32x4(tmp13317, tmp13325, 221);
tmp13205 = _mm512_shuffle_f32x4(tmp13319, tmp13327, 136);
__m512 tmp13253 = _mm512_shuffle_f32x4(tmp13319, tmp13327, 221);
tmp13206 = _mm512_shuffle_f32x4(tmp13321, tmp13329, 136);
__m512 tmp13254 = _mm512_shuffle_f32x4(tmp13321, tmp13329, 221);
(void)tmp13254;
__m512 tmp13259 = _mm512_add_ps(tmp13200, tmp13201);
__m512 tmp13279 = _mm512_add_ps(tmp13208, tmp13209);
__m512 tmp13258 = _mm512_add_ps(tmp13202, tmp13203);
__m512 tmp13278 = _mm512_add_ps(tmp13210, tmp13251);
__m512 tmp13264 = _mm512_sub_ps(tmp13202, tmp13203);
__m512 tmp13284 = _mm512_sub_ps(tmp13210, tmp13251);
__m512 tmp13263 = _mm512_sub_ps(tmp13200, tmp13201);
__m512 tmp13283 = _mm512_sub_ps(tmp13208, tmp13209);
__m512 tmp13260 = _mm512_add_ps(tmp13204, tmp13205);
__m512 tmp13280 = _mm512_add_ps(tmp13252, tmp13253);
__m512 tmp13265 = _mm512_sub_ps(tmp13204, tmp13205);
__m512 tmp13285 = _mm512_sub_ps(tmp13252, tmp13253);
__m512 tmp13262 = _mm512_fmadd_ps(tmp13264, _mm512_set1_ps(2e+00f), tmp13263);
__m512 tmp13282 = _mm512_fmadd_ps(tmp13284, _mm512_set1_ps(2e+00f), tmp13283);
__m512 tmp13269 = _mm512_fmadd_ps(tmp13264, _mm512_set1_ps(8e+00f), tmp13263);
__m512 tmp13289 = _mm512_fmadd_ps(tmp13284, _mm512_set1_ps(8e+00f), tmp13283);
__m512 tmp13257 = _mm512_add_ps(tmp13258, tmp13259);
__m512 tmp13277 = _mm512_add_ps(tmp13278, tmp13279);
__m512 tmp13261 = _mm512_fmadd_ps(tmp13265, _mm512_set1_ps(1.6e+01f), tmp13262);
__m512 tmp13281 = _mm512_fmadd_ps(tmp13285, _mm512_set1_ps(1.6e+01f), tmp13282);
__m512 tmp13268 = _mm512_fmadd_ps(tmp13265, _mm512_set1_ps(4e+00f), tmp13269);
__m512 tmp13288 = _mm512_fmadd_ps(tmp13285, _mm512_set1_ps(4e+00f), tmp13289);
__m512 tmp13274 = _mm512_add_ps(tmp13265, tmp13263);
__m512 tmp13267 = _mm512_fmadd_ps(tmp13258, _mm512_set1_ps(4e+00f), tmp13259);
__m512 tmp13287 = _mm512_fmadd_ps(tmp13278, _mm512_set1_ps(4e+00f), tmp13279);
__m512 tmp13271 = _mm512_fmadd_ps(tmp13258, _mm512_set1_ps(1.6e+01f), tmp13259);
__m512 tmp13256 = _mm512_add_ps(tmp13257, tmp13199);
__m512 tmp13276 = _mm512_add_ps(tmp13277, tmp13207);
__m512 tmp13273 = _mm512_add_ps(tmp13274, tmp13206);
__m512 tmp13255 = _mm512_fmadd_ps(tmp13260, _mm512_set1_ps(3.2e+01f), tmp13256);
__m512 tmp13275 = _mm512_fmadd_ps(tmp13280, _mm512_set1_ps(3.2e+01f), tmp13276);
__m512 tmp13266 = _mm512_fmadd_ps(tmp13260, _mm512_set1_ps(8e+00f), tmp13267);
__m512 tmp13286 = _mm512_fmadd_ps(tmp13280, _mm512_set1_ps(8e+00f), tmp13287);
__m512 tmp13272 = _mm512_fmadd_ps(tmp13264, _mm512_set1_ps(3.2e+01f), tmp13273);
__m512 tmp13270 = _mm512_fmadd_ps(tmp13260, _mm512_set1_ps(2e+00f), tmp13271);
__m512 out1707 = tmp13255;
__m512 out1713 = tmp13275;
__m512 out1708 = tmp13261;
__m512 out1714 = tmp13281;
__m512 out1709 = tmp13266;
__m512 out1715 = tmp13286;
__m512 out1710 = tmp13268;
__m512 out1716 = tmp13288;
__m512 out1711 = tmp13270;
__m512 out1712 = tmp13272;
out1707 = _mm512_max_ps(_mm512_setzero_ps(), out1707);
out1713 = _mm512_max_ps(_mm512_setzero_ps(), out1713);
out1708 = _mm512_max_ps(_mm512_setzero_ps(), out1708);
out1714 = _mm512_max_ps(_mm512_setzero_ps(), out1714);
out1709 = _mm512_max_ps(_mm512_setzero_ps(), out1709);
out1715 = _mm512_max_ps(_mm512_setzero_ps(), out1715);
out1710 = _mm512_max_ps(_mm512_setzero_ps(), out1710);
out1716 = _mm512_max_ps(_mm512_setzero_ps(), out1716);
out1711 = _mm512_max_ps(_mm512_setzero_ps(), out1711);
out1712 = _mm512_max_ps(_mm512_setzero_ps(), out1712);
_mm512_mask_storeu_ps(datPtr26+0+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1707);
_mm512_mask_storeu_ps(datPtr26+600+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1713);
_mm512_mask_storeu_ps(datPtr26+112+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1708);
_mm512_mask_storeu_ps(datPtr26+712+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1714);
_mm512_mask_storeu_ps(datPtr26+224+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1709);
_mm512_mask_storeu_ps(datPtr26+824+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1715);
_mm512_mask_storeu_ps(datPtr26+336+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1710);
_mm512_mask_storeu_ps(datPtr26+936+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1716);
_mm512_mask_storeu_ps(datPtr26+448+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1711);
_mm512_mask_storeu_ps(datPtr26+560+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1712);
__m512 sf961 = _mm512_loadu_ps(sfPtr12+256+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf962 = _mm512_loadu_ps(sfPtr12+384+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1866 = _mm512_shuffle_f32x4(sf961, sf962, 68);
__m512 in1867 = _mm512_shuffle_f32x4(sf961, sf962, 238);
__m512 sf963 = _mm512_loadu_ps(sfPtr12+320+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf964 = _mm512_loadu_ps(sfPtr12+448+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1874 = _mm512_shuffle_f32x4(sf963, sf964, 68);
__m512 in1875 = _mm512_shuffle_f32x4(sf963, sf964, 238);
__m512 sf965 = _mm512_loadu_ps(sfPtr12+13056+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf966 = _mm512_loadu_ps(sfPtr12+13184+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1868 = _mm512_shuffle_f32x4(sf965, sf966, 68);
__m512 in1869 = _mm512_shuffle_f32x4(sf965, sf966, 238);
__m512 sf967 = _mm512_loadu_ps(sfPtr12+13120+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf968 = _mm512_loadu_ps(sfPtr12+13248+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1876 = _mm512_shuffle_f32x4(sf967, sf968, 68);
__m512 in1877 = _mm512_shuffle_f32x4(sf967, sf968, 238);
__m512 sf969 = _mm512_loadu_ps(sfPtr12+25856+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf970 = _mm512_loadu_ps(sfPtr12+25984+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1870 = _mm512_shuffle_f32x4(sf969, sf970, 68);
__m512 in1871 = _mm512_shuffle_f32x4(sf969, sf970, 238);
__m512 sf971 = _mm512_loadu_ps(sfPtr12+25920+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf972 = _mm512_loadu_ps(sfPtr12+26048+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1878 = _mm512_shuffle_f32x4(sf971, sf972, 68);
__m512 in1879 = _mm512_shuffle_f32x4(sf971, sf972, 238);
__m512 sf973 = _mm512_loadu_ps(sfPtr12+38656+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf974 = _mm512_loadu_ps(sfPtr12+38784+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1872 = _mm512_shuffle_f32x4(sf973, sf974, 68);
__m512 in1873 = _mm512_shuffle_f32x4(sf973, sf974, 238);
__m512 sf975 = _mm512_loadu_ps(sfPtr12+38720+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf976 = _mm512_loadu_ps(sfPtr12+38848+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1880 = _mm512_shuffle_f32x4(sf975, sf976, 68);
__m512 in1881 = _mm512_shuffle_f32x4(sf975, sf976, 238);
__m512 tmp13346 = _mm512_add_ps(in1867, in1868);
__m512 tmp13366 = _mm512_add_ps(in1875, in1876);
__m512 tmp13345 = _mm512_add_ps(in1869, in1870);
__m512 tmp13365 = _mm512_add_ps(in1877, in1878);
__m512 tmp13351 = _mm512_sub_ps(in1869, in1870);
__m512 tmp13371 = _mm512_sub_ps(in1877, in1878);
__m512 tmp13350 = _mm512_sub_ps(in1867, in1868);
__m512 tmp13370 = _mm512_sub_ps(in1875, in1876);
__m512 tmp13347 = _mm512_add_ps(in1871, in1872);
__m512 tmp13367 = _mm512_add_ps(in1879, in1880);
__m512 tmp13352 = _mm512_sub_ps(in1871, in1872);
__m512 tmp13372 = _mm512_sub_ps(in1879, in1880);
__m512 tmp13349 = _mm512_fmadd_ps(tmp13351, _mm512_set1_ps(2e+00f), tmp13350);
__m512 tmp13369 = _mm512_fmadd_ps(tmp13371, _mm512_set1_ps(2e+00f), tmp13370);
__m512 tmp13356 = _mm512_fmadd_ps(tmp13351, _mm512_set1_ps(8e+00f), tmp13350);
__m512 tmp13376 = _mm512_fmadd_ps(tmp13371, _mm512_set1_ps(8e+00f), tmp13370);
__m512 tmp13344 = _mm512_add_ps(tmp13345, tmp13346);
__m512 tmp13364 = _mm512_add_ps(tmp13365, tmp13366);
__m512 tmp13348 = _mm512_fmadd_ps(tmp13352, _mm512_set1_ps(1.6e+01f), tmp13349);
__m512 tmp13368 = _mm512_fmadd_ps(tmp13372, _mm512_set1_ps(1.6e+01f), tmp13369);
__m512 tmp13355 = _mm512_fmadd_ps(tmp13352, _mm512_set1_ps(4e+00f), tmp13356);
__m512 tmp13375 = _mm512_fmadd_ps(tmp13372, _mm512_set1_ps(4e+00f), tmp13376);
__m512 tmp13361 = _mm512_add_ps(tmp13352, tmp13350);
__m512 tmp13381 = _mm512_add_ps(tmp13372, tmp13370);
__m512 tmp13354 = _mm512_fmadd_ps(tmp13345, _mm512_set1_ps(4e+00f), tmp13346);
__m512 tmp13374 = _mm512_fmadd_ps(tmp13365, _mm512_set1_ps(4e+00f), tmp13366);
__m512 tmp13358 = _mm512_fmadd_ps(tmp13345, _mm512_set1_ps(1.6e+01f), tmp13346);
__m512 tmp13378 = _mm512_fmadd_ps(tmp13365, _mm512_set1_ps(1.6e+01f), tmp13366);
__m512 tmp13343 = _mm512_add_ps(tmp13344, in1866);
__m512 tmp13363 = _mm512_add_ps(tmp13364, in1874);
__m512 tmp13360 = _mm512_add_ps(tmp13361, in1873);
__m512 tmp13380 = _mm512_add_ps(tmp13381, in1881);
__m512 tmp13342 = _mm512_fmadd_ps(tmp13347, _mm512_set1_ps(3.2e+01f), tmp13343);
__m512 tmp13362 = _mm512_fmadd_ps(tmp13367, _mm512_set1_ps(3.2e+01f), tmp13363);
__m512 tmp13353 = _mm512_fmadd_ps(tmp13347, _mm512_set1_ps(8e+00f), tmp13354);
__m512 tmp13373 = _mm512_fmadd_ps(tmp13367, _mm512_set1_ps(8e+00f), tmp13374);
__m512 tmp13359 = _mm512_fmadd_ps(tmp13351, _mm512_set1_ps(3.2e+01f), tmp13360);
__m512 tmp13379 = _mm512_fmadd_ps(tmp13371, _mm512_set1_ps(3.2e+01f), tmp13380);
__m512 tmp13357 = _mm512_fmadd_ps(tmp13347, _mm512_set1_ps(2e+00f), tmp13358);
__m512 tmp13377 = _mm512_fmadd_ps(tmp13367, _mm512_set1_ps(2e+00f), tmp13378);
__m512 tmp13330 = tmp13342;
__m512 tmp13336 = tmp13362;
__m512 tmp13331 = tmp13348;
__m512 tmp13337 = tmp13368;
__m512 tmp13332 = tmp13353;
__m512 tmp13338 = tmp13373;
__m512 tmp13333 = tmp13355;
__m512 tmp13339 = tmp13375;
__m512 tmp13334 = tmp13357;
__m512 tmp13340 = tmp13377;
__m512 tmp13335 = tmp13359;
__m512 tmp13341 = tmp13379;
__m512 tmp13421 = _mm512_unpacklo_ps(tmp13330, tmp13331);
__m512 tmp13422 = _mm512_unpackhi_ps(tmp13330, tmp13331);
__m512 tmp13423 = _mm512_unpacklo_ps(tmp13332, tmp13333);
__m512 tmp13424 = _mm512_unpackhi_ps(tmp13332, tmp13333);
__m512 tmp13425 = _mm512_unpacklo_ps(tmp13334, tmp13335);
__m512 tmp13426 = _mm512_unpackhi_ps(tmp13334, tmp13335);
__m512 tmp13427 = _mm512_unpacklo_ps(tmp13336, tmp13337);
__m512 tmp13428 = _mm512_unpackhi_ps(tmp13336, tmp13337);
__m512 tmp13429 = _mm512_unpacklo_ps(tmp13338, tmp13339);
__m512 tmp13430 = _mm512_unpackhi_ps(tmp13338, tmp13339);
__m512 tmp13431 = _mm512_unpacklo_ps(tmp13340, tmp13341);
__m512 tmp13432 = _mm512_unpackhi_ps(tmp13340, tmp13341);
__m512 tmp13433 = _mm512_shuffle_ps(tmp13421, tmp13423, 68);
__m512 tmp13434 = _mm512_shuffle_ps(tmp13421, tmp13423, 238);
__m512 tmp13435 = _mm512_shuffle_ps(tmp13422, tmp13424, 68);
__m512 tmp13436 = _mm512_shuffle_ps(tmp13422, tmp13424, 238);
__m512 tmp13437 = _mm512_shuffle_ps(tmp13425, tmp13427, 68);
__m512 tmp13438 = _mm512_shuffle_ps(tmp13425, tmp13427, 238);
__m512 tmp13439 = _mm512_shuffle_ps(tmp13426, tmp13428, 68);
__m512 tmp13440 = _mm512_shuffle_ps(tmp13426, tmp13428, 238);
__m512 tmp13441 = _mm512_shuffle_ps(tmp13429, tmp13431, 68);
__m512 tmp13442 = _mm512_shuffle_ps(tmp13429, tmp13431, 238);
__m512 tmp13443 = _mm512_shuffle_ps(tmp13430, tmp13432, 68);
__m512 tmp13444 = _mm512_shuffle_ps(tmp13430, tmp13432, 238);
__m512 tmp13445 = _mm512_shuffle_f32x4(tmp13433, tmp13437, 136);
__m512 tmp13446 = _mm512_shuffle_f32x4(tmp13433, tmp13437, 221);
__m512 tmp13447 = _mm512_shuffle_f32x4(tmp13434, tmp13438, 136);
__m512 tmp13448 = _mm512_shuffle_f32x4(tmp13434, tmp13438, 221);
__m512 tmp13449 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 136);
__m512 tmp13450 = _mm512_shuffle_f32x4(tmp13435, tmp13439, 221);
__m512 tmp13451 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 136);
__m512 tmp13452 = _mm512_shuffle_f32x4(tmp13436, tmp13440, 221);
__m512 tmp13453 = _mm512_shuffle_f32x4(tmp13441, tmp13441, 136);
__m512 tmp13454 = _mm512_shuffle_f32x4(tmp13441, tmp13441, 221);
__m512 tmp13455 = _mm512_shuffle_f32x4(tmp13442, tmp13442, 136);
__m512 tmp13456 = _mm512_shuffle_f32x4(tmp13442, tmp13442, 221);
__m512 tmp13457 = _mm512_shuffle_f32x4(tmp13443, tmp13443, 136);
__m512 tmp13458 = _mm512_shuffle_f32x4(tmp13443, tmp13443, 221);
__m512 tmp13459 = _mm512_shuffle_f32x4(tmp13444, tmp13444, 136);
__m512 tmp13460 = _mm512_shuffle_f32x4(tmp13444, tmp13444, 221);
tmp13330 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 136);
tmp13338 = _mm512_shuffle_f32x4(tmp13445, tmp13453, 221);
tmp13331 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 136);
tmp13339 = _mm512_shuffle_f32x4(tmp13447, tmp13455, 221);
tmp13332 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 136);
tmp13340 = _mm512_shuffle_f32x4(tmp13449, tmp13457, 221);
tmp13333 = _mm512_shuffle_f32x4(tmp13451, tmp13459, 136);
tmp13341 = _mm512_shuffle_f32x4(tmp13451, tmp13459, 221);
tmp13334 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 136);
__m512 tmp13382 = _mm512_shuffle_f32x4(tmp13446, tmp13454, 221);
tmp13335 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 136);
__m512 tmp13383 = _mm512_shuffle_f32x4(tmp13448, tmp13456, 221);
tmp13336 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 136);
__m512 tmp13384 = _mm512_shuffle_f32x4(tmp13450, tmp13458, 221);
tmp13337 = _mm512_shuffle_f32x4(tmp13452, tmp13460, 136);
__m512 tmp13385 = _mm512_shuffle_f32x4(tmp13452, tmp13460, 221);
(void)tmp13337;
__m512 tmp13390 = _mm512_add_ps(tmp13331, tmp13332);
__m512 tmp13405 = _mm512_add_ps(tmp13339, tmp13340);
__m512 tmp13389 = _mm512_add_ps(tmp13333, tmp13334);
__m512 tmp13404 = _mm512_add_ps(tmp13341, tmp13382);
__m512 tmp13395 = _mm512_sub_ps(tmp13333, tmp13334);
__m512 tmp13410 = _mm512_sub_ps(tmp13341, tmp13382);
__m512 tmp13394 = _mm512_sub_ps(tmp13331, tmp13332);
__m512 tmp13409 = _mm512_sub_ps(tmp13339, tmp13340);
__m512 tmp13391 = _mm512_add_ps(tmp13335, tmp13336);
__m512 tmp13406 = _mm512_add_ps(tmp13383, tmp13384);
__m512 tmp13396 = _mm512_sub_ps(tmp13335, tmp13336);
__m512 tmp13411 = _mm512_sub_ps(tmp13383, tmp13384);
__m512 tmp13393 = _mm512_fmadd_ps(tmp13395, _mm512_set1_ps(2e+00f), tmp13394);
__m512 tmp13408 = _mm512_fmadd_ps(tmp13410, _mm512_set1_ps(2e+00f), tmp13409);
__m512 tmp13400 = _mm512_fmadd_ps(tmp13395, _mm512_set1_ps(8e+00f), tmp13394);
__m512 tmp13415 = _mm512_fmadd_ps(tmp13410, _mm512_set1_ps(8e+00f), tmp13409);
__m512 tmp13388 = _mm512_add_ps(tmp13389, tmp13390);
__m512 tmp13403 = _mm512_add_ps(tmp13404, tmp13405);
__m512 tmp13392 = _mm512_fmadd_ps(tmp13396, _mm512_set1_ps(1.6e+01f), tmp13393);
__m512 tmp13407 = _mm512_fmadd_ps(tmp13411, _mm512_set1_ps(1.6e+01f), tmp13408);
__m512 tmp13399 = _mm512_fmadd_ps(tmp13396, _mm512_set1_ps(4e+00f), tmp13400);
__m512 tmp13414 = _mm512_fmadd_ps(tmp13411, _mm512_set1_ps(4e+00f), tmp13415);
__m512 tmp13420 = _mm512_add_ps(tmp13411, tmp13409);
__m512 tmp13398 = _mm512_fmadd_ps(tmp13389, _mm512_set1_ps(4e+00f), tmp13390);
__m512 tmp13413 = _mm512_fmadd_ps(tmp13404, _mm512_set1_ps(4e+00f), tmp13405);
__m512 tmp13417 = _mm512_fmadd_ps(tmp13404, _mm512_set1_ps(1.6e+01f), tmp13405);
__m512 tmp13387 = _mm512_add_ps(tmp13388, tmp13330);
__m512 tmp13402 = _mm512_add_ps(tmp13403, tmp13338);
__m512 tmp13419 = _mm512_add_ps(tmp13420, tmp13385);
__m512 tmp13386 = _mm512_fmadd_ps(tmp13391, _mm512_set1_ps(3.2e+01f), tmp13387);
__m512 tmp13401 = _mm512_fmadd_ps(tmp13406, _mm512_set1_ps(3.2e+01f), tmp13402);
__m512 tmp13397 = _mm512_fmadd_ps(tmp13391, _mm512_set1_ps(8e+00f), tmp13398);
__m512 tmp13412 = _mm512_fmadd_ps(tmp13406, _mm512_set1_ps(8e+00f), tmp13413);
__m512 tmp13418 = _mm512_fmadd_ps(tmp13410, _mm512_set1_ps(3.2e+01f), tmp13419);
__m512 tmp13416 = _mm512_fmadd_ps(tmp13406, _mm512_set1_ps(2e+00f), tmp13417);
__m512 out1717 = tmp13386;
__m512 out1721 = tmp13401;
__m512 out1718 = tmp13392;
__m512 out1722 = tmp13407;
__m512 out1719 = tmp13397;
__m512 out1723 = tmp13412;
__m512 out1720 = tmp13399;
__m512 out1724 = tmp13414;
__m512 out1725 = tmp13416;
__m512 out1726 = tmp13418;
out1717 = _mm512_max_ps(_mm512_setzero_ps(), out1717);
out1721 = _mm512_max_ps(_mm512_setzero_ps(), out1721);
out1718 = _mm512_max_ps(_mm512_setzero_ps(), out1718);
out1722 = _mm512_max_ps(_mm512_setzero_ps(), out1722);
out1719 = _mm512_max_ps(_mm512_setzero_ps(), out1719);
out1723 = _mm512_max_ps(_mm512_setzero_ps(), out1723);
out1720 = _mm512_max_ps(_mm512_setzero_ps(), out1720);
out1724 = _mm512_max_ps(_mm512_setzero_ps(), out1724);
out1725 = _mm512_max_ps(_mm512_setzero_ps(), out1725);
out1726 = _mm512_max_ps(_mm512_setzero_ps(), out1726);
_mm512_mask_storeu_ps(datPtr26+648+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1717);
_mm512_mask_storeu_ps(datPtr26+3136+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1721);
_mm512_mask_storeu_ps(datPtr26+760+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1718);
_mm512_mask_storeu_ps(datPtr26+3248+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1722);
_mm512_mask_storeu_ps(datPtr26+872+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1719);
_mm512_mask_storeu_ps(datPtr26+3360+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1723);
_mm512_mask_storeu_ps(datPtr26+984+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1720);
_mm512_mask_storeu_ps(datPtr26+3472+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1724);
_mm512_mask_storeu_ps(datPtr26+3584+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1725);
_mm512_mask_storeu_ps(datPtr26+3696+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 1023, out1726);
__m512 sf977 = _mm512_loadu_ps(sfPtr12+512+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf978 = _mm512_loadu_ps(sfPtr12+640+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1882 = _mm512_shuffle_f32x4(sf977, sf978, 68);
__m512 in1883 = _mm512_shuffle_f32x4(sf977, sf978, 238);
__m512 sf979 = _mm512_loadu_ps(sfPtr12+576+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf980 = _mm512_loadu_ps(sfPtr12+704+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1890 = _mm512_shuffle_f32x4(sf979, sf980, 68);
__m512 in1891 = _mm512_shuffle_f32x4(sf979, sf980, 238);
__m512 sf981 = _mm512_loadu_ps(sfPtr12+13312+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf982 = _mm512_loadu_ps(sfPtr12+13440+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1884 = _mm512_shuffle_f32x4(sf981, sf982, 68);
__m512 in1885 = _mm512_shuffle_f32x4(sf981, sf982, 238);
__m512 sf983 = _mm512_loadu_ps(sfPtr12+13376+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf984 = _mm512_loadu_ps(sfPtr12+13504+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1892 = _mm512_shuffle_f32x4(sf983, sf984, 68);
__m512 in1893 = _mm512_shuffle_f32x4(sf983, sf984, 238);
__m512 sf985 = _mm512_loadu_ps(sfPtr12+26112+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf986 = _mm512_loadu_ps(sfPtr12+26240+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1886 = _mm512_shuffle_f32x4(sf985, sf986, 68);
__m512 in1887 = _mm512_shuffle_f32x4(sf985, sf986, 238);
__m512 sf987 = _mm512_loadu_ps(sfPtr12+26176+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf988 = _mm512_loadu_ps(sfPtr12+26304+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1894 = _mm512_shuffle_f32x4(sf987, sf988, 68);
__m512 in1895 = _mm512_shuffle_f32x4(sf987, sf988, 238);
__m512 sf989 = _mm512_loadu_ps(sfPtr12+38912+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf990 = _mm512_loadu_ps(sfPtr12+39040+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1888 = _mm512_shuffle_f32x4(sf989, sf990, 68);
__m512 in1889 = _mm512_shuffle_f32x4(sf989, sf990, 238);
__m512 sf991 = _mm512_loadu_ps(sfPtr12+38976+51200*i51+3072*j44+1536*k139+768*l58);
__m512 sf992 = _mm512_loadu_ps(sfPtr12+39104+51200*i51+3072*j44+1536*k139+768*l58);
__m512 in1896 = _mm512_shuffle_f32x4(sf991, sf992, 68);
__m512 in1897 = _mm512_shuffle_f32x4(sf991, sf992, 238);
__m512 tmp13477 = _mm512_add_ps(in1883, in1884);
__m512 tmp13497 = _mm512_add_ps(in1891, in1892);
__m512 tmp13476 = _mm512_add_ps(in1885, in1886);
__m512 tmp13496 = _mm512_add_ps(in1893, in1894);
__m512 tmp13482 = _mm512_sub_ps(in1885, in1886);
__m512 tmp13502 = _mm512_sub_ps(in1893, in1894);
__m512 tmp13481 = _mm512_sub_ps(in1883, in1884);
__m512 tmp13501 = _mm512_sub_ps(in1891, in1892);
__m512 tmp13478 = _mm512_add_ps(in1887, in1888);
__m512 tmp13498 = _mm512_add_ps(in1895, in1896);
__m512 tmp13483 = _mm512_sub_ps(in1887, in1888);
__m512 tmp13503 = _mm512_sub_ps(in1895, in1896);
__m512 tmp13480 = _mm512_fmadd_ps(tmp13482, _mm512_set1_ps(2e+00f), tmp13481);
__m512 tmp13500 = _mm512_fmadd_ps(tmp13502, _mm512_set1_ps(2e+00f), tmp13501);
__m512 tmp13487 = _mm512_fmadd_ps(tmp13482, _mm512_set1_ps(8e+00f), tmp13481);
__m512 tmp13507 = _mm512_fmadd_ps(tmp13502, _mm512_set1_ps(8e+00f), tmp13501);
__m512 tmp13475 = _mm512_add_ps(tmp13476, tmp13477);
__m512 tmp13495 = _mm512_add_ps(tmp13496, tmp13497);
__m512 tmp13479 = _mm512_fmadd_ps(tmp13483, _mm512_set1_ps(1.6e+01f), tmp13480);
__m512 tmp13499 = _mm512_fmadd_ps(tmp13503, _mm512_set1_ps(1.6e+01f), tmp13500);
__m512 tmp13486 = _mm512_fmadd_ps(tmp13483, _mm512_set1_ps(4e+00f), tmp13487);
__m512 tmp13506 = _mm512_fmadd_ps(tmp13503, _mm512_set1_ps(4e+00f), tmp13507);
__m512 tmp13492 = _mm512_add_ps(tmp13483, tmp13481);
__m512 tmp13512 = _mm512_add_ps(tmp13503, tmp13501);
__m512 tmp13485 = _mm512_fmadd_ps(tmp13476, _mm512_set1_ps(4e+00f), tmp13477);
__m512 tmp13505 = _mm512_fmadd_ps(tmp13496, _mm512_set1_ps(4e+00f), tmp13497);
__m512 tmp13489 = _mm512_fmadd_ps(tmp13476, _mm512_set1_ps(1.6e+01f), tmp13477);
__m512 tmp13509 = _mm512_fmadd_ps(tmp13496, _mm512_set1_ps(1.6e+01f), tmp13497);
__m512 tmp13474 = _mm512_add_ps(tmp13475, in1882);
__m512 tmp13494 = _mm512_add_ps(tmp13495, in1890);
__m512 tmp13491 = _mm512_add_ps(tmp13492, in1889);
__m512 tmp13511 = _mm512_add_ps(tmp13512, in1897);
__m512 tmp13473 = _mm512_fmadd_ps(tmp13478, _mm512_set1_ps(3.2e+01f), tmp13474);
__m512 tmp13493 = _mm512_fmadd_ps(tmp13498, _mm512_set1_ps(3.2e+01f), tmp13494);
__m512 tmp13484 = _mm512_fmadd_ps(tmp13478, _mm512_set1_ps(8e+00f), tmp13485);
__m512 tmp13504 = _mm512_fmadd_ps(tmp13498, _mm512_set1_ps(8e+00f), tmp13505);
__m512 tmp13490 = _mm512_fmadd_ps(tmp13482, _mm512_set1_ps(3.2e+01f), tmp13491);
__m512 tmp13510 = _mm512_fmadd_ps(tmp13502, _mm512_set1_ps(3.2e+01f), tmp13511);
__m512 tmp13488 = _mm512_fmadd_ps(tmp13478, _mm512_set1_ps(2e+00f), tmp13489);
__m512 tmp13508 = _mm512_fmadd_ps(tmp13498, _mm512_set1_ps(2e+00f), tmp13509);
__m512 tmp13461 = tmp13473;
__m512 tmp13467 = tmp13493;
__m512 tmp13462 = tmp13479;
__m512 tmp13468 = tmp13499;
__m512 tmp13463 = tmp13484;
__m512 tmp13469 = tmp13504;
__m512 tmp13464 = tmp13486;
__m512 tmp13470 = tmp13506;
__m512 tmp13465 = tmp13488;
__m512 tmp13471 = tmp13508;
__m512 tmp13466 = tmp13490;
__m512 tmp13472 = tmp13510;
__m512 tmp13547 = _mm512_unpacklo_ps(tmp13461, tmp13462);
__m512 tmp13548 = _mm512_unpackhi_ps(tmp13461, tmp13462);
__m512 tmp13549 = _mm512_unpacklo_ps(tmp13463, tmp13464);
__m512 tmp13550 = _mm512_unpackhi_ps(tmp13463, tmp13464);
__m512 tmp13551 = _mm512_unpacklo_ps(tmp13465, tmp13466);
__m512 tmp13552 = _mm512_unpackhi_ps(tmp13465, tmp13466);
__m512 tmp13553 = _mm512_unpacklo_ps(tmp13467, tmp13468);
__m512 tmp13554 = _mm512_unpackhi_ps(tmp13467, tmp13468);
__m512 tmp13555 = _mm512_unpacklo_ps(tmp13469, tmp13470);
__m512 tmp13556 = _mm512_unpackhi_ps(tmp13469, tmp13470);
__m512 tmp13557 = _mm512_unpacklo_ps(tmp13471, tmp13472);
__m512 tmp13558 = _mm512_unpackhi_ps(tmp13471, tmp13472);
__m512 tmp13559 = _mm512_shuffle_ps(tmp13547, tmp13549, 68);
__m512 tmp13560 = _mm512_shuffle_ps(tmp13547, tmp13549, 238);
__m512 tmp13561 = _mm512_shuffle_ps(tmp13548, tmp13550, 68);
__m512 tmp13562 = _mm512_shuffle_ps(tmp13548, tmp13550, 238);
__m512 tmp13563 = _mm512_shuffle_ps(tmp13551, tmp13553, 68);
__m512 tmp13564 = _mm512_shuffle_ps(tmp13551, tmp13553, 238);
__m512 tmp13565 = _mm512_shuffle_ps(tmp13552, tmp13554, 68);
__m512 tmp13566 = _mm512_shuffle_ps(tmp13552, tmp13554, 238);
__m512 tmp13567 = _mm512_shuffle_ps(tmp13555, tmp13557, 68);
__m512 tmp13568 = _mm512_shuffle_ps(tmp13555, tmp13557, 238);
__m512 tmp13569 = _mm512_shuffle_ps(tmp13556, tmp13558, 68);
__m512 tmp13570 = _mm512_shuffle_ps(tmp13556, tmp13558, 238);
__m512 tmp13571 = _mm512_shuffle_f32x4(tmp13559, tmp13563, 136);
__m512 tmp13572 = _mm512_shuffle_f32x4(tmp13559, tmp13563, 221);
__m512 tmp13573 = _mm512_shuffle_f32x4(tmp13560, tmp13564, 136);
__m512 tmp13574 = _mm512_shuffle_f32x4(tmp13560, tmp13564, 221);
__m512 tmp13575 = _mm512_shuffle_f32x4(tmp13561, tmp13565, 136);
__m512 tmp13576 = _mm512_shuffle_f32x4(tmp13561, tmp13565, 221);
__m512 tmp13577 = _mm512_shuffle_f32x4(tmp13562, tmp13566, 136);
__m512 tmp13578 = _mm512_shuffle_f32x4(tmp13562, tmp13566, 221);
__m512 tmp13579 = _mm512_shuffle_f32x4(tmp13567, tmp13567, 136);
__m512 tmp13580 = _mm512_shuffle_f32x4(tmp13567, tmp13567, 221);
__m512 tmp13581 = _mm512_shuffle_f32x4(tmp13568, tmp13568, 136);
__m512 tmp13582 = _mm512_shuffle_f32x4(tmp13568, tmp13568, 221);
__m512 tmp13583 = _mm512_shuffle_f32x4(tmp13569, tmp13569, 136);
__m512 tmp13584 = _mm512_shuffle_f32x4(tmp13569, tmp13569, 221);
__m512 tmp13585 = _mm512_shuffle_f32x4(tmp13570, tmp13570, 136);
__m512 tmp13586 = _mm512_shuffle_f32x4(tmp13570, tmp13570, 221);
tmp13461 = _mm512_shuffle_f32x4(tmp13571, tmp13579, 136);
tmp13469 = _mm512_shuffle_f32x4(tmp13571, tmp13579, 221);
tmp13462 = _mm512_shuffle_f32x4(tmp13573, tmp13581, 136);
tmp13470 = _mm512_shuffle_f32x4(tmp13573, tmp13581, 221);
tmp13463 = _mm512_shuffle_f32x4(tmp13575, tmp13583, 136);
tmp13471 = _mm512_shuffle_f32x4(tmp13575, tmp13583, 221);
tmp13464 = _mm512_shuffle_f32x4(tmp13577, tmp13585, 136);
tmp13472 = _mm512_shuffle_f32x4(tmp13577, tmp13585, 221);
tmp13465 = _mm512_shuffle_f32x4(tmp13572, tmp13580, 136);
__m512 tmp13513 = _mm512_shuffle_f32x4(tmp13572, tmp13580, 221);
tmp13466 = _mm512_shuffle_f32x4(tmp13574, tmp13582, 136);
__m512 tmp13514 = _mm512_shuffle_f32x4(tmp13574, tmp13582, 221);
tmp13467 = _mm512_shuffle_f32x4(tmp13576, tmp13584, 136);
__m512 tmp13515 = _mm512_shuffle_f32x4(tmp13576, tmp13584, 221);
tmp13468 = _mm512_shuffle_f32x4(tmp13578, tmp13586, 136);
__m512 tmp13516 = _mm512_shuffle_f32x4(tmp13578, tmp13586, 221);
(void)tmp13468;
(void)tmp13516;
__m512 tmp13521 = _mm512_add_ps(tmp13462, tmp13463);
__m512 tmp13536 = _mm512_add_ps(tmp13470, tmp13471);
__m512 tmp13520 = _mm512_add_ps(tmp13464, tmp13465);
__m512 tmp13535 = _mm512_add_ps(tmp13472, tmp13513);
__m512 tmp13526 = _mm512_sub_ps(tmp13464, tmp13465);
__m512 tmp13541 = _mm512_sub_ps(tmp13472, tmp13513);
__m512 tmp13525 = _mm512_sub_ps(tmp13462, tmp13463);
__m512 tmp13540 = _mm512_sub_ps(tmp13470, tmp13471);
__m512 tmp13522 = _mm512_add_ps(tmp13466, tmp13467);
__m512 tmp13537 = _mm512_add_ps(tmp13514, tmp13515);
__m512 tmp13527 = _mm512_sub_ps(tmp13466, tmp13467);
__m512 tmp13542 = _mm512_sub_ps(tmp13514, tmp13515);
__m512 tmp13524 = _mm512_fmadd_ps(tmp13526, _mm512_set1_ps(2e+00f), tmp13525);
__m512 tmp13539 = _mm512_fmadd_ps(tmp13541, _mm512_set1_ps(2e+00f), tmp13540);
__m512 tmp13531 = _mm512_fmadd_ps(tmp13526, _mm512_set1_ps(8e+00f), tmp13525);
__m512 tmp13546 = _mm512_fmadd_ps(tmp13541, _mm512_set1_ps(8e+00f), tmp13540);
__m512 tmp13519 = _mm512_add_ps(tmp13520, tmp13521);
__m512 tmp13534 = _mm512_add_ps(tmp13535, tmp13536);
__m512 tmp13523 = _mm512_fmadd_ps(tmp13527, _mm512_set1_ps(1.6e+01f), tmp13524);
__m512 tmp13538 = _mm512_fmadd_ps(tmp13542, _mm512_set1_ps(1.6e+01f), tmp13539);
__m512 tmp13530 = _mm512_fmadd_ps(tmp13527, _mm512_set1_ps(4e+00f), tmp13531);
__m512 tmp13545 = _mm512_fmadd_ps(tmp13542, _mm512_set1_ps(4e+00f), tmp13546);
__m512 tmp13529 = _mm512_fmadd_ps(tmp13520, _mm512_set1_ps(4e+00f), tmp13521);
__m512 tmp13544 = _mm512_fmadd_ps(tmp13535, _mm512_set1_ps(4e+00f), tmp13536);
__m512 tmp13518 = _mm512_add_ps(tmp13519, tmp13461);
__m512 tmp13533 = _mm512_add_ps(tmp13534, tmp13469);
__m512 tmp13517 = _mm512_fmadd_ps(tmp13522, _mm512_set1_ps(3.2e+01f), tmp13518);
__m512 tmp13532 = _mm512_fmadd_ps(tmp13537, _mm512_set1_ps(3.2e+01f), tmp13533);
__m512 tmp13528 = _mm512_fmadd_ps(tmp13522, _mm512_set1_ps(8e+00f), tmp13529);
__m512 tmp13543 = _mm512_fmadd_ps(tmp13537, _mm512_set1_ps(8e+00f), tmp13544);
__m512 out1727 = tmp13517;
__m512 out1731 = tmp13532;
__m512 out1728 = tmp13523;
__m512 out1732 = tmp13538;
__m512 out1729 = tmp13528;
__m512 out1733 = tmp13543;
__m512 out1730 = tmp13530;
__m512 out1734 = tmp13545;
out1727 = _mm512_max_ps(_mm512_setzero_ps(), out1727);
out1731 = _mm512_max_ps(_mm512_setzero_ps(), out1731);
out1728 = _mm512_max_ps(_mm512_setzero_ps(), out1728);
out1732 = _mm512_max_ps(_mm512_setzero_ps(), out1732);
out1729 = _mm512_max_ps(_mm512_setzero_ps(), out1729);
out1733 = _mm512_max_ps(_mm512_setzero_ps(), out1733);
out1730 = _mm512_max_ps(_mm512_setzero_ps(), out1730);
out1734 = _mm512_max_ps(_mm512_setzero_ps(), out1734);
_mm512_mask_storeu_ps(datPtr26+3736+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1727);
_mm512_mask_storeu_ps(datPtr26+3784+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1731);
_mm512_mask_storeu_ps(datPtr26+3848+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1728);
_mm512_mask_storeu_ps(datPtr26+3896+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1732);
_mm512_mask_storeu_ps(datPtr26+3960+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1729);
_mm512_mask_storeu_ps(datPtr26+4008+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1733);
_mm512_mask_storeu_ps(datPtr26+4072+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1730);
_mm512_mask_storeu_ps(datPtr26+4120+25088*i51+112*toH44+4*toW44+12544*k139+6272*l58, 4095, out1734);
}
}
++j44;
rel22 = 4;
}
ptrdiff_t toH45 = base22+24;
ptrdiff_t toW45 = 24;
ptrdiff_t k140 = 2*w62;
for (; k140 != 2; ++k140) {
ptrdiff_t l59 = 0;
for (; l59 != 1; ++l59) {
__m512 sf993 = _mm512_loadu_ps(sfPtr12+0+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf994 = _mm512_loadu_ps(sfPtr12+64+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1898 = _mm512_shuffle_f32x4(sf993, sf994, 68);
__m512 in1899 = _mm512_shuffle_f32x4(sf993, sf994, 238);
__m512 sf995 = _mm512_loadu_ps(sfPtr12+128+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf996 = _mm512_loadu_ps(sfPtr12+192+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1906 = _mm512_shuffle_f32x4(sf995, sf996, 68);
__m512 in1907 = _mm512_shuffle_f32x4(sf995, sf996, 238);
__m512 sf997 = _mm512_loadu_ps(sfPtr12+12800+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf998 = _mm512_loadu_ps(sfPtr12+12864+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1900 = _mm512_shuffle_f32x4(sf997, sf998, 68);
__m512 in1901 = _mm512_shuffle_f32x4(sf997, sf998, 238);
__m512 sf999 = _mm512_loadu_ps(sfPtr12+12928+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf1000 = _mm512_loadu_ps(sfPtr12+12992+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1908 = _mm512_shuffle_f32x4(sf999, sf1000, 68);
__m512 in1909 = _mm512_shuffle_f32x4(sf999, sf1000, 238);
__m512 sf1001 = _mm512_loadu_ps(sfPtr12+25600+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf1002 = _mm512_loadu_ps(sfPtr12+25664+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1902 = _mm512_shuffle_f32x4(sf1001, sf1002, 68);
__m512 in1903 = _mm512_shuffle_f32x4(sf1001, sf1002, 238);
__m512 sf1003 = _mm512_loadu_ps(sfPtr12+25728+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf1004 = _mm512_loadu_ps(sfPtr12+25792+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1910 = _mm512_shuffle_f32x4(sf1003, sf1004, 68);
__m512 in1911 = _mm512_shuffle_f32x4(sf1003, sf1004, 238);
__m512 sf1005 = _mm512_loadu_ps(sfPtr12+38400+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf1006 = _mm512_loadu_ps(sfPtr12+38464+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1904 = _mm512_shuffle_f32x4(sf1005, sf1006, 68);
__m512 in1905 = _mm512_shuffle_f32x4(sf1005, sf1006, 238);
__m512 sf1007 = _mm512_loadu_ps(sfPtr12+38528+51200*i51+3072*j44+256*k140+256*l59);
__m512 sf1008 = _mm512_loadu_ps(sfPtr12+38592+51200*i51+3072*j44+256*k140+256*l59);
__m512 in1912 = _mm512_shuffle_f32x4(sf1007, sf1008, 68);
__m512 in1913 = _mm512_shuffle_f32x4(sf1007, sf1008, 238);
(void)in1905;
(void)in1913;
__m512 tmp13599 = _mm512_add_ps(in1899, in1900);
__m512 tmp13614 = _mm512_add_ps(in1907, in1908);
__m512 tmp13598 = _mm512_add_ps(in1901, in1902);
__m512 tmp13613 = _mm512_add_ps(in1909, in1910);
__m512 tmp13604 = _mm512_sub_ps(in1901, in1902);
__m512 tmp13619 = _mm512_sub_ps(in1909, in1910);
__m512 tmp13603 = _mm512_sub_ps(in1899, in1900);
__m512 tmp13618 = _mm512_sub_ps(in1907, in1908);
__m512 tmp13600 = _mm512_add_ps(in1903, in1904);
__m512 tmp13615 = _mm512_add_ps(in1911, in1912);
__m512 tmp13605 = _mm512_sub_ps(in1903, in1904);
__m512 tmp13620 = _mm512_sub_ps(in1911, in1912);
__m512 tmp13602 = _mm512_fmadd_ps(tmp13604, _mm512_set1_ps(2e+00f), tmp13603);
__m512 tmp13617 = _mm512_fmadd_ps(tmp13619, _mm512_set1_ps(2e+00f), tmp13618);
__m512 tmp13609 = _mm512_fmadd_ps(tmp13604, _mm512_set1_ps(8e+00f), tmp13603);
__m512 tmp13624 = _mm512_fmadd_ps(tmp13619, _mm512_set1_ps(8e+00f), tmp13618);
__m512 tmp13597 = _mm512_add_ps(tmp13598, tmp13599);
__m512 tmp13612 = _mm512_add_ps(tmp13613, tmp13614);
__m512 tmp13601 = _mm512_fmadd_ps(tmp13605, _mm512_set1_ps(1.6e+01f), tmp13602);
__m512 tmp13616 = _mm512_fmadd_ps(tmp13620, _mm512_set1_ps(1.6e+01f), tmp13617);
__m512 tmp13608 = _mm512_fmadd_ps(tmp13605, _mm512_set1_ps(4e+00f), tmp13609);
__m512 tmp13623 = _mm512_fmadd_ps(tmp13620, _mm512_set1_ps(4e+00f), tmp13624);
__m512 tmp13607 = _mm512_fmadd_ps(tmp13598, _mm512_set1_ps(4e+00f), tmp13599);
__m512 tmp13622 = _mm512_fmadd_ps(tmp13613, _mm512_set1_ps(4e+00f), tmp13614);
__m512 tmp13596 = _mm512_add_ps(tmp13597, in1898);
__m512 tmp13611 = _mm512_add_ps(tmp13612, in1906);
__m512 tmp13595 = _mm512_fmadd_ps(tmp13600, _mm512_set1_ps(3.2e+01f), tmp13596);
__m512 tmp13610 = _mm512_fmadd_ps(tmp13615, _mm512_set1_ps(3.2e+01f), tmp13611);
__m512 tmp13606 = _mm512_fmadd_ps(tmp13600, _mm512_set1_ps(8e+00f), tmp13607);
__m512 tmp13621 = _mm512_fmadd_ps(tmp13615, _mm512_set1_ps(8e+00f), tmp13622);
__m512 tmp13587 = tmp13595;
__m512 tmp13591 = tmp13610;
__m512 tmp13588 = tmp13601;
__m512 tmp13592 = tmp13616;
__m512 tmp13589 = tmp13606;
__m512 tmp13593 = tmp13621;
__m512 tmp13590 = tmp13608;
__m512 tmp13594 = tmp13623;
__m512 tmp13625 = _mm512_setzero_ps();
__m512 tmp13626 = _mm512_setzero_ps();
__m512 tmp13663 = _mm512_unpacklo_ps(tmp13587, tmp13588);
__m512 tmp13664 = _mm512_unpackhi_ps(tmp13587, tmp13588);
__m512 tmp13665 = _mm512_unpacklo_ps(tmp13589, tmp13590);
__m512 tmp13666 = _mm512_unpackhi_ps(tmp13589, tmp13590);
__m512 tmp13667 = _mm512_unpacklo_ps(tmp13625, tmp13626);
__m512 tmp13668 = _mm512_unpackhi_ps(tmp13625, tmp13626);
__m512 tmp13669 = _mm512_unpacklo_ps(tmp13591, tmp13592);
__m512 tmp13670 = _mm512_unpackhi_ps(tmp13591, tmp13592);
__m512 tmp13671 = _mm512_unpacklo_ps(tmp13593, tmp13594);
__m512 tmp13672 = _mm512_unpackhi_ps(tmp13593, tmp13594);
__m512 tmp13673 = _mm512_shuffle_ps(tmp13663, tmp13665, 68);
__m512 tmp13674 = _mm512_shuffle_ps(tmp13663, tmp13665, 238);
__m512 tmp13675 = _mm512_shuffle_ps(tmp13664, tmp13666, 68);
__m512 tmp13676 = _mm512_shuffle_ps(tmp13664, tmp13666, 238);
__m512 tmp13677 = _mm512_shuffle_ps(tmp13667, tmp13669, 68);
__m512 tmp13678 = _mm512_shuffle_ps(tmp13667, tmp13669, 238);
__m512 tmp13679 = _mm512_shuffle_ps(tmp13668, tmp13670, 68);
__m512 tmp13680 = _mm512_shuffle_ps(tmp13668, tmp13670, 238);
__m512 tmp13681 = _mm512_shuffle_ps(tmp13671, tmp13671, 238);
__m512 tmp13682 = _mm512_shuffle_ps(tmp13672, tmp13672, 238);
__m512 tmp13683 = _mm512_shuffle_f32x4(tmp13673, tmp13677, 136);
__m512 tmp13684 = _mm512_shuffle_f32x4(tmp13673, tmp13677, 221);
__m512 tmp13685 = _mm512_shuffle_f32x4(tmp13674, tmp13678, 136);
__m512 tmp13686 = _mm512_shuffle_f32x4(tmp13674, tmp13678, 221);
__m512 tmp13687 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 136);
__m512 tmp13688 = _mm512_shuffle_f32x4(tmp13675, tmp13679, 221);
__m512 tmp13689 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 136);
__m512 tmp13690 = _mm512_shuffle_f32x4(tmp13676, tmp13680, 221);
__m512 tmp13691 = _mm512_shuffle_f32x4(tmp13671, tmp13671, 136);
__m512 tmp13692 = _mm512_shuffle_f32x4(tmp13671, tmp13671, 221);
__m512 tmp13693 = _mm512_shuffle_f32x4(tmp13681, tmp13681, 136);
__m512 tmp13694 = _mm512_shuffle_f32x4(tmp13681, tmp13681, 221);
__m512 tmp13695 = _mm512_shuffle_f32x4(tmp13672, tmp13672, 136);
__m512 tmp13696 = _mm512_shuffle_f32x4(tmp13672, tmp13672, 221);
__m512 tmp13697 = _mm512_shuffle_f32x4(tmp13682, tmp13682, 136);
__m512 tmp13698 = _mm512_shuffle_f32x4(tmp13682, tmp13682, 221);
tmp13587 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 136);
tmp13593 = _mm512_shuffle_f32x4(tmp13683, tmp13691, 221);
tmp13588 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 136);
tmp13594 = _mm512_shuffle_f32x4(tmp13685, tmp13693, 221);
tmp13589 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 136);
__m512 tmp13627 = _mm512_shuffle_f32x4(tmp13687, tmp13695, 221);
tmp13590 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 136);
__m512 tmp13628 = _mm512_shuffle_f32x4(tmp13689, tmp13697, 221);
tmp13625 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 136);
__m512 tmp13629 = _mm512_shuffle_f32x4(tmp13684, tmp13692, 221);
tmp13626 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 136);
__m512 tmp13630 = _mm512_shuffle_f32x4(tmp13686, tmp13694, 221);
tmp13591 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 136);
__m512 tmp13631 = _mm512_shuffle_f32x4(tmp13688, tmp13696, 221);
tmp13592 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 136);
__m512 tmp13632 = _mm512_shuffle_f32x4(tmp13690, tmp13698, 221);
(void)tmp13592;
(void)tmp13632;
__m512 tmp13637 = _mm512_add_ps(tmp13588, tmp13589);
__m512 tmp13652 = _mm512_add_ps(tmp13594, tmp13627);
__m512 tmp13636 = _mm512_add_ps(tmp13590, tmp13625);
__m512 tmp13651 = _mm512_add_ps(tmp13628, tmp13629);
__m512 tmp13642 = _mm512_sub_ps(tmp13590, tmp13625);
__m512 tmp13657 = _mm512_sub_ps(tmp13628, tmp13629);
__m512 tmp13641 = _mm512_sub_ps(tmp13588, tmp13589);
__m512 tmp13656 = _mm512_sub_ps(tmp13594, tmp13627);
__m512 tmp13638 = _mm512_add_ps(tmp13626, tmp13591);
__m512 tmp13653 = _mm512_add_ps(tmp13630, tmp13631);
__m512 tmp13643 = _mm512_sub_ps(tmp13626, tmp13591);
__m512 tmp13658 = _mm512_sub_ps(tmp13630, tmp13631);
__m512 tmp13640 = _mm512_fmadd_ps(tmp13642, _mm512_set1_ps(2e+00f), tmp13641);
__m512 tmp13655 = _mm512_fmadd_ps(tmp13657, _mm512_set1_ps(2e+00f), tmp13656);
__m512 tmp13647 = _mm512_fmadd_ps(tmp13642, _mm512_set1_ps(8e+00f), tmp13641);
__m512 tmp13662 = _mm512_fmadd_ps(tmp13657, _mm512_set1_ps(8e+00f), tmp13656);
__m512 tmp13635 = _mm512_add_ps(tmp13636, tmp13637);
__m512 tmp13650 = _mm512_add_ps(tmp13651, tmp13652);
__m512 tmp13639 = _mm512_fmadd_ps(tmp13643, _mm512_set1_ps(1.6e+01f), tmp13640);
__m512 tmp13654 = _mm512_fmadd_ps(tmp13658, _mm512_set1_ps(1.6e+01f), tmp13655);
__m512 tmp13646 = _mm512_fmadd_ps(tmp13643, _mm512_set1_ps(4e+00f), tmp13647);
__m512 tmp13661 = _mm512_fmadd_ps(tmp13658, _mm512_set1_ps(4e+00f), tmp13662);
__m512 tmp13645 = _mm512_fmadd_ps(tmp13636, _mm512_set1_ps(4e+00f), tmp13637);
__m512 tmp13660 = _mm512_fmadd_ps(tmp13651, _mm512_set1_ps(4e+00f), tmp13652);
__m512 tmp13634 = _mm512_add_ps(tmp13635, tmp13587);
__m512 tmp13649 = _mm512_add_ps(tmp13650, tmp13593);
__m512 tmp13633 = _mm512_fmadd_ps(tmp13638, _mm512_set1_ps(3.2e+01f), tmp13634);
__m512 tmp13648 = _mm512_fmadd_ps(tmp13653, _mm512_set1_ps(3.2e+01f), tmp13649);
__m512 tmp13644 = _mm512_fmadd_ps(tmp13638, _mm512_set1_ps(8e+00f), tmp13645);
__m512 tmp13659 = _mm512_fmadd_ps(tmp13653, _mm512_set1_ps(8e+00f), tmp13660);
__m512 out1735 = tmp13633;
__m512 out1739 = tmp13648;
__m512 out1736 = tmp13639;
__m512 out1740 = tmp13654;
__m512 out1737 = tmp13644;
__m512 out1741 = tmp13659;
__m512 out1738 = tmp13646;
__m512 out1742 = tmp13661;
out1735 = _mm512_max_ps(_mm512_setzero_ps(), out1735);
out1739 = _mm512_max_ps(_mm512_setzero_ps(), out1739);
out1736 = _mm512_max_ps(_mm512_setzero_ps(), out1736);
out1740 = _mm512_max_ps(_mm512_setzero_ps(), out1740);
out1737 = _mm512_max_ps(_mm512_setzero_ps(), out1737);
out1741 = _mm512_max_ps(_mm512_setzero_ps(), out1741);
out1738 = _mm512_max_ps(_mm512_setzero_ps(), out1738);
out1742 = _mm512_max_ps(_mm512_setzero_ps(), out1742);
_mm512_mask_storeu_ps(datPtr26+0+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1735);
_mm512_mask_storeu_ps(datPtr26+6248+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1735);
_mm512_mask_storeu_ps(datPtr26+3136+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1739);
_mm512_mask_storeu_ps(datPtr26+9384+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1739);
_mm512_mask_storeu_ps(datPtr26+112+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1736);
_mm512_mask_storeu_ps(datPtr26+6360+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1736);
_mm512_mask_storeu_ps(datPtr26+3248+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1740);
_mm512_mask_storeu_ps(datPtr26+9496+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1740);
_mm512_mask_storeu_ps(datPtr26+224+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1737);
_mm512_mask_storeu_ps(datPtr26+6472+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1737);
_mm512_mask_storeu_ps(datPtr26+3360+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1741);
_mm512_mask_storeu_ps(datPtr26+9608+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1741);
_mm512_mask_storeu_ps(datPtr26+336+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1738);
_mm512_mask_storeu_ps(datPtr26+6584+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1738);
_mm512_mask_storeu_ps(datPtr26+3472+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 15, out1742);
_mm512_mask_storeu_ps(datPtr26+9720+25088*i51+112*toH45+4*toW45+12544*k140+12544*l59, 960, out1742);
}
}
++j44;
}
}

static void ResNeXt50ThreeConsumeSums3(ResNeXt50ThreaderTeam1* team55, char** tensors83) {
ResNeXt50ThreaderTask1 task87;
task87.callee1 = ResNeXt50ThreeConsumeSums3Callee1;
task87.any1 = tensors83;
task87.nd1 = 3;
task87.hull1[0] = 1;
task87.hull1[1] = 1;
task87.hull1[2] = 10;
ResNeXt50ThreaderDo1(team55, &task87);
}

static void ResNeXt50ThreeArrangeFilts4Callee1(ResNeXt50ThreaderTask1* task120, int64_t* pt65) {
char** tensors118 = task120->any1;
ptrdiff_t b83 = 0;
ptrdiff_t g37 = pt65[1];
ptrdiff_t e34 = 0;
char*restrict bfPtr16 = tensors118[3]+2048*e34;
char*restrict wfPtr16 = tensors118[3]+2048+25952256*e34;
char*restrict wtPtr19 = tensors118[0]+14256*e34;
char*restrict biasPtr19 = tensors118[1];
char*restrict bnPtr20 = tensors118[2];
ptrdiff_t i70 = 2*g37;
ptrdiff_t ii52 = i70+1;
for (; i70 <= ii52; ++i70) {
ptrdiff_t j62 = 4*b83;
if (j62 < 4) {
for (; j62 != 4; ++j62) {
ptrdiff_t k172 = 0+1*j62;
ptrdiff_t cut26 = 0;
__m512 postMul59 = _mm512_set1_ps(((float*)bnPtr20+(ptrdiff_t)2*(0+16*i70+4*j62))[0]);
__m512 postMul60 = _mm512_set1_ps(((float*)bnPtr20+(ptrdiff_t)2*(1+16*i70+4*j62))[0]);
__m512 postMul61 = _mm512_set1_ps(((float*)bnPtr20+(ptrdiff_t)2*(2+16*i70+4*j62))[0]);
__m512 postMul62 = _mm512_set1_ps(((float*)bnPtr20+(ptrdiff_t)2*(3+16*i70+4*j62))[0]);
ptrdiff_t s72 = 0;
for (; s72 != 16; ++s72) {
__m512 wt733 = _mm512_maskz_loadu_ps(511, wtPtr19+0+9216*i70+2304*j62+36*s72);
__m512 wt734 = _mm512_maskz_loadu_ps(511, wtPtr19+576+9216*i70+2304*j62+36*s72);
__m512 wt735 = _mm512_maskz_loadu_ps(511, wtPtr19+1152+9216*i70+2304*j62+36*s72);
__m512 wt736 = _mm512_maskz_loadu_ps(511, wtPtr19+1728+9216*i70+2304*j62+36*s72);
wt733 = _mm512_mul_ps(wt733, postMul59);
wt734 = _mm512_mul_ps(wt734, postMul60);
wt735 = _mm512_mul_ps(wt735, postMul61);
wt736 = _mm512_mul_ps(wt736, postMul62);
__m512i pm211 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm212 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp14275 = _mm512_permutex2var_ps(wt733, pm211, wt735);
__m512 tmp14276 = _mm512_permutex2var_ps(wt734, pm211, wt736);
__m512 tmp14277 = _mm512_permutex2var_ps(wt733, pm212, wt735);
__m512 tmp14278 = _mm512_permutex2var_ps(wt734, pm212, wt736);
__m512 in1914 = _mm512_permutex2var_ps(tmp14275, pm211, tmp14276);
__m512 in1915 = _mm512_permutex2var_ps(tmp14275, pm212, tmp14276);
__m512 in1916 = _mm512_permutex2var_ps(tmp14277, pm211, tmp14278);
__m512 tmp14279 = _mm512_fmadd_ps(in1914, _mm512_set1_ps(4e+00f), in1916);
__m512 tmp14280 = _mm512_add_ps(in1914, in1916);
__m512 tmp14281 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(4e+00f), in1914);
__m512 tmp14282 = _mm512_add_ps(in1915, tmp14280);
__m512 tmp14283 = _mm512_fmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp14281);
tmp14281 = _mm512_fnmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp14281);
__m512 tmp14284 = _mm512_fnmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp14279);
tmp14279 = _mm512_fmadd_ps(in1915, _mm512_set1_ps(2e+00f), tmp14279);
tmp14280 = _mm512_sub_ps(tmp14280, in1915);
__m512 tmp14301 = _mm512_unpacklo_ps(in1914, tmp14282);
__m512 tmp14302 = _mm512_unpackhi_ps(in1914, tmp14282);
__m512 tmp14303 = _mm512_unpacklo_ps(tmp14280, tmp14283);
__m512 tmp14304 = _mm512_unpackhi_ps(tmp14280, tmp14283);
__m512 tmp14305 = _mm512_unpacklo_ps(tmp14281, tmp14279);
__m512 tmp14306 = _mm512_unpackhi_ps(tmp14281, tmp14279);
__m512 tmp14307 = _mm512_unpacklo_ps(tmp14284, in1916);
__m512 tmp14308 = _mm512_unpackhi_ps(tmp14284, in1916);
__m512 tmp14309 = _mm512_shuffle_ps(tmp14301, tmp14303, 68);
__m512 tmp14310 = _mm512_shuffle_ps(tmp14301, tmp14303, 238);
__m512 tmp14311 = _mm512_shuffle_ps(tmp14302, tmp14304, 68);
__m512 tmp14312 = _mm512_shuffle_ps(tmp14302, tmp14304, 238);
__m512 tmp14313 = _mm512_shuffle_ps(tmp14305, tmp14307, 68);
__m512 tmp14314 = _mm512_shuffle_ps(tmp14305, tmp14307, 238);
__m512 tmp14315 = _mm512_shuffle_ps(tmp14306, tmp14308, 68);
__m512 tmp14316 = _mm512_shuffle_ps(tmp14306, tmp14308, 238);
__m512 tmp14317 = _mm512_shuffle_f32x4(tmp14309, tmp14313, 136);
__m512 tmp14318 = _mm512_shuffle_f32x4(tmp14309, tmp14313, 221);
__m512 tmp14319 = _mm512_shuffle_f32x4(tmp14310, tmp14314, 136);
__m512 tmp14320 = _mm512_shuffle_f32x4(tmp14310, tmp14314, 221);
__m512 tmp14321 = _mm512_shuffle_f32x4(tmp14311, tmp14315, 136);
__m512 tmp14322 = _mm512_shuffle_f32x4(tmp14311, tmp14315, 221);
__m512 tmp14323 = _mm512_shuffle_f32x4(tmp14312, tmp14316, 136);
__m512 tmp14324 = _mm512_shuffle_f32x4(tmp14312, tmp14316, 221);
in1914 = _mm512_shuffle_f32x4(tmp14317, tmp14317, 136);
__m512 tmp14285 = _mm512_shuffle_f32x4(tmp14317, tmp14317, 221);
tmp14282 = _mm512_shuffle_f32x4(tmp14319, tmp14319, 136);
__m512 tmp14286 = _mm512_shuffle_f32x4(tmp14319, tmp14319, 221);
tmp14280 = _mm512_shuffle_f32x4(tmp14321, tmp14321, 136);
__m512 tmp14287 = _mm512_shuffle_f32x4(tmp14321, tmp14321, 221);
tmp14283 = _mm512_shuffle_f32x4(tmp14323, tmp14323, 136);
__m512 tmp14288 = _mm512_shuffle_f32x4(tmp14323, tmp14323, 221);
tmp14281 = _mm512_shuffle_f32x4(tmp14318, tmp14318, 136);
tmp14279 = _mm512_shuffle_f32x4(tmp14320, tmp14320, 136);
tmp14284 = _mm512_shuffle_f32x4(tmp14322, tmp14322, 136);
in1916 = _mm512_shuffle_f32x4(tmp14324, tmp14324, 136);
in1914 = _mm512_shuffle_f32x4(in1914, tmp14283, 68);
tmp14282 = _mm512_shuffle_f32x4(tmp14282, tmp14281, 68);
tmp14280 = _mm512_shuffle_f32x4(tmp14280, tmp14279, 68);
tmp14284 = _mm512_shuffle_f32x4(tmp14284, tmp14286, 68);
in1916 = _mm512_shuffle_f32x4(in1916, tmp14287, 68);
tmp14285 = _mm512_shuffle_f32x4(tmp14285, tmp14288, 68);
__m512 tmp14289 = _mm512_fmadd_ps(in1914, _mm512_set1_ps(4e+00f), tmp14280);
__m512 tmp14295 = _mm512_fmadd_ps(tmp14284, _mm512_set1_ps(4e+00f), tmp14285);
__m512 tmp14290 = _mm512_add_ps(in1914, tmp14280);
__m512 tmp14296 = _mm512_add_ps(tmp14284, tmp14285);
__m512 tmp14291 = _mm512_fmadd_ps(tmp14280, _mm512_set1_ps(4e+00f), in1914);
__m512 tmp14297 = _mm512_fmadd_ps(tmp14285, _mm512_set1_ps(4e+00f), tmp14284);
__m512 tmp14292 = _mm512_add_ps(tmp14282, tmp14290);
__m512 tmp14298 = _mm512_add_ps(in1916, tmp14296);
__m512 tmp14293 = _mm512_fmadd_ps(tmp14282, _mm512_set1_ps(2e+00f), tmp14291);
__m512 tmp14299 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp14297);
tmp14291 = _mm512_fnmadd_ps(tmp14282, _mm512_set1_ps(2e+00f), tmp14291);
tmp14297 = _mm512_fnmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp14297);
__m512 tmp14294 = _mm512_fnmadd_ps(tmp14282, _mm512_set1_ps(2e+00f), tmp14289);
__m512 tmp14300 = _mm512_fnmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp14295);
tmp14289 = _mm512_fmadd_ps(tmp14282, _mm512_set1_ps(2e+00f), tmp14289);
tmp14295 = _mm512_fmadd_ps(in1916, _mm512_set1_ps(2e+00f), tmp14295);
tmp14290 = _mm512_sub_ps(tmp14290, tmp14282);
tmp14296 = _mm512_sub_ps(tmp14296, in1916);
in1914 = _mm512_mul_ps(in1914, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp14292 = _mm512_mul_ps(tmp14292, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp14290 = _mm512_mul_ps(tmp14290, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp14293 = _mm512_mul_ps(tmp14293, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp14291 = _mm512_mul_ps(tmp14291, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp14289 = _mm512_mul_ps(tmp14289, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp14294 = _mm512_mul_ps(tmp14294, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp14280 = _mm512_mul_ps(tmp14280, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp14284 = _mm512_mul_ps(tmp14284, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp14298 = _mm512_mul_ps(tmp14298, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp14296 = _mm512_mul_ps(tmp14296, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp14299 = _mm512_mul_ps(tmp14299, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp14297 = _mm512_mul_ps(tmp14297, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp14295 = _mm512_mul_ps(tmp14295, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp14300 = _mm512_mul_ps(tmp14300, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp14285 = _mm512_mul_ps(tmp14285, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1743 = _mm512_shuffle_f32x4(in1914, tmp14292, 68);
__m512 out1747 = _mm512_shuffle_f32x4(in1914, tmp14292, 238);
__m512 out1744 = _mm512_shuffle_f32x4(tmp14290, tmp14293, 68);
__m512 out1748 = _mm512_shuffle_f32x4(tmp14290, tmp14293, 238);
__m512 out1745 = _mm512_shuffle_f32x4(tmp14291, tmp14289, 68);
__m512 out1749 = _mm512_shuffle_f32x4(tmp14291, tmp14289, 238);
__m512 out1746 = _mm512_shuffle_f32x4(tmp14294, tmp14280, 68);
__m512 out1750 = _mm512_shuffle_f32x4(tmp14294, tmp14280, 238);
__m512 out1751 = _mm512_shuffle_f32x4(tmp14284, tmp14298, 68);
__m512 out1755 = _mm512_shuffle_f32x4(tmp14284, tmp14298, 238);
__m512 out1752 = _mm512_shuffle_f32x4(tmp14296, tmp14299, 68);
__m512 out1756 = _mm512_shuffle_f32x4(tmp14296, tmp14299, 238);
__m512 out1753 = _mm512_shuffle_f32x4(tmp14297, tmp14295, 68);
__m512 out1757 = _mm512_shuffle_f32x4(tmp14297, tmp14295, 238);
__m512 out1754 = _mm512_shuffle_f32x4(tmp14300, tmp14285, 68);
__m512 out1758 = _mm512_shuffle_f32x4(tmp14300, tmp14285, 238);
ptrdiff_t off13 = 32*cut26;
ptrdiff_t off14 = (size_t)(cut26+1)/4*2048+(size_t)(cut26+1)%4*32;
ptrdiff_t off15 = (size_t)(cut26+2)/4*2048+(size_t)(cut26+2)%4*32;
ptrdiff_t off16 = (size_t)(cut26+3)/4*2048+(size_t)(cut26+3)%4*32;
__m512i wf169 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1743, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf170 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1747, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf171 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1751, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf172 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1755, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf173 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1744, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf174 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1748, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf175 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1752, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf176 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1756, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf177 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1745, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf178 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1749, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf179 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1753, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf180 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1757, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf181 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1746, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf182 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1750, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf183 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1754, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf184 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1758, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr16+0+32768*i70+2048*k172+off13+128*s72, 255, wf169);
_mm512_mask_storeu_epi32(wfPtr16+0+32768*i70+2048*k172+off14+128*s72, 255, wf170);
_mm512_mask_storeu_epi32(wfPtr16+0+32768*i70+2048*k172+off15+128*s72, 255, wf171);
_mm512_mask_storeu_epi32(wfPtr16+0+32768*i70+2048*k172+off16+128*s72, 255, wf172);
_mm512_mask_storeu_epi32(wfPtr16+8192+32768*i70+2048*k172+off13+128*s72, 255, wf173);
_mm512_mask_storeu_epi32(wfPtr16+8192+32768*i70+2048*k172+off14+128*s72, 255, wf174);
_mm512_mask_storeu_epi32(wfPtr16+8192+32768*i70+2048*k172+off15+128*s72, 255, wf175);
_mm512_mask_storeu_epi32(wfPtr16+8192+32768*i70+2048*k172+off16+128*s72, 255, wf176);
_mm512_mask_storeu_epi32(wfPtr16+16384+32768*i70+2048*k172+off13+128*s72, 255, wf177);
_mm512_mask_storeu_epi32(wfPtr16+16384+32768*i70+2048*k172+off14+128*s72, 255, wf178);
_mm512_mask_storeu_epi32(wfPtr16+16384+32768*i70+2048*k172+off15+128*s72, 255, wf179);
_mm512_mask_storeu_epi32(wfPtr16+16384+32768*i70+2048*k172+off16+128*s72, 255, wf180);
_mm512_mask_storeu_epi32(wfPtr16+24576+32768*i70+2048*k172+off13+128*s72, 255, wf181);
_mm512_mask_storeu_epi32(wfPtr16+24576+32768*i70+2048*k172+off14+128*s72, 255, wf182);
_mm512_mask_storeu_epi32(wfPtr16+24576+32768*i70+2048*k172+off15+128*s72, 255, wf183);
_mm512_mask_storeu_epi32(wfPtr16+24576+32768*i70+2048*k172+off16+128*s72, 255, wf184);
}
__m512 bias7 = _mm512_setzero_ps();
if (!e34) {
bias7 = _mm512_maskz_loadu_ps(15, biasPtr19-0+64*i70+16*j62);
__m512i pmMul41 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd41 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas13 = _mm512_maskz_loadu_ps(255, bnPtr20+(ptrdiff_t)8*(0+16*i70+4*j62));
__m512 postMul63 = _mm512_permutexvar_ps(pmMul41, mas13);
__m512 postAdd41 = _mm512_permutexvar_ps(pmAdd41, mas13);
bias7 = _mm512_fmadd_ps(bias7, postMul63, postAdd41);
}
_mm512_mask_storeu_ps(bfPtr16-0+64*i70+16*j62, 15, bias7);
}
}
}
}

static void ResNeXt50ThreeArrangeFilts4(ResNeXt50ThreaderTeam1* team72, char** tensors117) {
ResNeXt50ThreaderTask1 task121;
task121.callee1 = ResNeXt50ThreeArrangeFilts4Callee1;
task121.any1 = tensors117;
task121.nd1 = 3;
task121.hull1[0] = 1;
task121.hull1[1] = 16;
task121.hull1[2] = 1;
ResNeXt50ThreaderDo1(team72, &task121);
}

static void ResNeXt50ThreeArrangeDats4Callee1(ResNeXt50ThreaderTask1* task122, int64_t* pt66) {
char** tensors120 = task122->any1;
ptrdiff_t s73 = 0;
ptrdiff_t c59 = 0;
ptrdiff_t g38 = pt66[2];
ptrdiff_t e35 = 0;
char*restrict datPtr38 = tensors120[0]-60+329472*e35;
char*restrict dfPtr16 = tensors120[1]+29196288*e35;
ptrdiff_t i71 = 4*g38;
ptrdiff_t ii53 = i71+3;
for (; i71 <= ii53; ++i71) {
ptrdiff_t j63 = 2*c59;
ptrdiff_t rel25 = j63-0;
ptrdiff_t base25 = 0;
if (rel25 < 1) {
ptrdiff_t h52 = base25+0;
ptrdiff_t w70 = 0;
ptrdiff_t k173 = 0;
for (; k173 != 8; ++k173) {
__m512 dat2482 = _mm512_maskz_loadu_ps(127, datPtr38+340+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512i pm213 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1924 = _mm512_permutexvar_ps(pm213, dat2482);
__m512 dat2483 = _mm512_maskz_loadu_ps(16383, datPtr38+60+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2484 = _mm512_maskz_loadu_ps(127, datPtr38+396+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512i pm214 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1917 = _mm512_permutexvar_ps(pm214, dat2483);
__m512i pm215 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in1925 = _mm512_permutex2var_ps(dat2483, pm215, dat2484);
__m512 dat2485 = _mm512_maskz_loadu_ps(16383, datPtr38+116+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2486 = _mm512_maskz_loadu_ps(127, datPtr38+452+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1918 = _mm512_permutexvar_ps(pm214, dat2485);
__m512 in1926 = _mm512_permutex2var_ps(dat2485, pm215, dat2486);
__m512 dat2487 = _mm512_maskz_loadu_ps(16383, datPtr38+172+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2488 = _mm512_maskz_loadu_ps(127, datPtr38+508+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1919 = _mm512_permutexvar_ps(pm214, dat2487);
__m512 in1927 = _mm512_permutex2var_ps(dat2487, pm215, dat2488);
__m512 dat2489 = _mm512_maskz_loadu_ps(16383, datPtr38+228+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2490 = _mm512_maskz_loadu_ps(127, datPtr38+564+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1920 = _mm512_permutexvar_ps(pm214, dat2489);
__m512 in1928 = _mm512_permutex2var_ps(dat2489, pm215, dat2490);
__m512 dat2491 = _mm512_maskz_loadu_ps(16383, datPtr38+284+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2492 = _mm512_maskz_loadu_ps(127, datPtr38+620+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1921 = _mm512_permutexvar_ps(pm214, dat2491);
__m512 in1929 = _mm512_permutex2var_ps(dat2491, pm215, dat2492);
__m512 dat2493 = _mm512_maskz_loadu_ps(16383, datPtr38+340+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2494 = _mm512_maskz_loadu_ps(127, datPtr38+676+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1922 = _mm512_permutexvar_ps(pm214, dat2493);
__m512 in1930 = _mm512_permutex2var_ps(dat2493, pm215, dat2494);
__m512 dat2495 = _mm512_maskz_loadu_ps(16383, datPtr38+396+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2496 = _mm512_maskz_loadu_ps(127, datPtr38+732+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1923 = _mm512_permutexvar_ps(pm214, dat2495);
__m512 in1931 = _mm512_permutex2var_ps(dat2495, pm215, dat2496);
__m512 tmp14325 = _mm512_add_ps(in1917, in1921);
__m512 tmp14330 = _mm512_add_ps(in1925, in1929);
__m512 tmp14326 = _mm512_sub_ps(in1920, in1918);
__m512 tmp14331 = _mm512_sub_ps(in1928, in1926);
__m512 tmp14327 = _mm512_add_ps(in1918, in1922);
__m512 tmp14332 = _mm512_add_ps(in1926, in1930);
__m512 tmp14328 = _mm512_sub_ps(_mm512_setzero_ps(), in1922);
in1924 = _mm512_sub_ps(in1924, in1930);
tmp14325 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-4.25e+00f), tmp14325);
tmp14330 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-4.25e+00f), tmp14330);
tmp14327 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-4.25e+00f), tmp14327);
tmp14332 = _mm512_fmadd_ps(in1928, _mm512_set1_ps(-4.25e+00f), tmp14332);
tmp14328 = _mm512_fmadd_ps(tmp14326, _mm512_set1_ps(5.25e+00f), tmp14328);
in1924 = _mm512_fmadd_ps(tmp14331, _mm512_set1_ps(5.25e+00f), in1924);
tmp14326 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(2.5e-01f), in1922);
tmp14331 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(2.5e-01f), in1930);
in1918 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(4e+00f), in1922);
in1926 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(4e+00f), in1930);
__m512 tmp14329 = _mm512_sub_ps(tmp14327, tmp14325);
__m512 tmp14333 = _mm512_sub_ps(tmp14332, tmp14330);
tmp14327 = _mm512_add_ps(tmp14325, tmp14327);
tmp14332 = _mm512_add_ps(tmp14330, tmp14332);
tmp14325 = _mm512_fmadd_ps(in1917, _mm512_set1_ps(2.5e-01f), in1921);
tmp14330 = _mm512_fmadd_ps(in1925, _mm512_set1_ps(2.5e-01f), in1929);
tmp14326 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-1.25e+00f), tmp14326);
tmp14331 = _mm512_fmadd_ps(in1928, _mm512_set1_ps(-1.25e+00f), tmp14331);
in1920 = _mm512_fmadd_ps(in1920, _mm512_set1_ps(-5e+00f), in1918);
in1928 = _mm512_fmadd_ps(in1928, _mm512_set1_ps(-5e+00f), in1926);
tmp14325 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-1.25e+00f), tmp14325);
tmp14330 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-1.25e+00f), tmp14330);
in1922 = _mm512_fmadd_ps(tmp14325, _mm512_set1_ps(2e+00f), tmp14326);
in1930 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(2e+00f), tmp14331);
tmp14326 = _mm512_fnmadd_ps(tmp14325, _mm512_set1_ps(2e+00f), tmp14326);
tmp14331 = _mm512_fnmadd_ps(tmp14330, _mm512_set1_ps(2e+00f), tmp14331);
tmp14325 = _mm512_fmadd_ps(in1921, _mm512_set1_ps(2.5e-01f), in1917);
tmp14330 = _mm512_fmadd_ps(in1929, _mm512_set1_ps(2.5e-01f), in1925);
in1917 = _mm512_sub_ps(in1923, in1917);
in1925 = _mm512_sub_ps(in1931, in1925);
tmp14325 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(-1.25e+00f), tmp14325);
tmp14330 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(-1.25e+00f), tmp14330);
in1919 = _mm512_sub_ps(in1919, in1921);
in1927 = _mm512_sub_ps(in1927, in1929);
in1919 = _mm512_fmadd_ps(in1919, _mm512_set1_ps(5.25e+00f), in1917);
in1927 = _mm512_fmadd_ps(in1927, _mm512_set1_ps(5.25e+00f), in1925);
in1918 = _mm512_fmadd_ps(tmp14325, _mm512_set1_ps(2e+00f), in1920);
in1926 = _mm512_fmadd_ps(tmp14330, _mm512_set1_ps(2e+00f), in1928);
in1920 = _mm512_fnmadd_ps(tmp14325, _mm512_set1_ps(2e+00f), in1920);
in1928 = _mm512_fnmadd_ps(tmp14330, _mm512_set1_ps(2e+00f), in1928);
__m512 tmp14342 = _mm512_unpacklo_ps(tmp14328, tmp14327);
__m512 tmp14343 = _mm512_unpackhi_ps(tmp14328, tmp14327);
__m512 tmp14344 = _mm512_unpacklo_ps(tmp14329, in1922);
__m512 tmp14345 = _mm512_unpackhi_ps(tmp14329, in1922);
__m512 tmp14346 = _mm512_unpacklo_ps(tmp14326, in1918);
__m512 tmp14347 = _mm512_unpackhi_ps(tmp14326, in1918);
__m512 tmp14348 = _mm512_unpacklo_ps(in1920, in1919);
__m512 tmp14349 = _mm512_unpackhi_ps(in1920, in1919);
__m512 tmp14350 = _mm512_unpacklo_ps(in1924, tmp14332);
__m512 tmp14351 = _mm512_unpackhi_ps(in1924, tmp14332);
__m512 tmp14352 = _mm512_unpacklo_ps(tmp14333, in1930);
__m512 tmp14353 = _mm512_unpackhi_ps(tmp14333, in1930);
__m512 tmp14354 = _mm512_unpacklo_ps(tmp14331, in1926);
__m512 tmp14355 = _mm512_unpackhi_ps(tmp14331, in1926);
__m512 tmp14356 = _mm512_unpacklo_ps(in1928, in1927);
__m512 tmp14357 = _mm512_unpackhi_ps(in1928, in1927);
__m512 tmp14358 = _mm512_shuffle_ps(tmp14342, tmp14344, 68);
__m512 tmp14359 = _mm512_shuffle_ps(tmp14342, tmp14344, 238);
__m512 tmp14360 = _mm512_shuffle_ps(tmp14343, tmp14345, 68);
__m512 tmp14361 = _mm512_shuffle_ps(tmp14343, tmp14345, 238);
__m512 tmp14362 = _mm512_shuffle_ps(tmp14346, tmp14348, 68);
__m512 tmp14363 = _mm512_shuffle_ps(tmp14346, tmp14348, 238);
__m512 tmp14364 = _mm512_shuffle_ps(tmp14347, tmp14349, 68);
__m512 tmp14365 = _mm512_shuffle_ps(tmp14347, tmp14349, 238);
__m512 tmp14366 = _mm512_shuffle_ps(tmp14350, tmp14352, 68);
__m512 tmp14367 = _mm512_shuffle_ps(tmp14350, tmp14352, 238);
__m512 tmp14368 = _mm512_shuffle_ps(tmp14351, tmp14353, 68);
__m512 tmp14369 = _mm512_shuffle_ps(tmp14351, tmp14353, 238);
__m512 tmp14370 = _mm512_shuffle_ps(tmp14354, tmp14356, 68);
__m512 tmp14371 = _mm512_shuffle_ps(tmp14354, tmp14356, 238);
__m512 tmp14372 = _mm512_shuffle_ps(tmp14355, tmp14357, 68);
__m512 tmp14373 = _mm512_shuffle_ps(tmp14355, tmp14357, 238);
__m512 tmp14374 = _mm512_shuffle_f32x4(tmp14358, tmp14362, 136);
__m512 tmp14375 = _mm512_shuffle_f32x4(tmp14358, tmp14362, 221);
__m512 tmp14376 = _mm512_shuffle_f32x4(tmp14359, tmp14363, 136);
__m512 tmp14377 = _mm512_shuffle_f32x4(tmp14359, tmp14363, 221);
__m512 tmp14378 = _mm512_shuffle_f32x4(tmp14360, tmp14364, 136);
__m512 tmp14379 = _mm512_shuffle_f32x4(tmp14360, tmp14364, 221);
__m512 tmp14380 = _mm512_shuffle_f32x4(tmp14361, tmp14365, 136);
__m512 tmp14381 = _mm512_shuffle_f32x4(tmp14361, tmp14365, 221);
__m512 tmp14382 = _mm512_shuffle_f32x4(tmp14366, tmp14370, 136);
__m512 tmp14383 = _mm512_shuffle_f32x4(tmp14366, tmp14370, 221);
__m512 tmp14384 = _mm512_shuffle_f32x4(tmp14367, tmp14371, 136);
__m512 tmp14385 = _mm512_shuffle_f32x4(tmp14367, tmp14371, 221);
__m512 tmp14386 = _mm512_shuffle_f32x4(tmp14368, tmp14372, 136);
__m512 tmp14387 = _mm512_shuffle_f32x4(tmp14368, tmp14372, 221);
__m512 tmp14388 = _mm512_shuffle_f32x4(tmp14369, tmp14373, 136);
__m512 tmp14389 = _mm512_shuffle_f32x4(tmp14369, tmp14373, 221);
tmp14328 = _mm512_shuffle_f32x4(tmp14374, tmp14382, 136);
in1924 = _mm512_shuffle_f32x4(tmp14374, tmp14382, 221);
tmp14327 = _mm512_shuffle_f32x4(tmp14376, tmp14384, 136);
tmp14332 = _mm512_shuffle_f32x4(tmp14376, tmp14384, 221);
tmp14329 = _mm512_shuffle_f32x4(tmp14378, tmp14386, 136);
tmp14333 = _mm512_shuffle_f32x4(tmp14378, tmp14386, 221);
in1922 = _mm512_shuffle_f32x4(tmp14380, tmp14388, 136);
in1930 = _mm512_shuffle_f32x4(tmp14380, tmp14388, 221);
tmp14326 = _mm512_shuffle_f32x4(tmp14375, tmp14383, 136);
tmp14331 = _mm512_shuffle_f32x4(tmp14375, tmp14383, 221);
in1918 = _mm512_shuffle_f32x4(tmp14377, tmp14385, 136);
in1926 = _mm512_shuffle_f32x4(tmp14377, tmp14385, 221);
in1920 = _mm512_shuffle_f32x4(tmp14379, tmp14387, 136);
in1928 = _mm512_shuffle_f32x4(tmp14379, tmp14387, 221);
in1919 = _mm512_shuffle_f32x4(tmp14381, tmp14389, 136);
in1927 = _mm512_shuffle_f32x4(tmp14381, tmp14389, 221);
__m512 tmp14334 = _mm512_add_ps(tmp14327, in1918);
__m512 tmp14338 = _mm512_add_ps(tmp14332, in1926);
__m512 tmp14335 = _mm512_sub_ps(tmp14326, tmp14329);
__m512 tmp14339 = _mm512_sub_ps(tmp14331, tmp14333);
__m512 tmp14336 = _mm512_add_ps(tmp14329, in1920);
__m512 tmp14340 = _mm512_add_ps(tmp14333, in1928);
tmp14328 = _mm512_sub_ps(tmp14328, in1920);
in1924 = _mm512_sub_ps(in1924, in1928);
tmp14334 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-4.25e+00f), tmp14334);
tmp14338 = _mm512_fmadd_ps(in1930, _mm512_set1_ps(-4.25e+00f), tmp14338);
tmp14336 = _mm512_fmadd_ps(tmp14326, _mm512_set1_ps(-4.25e+00f), tmp14336);
tmp14340 = _mm512_fmadd_ps(tmp14331, _mm512_set1_ps(-4.25e+00f), tmp14340);
tmp14328 = _mm512_fmadd_ps(tmp14335, _mm512_set1_ps(5.25e+00f), tmp14328);
in1924 = _mm512_fmadd_ps(tmp14339, _mm512_set1_ps(5.25e+00f), in1924);
tmp14335 = _mm512_fmadd_ps(tmp14329, _mm512_set1_ps(2.5e-01f), in1920);
tmp14339 = _mm512_fmadd_ps(tmp14333, _mm512_set1_ps(2.5e-01f), in1928);
tmp14329 = _mm512_fmadd_ps(tmp14329, _mm512_set1_ps(4e+00f), in1920);
tmp14333 = _mm512_fmadd_ps(tmp14333, _mm512_set1_ps(4e+00f), in1928);
__m512 tmp14337 = _mm512_sub_ps(tmp14336, tmp14334);
__m512 tmp14341 = _mm512_sub_ps(tmp14340, tmp14338);
tmp14336 = _mm512_add_ps(tmp14334, tmp14336);
tmp14340 = _mm512_add_ps(tmp14338, tmp14340);
tmp14334 = _mm512_fmadd_ps(tmp14327, _mm512_set1_ps(2.5e-01f), in1918);
tmp14338 = _mm512_fmadd_ps(tmp14332, _mm512_set1_ps(2.5e-01f), in1926);
tmp14335 = _mm512_fmadd_ps(tmp14326, _mm512_set1_ps(-1.25e+00f), tmp14335);
tmp14339 = _mm512_fmadd_ps(tmp14331, _mm512_set1_ps(-1.25e+00f), tmp14339);
tmp14326 = _mm512_fmadd_ps(tmp14326, _mm512_set1_ps(-5e+00f), tmp14329);
tmp14331 = _mm512_fmadd_ps(tmp14331, _mm512_set1_ps(-5e+00f), tmp14333);
tmp14334 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-1.25e+00f), tmp14334);
tmp14338 = _mm512_fmadd_ps(in1930, _mm512_set1_ps(-1.25e+00f), tmp14338);
in1920 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(2e+00f), tmp14335);
in1928 = _mm512_fmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14339);
tmp14335 = _mm512_fnmadd_ps(tmp14334, _mm512_set1_ps(2e+00f), tmp14335);
tmp14339 = _mm512_fnmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14339);
tmp14334 = _mm512_fmadd_ps(in1918, _mm512_set1_ps(2.5e-01f), tmp14327);
tmp14338 = _mm512_fmadd_ps(in1926, _mm512_set1_ps(2.5e-01f), tmp14332);
tmp14327 = _mm512_sub_ps(in1919, tmp14327);
tmp14332 = _mm512_sub_ps(in1927, tmp14332);
tmp14334 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(-1.25e+00f), tmp14334);
tmp14338 = _mm512_fmadd_ps(in1930, _mm512_set1_ps(-1.25e+00f), tmp14338);
in1922 = _mm512_sub_ps(in1922, in1918);
in1930 = _mm512_sub_ps(in1930, in1926);
in1922 = _mm512_fmadd_ps(in1922, _mm512_set1_ps(5.25e+00f), tmp14327);
in1930 = _mm512_fmadd_ps(in1930, _mm512_set1_ps(5.25e+00f), tmp14332);
tmp14329 = _mm512_fmadd_ps(tmp14334, _mm512_set1_ps(2e+00f), tmp14326);
tmp14333 = _mm512_fmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14331);
tmp14326 = _mm512_fnmadd_ps(tmp14334, _mm512_set1_ps(2e+00f), tmp14326);
tmp14331 = _mm512_fnmadd_ps(tmp14338, _mm512_set1_ps(2e+00f), tmp14331);
__m512 out1759 = _mm512_shuffle_f32x4(tmp14328, tmp14336, 68);
__m512 out1767 = _mm512_shuffle_f32x4(tmp14328, tmp14336, 238);
__m512 out1760 = _mm512_shuffle_f32x4(tmp14337, in1920, 68);
__m512 out1768 = _mm512_shuffle_f32x4(tmp14337, in1920, 238);
__m512 out1761 = _mm512_shuffle_f32x4(tmp14335, tmp14329, 68);
__m512 out1769 = _mm512_shuffle_f32x4(tmp14335, tmp14329, 238);
__m512 out1762 = _mm512_shuffle_f32x4(tmp14326, in1922, 68);
__m512 out1770 = _mm512_shuffle_f32x4(tmp14326, in1922, 238);
__m512 out1763 = _mm512_shuffle_f32x4(in1924, tmp14340, 68);
__m512 out1771 = _mm512_shuffle_f32x4(in1924, tmp14340, 238);
__m512 out1764 = _mm512_shuffle_f32x4(tmp14341, in1928, 68);
__m512 out1772 = _mm512_shuffle_f32x4(tmp14341, in1928, 238);
__m512 out1765 = _mm512_shuffle_f32x4(tmp14339, tmp14333, 68);
__m512 out1773 = _mm512_shuffle_f32x4(tmp14339, tmp14333, 238);
__m512 out1766 = _mm512_shuffle_f32x4(tmp14331, in1930, 68);
__m512 out1774 = _mm512_shuffle_f32x4(tmp14331, in1930, 238);
_mm512_storeu_ps(dfPtr16+0+36864*i71+6144*j63+6144*s73+768*k173, out1759);
_mm512_storeu_ps(dfPtr16+128+36864*i71+6144*j63+6144*s73+768*k173, out1767);
_mm512_storeu_ps(dfPtr16+64+36864*i71+6144*j63+6144*s73+768*k173, out1763);
_mm512_storeu_ps(dfPtr16+192+36864*i71+6144*j63+6144*s73+768*k173, out1771);
_mm512_storeu_ps(dfPtr16+9216+36864*i71+6144*j63+6144*s73+768*k173, out1760);
_mm512_storeu_ps(dfPtr16+9344+36864*i71+6144*j63+6144*s73+768*k173, out1768);
_mm512_storeu_ps(dfPtr16+9280+36864*i71+6144*j63+6144*s73+768*k173, out1764);
_mm512_storeu_ps(dfPtr16+9408+36864*i71+6144*j63+6144*s73+768*k173, out1772);
_mm512_storeu_ps(dfPtr16+18432+36864*i71+6144*j63+6144*s73+768*k173, out1761);
_mm512_storeu_ps(dfPtr16+18560+36864*i71+6144*j63+6144*s73+768*k173, out1769);
_mm512_storeu_ps(dfPtr16+18496+36864*i71+6144*j63+6144*s73+768*k173, out1765);
_mm512_storeu_ps(dfPtr16+18624+36864*i71+6144*j63+6144*s73+768*k173, out1773);
_mm512_storeu_ps(dfPtr16+27648+36864*i71+6144*j63+6144*s73+768*k173, out1762);
_mm512_storeu_ps(dfPtr16+27776+36864*i71+6144*j63+6144*s73+768*k173, out1770);
_mm512_storeu_ps(dfPtr16+27712+36864*i71+6144*j63+6144*s73+768*k173, out1766);
_mm512_storeu_ps(dfPtr16+27840+36864*i71+6144*j63+6144*s73+768*k173, out1774);
__m512 dat2497 = _mm512_maskz_loadu_ps(511, datPtr38+360+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512i pm216 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1932 = _mm512_permutexvar_ps(pm216, dat2497);
__m512 dat2498 = _mm512_maskz_loadu_ps(511, datPtr38+416+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2499 = _mm512_maskz_loadu_ps(8191, datPtr38+892+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1933 = _mm512_permutexvar_ps(pm216, dat2498);
__m512i pm217 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1940 = _mm512_permutexvar_ps(pm217, dat2499);
__m512 dat2500 = _mm512_maskz_loadu_ps(511, datPtr38+472+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2501 = _mm512_maskz_loadu_ps(8191, datPtr38+948+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1934 = _mm512_permutexvar_ps(pm216, dat2500);
__m512 in1941 = _mm512_permutexvar_ps(pm217, dat2501);
__m512 dat2502 = _mm512_maskz_loadu_ps(511, datPtr38+528+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2503 = _mm512_maskz_loadu_ps(8191, datPtr38+1004+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1935 = _mm512_permutexvar_ps(pm216, dat2502);
__m512 in1942 = _mm512_permutexvar_ps(pm217, dat2503);
__m512 dat2504 = _mm512_maskz_loadu_ps(511, datPtr38+584+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2505 = _mm512_maskz_loadu_ps(8191, datPtr38+1060+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1936 = _mm512_permutexvar_ps(pm216, dat2504);
__m512 in1943 = _mm512_permutexvar_ps(pm217, dat2505);
__m512 dat2506 = _mm512_maskz_loadu_ps(511, datPtr38+640+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2507 = _mm512_maskz_loadu_ps(8191, datPtr38+1116+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1937 = _mm512_permutexvar_ps(pm216, dat2506);
__m512 in1944 = _mm512_permutexvar_ps(pm217, dat2507);
__m512 dat2508 = _mm512_maskz_loadu_ps(511, datPtr38+696+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2509 = _mm512_maskz_loadu_ps(8191, datPtr38+1172+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1938 = _mm512_permutexvar_ps(pm216, dat2508);
__m512 in1945 = _mm512_permutexvar_ps(pm217, dat2509);
__m512 dat2510 = _mm512_maskz_loadu_ps(511, datPtr38+752+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2511 = _mm512_maskz_loadu_ps(8191, datPtr38+1228+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1939 = _mm512_permutexvar_ps(pm216, dat2510);
__m512 in1946 = _mm512_permutexvar_ps(pm217, dat2511);
__m512 tmp14390 = _mm512_add_ps(in1933, in1937);
__m512 tmp14394 = _mm512_add_ps(in1940, in1944);
__m512 tmp14391 = _mm512_sub_ps(in1936, in1934);
__m512 tmp14395 = _mm512_sub_ps(in1943, in1941);
__m512 tmp14392 = _mm512_add_ps(in1934, in1938);
__m512 tmp14396 = _mm512_add_ps(in1941, in1945);
in1932 = _mm512_sub_ps(in1932, in1938);
__m512 tmp14397 = _mm512_sub_ps(_mm512_setzero_ps(), in1945);
tmp14390 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-4.25e+00f), tmp14390);
tmp14394 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-4.25e+00f), tmp14394);
tmp14392 = _mm512_fmadd_ps(in1936, _mm512_set1_ps(-4.25e+00f), tmp14392);
tmp14396 = _mm512_fmadd_ps(in1943, _mm512_set1_ps(-4.25e+00f), tmp14396);
in1932 = _mm512_fmadd_ps(tmp14391, _mm512_set1_ps(5.25e+00f), in1932);
tmp14397 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(5.25e+00f), tmp14397);
tmp14391 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(2.5e-01f), in1938);
tmp14395 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(2.5e-01f), in1945);
in1934 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(4e+00f), in1938);
in1941 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(4e+00f), in1945);
__m512 tmp14393 = _mm512_sub_ps(tmp14392, tmp14390);
__m512 tmp14398 = _mm512_sub_ps(tmp14396, tmp14394);
tmp14392 = _mm512_add_ps(tmp14390, tmp14392);
tmp14396 = _mm512_add_ps(tmp14394, tmp14396);
tmp14390 = _mm512_fmadd_ps(in1933, _mm512_set1_ps(2.5e-01f), in1937);
tmp14394 = _mm512_fmadd_ps(in1940, _mm512_set1_ps(2.5e-01f), in1944);
tmp14391 = _mm512_fmadd_ps(in1936, _mm512_set1_ps(-1.25e+00f), tmp14391);
tmp14395 = _mm512_fmadd_ps(in1943, _mm512_set1_ps(-1.25e+00f), tmp14395);
in1936 = _mm512_fmadd_ps(in1936, _mm512_set1_ps(-5e+00f), in1934);
in1943 = _mm512_fmadd_ps(in1943, _mm512_set1_ps(-5e+00f), in1941);
tmp14390 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-1.25e+00f), tmp14390);
tmp14394 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-1.25e+00f), tmp14394);
in1938 = _mm512_fmadd_ps(tmp14390, _mm512_set1_ps(2e+00f), tmp14391);
in1945 = _mm512_fmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), tmp14395);
tmp14391 = _mm512_fnmadd_ps(tmp14390, _mm512_set1_ps(2e+00f), tmp14391);
tmp14395 = _mm512_fnmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), tmp14395);
tmp14390 = _mm512_fmadd_ps(in1937, _mm512_set1_ps(2.5e-01f), in1933);
tmp14394 = _mm512_fmadd_ps(in1944, _mm512_set1_ps(2.5e-01f), in1940);
in1933 = _mm512_sub_ps(in1939, in1933);
in1940 = _mm512_sub_ps(in1946, in1940);
tmp14390 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(-1.25e+00f), tmp14390);
tmp14394 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(-1.25e+00f), tmp14394);
in1935 = _mm512_sub_ps(in1935, in1937);
in1942 = _mm512_sub_ps(in1942, in1944);
in1935 = _mm512_fmadd_ps(in1935, _mm512_set1_ps(5.25e+00f), in1933);
in1942 = _mm512_fmadd_ps(in1942, _mm512_set1_ps(5.25e+00f), in1940);
in1934 = _mm512_fmadd_ps(tmp14390, _mm512_set1_ps(2e+00f), in1936);
in1941 = _mm512_fmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), in1943);
in1936 = _mm512_fnmadd_ps(tmp14390, _mm512_set1_ps(2e+00f), in1936);
in1943 = _mm512_fnmadd_ps(tmp14394, _mm512_set1_ps(2e+00f), in1943);
__m512 tmp14407 = _mm512_unpacklo_ps(in1932, tmp14392);
__m512 tmp14408 = _mm512_unpackhi_ps(in1932, tmp14392);
__m512 tmp14409 = _mm512_unpacklo_ps(tmp14393, in1938);
__m512 tmp14410 = _mm512_unpackhi_ps(tmp14393, in1938);
__m512 tmp14411 = _mm512_unpacklo_ps(tmp14391, in1934);
__m512 tmp14412 = _mm512_unpackhi_ps(tmp14391, in1934);
__m512 tmp14413 = _mm512_unpacklo_ps(in1936, in1935);
__m512 tmp14414 = _mm512_unpackhi_ps(in1936, in1935);
__m512 tmp14415 = _mm512_unpacklo_ps(tmp14397, tmp14396);
__m512 tmp14416 = _mm512_unpackhi_ps(tmp14397, tmp14396);
__m512 tmp14417 = _mm512_unpacklo_ps(tmp14398, in1945);
__m512 tmp14418 = _mm512_unpackhi_ps(tmp14398, in1945);
__m512 tmp14419 = _mm512_unpacklo_ps(tmp14395, in1941);
__m512 tmp14420 = _mm512_unpackhi_ps(tmp14395, in1941);
__m512 tmp14421 = _mm512_unpacklo_ps(in1943, in1942);
__m512 tmp14422 = _mm512_unpackhi_ps(in1943, in1942);
__m512 tmp14423 = _mm512_shuffle_ps(tmp14407, tmp14409, 68);
__m512 tmp14424 = _mm512_shuffle_ps(tmp14407, tmp14409, 238);
__m512 tmp14425 = _mm512_shuffle_ps(tmp14408, tmp14410, 68);
__m512 tmp14426 = _mm512_shuffle_ps(tmp14408, tmp14410, 238);
__m512 tmp14427 = _mm512_shuffle_ps(tmp14411, tmp14413, 68);
__m512 tmp14428 = _mm512_shuffle_ps(tmp14411, tmp14413, 238);
__m512 tmp14429 = _mm512_shuffle_ps(tmp14412, tmp14414, 68);
__m512 tmp14430 = _mm512_shuffle_ps(tmp14412, tmp14414, 238);
__m512 tmp14431 = _mm512_shuffle_ps(tmp14415, tmp14417, 68);
__m512 tmp14432 = _mm512_shuffle_ps(tmp14415, tmp14417, 238);
__m512 tmp14433 = _mm512_shuffle_ps(tmp14416, tmp14418, 68);
__m512 tmp14434 = _mm512_shuffle_ps(tmp14416, tmp14418, 238);
__m512 tmp14435 = _mm512_shuffle_ps(tmp14419, tmp14421, 68);
__m512 tmp14436 = _mm512_shuffle_ps(tmp14419, tmp14421, 238);
__m512 tmp14437 = _mm512_shuffle_ps(tmp14420, tmp14422, 68);
__m512 tmp14438 = _mm512_shuffle_ps(tmp14420, tmp14422, 238);
__m512 tmp14439 = _mm512_shuffle_f32x4(tmp14423, tmp14427, 136);
__m512 tmp14440 = _mm512_shuffle_f32x4(tmp14423, tmp14427, 221);
__m512 tmp14441 = _mm512_shuffle_f32x4(tmp14424, tmp14428, 136);
__m512 tmp14442 = _mm512_shuffle_f32x4(tmp14424, tmp14428, 221);
__m512 tmp14443 = _mm512_shuffle_f32x4(tmp14425, tmp14429, 136);
__m512 tmp14444 = _mm512_shuffle_f32x4(tmp14425, tmp14429, 221);
__m512 tmp14445 = _mm512_shuffle_f32x4(tmp14426, tmp14430, 136);
__m512 tmp14446 = _mm512_shuffle_f32x4(tmp14426, tmp14430, 221);
__m512 tmp14447 = _mm512_shuffle_f32x4(tmp14431, tmp14435, 136);
__m512 tmp14448 = _mm512_shuffle_f32x4(tmp14431, tmp14435, 221);
__m512 tmp14449 = _mm512_shuffle_f32x4(tmp14432, tmp14436, 136);
__m512 tmp14450 = _mm512_shuffle_f32x4(tmp14432, tmp14436, 221);
__m512 tmp14451 = _mm512_shuffle_f32x4(tmp14433, tmp14437, 136);
__m512 tmp14452 = _mm512_shuffle_f32x4(tmp14433, tmp14437, 221);
__m512 tmp14453 = _mm512_shuffle_f32x4(tmp14434, tmp14438, 136);
__m512 tmp14454 = _mm512_shuffle_f32x4(tmp14434, tmp14438, 221);
in1932 = _mm512_shuffle_f32x4(tmp14439, tmp14447, 136);
tmp14397 = _mm512_shuffle_f32x4(tmp14439, tmp14447, 221);
tmp14392 = _mm512_shuffle_f32x4(tmp14441, tmp14449, 136);
tmp14396 = _mm512_shuffle_f32x4(tmp14441, tmp14449, 221);
tmp14393 = _mm512_shuffle_f32x4(tmp14443, tmp14451, 136);
tmp14398 = _mm512_shuffle_f32x4(tmp14443, tmp14451, 221);
in1938 = _mm512_shuffle_f32x4(tmp14445, tmp14453, 136);
in1945 = _mm512_shuffle_f32x4(tmp14445, tmp14453, 221);
tmp14391 = _mm512_shuffle_f32x4(tmp14440, tmp14448, 136);
tmp14395 = _mm512_shuffle_f32x4(tmp14440, tmp14448, 221);
in1934 = _mm512_shuffle_f32x4(tmp14442, tmp14450, 136);
in1941 = _mm512_shuffle_f32x4(tmp14442, tmp14450, 221);
in1936 = _mm512_shuffle_f32x4(tmp14444, tmp14452, 136);
in1943 = _mm512_shuffle_f32x4(tmp14444, tmp14452, 221);
in1935 = _mm512_shuffle_f32x4(tmp14446, tmp14454, 136);
in1942 = _mm512_shuffle_f32x4(tmp14446, tmp14454, 221);
__m512 tmp14399 = _mm512_add_ps(tmp14392, in1934);
__m512 tmp14403 = _mm512_add_ps(tmp14396, in1941);
__m512 tmp14400 = _mm512_sub_ps(tmp14391, tmp14393);
__m512 tmp14404 = _mm512_sub_ps(tmp14395, tmp14398);
__m512 tmp14401 = _mm512_add_ps(tmp14393, in1936);
__m512 tmp14405 = _mm512_add_ps(tmp14398, in1943);
in1932 = _mm512_sub_ps(in1932, in1936);
tmp14397 = _mm512_sub_ps(tmp14397, in1943);
tmp14399 = _mm512_fmadd_ps(in1938, _mm512_set1_ps(-4.25e+00f), tmp14399);
tmp14403 = _mm512_fmadd_ps(in1945, _mm512_set1_ps(-4.25e+00f), tmp14403);
tmp14401 = _mm512_fmadd_ps(tmp14391, _mm512_set1_ps(-4.25e+00f), tmp14401);
tmp14405 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-4.25e+00f), tmp14405);
in1932 = _mm512_fmadd_ps(tmp14400, _mm512_set1_ps(5.25e+00f), in1932);
tmp14397 = _mm512_fmadd_ps(tmp14404, _mm512_set1_ps(5.25e+00f), tmp14397);
tmp14400 = _mm512_fmadd_ps(tmp14393, _mm512_set1_ps(2.5e-01f), in1936);
tmp14404 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(2.5e-01f), in1943);
tmp14393 = _mm512_fmadd_ps(tmp14393, _mm512_set1_ps(4e+00f), in1936);
tmp14398 = _mm512_fmadd_ps(tmp14398, _mm512_set1_ps(4e+00f), in1943);
__m512 tmp14402 = _mm512_sub_ps(tmp14401, tmp14399);
__m512 tmp14406 = _mm512_sub_ps(tmp14405, tmp14403);
tmp14401 = _mm512_add_ps(tmp14399, tmp14401);
tmp14405 = _mm512_add_ps(tmp14403, tmp14405);
tmp14399 = _mm512_fmadd_ps(tmp14392, _mm512_set1_ps(2.5e-01f), in1934);
tmp14403 = _mm512_fmadd_ps(tmp14396, _mm512_set1_ps(2.5e-01f), in1941);
tmp14400 = _mm512_fmadd_ps(tmp14391, _mm512_set1_ps(-1.25e+00f), tmp14400);
tmp14404 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-1.25e+00f), tmp14404);
tmp14391 = _mm512_fmadd_ps(tmp14391, _mm512_set1_ps(-5e+00f), tmp14393);
tmp14395 = _mm512_fmadd_ps(tmp14395, _mm512_set1_ps(-5e+00f), tmp14398);
tmp14399 = _mm512_fmadd_ps(in1938, _mm512_set1_ps(-1.25e+00f), tmp14399);
tmp14403 = _mm512_fmadd_ps(in1945, _mm512_set1_ps(-1.25e+00f), tmp14403);
in1936 = _mm512_fmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14400);
in1943 = _mm512_fmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14404);
tmp14400 = _mm512_fnmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14400);
tmp14404 = _mm512_fnmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14404);
tmp14399 = _mm512_fmadd_ps(in1934, _mm512_set1_ps(2.5e-01f), tmp14392);
tmp14403 = _mm512_fmadd_ps(in1941, _mm512_set1_ps(2.5e-01f), tmp14396);
tmp14392 = _mm512_sub_ps(in1935, tmp14392);
tmp14396 = _mm512_sub_ps(in1942, tmp14396);
tmp14399 = _mm512_fmadd_ps(in1938, _mm512_set1_ps(-1.25e+00f), tmp14399);
tmp14403 = _mm512_fmadd_ps(in1945, _mm512_set1_ps(-1.25e+00f), tmp14403);
in1938 = _mm512_sub_ps(in1938, in1934);
in1945 = _mm512_sub_ps(in1945, in1941);
in1938 = _mm512_fmadd_ps(in1938, _mm512_set1_ps(5.25e+00f), tmp14392);
in1945 = _mm512_fmadd_ps(in1945, _mm512_set1_ps(5.25e+00f), tmp14396);
tmp14393 = _mm512_fmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14391);
tmp14398 = _mm512_fmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14395);
tmp14391 = _mm512_fnmadd_ps(tmp14399, _mm512_set1_ps(2e+00f), tmp14391);
tmp14395 = _mm512_fnmadd_ps(tmp14403, _mm512_set1_ps(2e+00f), tmp14395);
__m512 out1775 = _mm512_shuffle_f32x4(in1932, tmp14401, 68);
__m512 out1783 = _mm512_shuffle_f32x4(in1932, tmp14401, 238);
__m512 out1776 = _mm512_shuffle_f32x4(tmp14402, in1936, 68);
__m512 out1784 = _mm512_shuffle_f32x4(tmp14402, in1936, 238);
__m512 out1777 = _mm512_shuffle_f32x4(tmp14400, tmp14393, 68);
__m512 out1785 = _mm512_shuffle_f32x4(tmp14400, tmp14393, 238);
__m512 out1778 = _mm512_shuffle_f32x4(tmp14391, in1938, 68);
__m512 out1786 = _mm512_shuffle_f32x4(tmp14391, in1938, 238);
__m512 out1779 = _mm512_shuffle_f32x4(tmp14397, tmp14405, 68);
__m512 out1787 = _mm512_shuffle_f32x4(tmp14397, tmp14405, 238);
__m512 out1780 = _mm512_shuffle_f32x4(tmp14406, in1943, 68);
__m512 out1788 = _mm512_shuffle_f32x4(tmp14406, in1943, 238);
__m512 out1781 = _mm512_shuffle_f32x4(tmp14404, tmp14398, 68);
__m512 out1789 = _mm512_shuffle_f32x4(tmp14404, tmp14398, 238);
__m512 out1782 = _mm512_shuffle_f32x4(tmp14395, in1945, 68);
__m512 out1790 = _mm512_shuffle_f32x4(tmp14395, in1945, 238);
_mm512_storeu_ps(dfPtr16+256+36864*i71+6144*j63+6144*s73+768*k173, out1775);
_mm512_storeu_ps(dfPtr16+384+36864*i71+6144*j63+6144*s73+768*k173, out1783);
_mm512_storeu_ps(dfPtr16+320+36864*i71+6144*j63+6144*s73+768*k173, out1779);
_mm512_storeu_ps(dfPtr16+448+36864*i71+6144*j63+6144*s73+768*k173, out1787);
_mm512_storeu_ps(dfPtr16+9472+36864*i71+6144*j63+6144*s73+768*k173, out1776);
_mm512_storeu_ps(dfPtr16+9600+36864*i71+6144*j63+6144*s73+768*k173, out1784);
_mm512_storeu_ps(dfPtr16+9536+36864*i71+6144*j63+6144*s73+768*k173, out1780);
_mm512_storeu_ps(dfPtr16+9664+36864*i71+6144*j63+6144*s73+768*k173, out1788);
_mm512_storeu_ps(dfPtr16+18688+36864*i71+6144*j63+6144*s73+768*k173, out1777);
_mm512_storeu_ps(dfPtr16+18816+36864*i71+6144*j63+6144*s73+768*k173, out1785);
_mm512_storeu_ps(dfPtr16+18752+36864*i71+6144*j63+6144*s73+768*k173, out1781);
_mm512_storeu_ps(dfPtr16+18880+36864*i71+6144*j63+6144*s73+768*k173, out1789);
_mm512_storeu_ps(dfPtr16+27904+36864*i71+6144*j63+6144*s73+768*k173, out1778);
_mm512_storeu_ps(dfPtr16+28032+36864*i71+6144*j63+6144*s73+768*k173, out1786);
_mm512_storeu_ps(dfPtr16+27968+36864*i71+6144*j63+6144*s73+768*k173, out1782);
_mm512_storeu_ps(dfPtr16+28096+36864*i71+6144*j63+6144*s73+768*k173, out1790);
__m512 dat2512 = _mm512_maskz_loadu_ps(16383, datPtr38+1172+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512i pm218 = _mm512_set_epi32(6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15);
__m512 in1947 = _mm512_permutexvar_ps(pm218, dat2512);
__m512i pm219 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in1955 = _mm512_permutexvar_ps(pm219, dat2512);
__m512 dat2513 = _mm512_maskz_loadu_ps(7, datPtr38+936+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2514 = _mm512_maskz_loadu_ps(16383, datPtr38+1228+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512i pm220 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in1948 = _mm512_permutex2var_ps(dat2513, pm220, dat2514);
__m512 in1956 = _mm512_permutexvar_ps(pm219, dat2514);
__m512 dat2515 = _mm512_maskz_loadu_ps(7, datPtr38+992+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2516 = _mm512_maskz_loadu_ps(16383, datPtr38+1284+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1949 = _mm512_permutex2var_ps(dat2515, pm220, dat2516);
__m512 in1957 = _mm512_permutexvar_ps(pm219, dat2516);
__m512 dat2517 = _mm512_maskz_loadu_ps(7, datPtr38+1048+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2518 = _mm512_maskz_loadu_ps(16383, datPtr38+1340+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1950 = _mm512_permutex2var_ps(dat2517, pm220, dat2518);
__m512 in1958 = _mm512_permutexvar_ps(pm219, dat2518);
__m512 dat2519 = _mm512_maskz_loadu_ps(7, datPtr38+1104+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2520 = _mm512_maskz_loadu_ps(16383, datPtr38+1396+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1951 = _mm512_permutex2var_ps(dat2519, pm220, dat2520);
__m512 in1959 = _mm512_permutexvar_ps(pm219, dat2520);
__m512 dat2521 = _mm512_maskz_loadu_ps(7, datPtr38+1160+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2522 = _mm512_maskz_loadu_ps(16383, datPtr38+1452+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1952 = _mm512_permutex2var_ps(dat2521, pm220, dat2522);
__m512 in1960 = _mm512_permutexvar_ps(pm219, dat2522);
__m512 dat2523 = _mm512_maskz_loadu_ps(7, datPtr38+1216+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2524 = _mm512_maskz_loadu_ps(16383, datPtr38+1508+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1953 = _mm512_permutex2var_ps(dat2523, pm220, dat2524);
__m512 in1961 = _mm512_permutexvar_ps(pm219, dat2524);
__m512 dat2525 = _mm512_maskz_loadu_ps(7, datPtr38+1272+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 dat2526 = _mm512_maskz_loadu_ps(16383, datPtr38+1564+13312*i71+56*h52+4*w70+13312*s73+1664*k173);
__m512 in1954 = _mm512_permutex2var_ps(dat2525, pm220, dat2526);
__m512 in1962 = _mm512_permutexvar_ps(pm219, dat2526);
__m512 tmp14455 = _mm512_add_ps(in1948, in1952);
__m512 tmp14459 = _mm512_add_ps(in1956, in1960);
__m512 tmp14456 = _mm512_sub_ps(in1951, in1949);
__m512 tmp14460 = _mm512_sub_ps(in1959, in1957);
__m512 tmp14457 = _mm512_add_ps(in1949, in1953);
__m512 tmp14461 = _mm512_add_ps(in1957, in1961);
in1947 = _mm512_sub_ps(in1947, in1953);
in1955 = _mm512_sub_ps(in1955, in1961);
tmp14455 = _mm512_fmadd_ps(in1950, _mm512_set1_ps(-4.25e+00f), tmp14455);
tmp14459 = _mm512_fmadd_ps(in1958, _mm512_set1_ps(-4.25e+00f), tmp14459);
tmp14457 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-4.25e+00f), tmp14457);
tmp14461 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-4.25e+00f), tmp14461);
in1947 = _mm512_fmadd_ps(tmp14456, _mm512_set1_ps(5.25e+00f), in1947);
in1955 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(5.25e+00f), in1955);
tmp14456 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(2.5e-01f), in1953);
tmp14460 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(2.5e-01f), in1961);
in1949 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(4e+00f), in1953);
in1957 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(4e+00f), in1961);
__m512 tmp14458 = _mm512_sub_ps(tmp14457, tmp14455);
__m512 tmp14462 = _mm512_sub_ps(tmp14461, tmp14459);
tmp14457 = _mm512_add_ps(tmp14455, tmp14457);
tmp14461 = _mm512_add_ps(tmp14459, tmp14461);
tmp14455 = _mm512_fmadd_ps(in1948, _mm512_set1_ps(2.5e-01f), in1952);
tmp14459 = _mm512_fmadd_ps(in1956, _mm512_set1_ps(2.5e-01f), in1960);
tmp14456 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-1.25e+00f), tmp14456);
tmp14460 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-1.25e+00f), tmp14460);
in1951 = _mm512_fmadd_ps(in1951, _mm512_set1_ps(-5e+00f), in1949);
in1959 = _mm512_fmadd_ps(in1959, _mm512_set1_ps(-5e+00f), in1957);
tmp14455 = _mm512_fmadd_ps(in1950, _mm512_set1_ps(-1.25e+00f), tmp14455);
tmp14459 = _mm512_fmadd_ps(in1958, _mm512_set1_ps(-1.25e+00f), tmp14459);
in1953 = _mm512_fmadd_ps(tmp14455, _mm512_set1_ps(2e+00f), tmp14456);
in1961 = _mm512_fmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), tmp14460);
tmp14456 = _mm512_fnmadd_ps(tmp14455, _mm512_set1_ps(2e+00f), tmp14456);
tmp14460 = _mm512_fnmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), tmp14460);
tmp14455 = _mm512_fmadd_ps(in1952, _mm512_set1_ps(2.5e-01f), in1948);
tmp14459 = _mm512_fmadd_ps(in1960, _mm512_set1_ps(2.5e-01f), in1956);
in1948 = _mm512_sub_ps(in1954, in1948);
in1956 = _mm512_sub_ps(in1962, in1956);
tmp14455 = _mm512_fmadd_ps(in1950, _mm512_set1_ps(-1.25e+00f), tmp14455);
tmp14459 = _mm512_fmadd_ps(in1958, _mm512_set1_ps(-1.25e+00f), tmp14459);
in1950 = _mm512_sub_ps(in1950, in1952);
in1958 = _mm512_sub_ps(in1958, in1960);
in1950 = _mm512_fmadd_ps(in1950, _mm512_set1_ps(5.25e+00f), in1948);
in1958 = _mm512_fmadd_ps(in1958, _mm512_set1_ps(5.25e+00f), in1956);
in1949 = _mm512_fmadd_ps(tmp14455, _mm512_set1_ps(2e+00f), in1951);
in1957 = _mm512_fmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), in1959);
in1951 = _mm512_fnmadd_ps(tmp14455, _mm512_set1_ps(2e+00f), in1951);
in1959 = _mm512_fnmadd_ps(tmp14459, _mm512_set1_ps(2e+00f), in1959);
__m512 tmp14471 = _mm512_unpacklo_ps(in1947, tmp14457);
__m512 tmp14472 = _mm512_unpackhi_ps(in1947, tmp14457);
__m512 tmp14473 = _mm512_unpacklo_ps(tmp14458, in1953);
__m512 tmp14474 = _mm512_unpackhi_ps(tmp14458, in1953);
__m512 tmp14475 = _mm512_unpacklo_ps(tmp14456, in1949);
__m512 tmp14476 = _mm512_unpackhi_ps(tmp14456, in1949);
__m512 tmp14477 = _mm512_unpacklo_ps(in1951, in1950);
__m512 tmp14478 = _mm512_unpackhi_ps(in1951, in1950);
__m512 tmp14479 = _mm512_unpacklo_ps(in1955, tmp14461);
__m512 tmp14480 = _mm512_unpackhi_ps(in1955, tmp14461);
__m512 tmp14481 = _mm512_unpacklo_ps(tmp14462, in1961);
__m512 tmp14482 = _mm512_unpackhi_ps(tmp14462, in1961);
__m512 tmp14483 = _mm512_unpacklo_ps(tmp14460, in1957);
__m512 tmp14484 = _mm512_unpackhi_ps(tmp14460, in1957);
__m512 tmp14485 = _mm512_unpacklo_ps(in1959, in1958);
__m512 tmp14486 = _mm512_unpackhi_ps(in1959, in1958);
__m512 tmp14487 = _mm512_shuffle_ps(tmp14471, tmp14473, 68);
__m512 tmp14488 = _mm512_shuffle_ps(tmp14471, tmp14473, 238);
__m512 tmp14489 = _mm512_shuffle_ps(tmp14472, tmp14474, 68);
__m512 tmp14490 = _mm512_shuffle_ps(tmp14472, tmp14474, 238);
__m512 tmp14491 = _mm512_shuffle_ps(tmp14475, tmp14477, 68);
__m512 tmp14492 = _mm512_shuffle_ps(tmp14475, tmp14477, 238);
__m512 tmp14493 = _mm512_shuffle_ps(tmp14476, tmp14478, 68);
__m512 tmp14494 = _mm512_shuffle_ps(tmp14476, tmp14478, 238);
__m512 tmp14495 = _mm512_shuffle_ps(tmp14479, tmp14481, 68);
__m512 tmp14496 = _mm512_shuffle_ps(tmp14479, tmp14481, 238);
__m512 tmp14497 = _mm512_shuffle_ps(tmp14480, tmp14482, 68);
__m512 tmp14498 = _mm512_shuffle_ps(tmp14480, tmp14482, 238);
__m512 tmp14499 = _mm512_shuffle_ps(tmp14483, tmp14485, 68);
__m512 tmp14500 = _mm512_shuffle_ps(tmp14483, tmp14485, 238);
__m512 tmp14501 = _mm512_shuffle_ps(tmp14484, tmp14486, 68);
__m512 tmp14502 = _mm512_shuffle_ps(tmp14484, tmp14486, 238);
__m512 tmp14503 = _mm512_shuffle_f32x4(tmp14487, tmp14491, 136);
__m512 tmp14504 = _mm512_shuffle_f32x4(tmp14487, tmp14491, 221);
__m512 tmp14505 = _mm512_shuffle_f32x4(tmp14488, tmp14492, 136);
__m512 tmp14506 = _mm512_shuffle_f32x4(tmp14488, tmp14492, 221);
__m512 tmp14507 = _mm512_shuffle_f32x4(tmp14489, tmp14493, 136);
__m512 tmp14508 = _mm512_shuffle_f32x4(tmp14489, tmp14493, 221);
__m512 tmp14509 = _mm512_shuffle_f32x4(tmp14490, tmp14494, 136);
__m512 tmp14510 = _mm512_shuffle_f32x4(tmp14490, tmp14494, 221);
__m512 tmp14511 = _mm512_shuffle_f32x4(tmp14495, tmp14499, 136);
__m512 tmp14512 = _mm512_shuffle_f32x4(tmp14495, tmp14499, 221);
__m512 tmp14513 = _mm512_shuffle_f32x4(tmp14496, tmp14500, 136);
__m512 tmp14514 = _mm512_shuffle_f32x4(tmp14496, tmp14500, 221);
__m512 tmp14515 = _mm512_shuffle_f32x4(tmp14497, tmp14501, 136);
__m512 tmp14516 = _mm512_shuffle_f32x4(tmp14497, tmp14501, 221);
__m512 tmp14517 = _mm512_shuffle_f32x4(tmp14498, tmp14502, 136);
__m512 tmp14518 = _mm512_shuffle_f32x4(tmp14498, tmp14502, 221);
in1947 = _mm512_shuffle_f32x4(tmp14503, tmp14511, 136);
in1955 = _mm512_shuffle_f32x4(tmp14503, tmp14511, 221);
tmp14457 = _mm512_shuffle_f32x4(tmp14505, tmp14513, 136);
tmp14461 = _mm512_shuffle_f32x4(tmp14505, tmp14513, 221);
tmp14458 = _mm512_shuffle_f32x4(tmp14507, tmp14515, 136);
tmp14462 = _mm512_shuffle_f32x4(tmp14507, tmp14515, 221);
in1953 = _mm512_shuffle_f32x4(tmp14509, tmp14517, 136);
in1961 = _mm512_shuffle_f32x4(tmp14509, tmp14517, 221);
tmp14456 = _mm512_shuffle_f32x4(tmp14504, tmp14512, 136);
tmp14460 = _mm512_shuffle_f32x4(tmp14504, tmp14512, 221);
in1949 = _mm512_shuffle_f32x4(tmp14506, tmp14514, 136);
in1957 = _mm512_shuffle_f32x4(tmp14506, tmp14514, 221);
in1951 = _mm512_shuffle_f32x4(tmp14508, tmp14516, 136);
in1959 = _mm512_shuffle_f32x4(tmp14508, tmp14516, 221);
in1950 = _mm512_shuffle_f32x4(tmp14510, tmp14518, 136);
in1958 = _mm512_shuffle_f32x4(tmp14510, tmp14518, 221);
__m512 tmp14463 = _mm512_add_ps(tmp14457, in1949);
__m512 tmp14467 = _mm512_add_ps(tmp14461, in1957);
__m512 tmp14464 = _mm512_sub_ps(tmp14456, tmp14458);
__m512 tmp14468 = _mm512_sub_ps(tmp14460, tmp14462);
__m512 tmp14465 = _mm512_add_ps(tmp14458, in1951);
__m512 tmp14469 = _mm512_add_ps(tmp14462, in1959);
in1947 = _mm512_sub_ps(in1947, in1951);
in1955 = _mm512_sub_ps(in1955, in1959);
tmp14463 = _mm512_fmadd_ps(in1953, _mm512_set1_ps(-4.25e+00f), tmp14463);
tmp14467 = _mm512_fmadd_ps(in1961, _mm512_set1_ps(-4.25e+00f), tmp14467);
tmp14465 = _mm512_fmadd_ps(tmp14456, _mm512_set1_ps(-4.25e+00f), tmp14465);
tmp14469 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-4.25e+00f), tmp14469);
in1947 = _mm512_fmadd_ps(tmp14464, _mm512_set1_ps(5.25e+00f), in1947);
in1955 = _mm512_fmadd_ps(tmp14468, _mm512_set1_ps(5.25e+00f), in1955);
tmp14464 = _mm512_fmadd_ps(tmp14458, _mm512_set1_ps(2.5e-01f), in1951);
tmp14468 = _mm512_fmadd_ps(tmp14462, _mm512_set1_ps(2.5e-01f), in1959);
tmp14458 = _mm512_fmadd_ps(tmp14458, _mm512_set1_ps(4e+00f), in1951);
tmp14462 = _mm512_fmadd_ps(tmp14462, _mm512_set1_ps(4e+00f), in1959);
__m512 tmp14466 = _mm512_sub_ps(tmp14465, tmp14463);
__m512 tmp14470 = _mm512_sub_ps(tmp14469, tmp14467);
tmp14465 = _mm512_add_ps(tmp14463, tmp14465);
tmp14469 = _mm512_add_ps(tmp14467, tmp14469);
tmp14463 = _mm512_fmadd_ps(tmp14457, _mm512_set1_ps(2.5e-01f), in1949);
tmp14467 = _mm512_fmadd_ps(tmp14461, _mm512_set1_ps(2.5e-01f), in1957);
tmp14464 = _mm512_fmadd_ps(tmp14456, _mm512_set1_ps(-1.25e+00f), tmp14464);
tmp14468 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-1.25e+00f), tmp14468);
tmp14456 = _mm512_fmadd_ps(tmp14456, _mm512_set1_ps(-5e+00f), tmp14458);
tmp14460 = _mm512_fmadd_ps(tmp14460, _mm512_set1_ps(-5e+00f), tmp14462);
tmp14463 = _mm512_fmadd_ps(in1953, _mm512_set1_ps(-1.25e+00f), tmp14463);
tmp14467 = _mm512_fmadd_ps(in1961, _mm512_set1_ps(-1.25e+00f), tmp14467);
in1951 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(2e+00f), tmp14464);
in1959 = _mm512_fmadd_ps(tmp14467, _mm512_set1_ps(2e+00f), tmp14468);
tmp14464 = _mm512_fnmadd_ps(tmp14463, _mm512_set1_ps(2e+00f), tmp14464);
tmp14468 = _mm512_fnmadd_ps(tmp14467, _mm512_set1_ps(2e+00f), tmp14468);
tmp14463 = _mm512_fmadd_ps(in1949, _mm512_set1_ps(2.5e-01f), tmp14457);
tmp14467 = _mm512_fmadd_ps(in1957, _mm512_set1_ps(2.5e-01f), tmp14461);
tmp14457 = _mm512_sub_ps(in1950, tmp14457);
tmp14461 = _mm512_sub_ps(in1958, tmp14461);
tmp14463 = _mm512_fmadd_ps(in1953, _mm512_set1_ps(-1.25e+00f), tmp14463);
tmp14467 = _mm512_fmadd_ps(in1961, _mm512_set1_ps(-1.25e+00f), tmp14467);
in1953 = _mm512_sub_ps(in1953, in1949);
in1961 = _mm512_sub_ps(in1961, in1957);
in1953 = _mm512_fmadd_ps(in1953, _mm512_set1_ps(5.25e+00f), tmp14457);
in1961 = _mm512_fmadd_ps(in1961, _mm512_set1_ps(5.25e+00f), tmp14461);
tmp14458 = _mm512_fmadd_ps(tmp14463, _mm512_set1_ps(2e+00f), tmp14456);
tmp14462 = _mm512_fmadd_ps(tmp14467, _mm512_set1_ps(2e+00f), tmp14460);
tmp14456 = _mm512_fnmadd_ps(tmp14463, _mm512_set1_ps(2e+00f), tmp14456);
tmp14460 = _mm512_fnmadd_ps(tmp14467, _mm512_set1_ps(2e+00f), tmp14460);
__m512 out1791 = _mm512_shuffle_f32x4(in1947, tmp14465, 68);
__m512 out1799 = _mm512_shuffle_f32x4(in1947, tmp14465, 238);
__m512 out1792 = _mm512_shuffle_f32x4(tmp14466, in1951, 68);
__m512 out1800 = _mm512_shuffle_f32x4(tmp14466, in1951, 238);
__m512 out1793 = _mm512_shuffle_f32x4(tmp14464, tmp14458, 68);
__m512 out1801 = _mm512_shuffle_f32x4(tmp14464, tmp14458, 238);
__m512 out1794 = _mm512_shuffle_f32x4(tmp14456, in1953, 68);
__m512 out1802 = _mm512_shuffle_f32x4(tmp14456, in1953, 238);
__m512 out1795 = _mm512_shuffle_f32x4(in1955, tmp14469, 68);
__m512 out1803 = _mm512_shuffle_f32x4(in1955, tmp14469, 238);
__m512 out1796 = _mm512_shuffle_f32x4(tmp14470, in1959, 68);
__m512 out1804 = _mm512_shuffle_f32x4(tmp14470, in1959, 238);
__m512 out1797 = _mm512_shuffle_f32x4(tmp14468, tmp14462, 68);
__m512 out1805 = _mm512_shuffle_f32x4(tmp14468, tmp14462, 238);
__m512 out1798 = _mm512_shuffle_f32x4(tmp14460, in1961, 68);
__m512 out1806 = _mm512_shuffle_f32x4(tmp14460, in1961, 238);
_mm512_storeu_ps(dfPtr16+512+36864*i71+6144*j63+6144*s73+768*k173, out1791);
_mm512_storeu_ps(dfPtr16+640+36864*i71+6144*j63+6144*s73+768*k173, out1799);
_mm512_storeu_ps(dfPtr16+576+36864*i71+6144*j63+6144*s73+768*k173, out1795);
_mm512_storeu_ps(dfPtr16+704+36864*i71+6144*j63+6144*s73+768*k173, out1803);
_mm512_storeu_ps(dfPtr16+9728+36864*i71+6144*j63+6144*s73+768*k173, out1792);
_mm512_storeu_ps(dfPtr16+9856+36864*i71+6144*j63+6144*s73+768*k173, out1800);
_mm512_storeu_ps(dfPtr16+9792+36864*i71+6144*j63+6144*s73+768*k173, out1796);
_mm512_storeu_ps(dfPtr16+9920+36864*i71+6144*j63+6144*s73+768*k173, out1804);
_mm512_storeu_ps(dfPtr16+18944+36864*i71+6144*j63+6144*s73+768*k173, out1793);
_mm512_storeu_ps(dfPtr16+19072+36864*i71+6144*j63+6144*s73+768*k173, out1801);
_mm512_storeu_ps(dfPtr16+19008+36864*i71+6144*j63+6144*s73+768*k173, out1797);
_mm512_storeu_ps(dfPtr16+19136+36864*i71+6144*j63+6144*s73+768*k173, out1805);
_mm512_storeu_ps(dfPtr16+28160+36864*i71+6144*j63+6144*s73+768*k173, out1794);
_mm512_storeu_ps(dfPtr16+28288+36864*i71+6144*j63+6144*s73+768*k173, out1802);
_mm512_storeu_ps(dfPtr16+28224+36864*i71+6144*j63+6144*s73+768*k173, out1798);
_mm512_storeu_ps(dfPtr16+28352+36864*i71+6144*j63+6144*s73+768*k173, out1806);
}
++j63;
rel25 = 1;
}
ptrdiff_t h53 = base25+12;
ptrdiff_t w71 = 0;
ptrdiff_t k174 = 0;
for (; k174 != 4; ++k174) {
__m512 dat2527 = _mm512_maskz_loadu_ps(16383, datPtr38+4+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2528 = _mm512_maskz_loadu_ps(127, datPtr38+836+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512i pm221 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1963 = _mm512_permutexvar_ps(pm221, dat2527);
__m512i pm222 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 13, 12, 11);
__m512 in1966 = _mm512_permutex2var_ps(dat2527, pm222, dat2528);
__m512 dat2529 = _mm512_maskz_loadu_ps(16383, datPtr38+60+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2530 = _mm512_maskz_loadu_ps(127, datPtr38+892+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1964 = _mm512_permutexvar_ps(pm221, dat2529);
__m512 in1967 = _mm512_permutex2var_ps(dat2529, pm222, dat2530);
__m512 dat2531 = _mm512_maskz_loadu_ps(16383, datPtr38+116+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2532 = _mm512_maskz_loadu_ps(127, datPtr38+948+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1965 = _mm512_permutexvar_ps(pm221, dat2531);
__m512 in1968 = _mm512_permutex2var_ps(dat2531, pm222, dat2532);
__m512 tmp14519 = in1964;
__m512 tmp14526 = in1967;
__m512 tmp14520 = _mm512_sub_ps(_mm512_setzero_ps(), in1965);
__m512 tmp14527 = _mm512_sub_ps(_mm512_setzero_ps(), in1968);
__m512 tmp14521 = in1965;
__m512 tmp14528 = in1968;
in1963 = in1963;
in1966 = in1966;
tmp14519 = tmp14519;
tmp14526 = tmp14526;
tmp14521 = tmp14521;
tmp14528 = tmp14528;
in1963 = _mm512_fmadd_ps(tmp14520, _mm512_set1_ps(5.25e+00f), in1963);
in1966 = _mm512_fmadd_ps(tmp14527, _mm512_set1_ps(5.25e+00f), in1966);
tmp14520 = _mm512_mul_ps(in1965, _mm512_set1_ps(2.5e-01f));
tmp14527 = _mm512_mul_ps(in1968, _mm512_set1_ps(2.5e-01f));
in1965 = _mm512_mul_ps(in1965, _mm512_set1_ps(4e+00f));
in1968 = _mm512_mul_ps(in1968, _mm512_set1_ps(4e+00f));
__m512 tmp14522 = _mm512_sub_ps(tmp14521, tmp14519);
__m512 tmp14529 = _mm512_sub_ps(tmp14528, tmp14526);
tmp14521 = _mm512_add_ps(tmp14519, tmp14521);
tmp14528 = _mm512_add_ps(tmp14526, tmp14528);
tmp14519 = _mm512_mul_ps(in1964, _mm512_set1_ps(2.5e-01f));
tmp14526 = _mm512_mul_ps(in1967, _mm512_set1_ps(2.5e-01f));
tmp14520 = tmp14520;
tmp14527 = tmp14527;
__m512 tmp14523 = in1965;
__m512 tmp14530 = in1968;
tmp14519 = tmp14519;
tmp14526 = tmp14526;
__m512 tmp14524 = _mm512_fmadd_ps(tmp14519, _mm512_set1_ps(2e+00f), tmp14520);
__m512 tmp14531 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(2e+00f), tmp14527);
tmp14520 = _mm512_fnmadd_ps(tmp14519, _mm512_set1_ps(2e+00f), tmp14520);
tmp14527 = _mm512_fnmadd_ps(tmp14526, _mm512_set1_ps(2e+00f), tmp14527);
tmp14519 = in1964;
tmp14526 = in1967;
in1964 = _mm512_sub_ps(_mm512_setzero_ps(), in1964);
in1967 = _mm512_sub_ps(_mm512_setzero_ps(), in1967);
tmp14519 = tmp14519;
tmp14526 = tmp14526;
__m512 tmp14525 = in1964;
__m512 tmp14532 = in1967;
in1965 = _mm512_fmadd_ps(tmp14519, _mm512_set1_ps(2e+00f), tmp14523);
in1968 = _mm512_fmadd_ps(tmp14526, _mm512_set1_ps(2e+00f), tmp14530);
tmp14523 = _mm512_fnmadd_ps(tmp14519, _mm512_set1_ps(2e+00f), tmp14523);
tmp14530 = _mm512_fnmadd_ps(tmp14526, _mm512_set1_ps(2e+00f), tmp14530);
__m512 tmp14541 = _mm512_unpacklo_ps(in1963, tmp14521);
__m512 tmp14542 = _mm512_unpackhi_ps(in1963, tmp14521);
__m512 tmp14543 = _mm512_unpacklo_ps(tmp14522, tmp14524);
__m512 tmp14544 = _mm512_unpackhi_ps(tmp14522, tmp14524);
__m512 tmp14545 = _mm512_unpacklo_ps(tmp14520, in1965);
__m512 tmp14546 = _mm512_unpackhi_ps(tmp14520, in1965);
__m512 tmp14547 = _mm512_unpacklo_ps(tmp14523, tmp14525);
__m512 tmp14548 = _mm512_unpackhi_ps(tmp14523, tmp14525);
__m512 tmp14549 = _mm512_unpacklo_ps(in1966, tmp14528);
__m512 tmp14550 = _mm512_unpackhi_ps(in1966, tmp14528);
__m512 tmp14551 = _mm512_unpacklo_ps(tmp14529, tmp14531);
__m512 tmp14552 = _mm512_unpackhi_ps(tmp14529, tmp14531);
__m512 tmp14553 = _mm512_unpacklo_ps(tmp14527, in1968);
__m512 tmp14554 = _mm512_unpackhi_ps(tmp14527, in1968);
__m512 tmp14555 = _mm512_unpacklo_ps(tmp14530, tmp14532);
__m512 tmp14556 = _mm512_unpackhi_ps(tmp14530, tmp14532);
__m512 tmp14557 = _mm512_shuffle_ps(tmp14541, tmp14543, 68);
__m512 tmp14558 = _mm512_shuffle_ps(tmp14541, tmp14543, 238);
__m512 tmp14559 = _mm512_shuffle_ps(tmp14542, tmp14544, 68);
__m512 tmp14560 = _mm512_shuffle_ps(tmp14542, tmp14544, 238);
__m512 tmp14561 = _mm512_shuffle_ps(tmp14545, tmp14547, 68);
__m512 tmp14562 = _mm512_shuffle_ps(tmp14545, tmp14547, 238);
__m512 tmp14563 = _mm512_shuffle_ps(tmp14546, tmp14548, 68);
__m512 tmp14564 = _mm512_shuffle_ps(tmp14546, tmp14548, 238);
__m512 tmp14565 = _mm512_shuffle_ps(tmp14549, tmp14551, 68);
__m512 tmp14566 = _mm512_shuffle_ps(tmp14549, tmp14551, 238);
__m512 tmp14567 = _mm512_shuffle_ps(tmp14550, tmp14552, 68);
__m512 tmp14568 = _mm512_shuffle_ps(tmp14550, tmp14552, 238);
__m512 tmp14569 = _mm512_shuffle_ps(tmp14553, tmp14555, 68);
__m512 tmp14570 = _mm512_shuffle_ps(tmp14553, tmp14555, 238);
__m512 tmp14571 = _mm512_shuffle_ps(tmp14554, tmp14556, 68);
__m512 tmp14572 = _mm512_shuffle_ps(tmp14554, tmp14556, 238);
__m512 tmp14573 = _mm512_shuffle_f32x4(tmp14557, tmp14561, 136);
__m512 tmp14574 = _mm512_shuffle_f32x4(tmp14557, tmp14561, 221);
__m512 tmp14575 = _mm512_shuffle_f32x4(tmp14558, tmp14562, 136);
__m512 tmp14576 = _mm512_shuffle_f32x4(tmp14558, tmp14562, 221);
__m512 tmp14577 = _mm512_shuffle_f32x4(tmp14559, tmp14563, 136);
__m512 tmp14578 = _mm512_shuffle_f32x4(tmp14559, tmp14563, 221);
__m512 tmp14579 = _mm512_shuffle_f32x4(tmp14560, tmp14564, 136);
__m512 tmp14580 = _mm512_shuffle_f32x4(tmp14560, tmp14564, 221);
__m512 tmp14581 = _mm512_shuffle_f32x4(tmp14565, tmp14569, 136);
__m512 tmp14582 = _mm512_shuffle_f32x4(tmp14565, tmp14569, 221);
__m512 tmp14583 = _mm512_shuffle_f32x4(tmp14566, tmp14570, 136);
__m512 tmp14584 = _mm512_shuffle_f32x4(tmp14566, tmp14570, 221);
__m512 tmp14585 = _mm512_shuffle_f32x4(tmp14567, tmp14571, 136);
__m512 tmp14586 = _mm512_shuffle_f32x4(tmp14567, tmp14571, 221);
__m512 tmp14587 = _mm512_shuffle_f32x4(tmp14568, tmp14572, 136);
__m512 tmp14588 = _mm512_shuffle_f32x4(tmp14568, tmp14572, 221);
in1963 = _mm512_shuffle_f32x4(tmp14573, tmp14581, 136);
in1966 = _mm512_shuffle_f32x4(tmp14573, tmp14581, 221);
tmp14521 = _mm512_shuffle_f32x4(tmp14575, tmp14583, 136);
tmp14528 = _mm512_shuffle_f32x4(tmp14575, tmp14583, 221);
tmp14522 = _mm512_shuffle_f32x4(tmp14577, tmp14585, 136);
tmp14529 = _mm512_shuffle_f32x4(tmp14577, tmp14585, 221);
tmp14524 = _mm512_shuffle_f32x4(tmp14579, tmp14587, 136);
tmp14531 = _mm512_shuffle_f32x4(tmp14579, tmp14587, 221);
tmp14520 = _mm512_shuffle_f32x4(tmp14574, tmp14582, 136);
tmp14527 = _mm512_shuffle_f32x4(tmp14574, tmp14582, 221);
in1965 = _mm512_shuffle_f32x4(tmp14576, tmp14584, 136);
in1968 = _mm512_shuffle_f32x4(tmp14576, tmp14584, 221);
tmp14523 = _mm512_shuffle_f32x4(tmp14578, tmp14586, 136);
tmp14530 = _mm512_shuffle_f32x4(tmp14578, tmp14586, 221);
tmp14525 = _mm512_shuffle_f32x4(tmp14580, tmp14588, 136);
tmp14532 = _mm512_shuffle_f32x4(tmp14580, tmp14588, 221);
__m512 tmp14533 = _mm512_add_ps(tmp14521, in1965);
__m512 tmp14537 = _mm512_add_ps(tmp14528, in1968);
__m512 tmp14534 = _mm512_sub_ps(tmp14520, tmp14522);
__m512 tmp14538 = _mm512_sub_ps(tmp14527, tmp14529);
__m512 tmp14535 = _mm512_add_ps(tmp14522, tmp14523);
__m512 tmp14539 = _mm512_add_ps(tmp14529, tmp14530);
in1963 = _mm512_sub_ps(in1963, tmp14523);
in1966 = _mm512_sub_ps(in1966, tmp14530);
tmp14533 = _mm512_fmadd_ps(tmp14524, _mm512_set1_ps(-4.25e+00f), tmp14533);
tmp14537 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-4.25e+00f), tmp14537);
tmp14535 = _mm512_fmadd_ps(tmp14520, _mm512_set1_ps(-4.25e+00f), tmp14535);
tmp14539 = _mm512_fmadd_ps(tmp14527, _mm512_set1_ps(-4.25e+00f), tmp14539);
in1963 = _mm512_fmadd_ps(tmp14534, _mm512_set1_ps(5.25e+00f), in1963);
in1966 = _mm512_fmadd_ps(tmp14538, _mm512_set1_ps(5.25e+00f), in1966);
tmp14534 = _mm512_fmadd_ps(tmp14522, _mm512_set1_ps(2.5e-01f), tmp14523);
tmp14538 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(2.5e-01f), tmp14530);
tmp14522 = _mm512_fmadd_ps(tmp14522, _mm512_set1_ps(4e+00f), tmp14523);
tmp14529 = _mm512_fmadd_ps(tmp14529, _mm512_set1_ps(4e+00f), tmp14530);
__m512 tmp14536 = _mm512_sub_ps(tmp14535, tmp14533);
__m512 tmp14540 = _mm512_sub_ps(tmp14539, tmp14537);
tmp14535 = _mm512_add_ps(tmp14533, tmp14535);
tmp14539 = _mm512_add_ps(tmp14537, tmp14539);
tmp14533 = _mm512_fmadd_ps(tmp14521, _mm512_set1_ps(2.5e-01f), in1965);
tmp14537 = _mm512_fmadd_ps(tmp14528, _mm512_set1_ps(2.5e-01f), in1968);
tmp14534 = _mm512_fmadd_ps(tmp14520, _mm512_set1_ps(-1.25e+00f), tmp14534);
tmp14538 = _mm512_fmadd_ps(tmp14527, _mm512_set1_ps(-1.25e+00f), tmp14538);
tmp14520 = _mm512_fmadd_ps(tmp14520, _mm512_set1_ps(-5e+00f), tmp14522);
tmp14527 = _mm512_fmadd_ps(tmp14527, _mm512_set1_ps(-5e+00f), tmp14529);
tmp14533 = _mm512_fmadd_ps(tmp14524, _mm512_set1_ps(-1.25e+00f), tmp14533);
tmp14537 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-1.25e+00f), tmp14537);
tmp14523 = _mm512_fmadd_ps(tmp14533, _mm512_set1_ps(2e+00f), tmp14534);
tmp14530 = _mm512_fmadd_ps(tmp14537, _mm512_set1_ps(2e+00f), tmp14538);
tmp14534 = _mm512_fnmadd_ps(tmp14533, _mm512_set1_ps(2e+00f), tmp14534);
tmp14538 = _mm512_fnmadd_ps(tmp14537, _mm512_set1_ps(2e+00f), tmp14538);
tmp14533 = _mm512_fmadd_ps(in1965, _mm512_set1_ps(2.5e-01f), tmp14521);
tmp14537 = _mm512_fmadd_ps(in1968, _mm512_set1_ps(2.5e-01f), tmp14528);
tmp14521 = _mm512_sub_ps(tmp14525, tmp14521);
tmp14528 = _mm512_sub_ps(tmp14532, tmp14528);
tmp14533 = _mm512_fmadd_ps(tmp14524, _mm512_set1_ps(-1.25e+00f), tmp14533);
tmp14537 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(-1.25e+00f), tmp14537);
tmp14524 = _mm512_sub_ps(tmp14524, in1965);
tmp14531 = _mm512_sub_ps(tmp14531, in1968);
tmp14524 = _mm512_fmadd_ps(tmp14524, _mm512_set1_ps(5.25e+00f), tmp14521);
tmp14531 = _mm512_fmadd_ps(tmp14531, _mm512_set1_ps(5.25e+00f), tmp14528);
tmp14522 = _mm512_fmadd_ps(tmp14533, _mm512_set1_ps(2e+00f), tmp14520);
tmp14529 = _mm512_fmadd_ps(tmp14537, _mm512_set1_ps(2e+00f), tmp14527);
tmp14520 = _mm512_fnmadd_ps(tmp14533, _mm512_set1_ps(2e+00f), tmp14520);
tmp14527 = _mm512_fnmadd_ps(tmp14537, _mm512_set1_ps(2e+00f), tmp14527);
__m512 out1807 = _mm512_shuffle_f32x4(in1963, tmp14535, 68);
__m512 out1815 = _mm512_shuffle_f32x4(in1963, tmp14535, 238);
__m512 out1808 = _mm512_shuffle_f32x4(tmp14536, tmp14523, 68);
__m512 out1816 = _mm512_shuffle_f32x4(tmp14536, tmp14523, 238);
__m512 out1809 = _mm512_shuffle_f32x4(tmp14534, tmp14522, 68);
__m512 out1817 = _mm512_shuffle_f32x4(tmp14534, tmp14522, 238);
__m512 out1810 = _mm512_shuffle_f32x4(tmp14520, tmp14524, 68);
__m512 out1818 = _mm512_shuffle_f32x4(tmp14520, tmp14524, 238);
__m512 out1811 = _mm512_shuffle_f32x4(in1966, tmp14539, 68);
__m512 out1819 = _mm512_shuffle_f32x4(in1966, tmp14539, 238);
__m512 out1812 = _mm512_shuffle_f32x4(tmp14540, tmp14530, 68);
__m512 out1820 = _mm512_shuffle_f32x4(tmp14540, tmp14530, 238);
__m512 out1813 = _mm512_shuffle_f32x4(tmp14538, tmp14529, 68);
__m512 out1821 = _mm512_shuffle_f32x4(tmp14538, tmp14529, 238);
__m512 out1814 = _mm512_shuffle_f32x4(tmp14527, tmp14531, 68);
__m512 out1822 = _mm512_shuffle_f32x4(tmp14527, tmp14531, 238);
_mm512_storeu_ps(dfPtr16+0+36864*i71+6144*j63+3072*s73+768*k174, out1807);
_mm512_storeu_ps(dfPtr16+128+36864*i71+6144*j63+3072*s73+768*k174, out1815);
_mm512_storeu_ps(dfPtr16+64+36864*i71+6144*j63+3072*s73+768*k174, out1811);
_mm512_storeu_ps(dfPtr16+192+36864*i71+6144*j63+3072*s73+768*k174, out1819);
_mm512_storeu_ps(dfPtr16+9216+36864*i71+6144*j63+3072*s73+768*k174, out1808);
_mm512_storeu_ps(dfPtr16+9344+36864*i71+6144*j63+3072*s73+768*k174, out1816);
_mm512_storeu_ps(dfPtr16+9280+36864*i71+6144*j63+3072*s73+768*k174, out1812);
_mm512_storeu_ps(dfPtr16+9408+36864*i71+6144*j63+3072*s73+768*k174, out1820);
_mm512_storeu_ps(dfPtr16+18432+36864*i71+6144*j63+3072*s73+768*k174, out1809);
_mm512_storeu_ps(dfPtr16+18560+36864*i71+6144*j63+3072*s73+768*k174, out1817);
_mm512_storeu_ps(dfPtr16+18496+36864*i71+6144*j63+3072*s73+768*k174, out1813);
_mm512_storeu_ps(dfPtr16+18624+36864*i71+6144*j63+3072*s73+768*k174, out1821);
_mm512_storeu_ps(dfPtr16+27648+36864*i71+6144*j63+3072*s73+768*k174, out1810);
_mm512_storeu_ps(dfPtr16+27776+36864*i71+6144*j63+3072*s73+768*k174, out1818);
_mm512_storeu_ps(dfPtr16+27712+36864*i71+6144*j63+3072*s73+768*k174, out1814);
_mm512_storeu_ps(dfPtr16+27840+36864*i71+6144*j63+3072*s73+768*k174, out1822);
__m512 dat2533 = _mm512_maskz_loadu_ps(511, datPtr38+856+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2534 = _mm512_maskz_loadu_ps(8191, datPtr38+1668+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512i pm223 = _mm512_set_epi32(15, 15, 15, 15, 15, 8, 7, 6, 7, 6, 5, 4, 3, 2, 1, 0);
__m512 in1969 = _mm512_permutexvar_ps(pm223, dat2533);
__m512i pm224 = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in1972 = _mm512_permutexvar_ps(pm224, dat2534);
__m512 dat2535 = _mm512_maskz_loadu_ps(511, datPtr38+912+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2536 = _mm512_maskz_loadu_ps(8191, datPtr38+1724+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1970 = _mm512_permutexvar_ps(pm223, dat2535);
__m512 in1973 = _mm512_permutexvar_ps(pm224, dat2536);
__m512 dat2537 = _mm512_maskz_loadu_ps(511, datPtr38+968+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2538 = _mm512_maskz_loadu_ps(8191, datPtr38+1780+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1971 = _mm512_permutexvar_ps(pm223, dat2537);
__m512 in1974 = _mm512_permutexvar_ps(pm224, dat2538);
__m512 tmp14589 = in1970;
__m512 tmp14596 = in1973;
__m512 tmp14590 = _mm512_sub_ps(_mm512_setzero_ps(), in1971);
__m512 tmp14597 = _mm512_sub_ps(_mm512_setzero_ps(), in1974);
__m512 tmp14591 = in1971;
__m512 tmp14598 = in1974;
in1969 = in1969;
in1972 = in1972;
tmp14589 = tmp14589;
tmp14596 = tmp14596;
tmp14591 = tmp14591;
tmp14598 = tmp14598;
in1969 = _mm512_fmadd_ps(tmp14590, _mm512_set1_ps(5.25e+00f), in1969);
in1972 = _mm512_fmadd_ps(tmp14597, _mm512_set1_ps(5.25e+00f), in1972);
tmp14590 = _mm512_mul_ps(in1971, _mm512_set1_ps(2.5e-01f));
tmp14597 = _mm512_mul_ps(in1974, _mm512_set1_ps(2.5e-01f));
in1971 = _mm512_mul_ps(in1971, _mm512_set1_ps(4e+00f));
in1974 = _mm512_mul_ps(in1974, _mm512_set1_ps(4e+00f));
__m512 tmp14592 = _mm512_sub_ps(tmp14591, tmp14589);
__m512 tmp14599 = _mm512_sub_ps(tmp14598, tmp14596);
tmp14591 = _mm512_add_ps(tmp14589, tmp14591);
tmp14598 = _mm512_add_ps(tmp14596, tmp14598);
tmp14589 = _mm512_mul_ps(in1970, _mm512_set1_ps(2.5e-01f));
tmp14596 = _mm512_mul_ps(in1973, _mm512_set1_ps(2.5e-01f));
tmp14590 = tmp14590;
tmp14597 = tmp14597;
__m512 tmp14593 = in1971;
__m512 tmp14600 = in1974;
tmp14589 = tmp14589;
tmp14596 = tmp14596;
__m512 tmp14594 = _mm512_fmadd_ps(tmp14589, _mm512_set1_ps(2e+00f), tmp14590);
__m512 tmp14601 = _mm512_fmadd_ps(tmp14596, _mm512_set1_ps(2e+00f), tmp14597);
tmp14590 = _mm512_fnmadd_ps(tmp14589, _mm512_set1_ps(2e+00f), tmp14590);
tmp14597 = _mm512_fnmadd_ps(tmp14596, _mm512_set1_ps(2e+00f), tmp14597);
tmp14589 = in1970;
tmp14596 = in1973;
in1970 = _mm512_sub_ps(_mm512_setzero_ps(), in1970);
in1973 = _mm512_sub_ps(_mm512_setzero_ps(), in1973);
tmp14589 = tmp14589;
tmp14596 = tmp14596;
__m512 tmp14595 = in1970;
__m512 tmp14602 = in1973;
in1971 = _mm512_fmadd_ps(tmp14589, _mm512_set1_ps(2e+00f), tmp14593);
in1974 = _mm512_fmadd_ps(tmp14596, _mm512_set1_ps(2e+00f), tmp14600);
tmp14593 = _mm512_fnmadd_ps(tmp14589, _mm512_set1_ps(2e+00f), tmp14593);
tmp14600 = _mm512_fnmadd_ps(tmp14596, _mm512_set1_ps(2e+00f), tmp14600);
__m512 tmp14611 = _mm512_unpacklo_ps(in1969, tmp14591);
__m512 tmp14612 = _mm512_unpackhi_ps(in1969, tmp14591);
__m512 tmp14613 = _mm512_unpacklo_ps(tmp14592, tmp14594);
__m512 tmp14614 = _mm512_unpackhi_ps(tmp14592, tmp14594);
__m512 tmp14615 = _mm512_unpacklo_ps(tmp14590, in1971);
__m512 tmp14616 = _mm512_unpackhi_ps(tmp14590, in1971);
__m512 tmp14617 = _mm512_unpacklo_ps(tmp14593, tmp14595);
__m512 tmp14618 = _mm512_unpackhi_ps(tmp14593, tmp14595);
__m512 tmp14619 = _mm512_unpacklo_ps(in1972, tmp14598);
__m512 tmp14620 = _mm512_unpackhi_ps(in1972, tmp14598);
__m512 tmp14621 = _mm512_unpacklo_ps(tmp14599, tmp14601);
__m512 tmp14622 = _mm512_unpackhi_ps(tmp14599, tmp14601);
__m512 tmp14623 = _mm512_unpacklo_ps(tmp14597, in1974);
__m512 tmp14624 = _mm512_unpackhi_ps(tmp14597, in1974);
__m512 tmp14625 = _mm512_unpacklo_ps(tmp14600, tmp14602);
__m512 tmp14626 = _mm512_unpackhi_ps(tmp14600, tmp14602);
__m512 tmp14627 = _mm512_shuffle_ps(tmp14611, tmp14613, 68);
__m512 tmp14628 = _mm512_shuffle_ps(tmp14611, tmp14613, 238);
__m512 tmp14629 = _mm512_shuffle_ps(tmp14612, tmp14614, 68);
__m512 tmp14630 = _mm512_shuffle_ps(tmp14612, tmp14614, 238);
__m512 tmp14631 = _mm512_shuffle_ps(tmp14615, tmp14617, 68);
__m512 tmp14632 = _mm512_shuffle_ps(tmp14615, tmp14617, 238);
__m512 tmp14633 = _mm512_shuffle_ps(tmp14616, tmp14618, 68);
__m512 tmp14634 = _mm512_shuffle_ps(tmp14616, tmp14618, 238);
__m512 tmp14635 = _mm512_shuffle_ps(tmp14619, tmp14621, 68);
__m512 tmp14636 = _mm512_shuffle_ps(tmp14619, tmp14621, 238);
__m512 tmp14637 = _mm512_shuffle_ps(tmp14620, tmp14622, 68);
__m512 tmp14638 = _mm512_shuffle_ps(tmp14620, tmp14622, 238);
__m512 tmp14639 = _mm512_shuffle_ps(tmp14623, tmp14625, 68);
__m512 tmp14640 = _mm512_shuffle_ps(tmp14623, tmp14625, 238);
__m512 tmp14641 = _mm512_shuffle_ps(tmp14624, tmp14626, 68);
__m512 tmp14642 = _mm512_shuffle_ps(tmp14624, tmp14626, 238);
__m512 tmp14643 = _mm512_shuffle_f32x4(tmp14627, tmp14631, 136);
__m512 tmp14644 = _mm512_shuffle_f32x4(tmp14627, tmp14631, 221);
__m512 tmp14645 = _mm512_shuffle_f32x4(tmp14628, tmp14632, 136);
__m512 tmp14646 = _mm512_shuffle_f32x4(tmp14628, tmp14632, 221);
__m512 tmp14647 = _mm512_shuffle_f32x4(tmp14629, tmp14633, 136);
__m512 tmp14648 = _mm512_shuffle_f32x4(tmp14629, tmp14633, 221);
__m512 tmp14649 = _mm512_shuffle_f32x4(tmp14630, tmp14634, 136);
__m512 tmp14650 = _mm512_shuffle_f32x4(tmp14630, tmp14634, 221);
__m512 tmp14651 = _mm512_shuffle_f32x4(tmp14635, tmp14639, 136);
__m512 tmp14652 = _mm512_shuffle_f32x4(tmp14635, tmp14639, 221);
__m512 tmp14653 = _mm512_shuffle_f32x4(tmp14636, tmp14640, 136);
__m512 tmp14654 = _mm512_shuffle_f32x4(tmp14636, tmp14640, 221);
__m512 tmp14655 = _mm512_shuffle_f32x4(tmp14637, tmp14641, 136);
__m512 tmp14656 = _mm512_shuffle_f32x4(tmp14637, tmp14641, 221);
__m512 tmp14657 = _mm512_shuffle_f32x4(tmp14638, tmp14642, 136);
__m512 tmp14658 = _mm512_shuffle_f32x4(tmp14638, tmp14642, 221);
in1969 = _mm512_shuffle_f32x4(tmp14643, tmp14651, 136);
in1972 = _mm512_shuffle_f32x4(tmp14643, tmp14651, 221);
tmp14591 = _mm512_shuffle_f32x4(tmp14645, tmp14653, 136);
tmp14598 = _mm512_shuffle_f32x4(tmp14645, tmp14653, 221);
tmp14592 = _mm512_shuffle_f32x4(tmp14647, tmp14655, 136);
tmp14599 = _mm512_shuffle_f32x4(tmp14647, tmp14655, 221);
tmp14594 = _mm512_shuffle_f32x4(tmp14649, tmp14657, 136);
tmp14601 = _mm512_shuffle_f32x4(tmp14649, tmp14657, 221);
tmp14590 = _mm512_shuffle_f32x4(tmp14644, tmp14652, 136);
tmp14597 = _mm512_shuffle_f32x4(tmp14644, tmp14652, 221);
in1971 = _mm512_shuffle_f32x4(tmp14646, tmp14654, 136);
in1974 = _mm512_shuffle_f32x4(tmp14646, tmp14654, 221);
tmp14593 = _mm512_shuffle_f32x4(tmp14648, tmp14656, 136);
tmp14600 = _mm512_shuffle_f32x4(tmp14648, tmp14656, 221);
tmp14595 = _mm512_shuffle_f32x4(tmp14650, tmp14658, 136);
tmp14602 = _mm512_shuffle_f32x4(tmp14650, tmp14658, 221);
__m512 tmp14603 = _mm512_add_ps(tmp14591, in1971);
__m512 tmp14607 = _mm512_add_ps(tmp14598, in1974);
__m512 tmp14604 = _mm512_sub_ps(tmp14590, tmp14592);
__m512 tmp14608 = _mm512_sub_ps(tmp14597, tmp14599);
__m512 tmp14605 = _mm512_add_ps(tmp14592, tmp14593);
__m512 tmp14609 = _mm512_add_ps(tmp14599, tmp14600);
in1969 = _mm512_sub_ps(in1969, tmp14593);
in1972 = _mm512_sub_ps(in1972, tmp14600);
tmp14603 = _mm512_fmadd_ps(tmp14594, _mm512_set1_ps(-4.25e+00f), tmp14603);
tmp14607 = _mm512_fmadd_ps(tmp14601, _mm512_set1_ps(-4.25e+00f), tmp14607);
tmp14605 = _mm512_fmadd_ps(tmp14590, _mm512_set1_ps(-4.25e+00f), tmp14605);
tmp14609 = _mm512_fmadd_ps(tmp14597, _mm512_set1_ps(-4.25e+00f), tmp14609);
in1969 = _mm512_fmadd_ps(tmp14604, _mm512_set1_ps(5.25e+00f), in1969);
in1972 = _mm512_fmadd_ps(tmp14608, _mm512_set1_ps(5.25e+00f), in1972);
tmp14604 = _mm512_fmadd_ps(tmp14592, _mm512_set1_ps(2.5e-01f), tmp14593);
tmp14608 = _mm512_fmadd_ps(tmp14599, _mm512_set1_ps(2.5e-01f), tmp14600);
tmp14592 = _mm512_fmadd_ps(tmp14592, _mm512_set1_ps(4e+00f), tmp14593);
tmp14599 = _mm512_fmadd_ps(tmp14599, _mm512_set1_ps(4e+00f), tmp14600);
__m512 tmp14606 = _mm512_sub_ps(tmp14605, tmp14603);
__m512 tmp14610 = _mm512_sub_ps(tmp14609, tmp14607);
tmp14605 = _mm512_add_ps(tmp14603, tmp14605);
tmp14609 = _mm512_add_ps(tmp14607, tmp14609);
tmp14603 = _mm512_fmadd_ps(tmp14591, _mm512_set1_ps(2.5e-01f), in1971);
tmp14607 = _mm512_fmadd_ps(tmp14598, _mm512_set1_ps(2.5e-01f), in1974);
tmp14604 = _mm512_fmadd_ps(tmp14590, _mm512_set1_ps(-1.25e+00f), tmp14604);
tmp14608 = _mm512_fmadd_ps(tmp14597, _mm512_set1_ps(-1.25e+00f), tmp14608);
tmp14590 = _mm512_fmadd_ps(tmp14590, _mm512_set1_ps(-5e+00f), tmp14592);
tmp14597 = _mm512_fmadd_ps(tmp14597, _mm512_set1_ps(-5e+00f), tmp14599);
tmp14603 = _mm512_fmadd_ps(tmp14594, _mm512_set1_ps(-1.25e+00f), tmp14603);
tmp14607 = _mm512_fmadd_ps(tmp14601, _mm512_set1_ps(-1.25e+00f), tmp14607);
tmp14593 = _mm512_fmadd_ps(tmp14603, _mm512_set1_ps(2e+00f), tmp14604);
tmp14600 = _mm512_fmadd_ps(tmp14607, _mm512_set1_ps(2e+00f), tmp14608);
tmp14604 = _mm512_fnmadd_ps(tmp14603, _mm512_set1_ps(2e+00f), tmp14604);
tmp14608 = _mm512_fnmadd_ps(tmp14607, _mm512_set1_ps(2e+00f), tmp14608);
tmp14603 = _mm512_fmadd_ps(in1971, _mm512_set1_ps(2.5e-01f), tmp14591);
tmp14607 = _mm512_fmadd_ps(in1974, _mm512_set1_ps(2.5e-01f), tmp14598);
tmp14591 = _mm512_sub_ps(tmp14595, tmp14591);
tmp14598 = _mm512_sub_ps(tmp14602, tmp14598);
tmp14603 = _mm512_fmadd_ps(tmp14594, _mm512_set1_ps(-1.25e+00f), tmp14603);
tmp14607 = _mm512_fmadd_ps(tmp14601, _mm512_set1_ps(-1.25e+00f), tmp14607);
tmp14594 = _mm512_sub_ps(tmp14594, in1971);
tmp14601 = _mm512_sub_ps(tmp14601, in1974);
tmp14594 = _mm512_fmadd_ps(tmp14594, _mm512_set1_ps(5.25e+00f), tmp14591);
tmp14601 = _mm512_fmadd_ps(tmp14601, _mm512_set1_ps(5.25e+00f), tmp14598);
tmp14592 = _mm512_fmadd_ps(tmp14603, _mm512_set1_ps(2e+00f), tmp14590);
tmp14599 = _mm512_fmadd_ps(tmp14607, _mm512_set1_ps(2e+00f), tmp14597);
tmp14590 = _mm512_fnmadd_ps(tmp14603, _mm512_set1_ps(2e+00f), tmp14590);
tmp14597 = _mm512_fnmadd_ps(tmp14607, _mm512_set1_ps(2e+00f), tmp14597);
__m512 out1823 = _mm512_shuffle_f32x4(in1969, tmp14605, 68);
__m512 out1831 = _mm512_shuffle_f32x4(in1969, tmp14605, 238);
__m512 out1824 = _mm512_shuffle_f32x4(tmp14606, tmp14593, 68);
__m512 out1832 = _mm512_shuffle_f32x4(tmp14606, tmp14593, 238);
__m512 out1825 = _mm512_shuffle_f32x4(tmp14604, tmp14592, 68);
__m512 out1833 = _mm512_shuffle_f32x4(tmp14604, tmp14592, 238);
__m512 out1826 = _mm512_shuffle_f32x4(tmp14590, tmp14594, 68);
__m512 out1834 = _mm512_shuffle_f32x4(tmp14590, tmp14594, 238);
__m512 out1827 = _mm512_shuffle_f32x4(in1972, tmp14609, 68);
__m512 out1835 = _mm512_shuffle_f32x4(in1972, tmp14609, 238);
__m512 out1828 = _mm512_shuffle_f32x4(tmp14610, tmp14600, 68);
__m512 out1836 = _mm512_shuffle_f32x4(tmp14610, tmp14600, 238);
__m512 out1829 = _mm512_shuffle_f32x4(tmp14608, tmp14599, 68);
__m512 out1837 = _mm512_shuffle_f32x4(tmp14608, tmp14599, 238);
__m512 out1830 = _mm512_shuffle_f32x4(tmp14597, tmp14601, 68);
__m512 out1838 = _mm512_shuffle_f32x4(tmp14597, tmp14601, 238);
_mm512_storeu_ps(dfPtr16+256+36864*i71+6144*j63+3072*s73+768*k174, out1823);
_mm512_storeu_ps(dfPtr16+384+36864*i71+6144*j63+3072*s73+768*k174, out1831);
_mm512_storeu_ps(dfPtr16+320+36864*i71+6144*j63+3072*s73+768*k174, out1827);
_mm512_storeu_ps(dfPtr16+448+36864*i71+6144*j63+3072*s73+768*k174, out1835);
_mm512_storeu_ps(dfPtr16+9472+36864*i71+6144*j63+3072*s73+768*k174, out1824);
_mm512_storeu_ps(dfPtr16+9600+36864*i71+6144*j63+3072*s73+768*k174, out1832);
_mm512_storeu_ps(dfPtr16+9536+36864*i71+6144*j63+3072*s73+768*k174, out1828);
_mm512_storeu_ps(dfPtr16+9664+36864*i71+6144*j63+3072*s73+768*k174, out1836);
_mm512_storeu_ps(dfPtr16+18688+36864*i71+6144*j63+3072*s73+768*k174, out1825);
_mm512_storeu_ps(dfPtr16+18816+36864*i71+6144*j63+3072*s73+768*k174, out1833);
_mm512_storeu_ps(dfPtr16+18752+36864*i71+6144*j63+3072*s73+768*k174, out1829);
_mm512_storeu_ps(dfPtr16+18880+36864*i71+6144*j63+3072*s73+768*k174, out1837);
_mm512_storeu_ps(dfPtr16+27904+36864*i71+6144*j63+3072*s73+768*k174, out1826);
_mm512_storeu_ps(dfPtr16+28032+36864*i71+6144*j63+3072*s73+768*k174, out1834);
_mm512_storeu_ps(dfPtr16+27968+36864*i71+6144*j63+3072*s73+768*k174, out1830);
_mm512_storeu_ps(dfPtr16+28096+36864*i71+6144*j63+3072*s73+768*k174, out1838);
__m512 dat2539 = _mm512_maskz_loadu_ps(7, datPtr38+1712+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2540 = _mm512_maskz_loadu_ps(16383, datPtr38+2500+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512i pm225 = _mm512_set_epi32(22, 21, 20, 19, 18, 17, 16, 15, 15, 15, 15, 15, 15, 2, 1, 0);
__m512 in1975 = _mm512_permutex2var_ps(dat2539, pm225, dat2540);
__m512i pm226 = _mm512_set_epi32(15, 15, 15, 15, 15, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 5);
__m512 in1978 = _mm512_permutexvar_ps(pm226, dat2540);
__m512 dat2541 = _mm512_maskz_loadu_ps(7, datPtr38+1768+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2542 = _mm512_maskz_loadu_ps(16383, datPtr38+2556+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1976 = _mm512_permutex2var_ps(dat2541, pm225, dat2542);
__m512 in1979 = _mm512_permutexvar_ps(pm226, dat2542);
__m512 dat2543 = _mm512_maskz_loadu_ps(7, datPtr38+1824+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 dat2544 = _mm512_maskz_loadu_ps(16383, datPtr38+2612+13312*i71+56*h53+4*w71+13312*s73+3328*k174);
__m512 in1977 = _mm512_permutex2var_ps(dat2543, pm225, dat2544);
__m512 in1980 = _mm512_permutexvar_ps(pm226, dat2544);
__m512 tmp14659 = in1976;
__m512 tmp14666 = in1979;
__m512 tmp14660 = _mm512_sub_ps(_mm512_setzero_ps(), in1977);
__m512 tmp14667 = _mm512_sub_ps(_mm512_setzero_ps(), in1980);
__m512 tmp14661 = in1977;
__m512 tmp14668 = in1980;
in1975 = in1975;
in1978 = in1978;
tmp14659 = tmp14659;
tmp14666 = tmp14666;
tmp14661 = tmp14661;
tmp14668 = tmp14668;
in1975 = _mm512_fmadd_ps(tmp14660, _mm512_set1_ps(5.25e+00f), in1975);
in1978 = _mm512_fmadd_ps(tmp14667, _mm512_set1_ps(5.25e+00f), in1978);
tmp14660 = _mm512_mul_ps(in1977, _mm512_set1_ps(2.5e-01f));
tmp14667 = _mm512_mul_ps(in1980, _mm512_set1_ps(2.5e-01f));
in1977 = _mm512_mul_ps(in1977, _mm512_set1_ps(4e+00f));
in1980 = _mm512_mul_ps(in1980, _mm512_set1_ps(4e+00f));
__m512 tmp14662 = _mm512_sub_ps(tmp14661, tmp14659);
__m512 tmp14669 = _mm512_sub_ps(tmp14668, tmp14666);
tmp14661 = _mm512_add_ps(tmp14659, tmp14661);
tmp14668 = _mm512_add_ps(tmp14666, tmp14668);
tmp14659 = _mm512_mul_ps(in1976, _mm512_set1_ps(2.5e-01f));
tmp14666 = _mm512_mul_ps(in1979, _mm512_set1_ps(2.5e-01f));
tmp14660 = tmp14660;
tmp14667 = tmp14667;
__m512 tmp14663 = in1977;
__m512 tmp14670 = in1980;
tmp14659 = tmp14659;
tmp14666 = tmp14666;
__m512 tmp14664 = _mm512_fmadd_ps(tmp14659, _mm512_set1_ps(2e+00f), tmp14660);
__m512 tmp14671 = _mm512_fmadd_ps(tmp14666, _mm512_set1_ps(2e+00f), tmp14667);
tmp14660 = _mm512_fnmadd_ps(tmp14659, _mm512_set1_ps(2e+00f), tmp14660);
tmp14667 = _mm512_fnmadd_ps(tmp14666, _mm512_set1_ps(2e+00f), tmp14667);
tmp14659 = in1976;
tmp14666 = in1979;
in1976 = _mm512_sub_ps(_mm512_setzero_ps(), in1976);
in1979 = _mm512_sub_ps(_mm512_setzero_ps(), in1979);
tmp14659 = tmp14659;
tmp14666 = tmp14666;
__m512 tmp14665 = in1976;
__m512 tmp14672 = in1979;
in1977 = _mm512_fmadd_ps(tmp14659, _mm512_set1_ps(2e+00f), tmp14663);
in1980 = _mm512_fmadd_ps(tmp14666, _mm512_set1_ps(2e+00f), tmp14670);
tmp14663 = _mm512_fnmadd_ps(tmp14659, _mm512_set1_ps(2e+00f), tmp14663);
tmp14670 = _mm512_fnmadd_ps(tmp14666, _mm512_set1_ps(2e+00f), tmp14670);
__m512 tmp14681 = _mm512_unpacklo_ps(in1975, tmp14661);
__m512 tmp14682 = _mm512_unpackhi_ps(in1975, tmp14661);
__m512 tmp14683 = _mm512_unpacklo_ps(tmp14662, tmp14664);
__m512 tmp14684 = _mm512_unpackhi_ps(tmp14662, tmp14664);
__m512 tmp14685 = _mm512_unpacklo_ps(tmp14660, in1977);
__m512 tmp14686 = _mm512_unpackhi_ps(tmp14660, in1977);
__m512 tmp14687 = _mm512_unpacklo_ps(tmp14663, tmp14665);
__m512 tmp14688 = _mm512_unpackhi_ps(tmp14663, tmp14665);
__m512 tmp14689 = _mm512_unpacklo_ps(in1978, tmp14668);
__m512 tmp14690 = _mm512_unpackhi_ps(in1978, tmp14668);
__m512 tmp14691 = _mm512_unpacklo_ps(tmp14669, tmp14671);
__m512 tmp14692 = _mm512_unpackhi_ps(tmp14669, tmp14671);
__m512 tmp14693 = _mm512_unpacklo_ps(tmp14667, in1980);
__m512 tmp14694 = _mm512_unpackhi_ps(tmp14667, in1980);
__m512 tmp14695 = _mm512_unpacklo_ps(tmp14670, tmp14672);
__m512 tmp14696 = _mm512_unpackhi_ps(tmp14670, tmp14672);
__m512 tmp14697 = _mm512_shuffle_ps(tmp14681, tmp14683, 68);
__m512 tmp14698 = _mm512_shuffle_ps(tmp14681, tmp14683, 238);
__m512 tmp14699 = _mm512_shuffle_ps(tmp14682, tmp14684, 68);
__m512 tmp14700 = _mm512_shuffle_ps(tmp14682, tmp14684, 238);
__m512 tmp14701 = _mm512_shuffle_ps(tmp14685, tmp14687, 68);
__m512 tmp14702 = _mm512_shuffle_ps(tmp14685, tmp14687, 238);
__m512 tmp14703 = _mm512_shuffle_ps(tmp14686, tmp14688, 68);
__m512 tmp14704 = _mm512_shuffle_ps(tmp14686, tmp14688, 238);
__m512 tmp14705 = _mm512_shuffle_ps(tmp14689, tmp14691, 68);
__m512 tmp14706 = _mm512_shuffle_ps(tmp14689, tmp14691, 238);
__m512 tmp14707 = _mm512_shuffle_ps(tmp14690, tmp14692, 68);
__m512 tmp14708 = _mm512_shuffle_ps(tmp14690, tmp14692, 238);
__m512 tmp14709 = _mm512_shuffle_ps(tmp14693, tmp14695, 68);
__m512 tmp14710 = _mm512_shuffle_ps(tmp14693, tmp14695, 238);
__m512 tmp14711 = _mm512_shuffle_ps(tmp14694, tmp14696, 68);
__m512 tmp14712 = _mm512_shuffle_ps(tmp14694, tmp14696, 238);
__m512 tmp14713 = _mm512_shuffle_f32x4(tmp14697, tmp14701, 136);
__m512 tmp14714 = _mm512_shuffle_f32x4(tmp14697, tmp14701, 221);
__m512 tmp14715 = _mm512_shuffle_f32x4(tmp14698, tmp14702, 136);
__m512 tmp14716 = _mm512_shuffle_f32x4(tmp14698, tmp14702, 221);
__m512 tmp14717 = _mm512_shuffle_f32x4(tmp14699, tmp14703, 136);
__m512 tmp14718 = _mm512_shuffle_f32x4(tmp14699, tmp14703, 221);
__m512 tmp14719 = _mm512_shuffle_f32x4(tmp14700, tmp14704, 136);
__m512 tmp14720 = _mm512_shuffle_f32x4(tmp14700, tmp14704, 221);
__m512 tmp14721 = _mm512_shuffle_f32x4(tmp14705, tmp14709, 136);
__m512 tmp14722 = _mm512_shuffle_f32x4(tmp14705, tmp14709, 221);
__m512 tmp14723 = _mm512_shuffle_f32x4(tmp14706, tmp14710, 136);
__m512 tmp14724 = _mm512_shuffle_f32x4(tmp14706, tmp14710, 221);
__m512 tmp14725 = _mm512_shuffle_f32x4(tmp14707, tmp14711, 136);
__m512 tmp14726 = _mm512_shuffle_f32x4(tmp14707, tmp14711, 221);
__m512 tmp14727 = _mm512_shuffle_f32x4(tmp14708, tmp14712, 136);
__m512 tmp14728 = _mm512_shuffle_f32x4(tmp14708, tmp14712, 221);
in1975 = _mm512_shuffle_f32x4(tmp14713, tmp14721, 136);
in1978 = _mm512_shuffle_f32x4(tmp14713, tmp14721, 221);
tmp14661 = _mm512_shuffle_f32x4(tmp14715, tmp14723, 136);
tmp14668 = _mm512_shuffle_f32x4(tmp14715, tmp14723, 221);
tmp14662 = _mm512_shuffle_f32x4(tmp14717, tmp14725, 136);
tmp14669 = _mm512_shuffle_f32x4(tmp14717, tmp14725, 221);
tmp14664 = _mm512_shuffle_f32x4(tmp14719, tmp14727, 136);
tmp14671 = _mm512_shuffle_f32x4(tmp14719, tmp14727, 221);
tmp14660 = _mm512_shuffle_f32x4(tmp14714, tmp14722, 136);
tmp14667 = _mm512_shuffle_f32x4(tmp14714, tmp14722, 221);
in1977 = _mm512_shuffle_f32x4(tmp14716, tmp14724, 136);
in1980 = _mm512_shuffle_f32x4(tmp14716, tmp14724, 221);
tmp14663 = _mm512_shuffle_f32x4(tmp14718, tmp14726, 136);
tmp14670 = _mm512_shuffle_f32x4(tmp14718, tmp14726, 221);
tmp14665 = _mm512_shuffle_f32x4(tmp14720, tmp14728, 136);
tmp14672 = _mm512_shuffle_f32x4(tmp14720, tmp14728, 221);
__m512 tmp14673 = _mm512_add_ps(tmp14661, in1977);
__m512 tmp14677 = _mm512_add_ps(tmp14668, in1980);
__m512 tmp14674 = _mm512_sub_ps(tmp14660, tmp14662);
__m512 tmp14678 = _mm512_sub_ps(tmp14667, tmp14669);
__m512 tmp14675 = _mm512_add_ps(tmp14662, tmp14663);
__m512 tmp14679 = _mm512_add_ps(tmp14669, tmp14670);
in1975 = _mm512_sub_ps(in1975, tmp14663);
in1978 = _mm512_sub_ps(in1978, tmp14670);
tmp14673 = _mm512_fmadd_ps(tmp14664, _mm512_set1_ps(-4.25e+00f), tmp14673);
tmp14677 = _mm512_fmadd_ps(tmp14671, _mm512_set1_ps(-4.25e+00f), tmp14677);
tmp14675 = _mm512_fmadd_ps(tmp14660, _mm512_set1_ps(-4.25e+00f), tmp14675);
tmp14679 = _mm512_fmadd_ps(tmp14667, _mm512_set1_ps(-4.25e+00f), tmp14679);
in1975 = _mm512_fmadd_ps(tmp14674, _mm512_set1_ps(5.25e+00f), in1975);
in1978 = _mm512_fmadd_ps(tmp14678, _mm512_set1_ps(5.25e+00f), in1978);
tmp14674 = _mm512_fmadd_ps(tmp14662, _mm512_set1_ps(2.5e-01f), tmp14663);
tmp14678 = _mm512_fmadd_ps(tmp14669, _mm512_set1_ps(2.5e-01f), tmp14670);
tmp14662 = _mm512_fmadd_ps(tmp14662, _mm512_set1_ps(4e+00f), tmp14663);
tmp14669 = _mm512_fmadd_ps(tmp14669, _mm512_set1_ps(4e+00f), tmp14670);
__m512 tmp14676 = _mm512_sub_ps(tmp14675, tmp14673);
__m512 tmp14680 = _mm512_sub_ps(tmp14679, tmp14677);
tmp14675 = _mm512_add_ps(tmp14673, tmp14675);
tmp14679 = _mm512_add_ps(tmp14677, tmp14679);
tmp14673 = _mm512_fmadd_ps(tmp14661, _mm512_set1_ps(2.5e-01f), in1977);
tmp14677 = _mm512_fmadd_ps(tmp14668, _mm512_set1_ps(2.5e-01f), in1980);
tmp14674 = _mm512_fmadd_ps(tmp14660, _mm512_set1_ps(-1.25e+00f), tmp14674);
tmp14678 = _mm512_fmadd_ps(tmp14667, _mm512_set1_ps(-1.25e+00f), tmp14678);
tmp14660 = _mm512_fmadd_ps(tmp14660, _mm512_set1_ps(-5e+00f), tmp14662);
tmp14667 = _mm512_fmadd_ps(tmp14667, _mm512_set1_ps(-5e+00f), tmp14669);
tmp14673 = _mm512_fmadd_ps(tmp14664, _mm512_set1_ps(-1.25e+00f), tmp14673);
tmp14677 = _mm512_fmadd_ps(tmp14671, _mm512_set1_ps(-1.25e+00f), tmp14677);
tmp14663 = _mm512_fmadd_ps(tmp14673, _mm512_set1_ps(2e+00f), tmp14674);
tmp14670 = _mm512_fmadd_ps(tmp14677, _mm512_set1_ps(2e+00f), tmp14678);
tmp14674 = _mm512_fnmadd_ps(tmp14673, _mm512_set1_ps(2e+00f), tmp14674);
tmp14678 = _mm512_fnmadd_ps(tmp14677, _mm512_set1_ps(2e+00f), tmp14678);
tmp14673 = _mm512_fmadd_ps(in1977, _mm512_set1_ps(2.5e-01f), tmp14661);
tmp14677 = _mm512_fmadd_ps(in1980, _mm512_set1_ps(2.5e-01f), tmp14668);
tmp14661 = _mm512_sub_ps(tmp14665, tmp14661);
tmp14668 = _mm512_sub_ps(tmp14672, tmp14668);
tmp14673 = _mm512_fmadd_ps(tmp14664, _mm512_set1_ps(-1.25e+00f), tmp14673);
tmp14677 = _mm512_fmadd_ps(tmp14671, _mm512_set1_ps(-1.25e+00f), tmp14677);
tmp14664 = _mm512_sub_ps(tmp14664, in1977);
tmp14671 = _mm512_sub_ps(tmp14671, in1980);
tmp14664 = _mm512_fmadd_ps(tmp14664, _mm512_set1_ps(5.25e+00f), tmp14661);
tmp14671 = _mm512_fmadd_ps(tmp14671, _mm512_set1_ps(5.25e+00f), tmp14668);
tmp14662 = _mm512_fmadd_ps(tmp14673, _mm512_set1_ps(2e+00f), tmp14660);
tmp14669 = _mm512_fmadd_ps(tmp14677, _mm512_set1_ps(2e+00f), tmp14667);
tmp14660 = _mm512_fnmadd_ps(tmp14673, _mm512_set1_ps(2e+00f), tmp14660);
tmp14667 = _mm512_fnmadd_ps(tmp14677, _mm512_set1_ps(2e+00f), tmp14667);
__m512 out1839 = _mm512_shuffle_f32x4(in1975, tmp14675, 68);
__m512 out1847 = _mm512_shuffle_f32x4(in1975, tmp14675, 238);
__m512 out1840 = _mm512_shuffle_f32x4(tmp14676, tmp14663, 68);
__m512 out1848 = _mm512_shuffle_f32x4(tmp14676, tmp14663, 238);
__m512 out1841 = _mm512_shuffle_f32x4(tmp14674, tmp14662, 68);
__m512 out1849 = _mm512_shuffle_f32x4(tmp14674, tmp14662, 238);
__m512 out1842 = _mm512_shuffle_f32x4(tmp14660, tmp14664, 68);
__m512 out1850 = _mm512_shuffle_f32x4(tmp14660, tmp14664, 238);
__m512 out1843 = _mm512_shuffle_f32x4(in1978, tmp14679, 68);
__m512 out1851 = _mm512_shuffle_f32x4(in1978, tmp14679, 238);
__m512 out1844 = _mm512_shuffle_f32x4(tmp14680, tmp14670, 68);
__m512 out1852 = _mm512_shuffle_f32x4(tmp14680, tmp14670, 238);
__m512 out1845 = _mm512_shuffle_f32x4(tmp14678, tmp14669, 68);
__m512 out1853 = _mm512_shuffle_f32x4(tmp14678, tmp14669, 238);
__m512 out1846 = _mm512_shuffle_f32x4(tmp14667, tmp14671, 68);
__m512 out1854 = _mm512_shuffle_f32x4(tmp14667, tmp14671, 238);
_mm512_storeu_ps(dfPtr16+512+36864*i71+6144*j63+3072*s73+768*k174, out1839);
_mm512_storeu_ps(dfPtr16+640+36864*i71+6144*j63+3072*s73+768*k174, out1847);
_mm512_storeu_ps(dfPtr16+576+36864*i71+6144*j63+3072*s73+768*k174, out1843);
_mm512_storeu_ps(dfPtr16+704+36864*i71+6144*j63+3072*s73+768*k174, out1851);
_mm512_storeu_ps(dfPtr16+9728+36864*i71+6144*j63+3072*s73+768*k174, out1840);
_mm512_storeu_ps(dfPtr16+9856+36864*i71+6144*j63+3072*s73+768*k174, out1848);
_mm512_storeu_ps(dfPtr16+9792+36864*i71+6144*j63+3072*s73+768*k174, out1844);
_mm512_storeu_ps(dfPtr16+9920+36864*i71+6144*j63+3072*s73+768*k174, out1852);
_mm512_storeu_ps(dfPtr16+18944+36864*i71+6144*j63+3072*s73+768*k174, out1841);
_mm512_storeu_ps(dfPtr16+19072+36864*i71+6144*j63+3072*s73+768*k174, out1849);
_mm512_storeu_ps(dfPtr16+19008+36864*i71+6144*j63+3072*s73+768*k174, out1845);
_mm512_storeu_ps(dfPtr16+19136+36864*i71+6144*j63+3072*s73+768*k174, out1853);
_mm512_storeu_ps(dfPtr16+28160+36864*i71+6144*j63+3072*s73+768*k174, out1842);
_mm512_storeu_ps(dfPtr16+28288+36864*i71+6144*j63+3072*s73+768*k174, out1850);
_mm512_storeu_ps(dfPtr16+28224+36864*i71+6144*j63+3072*s73+768*k174, out1846);
_mm512_storeu_ps(dfPtr16+28352+36864*i71+6144*j63+3072*s73+768*k174, out1854);
}
++j63;
}
}

static void ResNeXt50ThreeArrangeDats4(ResNeXt50ThreaderTeam1* team73, char** tensors119) {
ResNeXt50ThreaderTask1 task123;
task123.callee1 = ResNeXt50ThreeArrangeDats4Callee1;
task123.any1 = tensors119;
task123.nd1 = 4;
task123.hull1[0] = 1;
task123.hull1[1] = 1;
task123.hull1[2] = 8;
task123.hull1[3] = 1;
ResNeXt50ThreaderDo1(team73, &task123);
}

static void ResNeXt50ThreeProduceSums4Callee1(ResNeXt50ThreaderTask1* task124, int64_t* pt67) {
void** pair30 = task124->any1;
char** tensors122 = pair30[0];
ptrdiff_t e36 = 0;
ptrdiff_t g39 = pt67[3];
ptrdiff_t f68 = 0;
ptrdiff_t d24 = 0;
ptrdiff_t w72 = 0;
char*restrict bfPtr17 = tensors122[0]+2048*e36;
char*restrict wfPtr17 = tensors122[0]+2048+25952256*e36;
char*restrict dfPtr17 = tensors122[1]+29196288*e36;
char*restrict sfPtr16 = tensors122[2];
ptrdiff_t i72 = 1*g39;
ptrdiff_t j64 = 4*f68;
ptrdiff_t jj60 = j64+3;
for (; j64 <= jj60; ++j64) {
ptrdiff_t k175 = 2*d24;
for (; k175 != 1; ++k175) {
ptrdiff_t l72 = 4*w72;
for (; l72 != 4; ++l72) {
__m512 sum592;
__m512 sum598;
__m512 sum604;
__m512 sum610;
if (__builtin_expect(!j64, 0)) {
sum592 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+0+64*i72+16*l72)));
sum598 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+4+64*i72+16*l72)));
sum604 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+8+64*i72+16*l72)));
sum610 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+12+64*i72+16*l72)));
} else {
sum592 = _mm512_setzero_ps();
sum598 = _mm512_setzero_ps();
sum604 = _mm512_setzero_ps();
sum610 = _mm512_setzero_ps();
}
__m512 sum593 = sum592;
__m512 sum594 = sum592;
__m512 sum595 = sum592;
__m512 sum596 = sum592;
__m512 sum597 = sum592;
__m512 sum599 = sum598;
__m512 sum600 = sum598;
__m512 sum601 = sum598;
__m512 sum602 = sum598;
__m512 sum603 = sum598;
__m512 sum605 = sum604;
__m512 sum606 = sum604;
__m512 sum607 = sum604;
__m512 sum608 = sum604;
__m512 sum609 = sum604;
__m512 sum611 = sum610;
__m512 sum612 = sum610;
__m512 sum613 = sum610;
__m512 sum614 = sum610;
__m512 sum615 = sum610;
ptrdiff_t b84 = 0;
for (; b84 != 16; ++b84) {
__m512i wfs61 = _mm512_maskz_loadu_epi32(65535, wfPtr17+0+32768*i72+8192*j64+2048*l72+128*b84);
__m512 wf185 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs61));
__m512 df940 = _mm512_loadu_ps(dfPtr17+0+36864*i72+9216*j64+6144*k175+384*b84);
sum592 = _mm512_fmadd_ps(wf185, df940, sum592);
__m512 df941 = _mm512_loadu_ps(dfPtr17+64+36864*i72+9216*j64+6144*k175+384*b84);
sum593 = _mm512_fmadd_ps(wf185, df941, sum593);
__m512 df942 = _mm512_loadu_ps(dfPtr17+128+36864*i72+9216*j64+6144*k175+384*b84);
sum594 = _mm512_fmadd_ps(wf185, df942, sum594);
__m512 df943 = _mm512_loadu_ps(dfPtr17+192+36864*i72+9216*j64+6144*k175+384*b84);
sum595 = _mm512_fmadd_ps(wf185, df943, sum595);
__m512 df944 = _mm512_loadu_ps(dfPtr17+256+36864*i72+9216*j64+6144*k175+384*b84);
sum596 = _mm512_fmadd_ps(wf185, df944, sum596);
__m512 df945 = _mm512_loadu_ps(dfPtr17+320+36864*i72+9216*j64+6144*k175+384*b84);
sum597 = _mm512_fmadd_ps(wf185, df945, sum597);
__m512 wf186 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs61, 1));
sum598 = _mm512_fmadd_ps(wf186, df940, sum598);
sum599 = _mm512_fmadd_ps(wf186, df941, sum599);
sum600 = _mm512_fmadd_ps(wf186, df942, sum600);
sum601 = _mm512_fmadd_ps(wf186, df943, sum601);
sum602 = _mm512_fmadd_ps(wf186, df944, sum602);
sum603 = _mm512_fmadd_ps(wf186, df945, sum603);
__m512i wfs62 = _mm512_maskz_loadu_epi32(65535, wfPtr17+64+32768*i72+8192*j64+2048*l72+128*b84);
__m512 wf187 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs62));
sum604 = _mm512_fmadd_ps(wf187, df940, sum604);
sum605 = _mm512_fmadd_ps(wf187, df941, sum605);
sum606 = _mm512_fmadd_ps(wf187, df942, sum606);
sum607 = _mm512_fmadd_ps(wf187, df943, sum607);
sum608 = _mm512_fmadd_ps(wf187, df944, sum608);
sum609 = _mm512_fmadd_ps(wf187, df945, sum609);
__m512 wf188 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs62, 1));
sum610 = _mm512_fmadd_ps(wf188, df940, sum610);
sum611 = _mm512_fmadd_ps(wf188, df941, sum611);
sum612 = _mm512_fmadd_ps(wf188, df942, sum612);
sum613 = _mm512_fmadd_ps(wf188, df943, sum613);
sum614 = _mm512_fmadd_ps(wf188, df944, sum614);
sum615 = _mm512_fmadd_ps(wf188, df945, sum615);
}
_mm512_storeu_ps(sfPtr16+0+36864*i72+9216*j64+6144*k175+1536*l72, sum592);
_mm512_storeu_ps(sfPtr16+64+36864*i72+9216*j64+6144*k175+1536*l72, sum593);
_mm512_storeu_ps(sfPtr16+128+36864*i72+9216*j64+6144*k175+1536*l72, sum594);
_mm512_storeu_ps(sfPtr16+192+36864*i72+9216*j64+6144*k175+1536*l72, sum595);
_mm512_storeu_ps(sfPtr16+256+36864*i72+9216*j64+6144*k175+1536*l72, sum596);
_mm512_storeu_ps(sfPtr16+320+36864*i72+9216*j64+6144*k175+1536*l72, sum597);
_mm512_storeu_ps(sfPtr16+384+36864*i72+9216*j64+6144*k175+1536*l72, sum598);
_mm512_storeu_ps(sfPtr16+448+36864*i72+9216*j64+6144*k175+1536*l72, sum599);
_mm512_storeu_ps(sfPtr16+512+36864*i72+9216*j64+6144*k175+1536*l72, sum600);
_mm512_storeu_ps(sfPtr16+576+36864*i72+9216*j64+6144*k175+1536*l72, sum601);
_mm512_storeu_ps(sfPtr16+640+36864*i72+9216*j64+6144*k175+1536*l72, sum602);
_mm512_storeu_ps(sfPtr16+704+36864*i72+9216*j64+6144*k175+1536*l72, sum603);
_mm512_storeu_ps(sfPtr16+768+36864*i72+9216*j64+6144*k175+1536*l72, sum604);
_mm512_storeu_ps(sfPtr16+832+36864*i72+9216*j64+6144*k175+1536*l72, sum605);
_mm512_storeu_ps(sfPtr16+896+36864*i72+9216*j64+6144*k175+1536*l72, sum606);
_mm512_storeu_ps(sfPtr16+960+36864*i72+9216*j64+6144*k175+1536*l72, sum607);
_mm512_storeu_ps(sfPtr16+1024+36864*i72+9216*j64+6144*k175+1536*l72, sum608);
_mm512_storeu_ps(sfPtr16+1088+36864*i72+9216*j64+6144*k175+1536*l72, sum609);
_mm512_storeu_ps(sfPtr16+1152+36864*i72+9216*j64+6144*k175+1536*l72, sum610);
_mm512_storeu_ps(sfPtr16+1216+36864*i72+9216*j64+6144*k175+1536*l72, sum611);
_mm512_storeu_ps(sfPtr16+1280+36864*i72+9216*j64+6144*k175+1536*l72, sum612);
_mm512_storeu_ps(sfPtr16+1344+36864*i72+9216*j64+6144*k175+1536*l72, sum613);
_mm512_storeu_ps(sfPtr16+1408+36864*i72+9216*j64+6144*k175+1536*l72, sum614);
_mm512_storeu_ps(sfPtr16+1472+36864*i72+9216*j64+6144*k175+1536*l72, sum615);
}
}
ptrdiff_t l73 = 4*w72;
for (; l73 != 4; ++l73) {
__m512 sum616;
__m512 sum619;
__m512 sum622;
__m512 sum625;
if (__builtin_expect(!j64, 0)) {
sum616 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+0+64*i72+16*l73)));
sum619 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+4+64*i72+16*l73)));
sum622 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+8+64*i72+16*l73)));
sum625 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr17+12+64*i72+16*l73)));
} else {
sum616 = _mm512_setzero_ps();
sum619 = _mm512_setzero_ps();
sum622 = _mm512_setzero_ps();
sum625 = _mm512_setzero_ps();
}
__m512 sum617 = sum616;
__m512 sum618 = sum616;
__m512 sum620 = sum619;
__m512 sum621 = sum619;
__m512 sum623 = sum622;
__m512 sum624 = sum622;
__m512 sum626 = sum625;
__m512 sum627 = sum625;
ptrdiff_t b85 = 0;
for (; b85 != 16; ++b85) {
__m512i wfs63 = _mm512_maskz_loadu_epi32(65535, wfPtr17+0+32768*i72+8192*j64+2048*l73+128*b85);
__m512 wf189 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs63));
__m512 df946 = _mm512_loadu_ps(dfPtr17+0+36864*i72+9216*j64+6144*k175+192*b85);
sum616 = _mm512_fmadd_ps(wf189, df946, sum616);
__m512 df947 = _mm512_loadu_ps(dfPtr17+64+36864*i72+9216*j64+6144*k175+192*b85);
sum617 = _mm512_fmadd_ps(wf189, df947, sum617);
__m512 df948 = _mm512_loadu_ps(dfPtr17+128+36864*i72+9216*j64+6144*k175+192*b85);
sum618 = _mm512_fmadd_ps(wf189, df948, sum618);
__m512 wf190 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs63, 1));
sum619 = _mm512_fmadd_ps(wf190, df946, sum619);
sum620 = _mm512_fmadd_ps(wf190, df947, sum620);
sum621 = _mm512_fmadd_ps(wf190, df948, sum621);
__m512i wfs64 = _mm512_maskz_loadu_epi32(65535, wfPtr17+64+32768*i72+8192*j64+2048*l73+128*b85);
__m512 wf191 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs64));
sum622 = _mm512_fmadd_ps(wf191, df946, sum622);
sum623 = _mm512_fmadd_ps(wf191, df947, sum623);
sum624 = _mm512_fmadd_ps(wf191, df948, sum624);
__m512 wf192 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs64, 1));
sum625 = _mm512_fmadd_ps(wf192, df946, sum625);
sum626 = _mm512_fmadd_ps(wf192, df947, sum626);
sum627 = _mm512_fmadd_ps(wf192, df948, sum627);
}
_mm512_storeu_ps(sfPtr16+0+36864*i72+9216*j64+6144*k175+768*l73, sum616);
_mm512_storeu_ps(sfPtr16+64+36864*i72+9216*j64+6144*k175+768*l73, sum617);
_mm512_storeu_ps(sfPtr16+128+36864*i72+9216*j64+6144*k175+768*l73, sum618);
_mm512_storeu_ps(sfPtr16+192+36864*i72+9216*j64+6144*k175+768*l73, sum619);
_mm512_storeu_ps(sfPtr16+256+36864*i72+9216*j64+6144*k175+768*l73, sum620);
_mm512_storeu_ps(sfPtr16+320+36864*i72+9216*j64+6144*k175+768*l73, sum621);
_mm512_storeu_ps(sfPtr16+384+36864*i72+9216*j64+6144*k175+768*l73, sum622);
_mm512_storeu_ps(sfPtr16+448+36864*i72+9216*j64+6144*k175+768*l73, sum623);
_mm512_storeu_ps(sfPtr16+512+36864*i72+9216*j64+6144*k175+768*l73, sum624);
_mm512_storeu_ps(sfPtr16+576+36864*i72+9216*j64+6144*k175+768*l73, sum625);
_mm512_storeu_ps(sfPtr16+640+36864*i72+9216*j64+6144*k175+768*l73, sum626);
_mm512_storeu_ps(sfPtr16+704+36864*i72+9216*j64+6144*k175+768*l73, sum627);
}
}
}

static void ResNeXt50ThreeProduceSums4(ResNeXt50ThreaderTeam1* team74, char** tensors121) {
void* pair29[] = {tensors121, 0};
ResNeXt50ThreaderTask1 task125;
task125.callee1 = ResNeXt50ThreeProduceSums4Callee1;
task125.any1 = pair29;
task125.nd1 = 4;
task125.hull1[0] = 1;
task125.hull1[1] = 1;
task125.hull1[2] = 1;
task125.hull1[3] = 32;
ResNeXt50ThreaderDo1(team74, &task125);
}

static void ResNeXt50ThreeConsumeSums4Callee1(ResNeXt50ThreaderTask1* task126, int64_t* pt68) {
char** tensors124 = task126->any1;
ptrdiff_t w73 = 0;
ptrdiff_t d25 = 0;
ptrdiff_t g40 = pt68[2];
char*restrict sfPtr17 = tensors124[0];
char*restrict datPtr39 = tensors124[1];
ptrdiff_t i73 = 4*g40;
ptrdiff_t ii54 = i73+3;
for (; i73 <= ii54; ++i73) {
ptrdiff_t j65 = 2*d25;
ptrdiff_t rel26 = j65-0;
ptrdiff_t base26 = 0;
if (rel26 < 1) {
ptrdiff_t toH47 = base26+0;
ptrdiff_t toW47 = 0;
ptrdiff_t k176 = 4*w73;
for (; k176 != 4; ++k176) {
ptrdiff_t l74 = 0;
for (; l74 != 2; ++l74) {
__m512 sf1009 = _mm512_loadu_ps(sfPtr17+0+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1010 = _mm512_loadu_ps(sfPtr17+128+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1981 = _mm512_shuffle_f32x4(sf1009, sf1010, 68);
__m512 in1982 = _mm512_shuffle_f32x4(sf1009, sf1010, 238);
__m512 sf1011 = _mm512_loadu_ps(sfPtr17+64+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1012 = _mm512_loadu_ps(sfPtr17+192+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1989 = _mm512_shuffle_f32x4(sf1011, sf1012, 68);
__m512 in1990 = _mm512_shuffle_f32x4(sf1011, sf1012, 238);
__m512 sf1013 = _mm512_loadu_ps(sfPtr17+9216+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1014 = _mm512_loadu_ps(sfPtr17+9344+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1983 = _mm512_shuffle_f32x4(sf1013, sf1014, 68);
__m512 in1984 = _mm512_shuffle_f32x4(sf1013, sf1014, 238);
__m512 sf1015 = _mm512_loadu_ps(sfPtr17+9280+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1016 = _mm512_loadu_ps(sfPtr17+9408+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1991 = _mm512_shuffle_f32x4(sf1015, sf1016, 68);
__m512 in1992 = _mm512_shuffle_f32x4(sf1015, sf1016, 238);
__m512 sf1017 = _mm512_loadu_ps(sfPtr17+18432+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1018 = _mm512_loadu_ps(sfPtr17+18560+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1985 = _mm512_shuffle_f32x4(sf1017, sf1018, 68);
__m512 in1986 = _mm512_shuffle_f32x4(sf1017, sf1018, 238);
__m512 sf1019 = _mm512_loadu_ps(sfPtr17+18496+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1020 = _mm512_loadu_ps(sfPtr17+18624+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1993 = _mm512_shuffle_f32x4(sf1019, sf1020, 68);
__m512 in1994 = _mm512_shuffle_f32x4(sf1019, sf1020, 238);
__m512 sf1021 = _mm512_loadu_ps(sfPtr17+27648+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1022 = _mm512_loadu_ps(sfPtr17+27776+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1987 = _mm512_shuffle_f32x4(sf1021, sf1022, 68);
__m512 in1988 = _mm512_shuffle_f32x4(sf1021, sf1022, 238);
__m512 sf1023 = _mm512_loadu_ps(sfPtr17+27712+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1024 = _mm512_loadu_ps(sfPtr17+27840+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1995 = _mm512_shuffle_f32x4(sf1023, sf1024, 68);
__m512 in1996 = _mm512_shuffle_f32x4(sf1023, sf1024, 238);
__m512 tmp14745 = _mm512_add_ps(in1982, in1983);
__m512 tmp14765 = _mm512_add_ps(in1990, in1991);
__m512 tmp14744 = _mm512_add_ps(in1984, in1985);
__m512 tmp14764 = _mm512_add_ps(in1992, in1993);
__m512 tmp14750 = _mm512_sub_ps(in1984, in1985);
__m512 tmp14770 = _mm512_sub_ps(in1992, in1993);
__m512 tmp14749 = _mm512_sub_ps(in1982, in1983);
__m512 tmp14769 = _mm512_sub_ps(in1990, in1991);
__m512 tmp14746 = _mm512_add_ps(in1986, in1987);
__m512 tmp14766 = _mm512_add_ps(in1994, in1995);
__m512 tmp14751 = _mm512_sub_ps(in1986, in1987);
__m512 tmp14771 = _mm512_sub_ps(in1994, in1995);
__m512 tmp14748 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(2e+00f), tmp14749);
__m512 tmp14768 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(2e+00f), tmp14769);
__m512 tmp14755 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(8e+00f), tmp14749);
__m512 tmp14775 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(8e+00f), tmp14769);
__m512 tmp14743 = _mm512_add_ps(tmp14744, tmp14745);
__m512 tmp14763 = _mm512_add_ps(tmp14764, tmp14765);
__m512 tmp14747 = _mm512_fmadd_ps(tmp14751, _mm512_set1_ps(1.6e+01f), tmp14748);
__m512 tmp14767 = _mm512_fmadd_ps(tmp14771, _mm512_set1_ps(1.6e+01f), tmp14768);
__m512 tmp14754 = _mm512_fmadd_ps(tmp14751, _mm512_set1_ps(4e+00f), tmp14755);
__m512 tmp14774 = _mm512_fmadd_ps(tmp14771, _mm512_set1_ps(4e+00f), tmp14775);
__m512 tmp14760 = _mm512_add_ps(tmp14751, tmp14749);
__m512 tmp14780 = _mm512_add_ps(tmp14771, tmp14769);
__m512 tmp14753 = _mm512_fmadd_ps(tmp14744, _mm512_set1_ps(4e+00f), tmp14745);
__m512 tmp14773 = _mm512_fmadd_ps(tmp14764, _mm512_set1_ps(4e+00f), tmp14765);
__m512 tmp14757 = _mm512_fmadd_ps(tmp14744, _mm512_set1_ps(1.6e+01f), tmp14745);
__m512 tmp14777 = _mm512_fmadd_ps(tmp14764, _mm512_set1_ps(1.6e+01f), tmp14765);
__m512 tmp14742 = _mm512_add_ps(tmp14743, in1981);
__m512 tmp14762 = _mm512_add_ps(tmp14763, in1989);
__m512 tmp14759 = _mm512_add_ps(tmp14760, in1988);
__m512 tmp14779 = _mm512_add_ps(tmp14780, in1996);
__m512 tmp14741 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(3.2e+01f), tmp14742);
__m512 tmp14761 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(3.2e+01f), tmp14762);
__m512 tmp14752 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(8e+00f), tmp14753);
__m512 tmp14772 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(8e+00f), tmp14773);
__m512 tmp14758 = _mm512_fmadd_ps(tmp14750, _mm512_set1_ps(3.2e+01f), tmp14759);
__m512 tmp14778 = _mm512_fmadd_ps(tmp14770, _mm512_set1_ps(3.2e+01f), tmp14779);
__m512 tmp14756 = _mm512_fmadd_ps(tmp14746, _mm512_set1_ps(2e+00f), tmp14757);
__m512 tmp14776 = _mm512_fmadd_ps(tmp14766, _mm512_set1_ps(2e+00f), tmp14777);
__m512 tmp14729 = tmp14741;
__m512 tmp14735 = tmp14761;
__m512 tmp14730 = tmp14747;
__m512 tmp14736 = tmp14767;
__m512 tmp14731 = tmp14752;
__m512 tmp14737 = tmp14772;
__m512 tmp14732 = tmp14754;
__m512 tmp14738 = tmp14774;
__m512 tmp14733 = tmp14756;
__m512 tmp14739 = tmp14776;
__m512 tmp14734 = tmp14758;
__m512 tmp14740 = tmp14778;
__m512 tmp14825 = _mm512_unpacklo_ps(tmp14729, tmp14730);
__m512 tmp14826 = _mm512_unpackhi_ps(tmp14729, tmp14730);
__m512 tmp14827 = _mm512_unpacklo_ps(tmp14731, tmp14732);
__m512 tmp14828 = _mm512_unpackhi_ps(tmp14731, tmp14732);
__m512 tmp14829 = _mm512_unpacklo_ps(tmp14733, tmp14734);
__m512 tmp14830 = _mm512_unpackhi_ps(tmp14733, tmp14734);
__m512 tmp14831 = _mm512_unpacklo_ps(tmp14735, tmp14736);
__m512 tmp14832 = _mm512_unpackhi_ps(tmp14735, tmp14736);
__m512 tmp14833 = _mm512_unpacklo_ps(tmp14737, tmp14738);
__m512 tmp14834 = _mm512_unpackhi_ps(tmp14737, tmp14738);
__m512 tmp14835 = _mm512_unpacklo_ps(tmp14739, tmp14740);
__m512 tmp14836 = _mm512_unpackhi_ps(tmp14739, tmp14740);
__m512 tmp14837 = _mm512_shuffle_ps(tmp14825, tmp14827, 68);
__m512 tmp14838 = _mm512_shuffle_ps(tmp14825, tmp14827, 238);
__m512 tmp14839 = _mm512_shuffle_ps(tmp14826, tmp14828, 68);
__m512 tmp14840 = _mm512_shuffle_ps(tmp14826, tmp14828, 238);
__m512 tmp14841 = _mm512_shuffle_ps(tmp14829, tmp14831, 68);
__m512 tmp14842 = _mm512_shuffle_ps(tmp14829, tmp14831, 238);
__m512 tmp14843 = _mm512_shuffle_ps(tmp14830, tmp14832, 68);
__m512 tmp14844 = _mm512_shuffle_ps(tmp14830, tmp14832, 238);
__m512 tmp14845 = _mm512_shuffle_ps(tmp14833, tmp14835, 68);
__m512 tmp14846 = _mm512_shuffle_ps(tmp14833, tmp14835, 238);
__m512 tmp14847 = _mm512_shuffle_ps(tmp14834, tmp14836, 68);
__m512 tmp14848 = _mm512_shuffle_ps(tmp14834, tmp14836, 238);
__m512 tmp14849 = _mm512_shuffle_f32x4(tmp14837, tmp14841, 136);
__m512 tmp14850 = _mm512_shuffle_f32x4(tmp14837, tmp14841, 221);
__m512 tmp14851 = _mm512_shuffle_f32x4(tmp14838, tmp14842, 136);
__m512 tmp14852 = _mm512_shuffle_f32x4(tmp14838, tmp14842, 221);
__m512 tmp14853 = _mm512_shuffle_f32x4(tmp14839, tmp14843, 136);
__m512 tmp14854 = _mm512_shuffle_f32x4(tmp14839, tmp14843, 221);
__m512 tmp14855 = _mm512_shuffle_f32x4(tmp14840, tmp14844, 136);
__m512 tmp14856 = _mm512_shuffle_f32x4(tmp14840, tmp14844, 221);
__m512 tmp14857 = _mm512_shuffle_f32x4(tmp14845, tmp14845, 136);
__m512 tmp14858 = _mm512_shuffle_f32x4(tmp14845, tmp14845, 221);
__m512 tmp14859 = _mm512_shuffle_f32x4(tmp14846, tmp14846, 136);
__m512 tmp14860 = _mm512_shuffle_f32x4(tmp14846, tmp14846, 221);
__m512 tmp14861 = _mm512_shuffle_f32x4(tmp14847, tmp14847, 136);
__m512 tmp14862 = _mm512_shuffle_f32x4(tmp14847, tmp14847, 221);
__m512 tmp14863 = _mm512_shuffle_f32x4(tmp14848, tmp14848, 136);
__m512 tmp14864 = _mm512_shuffle_f32x4(tmp14848, tmp14848, 221);
tmp14729 = _mm512_shuffle_f32x4(tmp14849, tmp14857, 136);
tmp14737 = _mm512_shuffle_f32x4(tmp14849, tmp14857, 221);
tmp14730 = _mm512_shuffle_f32x4(tmp14851, tmp14859, 136);
tmp14738 = _mm512_shuffle_f32x4(tmp14851, tmp14859, 221);
tmp14731 = _mm512_shuffle_f32x4(tmp14853, tmp14861, 136);
tmp14739 = _mm512_shuffle_f32x4(tmp14853, tmp14861, 221);
tmp14732 = _mm512_shuffle_f32x4(tmp14855, tmp14863, 136);
tmp14740 = _mm512_shuffle_f32x4(tmp14855, tmp14863, 221);
tmp14733 = _mm512_shuffle_f32x4(tmp14850, tmp14858, 136);
__m512 tmp14781 = _mm512_shuffle_f32x4(tmp14850, tmp14858, 221);
tmp14734 = _mm512_shuffle_f32x4(tmp14852, tmp14860, 136);
__m512 tmp14782 = _mm512_shuffle_f32x4(tmp14852, tmp14860, 221);
tmp14735 = _mm512_shuffle_f32x4(tmp14854, tmp14862, 136);
__m512 tmp14783 = _mm512_shuffle_f32x4(tmp14854, tmp14862, 221);
tmp14736 = _mm512_shuffle_f32x4(tmp14856, tmp14864, 136);
__m512 tmp14784 = _mm512_shuffle_f32x4(tmp14856, tmp14864, 221);
__m512 tmp14789 = _mm512_add_ps(tmp14730, tmp14731);
__m512 tmp14809 = _mm512_add_ps(tmp14738, tmp14739);
__m512 tmp14788 = _mm512_add_ps(tmp14732, tmp14733);
__m512 tmp14808 = _mm512_add_ps(tmp14740, tmp14781);
__m512 tmp14794 = _mm512_sub_ps(tmp14732, tmp14733);
__m512 tmp14814 = _mm512_sub_ps(tmp14740, tmp14781);
__m512 tmp14793 = _mm512_sub_ps(tmp14730, tmp14731);
__m512 tmp14813 = _mm512_sub_ps(tmp14738, tmp14739);
__m512 tmp14790 = _mm512_add_ps(tmp14734, tmp14735);
__m512 tmp14810 = _mm512_add_ps(tmp14782, tmp14783);
__m512 tmp14795 = _mm512_sub_ps(tmp14734, tmp14735);
__m512 tmp14815 = _mm512_sub_ps(tmp14782, tmp14783);
__m512 tmp14792 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(2e+00f), tmp14793);
__m512 tmp14812 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(2e+00f), tmp14813);
__m512 tmp14799 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(8e+00f), tmp14793);
__m512 tmp14819 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(8e+00f), tmp14813);
__m512 tmp14787 = _mm512_add_ps(tmp14788, tmp14789);
__m512 tmp14807 = _mm512_add_ps(tmp14808, tmp14809);
__m512 tmp14791 = _mm512_fmadd_ps(tmp14795, _mm512_set1_ps(1.6e+01f), tmp14792);
__m512 tmp14811 = _mm512_fmadd_ps(tmp14815, _mm512_set1_ps(1.6e+01f), tmp14812);
__m512 tmp14798 = _mm512_fmadd_ps(tmp14795, _mm512_set1_ps(4e+00f), tmp14799);
__m512 tmp14818 = _mm512_fmadd_ps(tmp14815, _mm512_set1_ps(4e+00f), tmp14819);
__m512 tmp14804 = _mm512_add_ps(tmp14795, tmp14793);
__m512 tmp14824 = _mm512_add_ps(tmp14815, tmp14813);
__m512 tmp14797 = _mm512_fmadd_ps(tmp14788, _mm512_set1_ps(4e+00f), tmp14789);
__m512 tmp14817 = _mm512_fmadd_ps(tmp14808, _mm512_set1_ps(4e+00f), tmp14809);
__m512 tmp14801 = _mm512_fmadd_ps(tmp14788, _mm512_set1_ps(1.6e+01f), tmp14789);
__m512 tmp14821 = _mm512_fmadd_ps(tmp14808, _mm512_set1_ps(1.6e+01f), tmp14809);
__m512 tmp14786 = _mm512_add_ps(tmp14787, tmp14729);
__m512 tmp14806 = _mm512_add_ps(tmp14807, tmp14737);
__m512 tmp14803 = _mm512_add_ps(tmp14804, tmp14736);
__m512 tmp14823 = _mm512_add_ps(tmp14824, tmp14784);
__m512 tmp14785 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(3.2e+01f), tmp14786);
__m512 tmp14805 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(3.2e+01f), tmp14806);
__m512 tmp14796 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(8e+00f), tmp14797);
__m512 tmp14816 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(8e+00f), tmp14817);
__m512 tmp14802 = _mm512_fmadd_ps(tmp14794, _mm512_set1_ps(3.2e+01f), tmp14803);
__m512 tmp14822 = _mm512_fmadd_ps(tmp14814, _mm512_set1_ps(3.2e+01f), tmp14823);
__m512 tmp14800 = _mm512_fmadd_ps(tmp14790, _mm512_set1_ps(2e+00f), tmp14801);
__m512 tmp14820 = _mm512_fmadd_ps(tmp14810, _mm512_set1_ps(2e+00f), tmp14821);
__m512 out1855 = tmp14785;
__m512 out1861 = tmp14805;
__m512 out1856 = tmp14791;
__m512 out1862 = tmp14811;
__m512 out1857 = tmp14796;
__m512 out1863 = tmp14816;
__m512 out1858 = tmp14798;
__m512 out1864 = tmp14818;
__m512 out1859 = tmp14800;
__m512 out1865 = tmp14820;
__m512 out1860 = tmp14802;
__m512 out1866 = tmp14822;
out1855 = _mm512_max_ps(_mm512_setzero_ps(), out1855);
out1861 = _mm512_max_ps(_mm512_setzero_ps(), out1861);
out1856 = _mm512_max_ps(_mm512_setzero_ps(), out1856);
out1862 = _mm512_max_ps(_mm512_setzero_ps(), out1862);
out1857 = _mm512_max_ps(_mm512_setzero_ps(), out1857);
out1863 = _mm512_max_ps(_mm512_setzero_ps(), out1863);
out1858 = _mm512_max_ps(_mm512_setzero_ps(), out1858);
out1864 = _mm512_max_ps(_mm512_setzero_ps(), out1864);
out1859 = _mm512_max_ps(_mm512_setzero_ps(), out1859);
out1865 = _mm512_max_ps(_mm512_setzero_ps(), out1865);
out1860 = _mm512_max_ps(_mm512_setzero_ps(), out1860);
out1866 = _mm512_max_ps(_mm512_setzero_ps(), out1866);
_mm512_mask_storeu_ps(datPtr39+0+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1855);
_mm512_mask_storeu_ps(datPtr39+48+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1861);
_mm512_mask_storeu_ps(datPtr39+312+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1861);
_mm512_mask_storeu_ps(datPtr39+56+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1856);
_mm512_mask_storeu_ps(datPtr39+104+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1862);
_mm512_mask_storeu_ps(datPtr39+368+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1862);
_mm512_mask_storeu_ps(datPtr39+112+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1857);
_mm512_mask_storeu_ps(datPtr39+160+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1863);
_mm512_mask_storeu_ps(datPtr39+424+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1863);
_mm512_mask_storeu_ps(datPtr39+168+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1858);
_mm512_mask_storeu_ps(datPtr39+216+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1864);
_mm512_mask_storeu_ps(datPtr39+480+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1864);
_mm512_mask_storeu_ps(datPtr39+224+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1859);
_mm512_mask_storeu_ps(datPtr39+272+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1865);
_mm512_mask_storeu_ps(datPtr39+536+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1865);
_mm512_mask_storeu_ps(datPtr39+280+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1860);
_mm512_mask_storeu_ps(datPtr39+328+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1866);
_mm512_mask_storeu_ps(datPtr39+592+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4032, out1866);
__m512 sf1025 = _mm512_loadu_ps(sfPtr17+256+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1026 = _mm512_loadu_ps(sfPtr17+384+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1997 = _mm512_shuffle_f32x4(sf1025, sf1026, 68);
__m512 in1998 = _mm512_shuffle_f32x4(sf1025, sf1026, 238);
__m512 sf1027 = _mm512_loadu_ps(sfPtr17+320+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1028 = _mm512_loadu_ps(sfPtr17+448+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2005 = _mm512_shuffle_f32x4(sf1027, sf1028, 68);
__m512 in2006 = _mm512_shuffle_f32x4(sf1027, sf1028, 238);
__m512 sf1029 = _mm512_loadu_ps(sfPtr17+9472+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1030 = _mm512_loadu_ps(sfPtr17+9600+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in1999 = _mm512_shuffle_f32x4(sf1029, sf1030, 68);
__m512 in2000 = _mm512_shuffle_f32x4(sf1029, sf1030, 238);
__m512 sf1031 = _mm512_loadu_ps(sfPtr17+9536+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1032 = _mm512_loadu_ps(sfPtr17+9664+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2007 = _mm512_shuffle_f32x4(sf1031, sf1032, 68);
__m512 in2008 = _mm512_shuffle_f32x4(sf1031, sf1032, 238);
__m512 sf1033 = _mm512_loadu_ps(sfPtr17+18688+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1034 = _mm512_loadu_ps(sfPtr17+18816+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2001 = _mm512_shuffle_f32x4(sf1033, sf1034, 68);
__m512 in2002 = _mm512_shuffle_f32x4(sf1033, sf1034, 238);
__m512 sf1035 = _mm512_loadu_ps(sfPtr17+18752+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1036 = _mm512_loadu_ps(sfPtr17+18880+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2009 = _mm512_shuffle_f32x4(sf1035, sf1036, 68);
__m512 in2010 = _mm512_shuffle_f32x4(sf1035, sf1036, 238);
__m512 sf1037 = _mm512_loadu_ps(sfPtr17+27904+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1038 = _mm512_loadu_ps(sfPtr17+28032+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2003 = _mm512_shuffle_f32x4(sf1037, sf1038, 68);
__m512 in2004 = _mm512_shuffle_f32x4(sf1037, sf1038, 238);
__m512 sf1039 = _mm512_loadu_ps(sfPtr17+27968+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1040 = _mm512_loadu_ps(sfPtr17+28096+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2011 = _mm512_shuffle_f32x4(sf1039, sf1040, 68);
__m512 in2012 = _mm512_shuffle_f32x4(sf1039, sf1040, 238);
__m512 tmp14881 = _mm512_add_ps(in1998, in1999);
__m512 tmp14901 = _mm512_add_ps(in2006, in2007);
__m512 tmp14880 = _mm512_add_ps(in2000, in2001);
__m512 tmp14900 = _mm512_add_ps(in2008, in2009);
__m512 tmp14886 = _mm512_sub_ps(in2000, in2001);
__m512 tmp14906 = _mm512_sub_ps(in2008, in2009);
__m512 tmp14885 = _mm512_sub_ps(in1998, in1999);
__m512 tmp14905 = _mm512_sub_ps(in2006, in2007);
__m512 tmp14882 = _mm512_add_ps(in2002, in2003);
__m512 tmp14902 = _mm512_add_ps(in2010, in2011);
__m512 tmp14887 = _mm512_sub_ps(in2002, in2003);
__m512 tmp14907 = _mm512_sub_ps(in2010, in2011);
__m512 tmp14884 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(2e+00f), tmp14885);
__m512 tmp14904 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(2e+00f), tmp14905);
__m512 tmp14891 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(8e+00f), tmp14885);
__m512 tmp14911 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(8e+00f), tmp14905);
__m512 tmp14879 = _mm512_add_ps(tmp14880, tmp14881);
__m512 tmp14899 = _mm512_add_ps(tmp14900, tmp14901);
__m512 tmp14883 = _mm512_fmadd_ps(tmp14887, _mm512_set1_ps(1.6e+01f), tmp14884);
__m512 tmp14903 = _mm512_fmadd_ps(tmp14907, _mm512_set1_ps(1.6e+01f), tmp14904);
__m512 tmp14890 = _mm512_fmadd_ps(tmp14887, _mm512_set1_ps(4e+00f), tmp14891);
__m512 tmp14910 = _mm512_fmadd_ps(tmp14907, _mm512_set1_ps(4e+00f), tmp14911);
__m512 tmp14896 = _mm512_add_ps(tmp14887, tmp14885);
__m512 tmp14916 = _mm512_add_ps(tmp14907, tmp14905);
__m512 tmp14889 = _mm512_fmadd_ps(tmp14880, _mm512_set1_ps(4e+00f), tmp14881);
__m512 tmp14909 = _mm512_fmadd_ps(tmp14900, _mm512_set1_ps(4e+00f), tmp14901);
__m512 tmp14893 = _mm512_fmadd_ps(tmp14880, _mm512_set1_ps(1.6e+01f), tmp14881);
__m512 tmp14913 = _mm512_fmadd_ps(tmp14900, _mm512_set1_ps(1.6e+01f), tmp14901);
__m512 tmp14878 = _mm512_add_ps(tmp14879, in1997);
__m512 tmp14898 = _mm512_add_ps(tmp14899, in2005);
__m512 tmp14895 = _mm512_add_ps(tmp14896, in2004);
__m512 tmp14915 = _mm512_add_ps(tmp14916, in2012);
__m512 tmp14877 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(3.2e+01f), tmp14878);
__m512 tmp14897 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(3.2e+01f), tmp14898);
__m512 tmp14888 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(8e+00f), tmp14889);
__m512 tmp14908 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(8e+00f), tmp14909);
__m512 tmp14894 = _mm512_fmadd_ps(tmp14886, _mm512_set1_ps(3.2e+01f), tmp14895);
__m512 tmp14914 = _mm512_fmadd_ps(tmp14906, _mm512_set1_ps(3.2e+01f), tmp14915);
__m512 tmp14892 = _mm512_fmadd_ps(tmp14882, _mm512_set1_ps(2e+00f), tmp14893);
__m512 tmp14912 = _mm512_fmadd_ps(tmp14902, _mm512_set1_ps(2e+00f), tmp14913);
__m512 tmp14865 = tmp14877;
__m512 tmp14871 = tmp14897;
__m512 tmp14866 = tmp14883;
__m512 tmp14872 = tmp14903;
__m512 tmp14867 = tmp14888;
__m512 tmp14873 = tmp14908;
__m512 tmp14868 = tmp14890;
__m512 tmp14874 = tmp14910;
__m512 tmp14869 = tmp14892;
__m512 tmp14875 = tmp14912;
__m512 tmp14870 = tmp14894;
__m512 tmp14876 = tmp14914;
__m512 tmp14961 = _mm512_unpacklo_ps(tmp14865, tmp14866);
__m512 tmp14962 = _mm512_unpackhi_ps(tmp14865, tmp14866);
__m512 tmp14963 = _mm512_unpacklo_ps(tmp14867, tmp14868);
__m512 tmp14964 = _mm512_unpackhi_ps(tmp14867, tmp14868);
__m512 tmp14965 = _mm512_unpacklo_ps(tmp14869, tmp14870);
__m512 tmp14966 = _mm512_unpackhi_ps(tmp14869, tmp14870);
__m512 tmp14967 = _mm512_unpacklo_ps(tmp14871, tmp14872);
__m512 tmp14968 = _mm512_unpackhi_ps(tmp14871, tmp14872);
__m512 tmp14969 = _mm512_unpacklo_ps(tmp14873, tmp14874);
__m512 tmp14970 = _mm512_unpackhi_ps(tmp14873, tmp14874);
__m512 tmp14971 = _mm512_unpacklo_ps(tmp14875, tmp14876);
__m512 tmp14972 = _mm512_unpackhi_ps(tmp14875, tmp14876);
__m512 tmp14973 = _mm512_shuffle_ps(tmp14961, tmp14963, 68);
__m512 tmp14974 = _mm512_shuffle_ps(tmp14961, tmp14963, 238);
__m512 tmp14975 = _mm512_shuffle_ps(tmp14962, tmp14964, 68);
__m512 tmp14976 = _mm512_shuffle_ps(tmp14962, tmp14964, 238);
__m512 tmp14977 = _mm512_shuffle_ps(tmp14965, tmp14967, 68);
__m512 tmp14978 = _mm512_shuffle_ps(tmp14965, tmp14967, 238);
__m512 tmp14979 = _mm512_shuffle_ps(tmp14966, tmp14968, 68);
__m512 tmp14980 = _mm512_shuffle_ps(tmp14966, tmp14968, 238);
__m512 tmp14981 = _mm512_shuffle_ps(tmp14969, tmp14971, 68);
__m512 tmp14982 = _mm512_shuffle_ps(tmp14969, tmp14971, 238);
__m512 tmp14983 = _mm512_shuffle_ps(tmp14970, tmp14972, 68);
__m512 tmp14984 = _mm512_shuffle_ps(tmp14970, tmp14972, 238);
__m512 tmp14985 = _mm512_shuffle_f32x4(tmp14973, tmp14977, 136);
__m512 tmp14986 = _mm512_shuffle_f32x4(tmp14973, tmp14977, 221);
__m512 tmp14987 = _mm512_shuffle_f32x4(tmp14974, tmp14978, 136);
__m512 tmp14988 = _mm512_shuffle_f32x4(tmp14974, tmp14978, 221);
__m512 tmp14989 = _mm512_shuffle_f32x4(tmp14975, tmp14979, 136);
__m512 tmp14990 = _mm512_shuffle_f32x4(tmp14975, tmp14979, 221);
__m512 tmp14991 = _mm512_shuffle_f32x4(tmp14976, tmp14980, 136);
__m512 tmp14992 = _mm512_shuffle_f32x4(tmp14976, tmp14980, 221);
__m512 tmp14993 = _mm512_shuffle_f32x4(tmp14981, tmp14981, 136);
__m512 tmp14994 = _mm512_shuffle_f32x4(tmp14981, tmp14981, 221);
__m512 tmp14995 = _mm512_shuffle_f32x4(tmp14982, tmp14982, 136);
__m512 tmp14996 = _mm512_shuffle_f32x4(tmp14982, tmp14982, 221);
__m512 tmp14997 = _mm512_shuffle_f32x4(tmp14983, tmp14983, 136);
__m512 tmp14998 = _mm512_shuffle_f32x4(tmp14983, tmp14983, 221);
__m512 tmp14999 = _mm512_shuffle_f32x4(tmp14984, tmp14984, 136);
__m512 tmp15000 = _mm512_shuffle_f32x4(tmp14984, tmp14984, 221);
tmp14865 = _mm512_shuffle_f32x4(tmp14985, tmp14993, 136);
tmp14873 = _mm512_shuffle_f32x4(tmp14985, tmp14993, 221);
tmp14866 = _mm512_shuffle_f32x4(tmp14987, tmp14995, 136);
tmp14874 = _mm512_shuffle_f32x4(tmp14987, tmp14995, 221);
tmp14867 = _mm512_shuffle_f32x4(tmp14989, tmp14997, 136);
tmp14875 = _mm512_shuffle_f32x4(tmp14989, tmp14997, 221);
tmp14868 = _mm512_shuffle_f32x4(tmp14991, tmp14999, 136);
tmp14876 = _mm512_shuffle_f32x4(tmp14991, tmp14999, 221);
tmp14869 = _mm512_shuffle_f32x4(tmp14986, tmp14994, 136);
__m512 tmp14917 = _mm512_shuffle_f32x4(tmp14986, tmp14994, 221);
tmp14870 = _mm512_shuffle_f32x4(tmp14988, tmp14996, 136);
__m512 tmp14918 = _mm512_shuffle_f32x4(tmp14988, tmp14996, 221);
tmp14871 = _mm512_shuffle_f32x4(tmp14990, tmp14998, 136);
__m512 tmp14919 = _mm512_shuffle_f32x4(tmp14990, tmp14998, 221);
tmp14872 = _mm512_shuffle_f32x4(tmp14992, tmp15000, 136);
__m512 tmp14920 = _mm512_shuffle_f32x4(tmp14992, tmp15000, 221);
__m512 tmp14925 = _mm512_add_ps(tmp14866, tmp14867);
__m512 tmp14945 = _mm512_add_ps(tmp14874, tmp14875);
__m512 tmp14924 = _mm512_add_ps(tmp14868, tmp14869);
__m512 tmp14944 = _mm512_add_ps(tmp14876, tmp14917);
__m512 tmp14930 = _mm512_sub_ps(tmp14868, tmp14869);
__m512 tmp14950 = _mm512_sub_ps(tmp14876, tmp14917);
__m512 tmp14929 = _mm512_sub_ps(tmp14866, tmp14867);
__m512 tmp14949 = _mm512_sub_ps(tmp14874, tmp14875);
__m512 tmp14926 = _mm512_add_ps(tmp14870, tmp14871);
__m512 tmp14946 = _mm512_add_ps(tmp14918, tmp14919);
__m512 tmp14931 = _mm512_sub_ps(tmp14870, tmp14871);
__m512 tmp14951 = _mm512_sub_ps(tmp14918, tmp14919);
__m512 tmp14928 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(2e+00f), tmp14929);
__m512 tmp14948 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(2e+00f), tmp14949);
__m512 tmp14935 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(8e+00f), tmp14929);
__m512 tmp14955 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(8e+00f), tmp14949);
__m512 tmp14923 = _mm512_add_ps(tmp14924, tmp14925);
__m512 tmp14943 = _mm512_add_ps(tmp14944, tmp14945);
__m512 tmp14927 = _mm512_fmadd_ps(tmp14931, _mm512_set1_ps(1.6e+01f), tmp14928);
__m512 tmp14947 = _mm512_fmadd_ps(tmp14951, _mm512_set1_ps(1.6e+01f), tmp14948);
__m512 tmp14934 = _mm512_fmadd_ps(tmp14931, _mm512_set1_ps(4e+00f), tmp14935);
__m512 tmp14954 = _mm512_fmadd_ps(tmp14951, _mm512_set1_ps(4e+00f), tmp14955);
__m512 tmp14940 = _mm512_add_ps(tmp14931, tmp14929);
__m512 tmp14960 = _mm512_add_ps(tmp14951, tmp14949);
__m512 tmp14933 = _mm512_fmadd_ps(tmp14924, _mm512_set1_ps(4e+00f), tmp14925);
__m512 tmp14953 = _mm512_fmadd_ps(tmp14944, _mm512_set1_ps(4e+00f), tmp14945);
__m512 tmp14937 = _mm512_fmadd_ps(tmp14924, _mm512_set1_ps(1.6e+01f), tmp14925);
__m512 tmp14957 = _mm512_fmadd_ps(tmp14944, _mm512_set1_ps(1.6e+01f), tmp14945);
__m512 tmp14922 = _mm512_add_ps(tmp14923, tmp14865);
__m512 tmp14942 = _mm512_add_ps(tmp14943, tmp14873);
__m512 tmp14939 = _mm512_add_ps(tmp14940, tmp14872);
__m512 tmp14959 = _mm512_add_ps(tmp14960, tmp14920);
__m512 tmp14921 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(3.2e+01f), tmp14922);
__m512 tmp14941 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(3.2e+01f), tmp14942);
__m512 tmp14932 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(8e+00f), tmp14933);
__m512 tmp14952 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(8e+00f), tmp14953);
__m512 tmp14938 = _mm512_fmadd_ps(tmp14930, _mm512_set1_ps(3.2e+01f), tmp14939);
__m512 tmp14958 = _mm512_fmadd_ps(tmp14950, _mm512_set1_ps(3.2e+01f), tmp14959);
__m512 tmp14936 = _mm512_fmadd_ps(tmp14926, _mm512_set1_ps(2e+00f), tmp14937);
__m512 tmp14956 = _mm512_fmadd_ps(tmp14946, _mm512_set1_ps(2e+00f), tmp14957);
__m512 out1867 = tmp14921;
__m512 out1873 = tmp14941;
__m512 out1868 = tmp14927;
__m512 out1874 = tmp14947;
__m512 out1869 = tmp14932;
__m512 out1875 = tmp14952;
__m512 out1870 = tmp14934;
__m512 out1876 = tmp14954;
__m512 out1871 = tmp14936;
__m512 out1877 = tmp14956;
__m512 out1872 = tmp14938;
__m512 out1878 = tmp14958;
out1867 = _mm512_max_ps(_mm512_setzero_ps(), out1867);
out1873 = _mm512_max_ps(_mm512_setzero_ps(), out1873);
out1868 = _mm512_max_ps(_mm512_setzero_ps(), out1868);
out1874 = _mm512_max_ps(_mm512_setzero_ps(), out1874);
out1869 = _mm512_max_ps(_mm512_setzero_ps(), out1869);
out1875 = _mm512_max_ps(_mm512_setzero_ps(), out1875);
out1870 = _mm512_max_ps(_mm512_setzero_ps(), out1870);
out1876 = _mm512_max_ps(_mm512_setzero_ps(), out1876);
out1871 = _mm512_max_ps(_mm512_setzero_ps(), out1871);
out1877 = _mm512_max_ps(_mm512_setzero_ps(), out1877);
out1872 = _mm512_max_ps(_mm512_setzero_ps(), out1872);
out1878 = _mm512_max_ps(_mm512_setzero_ps(), out1878);
_mm512_mask_storeu_ps(datPtr39+360+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1867);
_mm512_mask_storeu_ps(datPtr39+832+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1873);
_mm512_mask_storeu_ps(datPtr39+416+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1868);
_mm512_mask_storeu_ps(datPtr39+888+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1874);
_mm512_mask_storeu_ps(datPtr39+472+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1869);
_mm512_mask_storeu_ps(datPtr39+944+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1875);
_mm512_mask_storeu_ps(datPtr39+528+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1870);
_mm512_mask_storeu_ps(datPtr39+1000+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1876);
_mm512_mask_storeu_ps(datPtr39+584+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1871);
_mm512_mask_storeu_ps(datPtr39+1056+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1877);
_mm512_mask_storeu_ps(datPtr39+640+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 255, out1872);
_mm512_mask_storeu_ps(datPtr39+1112+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1878);
__m512 sf1041 = _mm512_loadu_ps(sfPtr17+512+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1042 = _mm512_loadu_ps(sfPtr17+576+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2013 = _mm512_shuffle_f32x4(sf1042, sf1041, 68);
__m512 in2014 = _mm512_shuffle_f32x4(sf1042, sf1041, 238);
__m512 sf1043 = _mm512_loadu_ps(sfPtr17+640+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1044 = _mm512_loadu_ps(sfPtr17+704+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2021 = _mm512_shuffle_f32x4(sf1043, sf1044, 68);
__m512 in2022 = _mm512_shuffle_f32x4(sf1043, sf1044, 238);
__m512 sf1045 = _mm512_loadu_ps(sfPtr17+9728+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1046 = _mm512_loadu_ps(sfPtr17+9792+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2015 = _mm512_shuffle_f32x4(sf1046, sf1045, 68);
__m512 in2016 = _mm512_shuffle_f32x4(sf1046, sf1045, 238);
__m512 sf1047 = _mm512_loadu_ps(sfPtr17+9856+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1048 = _mm512_loadu_ps(sfPtr17+9920+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2023 = _mm512_shuffle_f32x4(sf1047, sf1048, 68);
__m512 in2024 = _mm512_shuffle_f32x4(sf1047, sf1048, 238);
__m512 sf1049 = _mm512_loadu_ps(sfPtr17+18944+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1050 = _mm512_loadu_ps(sfPtr17+19008+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2017 = _mm512_shuffle_f32x4(sf1050, sf1049, 68);
__m512 in2018 = _mm512_shuffle_f32x4(sf1050, sf1049, 238);
__m512 sf1051 = _mm512_loadu_ps(sfPtr17+19072+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1052 = _mm512_loadu_ps(sfPtr17+19136+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2025 = _mm512_shuffle_f32x4(sf1051, sf1052, 68);
__m512 in2026 = _mm512_shuffle_f32x4(sf1051, sf1052, 238);
__m512 sf1053 = _mm512_loadu_ps(sfPtr17+28160+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1054 = _mm512_loadu_ps(sfPtr17+28224+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2019 = _mm512_shuffle_f32x4(sf1054, sf1053, 68);
__m512 in2020 = _mm512_shuffle_f32x4(sf1054, sf1053, 238);
__m512 sf1055 = _mm512_loadu_ps(sfPtr17+28288+36864*i73+6144*j65+1536*k176+768*l74);
__m512 sf1056 = _mm512_loadu_ps(sfPtr17+28352+36864*i73+6144*j65+1536*k176+768*l74);
__m512 in2027 = _mm512_shuffle_f32x4(sf1055, sf1056, 68);
__m512 in2028 = _mm512_shuffle_f32x4(sf1055, sf1056, 238);
__m512 tmp15017 = _mm512_add_ps(in2014, in2015);
__m512 tmp15037 = _mm512_add_ps(in2022, in2023);
__m512 tmp15016 = _mm512_add_ps(in2016, in2017);
__m512 tmp15036 = _mm512_add_ps(in2024, in2025);
__m512 tmp15022 = _mm512_sub_ps(in2016, in2017);
__m512 tmp15042 = _mm512_sub_ps(in2024, in2025);
__m512 tmp15021 = _mm512_sub_ps(in2014, in2015);
__m512 tmp15041 = _mm512_sub_ps(in2022, in2023);
__m512 tmp15018 = _mm512_add_ps(in2018, in2019);
__m512 tmp15038 = _mm512_add_ps(in2026, in2027);
__m512 tmp15023 = _mm512_sub_ps(in2018, in2019);
__m512 tmp15043 = _mm512_sub_ps(in2026, in2027);
__m512 tmp15020 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(2e+00f), tmp15021);
__m512 tmp15040 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(2e+00f), tmp15041);
__m512 tmp15027 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(8e+00f), tmp15021);
__m512 tmp15047 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(8e+00f), tmp15041);
__m512 tmp15015 = _mm512_add_ps(tmp15016, tmp15017);
__m512 tmp15035 = _mm512_add_ps(tmp15036, tmp15037);
__m512 tmp15019 = _mm512_fmadd_ps(tmp15023, _mm512_set1_ps(1.6e+01f), tmp15020);
__m512 tmp15039 = _mm512_fmadd_ps(tmp15043, _mm512_set1_ps(1.6e+01f), tmp15040);
__m512 tmp15026 = _mm512_fmadd_ps(tmp15023, _mm512_set1_ps(4e+00f), tmp15027);
__m512 tmp15046 = _mm512_fmadd_ps(tmp15043, _mm512_set1_ps(4e+00f), tmp15047);
__m512 tmp15032 = _mm512_add_ps(tmp15023, tmp15021);
__m512 tmp15052 = _mm512_add_ps(tmp15043, tmp15041);
__m512 tmp15025 = _mm512_fmadd_ps(tmp15016, _mm512_set1_ps(4e+00f), tmp15017);
__m512 tmp15045 = _mm512_fmadd_ps(tmp15036, _mm512_set1_ps(4e+00f), tmp15037);
__m512 tmp15029 = _mm512_fmadd_ps(tmp15016, _mm512_set1_ps(1.6e+01f), tmp15017);
__m512 tmp15049 = _mm512_fmadd_ps(tmp15036, _mm512_set1_ps(1.6e+01f), tmp15037);
__m512 tmp15014 = _mm512_add_ps(tmp15015, in2013);
__m512 tmp15034 = _mm512_add_ps(tmp15035, in2021);
__m512 tmp15031 = _mm512_add_ps(tmp15032, in2020);
__m512 tmp15051 = _mm512_add_ps(tmp15052, in2028);
__m512 tmp15013 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(3.2e+01f), tmp15014);
__m512 tmp15033 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(3.2e+01f), tmp15034);
__m512 tmp15024 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(8e+00f), tmp15025);
__m512 tmp15044 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(8e+00f), tmp15045);
__m512 tmp15030 = _mm512_fmadd_ps(tmp15022, _mm512_set1_ps(3.2e+01f), tmp15031);
__m512 tmp15050 = _mm512_fmadd_ps(tmp15042, _mm512_set1_ps(3.2e+01f), tmp15051);
__m512 tmp15028 = _mm512_fmadd_ps(tmp15018, _mm512_set1_ps(2e+00f), tmp15029);
__m512 tmp15048 = _mm512_fmadd_ps(tmp15038, _mm512_set1_ps(2e+00f), tmp15049);
__m512 tmp15001 = tmp15013;
__m512 tmp15007 = tmp15033;
__m512 tmp15002 = tmp15019;
__m512 tmp15008 = tmp15039;
__m512 tmp15003 = tmp15024;
__m512 tmp15009 = tmp15044;
__m512 tmp15004 = tmp15026;
__m512 tmp15010 = tmp15046;
__m512 tmp15005 = tmp15028;
__m512 tmp15011 = tmp15048;
__m512 tmp15006 = tmp15030;
__m512 tmp15012 = tmp15050;
__m512 tmp15097 = _mm512_unpacklo_ps(tmp15001, tmp15002);
__m512 tmp15098 = _mm512_unpackhi_ps(tmp15001, tmp15002);
__m512 tmp15099 = _mm512_unpacklo_ps(tmp15003, tmp15004);
__m512 tmp15100 = _mm512_unpackhi_ps(tmp15003, tmp15004);
__m512 tmp15101 = _mm512_unpacklo_ps(tmp15005, tmp15006);
__m512 tmp15102 = _mm512_unpackhi_ps(tmp15005, tmp15006);
__m512 tmp15103 = _mm512_unpacklo_ps(tmp15007, tmp15008);
__m512 tmp15104 = _mm512_unpackhi_ps(tmp15007, tmp15008);
__m512 tmp15105 = _mm512_unpacklo_ps(tmp15009, tmp15010);
__m512 tmp15106 = _mm512_unpackhi_ps(tmp15009, tmp15010);
__m512 tmp15107 = _mm512_unpacklo_ps(tmp15011, tmp15012);
__m512 tmp15108 = _mm512_unpackhi_ps(tmp15011, tmp15012);
__m512 tmp15109 = _mm512_shuffle_ps(tmp15097, tmp15099, 68);
__m512 tmp15110 = _mm512_shuffle_ps(tmp15097, tmp15099, 238);
__m512 tmp15111 = _mm512_shuffle_ps(tmp15098, tmp15100, 68);
__m512 tmp15112 = _mm512_shuffle_ps(tmp15098, tmp15100, 238);
__m512 tmp15113 = _mm512_shuffle_ps(tmp15101, tmp15103, 68);
__m512 tmp15114 = _mm512_shuffle_ps(tmp15101, tmp15103, 238);
__m512 tmp15115 = _mm512_shuffle_ps(tmp15102, tmp15104, 68);
__m512 tmp15116 = _mm512_shuffle_ps(tmp15102, tmp15104, 238);
__m512 tmp15117 = _mm512_shuffle_ps(tmp15105, tmp15107, 68);
__m512 tmp15118 = _mm512_shuffle_ps(tmp15105, tmp15107, 238);
__m512 tmp15119 = _mm512_shuffle_ps(tmp15106, tmp15108, 68);
__m512 tmp15120 = _mm512_shuffle_ps(tmp15106, tmp15108, 238);
__m512 tmp15121 = _mm512_shuffle_f32x4(tmp15109, tmp15113, 136);
__m512 tmp15122 = _mm512_shuffle_f32x4(tmp15109, tmp15113, 221);
__m512 tmp15123 = _mm512_shuffle_f32x4(tmp15110, tmp15114, 136);
__m512 tmp15124 = _mm512_shuffle_f32x4(tmp15110, tmp15114, 221);
__m512 tmp15125 = _mm512_shuffle_f32x4(tmp15111, tmp15115, 136);
__m512 tmp15126 = _mm512_shuffle_f32x4(tmp15111, tmp15115, 221);
__m512 tmp15127 = _mm512_shuffle_f32x4(tmp15112, tmp15116, 136);
__m512 tmp15128 = _mm512_shuffle_f32x4(tmp15112, tmp15116, 221);
__m512 tmp15129 = _mm512_shuffle_f32x4(tmp15117, tmp15117, 136);
__m512 tmp15130 = _mm512_shuffle_f32x4(tmp15117, tmp15117, 221);
__m512 tmp15131 = _mm512_shuffle_f32x4(tmp15118, tmp15118, 136);
__m512 tmp15132 = _mm512_shuffle_f32x4(tmp15118, tmp15118, 221);
__m512 tmp15133 = _mm512_shuffle_f32x4(tmp15119, tmp15119, 136);
__m512 tmp15134 = _mm512_shuffle_f32x4(tmp15119, tmp15119, 221);
__m512 tmp15135 = _mm512_shuffle_f32x4(tmp15120, tmp15120, 136);
__m512 tmp15136 = _mm512_shuffle_f32x4(tmp15120, tmp15120, 221);
tmp15001 = _mm512_shuffle_f32x4(tmp15121, tmp15129, 136);
tmp15009 = _mm512_shuffle_f32x4(tmp15121, tmp15129, 221);
tmp15002 = _mm512_shuffle_f32x4(tmp15123, tmp15131, 136);
tmp15010 = _mm512_shuffle_f32x4(tmp15123, tmp15131, 221);
tmp15003 = _mm512_shuffle_f32x4(tmp15125, tmp15133, 136);
tmp15011 = _mm512_shuffle_f32x4(tmp15125, tmp15133, 221);
tmp15004 = _mm512_shuffle_f32x4(tmp15127, tmp15135, 136);
tmp15012 = _mm512_shuffle_f32x4(tmp15127, tmp15135, 221);
tmp15005 = _mm512_shuffle_f32x4(tmp15122, tmp15130, 136);
__m512 tmp15053 = _mm512_shuffle_f32x4(tmp15122, tmp15130, 221);
tmp15006 = _mm512_shuffle_f32x4(tmp15124, tmp15132, 136);
__m512 tmp15054 = _mm512_shuffle_f32x4(tmp15124, tmp15132, 221);
tmp15007 = _mm512_shuffle_f32x4(tmp15126, tmp15134, 136);
__m512 tmp15055 = _mm512_shuffle_f32x4(tmp15126, tmp15134, 221);
tmp15008 = _mm512_shuffle_f32x4(tmp15128, tmp15136, 136);
__m512 tmp15056 = _mm512_shuffle_f32x4(tmp15128, tmp15136, 221);
__m512 tmp15061 = _mm512_add_ps(tmp15002, tmp15003);
__m512 tmp15081 = _mm512_add_ps(tmp15010, tmp15011);
__m512 tmp15060 = _mm512_add_ps(tmp15004, tmp15005);
__m512 tmp15080 = _mm512_add_ps(tmp15012, tmp15053);
__m512 tmp15066 = _mm512_sub_ps(tmp15004, tmp15005);
__m512 tmp15086 = _mm512_sub_ps(tmp15012, tmp15053);
__m512 tmp15065 = _mm512_sub_ps(tmp15002, tmp15003);
__m512 tmp15085 = _mm512_sub_ps(tmp15010, tmp15011);
__m512 tmp15062 = _mm512_add_ps(tmp15006, tmp15007);
__m512 tmp15082 = _mm512_add_ps(tmp15054, tmp15055);
__m512 tmp15067 = _mm512_sub_ps(tmp15006, tmp15007);
__m512 tmp15087 = _mm512_sub_ps(tmp15054, tmp15055);
__m512 tmp15064 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(2e+00f), tmp15065);
__m512 tmp15084 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(2e+00f), tmp15085);
__m512 tmp15071 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(8e+00f), tmp15065);
__m512 tmp15091 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(8e+00f), tmp15085);
__m512 tmp15059 = _mm512_add_ps(tmp15060, tmp15061);
__m512 tmp15079 = _mm512_add_ps(tmp15080, tmp15081);
__m512 tmp15063 = _mm512_fmadd_ps(tmp15067, _mm512_set1_ps(1.6e+01f), tmp15064);
__m512 tmp15083 = _mm512_fmadd_ps(tmp15087, _mm512_set1_ps(1.6e+01f), tmp15084);
__m512 tmp15070 = _mm512_fmadd_ps(tmp15067, _mm512_set1_ps(4e+00f), tmp15071);
__m512 tmp15090 = _mm512_fmadd_ps(tmp15087, _mm512_set1_ps(4e+00f), tmp15091);
__m512 tmp15076 = _mm512_add_ps(tmp15067, tmp15065);
__m512 tmp15096 = _mm512_add_ps(tmp15087, tmp15085);
__m512 tmp15069 = _mm512_fmadd_ps(tmp15060, _mm512_set1_ps(4e+00f), tmp15061);
__m512 tmp15089 = _mm512_fmadd_ps(tmp15080, _mm512_set1_ps(4e+00f), tmp15081);
__m512 tmp15073 = _mm512_fmadd_ps(tmp15060, _mm512_set1_ps(1.6e+01f), tmp15061);
__m512 tmp15093 = _mm512_fmadd_ps(tmp15080, _mm512_set1_ps(1.6e+01f), tmp15081);
__m512 tmp15058 = _mm512_add_ps(tmp15059, tmp15001);
__m512 tmp15078 = _mm512_add_ps(tmp15079, tmp15009);
__m512 tmp15075 = _mm512_add_ps(tmp15076, tmp15008);
__m512 tmp15095 = _mm512_add_ps(tmp15096, tmp15056);
__m512 tmp15057 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(3.2e+01f), tmp15058);
__m512 tmp15077 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(3.2e+01f), tmp15078);
__m512 tmp15068 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(8e+00f), tmp15069);
__m512 tmp15088 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(8e+00f), tmp15089);
__m512 tmp15074 = _mm512_fmadd_ps(tmp15066, _mm512_set1_ps(3.2e+01f), tmp15075);
__m512 tmp15094 = _mm512_fmadd_ps(tmp15086, _mm512_set1_ps(3.2e+01f), tmp15095);
__m512 tmp15072 = _mm512_fmadd_ps(tmp15062, _mm512_set1_ps(2e+00f), tmp15073);
__m512 tmp15092 = _mm512_fmadd_ps(tmp15082, _mm512_set1_ps(2e+00f), tmp15093);
__m512 out1885 = tmp15057;
__m512 out1879 = tmp15077;
__m512 out1886 = tmp15063;
__m512 out1880 = tmp15083;
__m512 out1887 = tmp15068;
__m512 out1881 = tmp15088;
__m512 out1888 = tmp15070;
__m512 out1882 = tmp15090;
__m512 out1889 = tmp15072;
__m512 out1883 = tmp15092;
__m512 out1890 = tmp15074;
__m512 out1884 = tmp15094;
out1885 = _mm512_max_ps(_mm512_setzero_ps(), out1885);
out1879 = _mm512_max_ps(_mm512_setzero_ps(), out1879);
out1886 = _mm512_max_ps(_mm512_setzero_ps(), out1886);
out1880 = _mm512_max_ps(_mm512_setzero_ps(), out1880);
out1887 = _mm512_max_ps(_mm512_setzero_ps(), out1887);
out1881 = _mm512_max_ps(_mm512_setzero_ps(), out1881);
out1888 = _mm512_max_ps(_mm512_setzero_ps(), out1888);
out1882 = _mm512_max_ps(_mm512_setzero_ps(), out1882);
out1889 = _mm512_max_ps(_mm512_setzero_ps(), out1889);
out1883 = _mm512_max_ps(_mm512_setzero_ps(), out1883);
out1890 = _mm512_max_ps(_mm512_setzero_ps(), out1890);
out1884 = _mm512_max_ps(_mm512_setzero_ps(), out1884);
_mm512_mask_storeu_ps(datPtr39+1168+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1885);
_mm512_mask_storeu_ps(datPtr39+880+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1879);
_mm512_mask_storeu_ps(datPtr39+1192+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1879);
_mm512_mask_storeu_ps(datPtr39+1224+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1886);
_mm512_mask_storeu_ps(datPtr39+936+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1880);
_mm512_mask_storeu_ps(datPtr39+1248+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1880);
_mm512_mask_storeu_ps(datPtr39+1280+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1887);
_mm512_mask_storeu_ps(datPtr39+992+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1881);
_mm512_mask_storeu_ps(datPtr39+1304+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1881);
_mm512_mask_storeu_ps(datPtr39+1336+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1888);
_mm512_mask_storeu_ps(datPtr39+1048+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1882);
_mm512_mask_storeu_ps(datPtr39+1360+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1882);
_mm512_mask_storeu_ps(datPtr39+1392+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1889);
_mm512_mask_storeu_ps(datPtr39+1104+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1883);
_mm512_mask_storeu_ps(datPtr39+1416+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1883);
_mm512_mask_storeu_ps(datPtr39+1448+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 4095, out1890);
_mm512_mask_storeu_ps(datPtr39+1160+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 3, out1884);
_mm512_mask_storeu_ps(datPtr39+1472+13312*i73+56*toH47+4*toW47+3328*k176+1664*l74, 192, out1884);
}
}
++j65;
rel26 = 1;
}
ptrdiff_t toH48 = base26+12;
ptrdiff_t toW48 = 0;
ptrdiff_t k177 = 4*w73;
for (; k177 != 4; ++k177) {
ptrdiff_t l75 = 0;
for (; l75 != 1; ++l75) {
__m512 sf1057 = _mm512_loadu_ps(sfPtr17+0+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1058 = _mm512_loadu_ps(sfPtr17+128+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2029 = _mm512_shuffle_f32x4(sf1057, sf1058, 68);
__m512 in2030 = _mm512_shuffle_f32x4(sf1057, sf1058, 238);
__m512 sf1059 = _mm512_loadu_ps(sfPtr17+64+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1060 = _mm512_loadu_ps(sfPtr17+192+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2037 = _mm512_shuffle_f32x4(sf1059, sf1060, 68);
__m512 in2038 = _mm512_shuffle_f32x4(sf1059, sf1060, 238);
__m512 sf1061 = _mm512_loadu_ps(sfPtr17+9216+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1062 = _mm512_loadu_ps(sfPtr17+9344+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2031 = _mm512_shuffle_f32x4(sf1061, sf1062, 68);
__m512 in2032 = _mm512_shuffle_f32x4(sf1061, sf1062, 238);
__m512 sf1063 = _mm512_loadu_ps(sfPtr17+9280+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1064 = _mm512_loadu_ps(sfPtr17+9408+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2039 = _mm512_shuffle_f32x4(sf1063, sf1064, 68);
__m512 in2040 = _mm512_shuffle_f32x4(sf1063, sf1064, 238);
__m512 sf1065 = _mm512_loadu_ps(sfPtr17+18432+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1066 = _mm512_loadu_ps(sfPtr17+18560+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2033 = _mm512_shuffle_f32x4(sf1065, sf1066, 68);
__m512 in2034 = _mm512_shuffle_f32x4(sf1065, sf1066, 238);
__m512 sf1067 = _mm512_loadu_ps(sfPtr17+18496+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1068 = _mm512_loadu_ps(sfPtr17+18624+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2041 = _mm512_shuffle_f32x4(sf1067, sf1068, 68);
__m512 in2042 = _mm512_shuffle_f32x4(sf1067, sf1068, 238);
__m512 sf1069 = _mm512_loadu_ps(sfPtr17+27648+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1070 = _mm512_loadu_ps(sfPtr17+27776+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2035 = _mm512_shuffle_f32x4(sf1069, sf1070, 68);
__m512 in2036 = _mm512_shuffle_f32x4(sf1069, sf1070, 238);
__m512 sf1071 = _mm512_loadu_ps(sfPtr17+27712+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1072 = _mm512_loadu_ps(sfPtr17+27840+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2043 = _mm512_shuffle_f32x4(sf1071, sf1072, 68);
__m512 in2044 = _mm512_shuffle_f32x4(sf1071, sf1072, 238);
__m512 tmp15153 = _mm512_add_ps(in2030, in2031);
__m512 tmp15173 = _mm512_add_ps(in2038, in2039);
__m512 tmp15152 = _mm512_add_ps(in2032, in2033);
__m512 tmp15172 = _mm512_add_ps(in2040, in2041);
__m512 tmp15158 = _mm512_sub_ps(in2032, in2033);
__m512 tmp15178 = _mm512_sub_ps(in2040, in2041);
__m512 tmp15157 = _mm512_sub_ps(in2030, in2031);
__m512 tmp15177 = _mm512_sub_ps(in2038, in2039);
__m512 tmp15154 = _mm512_add_ps(in2034, in2035);
__m512 tmp15174 = _mm512_add_ps(in2042, in2043);
__m512 tmp15159 = _mm512_sub_ps(in2034, in2035);
__m512 tmp15179 = _mm512_sub_ps(in2042, in2043);
__m512 tmp15156 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(2e+00f), tmp15157);
__m512 tmp15176 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(2e+00f), tmp15177);
__m512 tmp15163 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(8e+00f), tmp15157);
__m512 tmp15183 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(8e+00f), tmp15177);
__m512 tmp15151 = _mm512_add_ps(tmp15152, tmp15153);
__m512 tmp15171 = _mm512_add_ps(tmp15172, tmp15173);
__m512 tmp15155 = _mm512_fmadd_ps(tmp15159, _mm512_set1_ps(1.6e+01f), tmp15156);
__m512 tmp15175 = _mm512_fmadd_ps(tmp15179, _mm512_set1_ps(1.6e+01f), tmp15176);
__m512 tmp15162 = _mm512_fmadd_ps(tmp15159, _mm512_set1_ps(4e+00f), tmp15163);
__m512 tmp15182 = _mm512_fmadd_ps(tmp15179, _mm512_set1_ps(4e+00f), tmp15183);
__m512 tmp15168 = _mm512_add_ps(tmp15159, tmp15157);
__m512 tmp15188 = _mm512_add_ps(tmp15179, tmp15177);
__m512 tmp15161 = _mm512_fmadd_ps(tmp15152, _mm512_set1_ps(4e+00f), tmp15153);
__m512 tmp15181 = _mm512_fmadd_ps(tmp15172, _mm512_set1_ps(4e+00f), tmp15173);
__m512 tmp15165 = _mm512_fmadd_ps(tmp15152, _mm512_set1_ps(1.6e+01f), tmp15153);
__m512 tmp15185 = _mm512_fmadd_ps(tmp15172, _mm512_set1_ps(1.6e+01f), tmp15173);
__m512 tmp15150 = _mm512_add_ps(tmp15151, in2029);
__m512 tmp15170 = _mm512_add_ps(tmp15171, in2037);
__m512 tmp15167 = _mm512_add_ps(tmp15168, in2036);
__m512 tmp15187 = _mm512_add_ps(tmp15188, in2044);
__m512 tmp15149 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(3.2e+01f), tmp15150);
__m512 tmp15169 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(3.2e+01f), tmp15170);
__m512 tmp15160 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(8e+00f), tmp15161);
__m512 tmp15180 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(8e+00f), tmp15181);
__m512 tmp15166 = _mm512_fmadd_ps(tmp15158, _mm512_set1_ps(3.2e+01f), tmp15167);
__m512 tmp15186 = _mm512_fmadd_ps(tmp15178, _mm512_set1_ps(3.2e+01f), tmp15187);
__m512 tmp15164 = _mm512_fmadd_ps(tmp15154, _mm512_set1_ps(2e+00f), tmp15165);
__m512 tmp15184 = _mm512_fmadd_ps(tmp15174, _mm512_set1_ps(2e+00f), tmp15185);
__m512 tmp15137 = tmp15149;
__m512 tmp15143 = tmp15169;
__m512 tmp15138 = tmp15155;
__m512 tmp15144 = tmp15175;
__m512 tmp15139 = tmp15160;
__m512 tmp15145 = tmp15180;
__m512 tmp15140 = tmp15162;
__m512 tmp15146 = tmp15182;
__m512 tmp15141 = tmp15164;
__m512 tmp15147 = tmp15184;
__m512 tmp15142 = tmp15166;
__m512 tmp15148 = tmp15186;
__m512 tmp15215 = _mm512_unpacklo_ps(tmp15137, tmp15138);
__m512 tmp15216 = _mm512_unpackhi_ps(tmp15137, tmp15138);
__m512 tmp15217 = _mm512_unpacklo_ps(tmp15139, tmp15140);
__m512 tmp15218 = _mm512_unpackhi_ps(tmp15139, tmp15140);
__m512 tmp15219 = _mm512_unpacklo_ps(tmp15141, tmp15142);
__m512 tmp15220 = _mm512_unpackhi_ps(tmp15141, tmp15142);
__m512 tmp15221 = _mm512_unpacklo_ps(tmp15143, tmp15144);
__m512 tmp15222 = _mm512_unpackhi_ps(tmp15143, tmp15144);
__m512 tmp15223 = _mm512_unpacklo_ps(tmp15145, tmp15146);
__m512 tmp15224 = _mm512_unpackhi_ps(tmp15145, tmp15146);
__m512 tmp15225 = _mm512_unpacklo_ps(tmp15147, tmp15148);
__m512 tmp15226 = _mm512_unpackhi_ps(tmp15147, tmp15148);
__m512 tmp15227 = _mm512_shuffle_ps(tmp15215, tmp15217, 68);
__m512 tmp15228 = _mm512_shuffle_ps(tmp15215, tmp15217, 238);
__m512 tmp15229 = _mm512_shuffle_ps(tmp15216, tmp15218, 68);
__m512 tmp15230 = _mm512_shuffle_ps(tmp15216, tmp15218, 238);
__m512 tmp15231 = _mm512_shuffle_ps(tmp15219, tmp15221, 68);
__m512 tmp15232 = _mm512_shuffle_ps(tmp15219, tmp15221, 238);
__m512 tmp15233 = _mm512_shuffle_ps(tmp15220, tmp15222, 68);
__m512 tmp15234 = _mm512_shuffle_ps(tmp15220, tmp15222, 238);
__m512 tmp15235 = _mm512_shuffle_ps(tmp15223, tmp15225, 68);
__m512 tmp15236 = _mm512_shuffle_ps(tmp15223, tmp15225, 238);
__m512 tmp15237 = _mm512_shuffle_ps(tmp15224, tmp15226, 68);
__m512 tmp15238 = _mm512_shuffle_ps(tmp15224, tmp15226, 238);
__m512 tmp15239 = _mm512_shuffle_f32x4(tmp15227, tmp15231, 136);
__m512 tmp15240 = _mm512_shuffle_f32x4(tmp15227, tmp15231, 221);
__m512 tmp15241 = _mm512_shuffle_f32x4(tmp15228, tmp15232, 136);
__m512 tmp15242 = _mm512_shuffle_f32x4(tmp15228, tmp15232, 221);
__m512 tmp15243 = _mm512_shuffle_f32x4(tmp15229, tmp15233, 136);
__m512 tmp15244 = _mm512_shuffle_f32x4(tmp15229, tmp15233, 221);
__m512 tmp15245 = _mm512_shuffle_f32x4(tmp15230, tmp15234, 136);
__m512 tmp15246 = _mm512_shuffle_f32x4(tmp15230, tmp15234, 221);
__m512 tmp15247 = _mm512_shuffle_f32x4(tmp15235, tmp15235, 136);
__m512 tmp15248 = _mm512_shuffle_f32x4(tmp15235, tmp15235, 221);
__m512 tmp15249 = _mm512_shuffle_f32x4(tmp15236, tmp15236, 136);
__m512 tmp15250 = _mm512_shuffle_f32x4(tmp15236, tmp15236, 221);
__m512 tmp15251 = _mm512_shuffle_f32x4(tmp15237, tmp15237, 136);
__m512 tmp15252 = _mm512_shuffle_f32x4(tmp15237, tmp15237, 221);
__m512 tmp15253 = _mm512_shuffle_f32x4(tmp15238, tmp15238, 136);
__m512 tmp15254 = _mm512_shuffle_f32x4(tmp15238, tmp15238, 221);
tmp15137 = _mm512_shuffle_f32x4(tmp15239, tmp15247, 136);
tmp15145 = _mm512_shuffle_f32x4(tmp15239, tmp15247, 221);
tmp15138 = _mm512_shuffle_f32x4(tmp15241, tmp15249, 136);
tmp15146 = _mm512_shuffle_f32x4(tmp15241, tmp15249, 221);
tmp15139 = _mm512_shuffle_f32x4(tmp15243, tmp15251, 136);
tmp15147 = _mm512_shuffle_f32x4(tmp15243, tmp15251, 221);
tmp15140 = _mm512_shuffle_f32x4(tmp15245, tmp15253, 136);
tmp15148 = _mm512_shuffle_f32x4(tmp15245, tmp15253, 221);
tmp15141 = _mm512_shuffle_f32x4(tmp15240, tmp15248, 136);
__m512 tmp15189 = _mm512_shuffle_f32x4(tmp15240, tmp15248, 221);
tmp15142 = _mm512_shuffle_f32x4(tmp15242, tmp15250, 136);
__m512 tmp15190 = _mm512_shuffle_f32x4(tmp15242, tmp15250, 221);
tmp15143 = _mm512_shuffle_f32x4(tmp15244, tmp15252, 136);
__m512 tmp15191 = _mm512_shuffle_f32x4(tmp15244, tmp15252, 221);
tmp15144 = _mm512_shuffle_f32x4(tmp15246, tmp15254, 136);
__m512 tmp15192 = _mm512_shuffle_f32x4(tmp15246, tmp15254, 221);
(void)tmp15144;
(void)tmp15192;
__m512 tmp15197 = _mm512_add_ps(tmp15138, tmp15139);
__m512 tmp15208 = _mm512_add_ps(tmp15146, tmp15147);
__m512 tmp15196 = _mm512_add_ps(tmp15140, tmp15141);
__m512 tmp15207 = _mm512_add_ps(tmp15148, tmp15189);
__m512 tmp15202 = _mm512_sub_ps(tmp15140, tmp15141);
__m512 tmp15213 = _mm512_sub_ps(tmp15148, tmp15189);
__m512 tmp15201 = _mm512_sub_ps(tmp15138, tmp15139);
__m512 tmp15212 = _mm512_sub_ps(tmp15146, tmp15147);
__m512 tmp15198 = _mm512_add_ps(tmp15142, tmp15143);
__m512 tmp15209 = _mm512_add_ps(tmp15190, tmp15191);
__m512 tmp15203 = _mm512_sub_ps(tmp15142, tmp15143);
__m512 tmp15214 = _mm512_sub_ps(tmp15190, tmp15191);
__m512 tmp15200 = _mm512_fmadd_ps(tmp15202, _mm512_set1_ps(2e+00f), tmp15201);
__m512 tmp15211 = _mm512_fmadd_ps(tmp15213, _mm512_set1_ps(2e+00f), tmp15212);
__m512 tmp15195 = _mm512_add_ps(tmp15196, tmp15197);
__m512 tmp15206 = _mm512_add_ps(tmp15207, tmp15208);
__m512 tmp15199 = _mm512_fmadd_ps(tmp15203, _mm512_set1_ps(1.6e+01f), tmp15200);
__m512 tmp15210 = _mm512_fmadd_ps(tmp15214, _mm512_set1_ps(1.6e+01f), tmp15211);
__m512 tmp15194 = _mm512_add_ps(tmp15195, tmp15137);
__m512 tmp15205 = _mm512_add_ps(tmp15206, tmp15145);
__m512 tmp15193 = _mm512_fmadd_ps(tmp15198, _mm512_set1_ps(3.2e+01f), tmp15194);
__m512 tmp15204 = _mm512_fmadd_ps(tmp15209, _mm512_set1_ps(3.2e+01f), tmp15205);
__m512 out1891 = tmp15193;
__m512 out1893 = tmp15204;
__m512 out1892 = tmp15199;
__m512 out1894 = tmp15210;
out1891 = _mm512_max_ps(_mm512_setzero_ps(), out1891);
out1893 = _mm512_max_ps(_mm512_setzero_ps(), out1893);
out1892 = _mm512_max_ps(_mm512_setzero_ps(), out1892);
out1894 = _mm512_max_ps(_mm512_setzero_ps(), out1894);
_mm512_mask_storeu_ps(datPtr39+0+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1891);
_mm512_mask_storeu_ps(datPtr39+48+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 3, out1893);
_mm512_mask_storeu_ps(datPtr39+808+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4032, out1893);
_mm512_mask_storeu_ps(datPtr39+56+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1892);
_mm512_mask_storeu_ps(datPtr39+104+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 3, out1894);
_mm512_mask_storeu_ps(datPtr39+864+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4032, out1894);
__m512 sf1073 = _mm512_loadu_ps(sfPtr17+256+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1074 = _mm512_loadu_ps(sfPtr17+384+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2045 = _mm512_shuffle_f32x4(sf1073, sf1074, 68);
__m512 in2046 = _mm512_shuffle_f32x4(sf1073, sf1074, 238);
__m512 sf1075 = _mm512_loadu_ps(sfPtr17+320+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1076 = _mm512_loadu_ps(sfPtr17+448+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2053 = _mm512_shuffle_f32x4(sf1075, sf1076, 68);
__m512 in2054 = _mm512_shuffle_f32x4(sf1075, sf1076, 238);
__m512 sf1077 = _mm512_loadu_ps(sfPtr17+9472+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1078 = _mm512_loadu_ps(sfPtr17+9600+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2047 = _mm512_shuffle_f32x4(sf1077, sf1078, 68);
__m512 in2048 = _mm512_shuffle_f32x4(sf1077, sf1078, 238);
__m512 sf1079 = _mm512_loadu_ps(sfPtr17+9536+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1080 = _mm512_loadu_ps(sfPtr17+9664+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2055 = _mm512_shuffle_f32x4(sf1079, sf1080, 68);
__m512 in2056 = _mm512_shuffle_f32x4(sf1079, sf1080, 238);
__m512 sf1081 = _mm512_loadu_ps(sfPtr17+18688+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1082 = _mm512_loadu_ps(sfPtr17+18816+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2049 = _mm512_shuffle_f32x4(sf1081, sf1082, 68);
__m512 in2050 = _mm512_shuffle_f32x4(sf1081, sf1082, 238);
__m512 sf1083 = _mm512_loadu_ps(sfPtr17+18752+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1084 = _mm512_loadu_ps(sfPtr17+18880+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2057 = _mm512_shuffle_f32x4(sf1083, sf1084, 68);
__m512 in2058 = _mm512_shuffle_f32x4(sf1083, sf1084, 238);
__m512 sf1085 = _mm512_loadu_ps(sfPtr17+27904+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1086 = _mm512_loadu_ps(sfPtr17+28032+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2051 = _mm512_shuffle_f32x4(sf1085, sf1086, 68);
__m512 in2052 = _mm512_shuffle_f32x4(sf1085, sf1086, 238);
__m512 sf1087 = _mm512_loadu_ps(sfPtr17+27968+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1088 = _mm512_loadu_ps(sfPtr17+28096+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2059 = _mm512_shuffle_f32x4(sf1087, sf1088, 68);
__m512 in2060 = _mm512_shuffle_f32x4(sf1087, sf1088, 238);
__m512 tmp15271 = _mm512_add_ps(in2046, in2047);
__m512 tmp15291 = _mm512_add_ps(in2054, in2055);
__m512 tmp15270 = _mm512_add_ps(in2048, in2049);
__m512 tmp15290 = _mm512_add_ps(in2056, in2057);
__m512 tmp15276 = _mm512_sub_ps(in2048, in2049);
__m512 tmp15296 = _mm512_sub_ps(in2056, in2057);
__m512 tmp15275 = _mm512_sub_ps(in2046, in2047);
__m512 tmp15295 = _mm512_sub_ps(in2054, in2055);
__m512 tmp15272 = _mm512_add_ps(in2050, in2051);
__m512 tmp15292 = _mm512_add_ps(in2058, in2059);
__m512 tmp15277 = _mm512_sub_ps(in2050, in2051);
__m512 tmp15297 = _mm512_sub_ps(in2058, in2059);
__m512 tmp15274 = _mm512_fmadd_ps(tmp15276, _mm512_set1_ps(2e+00f), tmp15275);
__m512 tmp15294 = _mm512_fmadd_ps(tmp15296, _mm512_set1_ps(2e+00f), tmp15295);
__m512 tmp15281 = _mm512_fmadd_ps(tmp15276, _mm512_set1_ps(8e+00f), tmp15275);
__m512 tmp15301 = _mm512_fmadd_ps(tmp15296, _mm512_set1_ps(8e+00f), tmp15295);
__m512 tmp15269 = _mm512_add_ps(tmp15270, tmp15271);
__m512 tmp15289 = _mm512_add_ps(tmp15290, tmp15291);
__m512 tmp15273 = _mm512_fmadd_ps(tmp15277, _mm512_set1_ps(1.6e+01f), tmp15274);
__m512 tmp15293 = _mm512_fmadd_ps(tmp15297, _mm512_set1_ps(1.6e+01f), tmp15294);
__m512 tmp15280 = _mm512_fmadd_ps(tmp15277, _mm512_set1_ps(4e+00f), tmp15281);
__m512 tmp15300 = _mm512_fmadd_ps(tmp15297, _mm512_set1_ps(4e+00f), tmp15301);
__m512 tmp15286 = _mm512_add_ps(tmp15277, tmp15275);
__m512 tmp15306 = _mm512_add_ps(tmp15297, tmp15295);
__m512 tmp15279 = _mm512_fmadd_ps(tmp15270, _mm512_set1_ps(4e+00f), tmp15271);
__m512 tmp15299 = _mm512_fmadd_ps(tmp15290, _mm512_set1_ps(4e+00f), tmp15291);
__m512 tmp15283 = _mm512_fmadd_ps(tmp15270, _mm512_set1_ps(1.6e+01f), tmp15271);
__m512 tmp15303 = _mm512_fmadd_ps(tmp15290, _mm512_set1_ps(1.6e+01f), tmp15291);
__m512 tmp15268 = _mm512_add_ps(tmp15269, in2045);
__m512 tmp15288 = _mm512_add_ps(tmp15289, in2053);
__m512 tmp15285 = _mm512_add_ps(tmp15286, in2052);
__m512 tmp15305 = _mm512_add_ps(tmp15306, in2060);
__m512 tmp15267 = _mm512_fmadd_ps(tmp15272, _mm512_set1_ps(3.2e+01f), tmp15268);
__m512 tmp15287 = _mm512_fmadd_ps(tmp15292, _mm512_set1_ps(3.2e+01f), tmp15288);
__m512 tmp15278 = _mm512_fmadd_ps(tmp15272, _mm512_set1_ps(8e+00f), tmp15279);
__m512 tmp15298 = _mm512_fmadd_ps(tmp15292, _mm512_set1_ps(8e+00f), tmp15299);
__m512 tmp15284 = _mm512_fmadd_ps(tmp15276, _mm512_set1_ps(3.2e+01f), tmp15285);
__m512 tmp15304 = _mm512_fmadd_ps(tmp15296, _mm512_set1_ps(3.2e+01f), tmp15305);
__m512 tmp15282 = _mm512_fmadd_ps(tmp15272, _mm512_set1_ps(2e+00f), tmp15283);
__m512 tmp15302 = _mm512_fmadd_ps(tmp15292, _mm512_set1_ps(2e+00f), tmp15303);
__m512 tmp15255 = tmp15267;
__m512 tmp15261 = tmp15287;
__m512 tmp15256 = tmp15273;
__m512 tmp15262 = tmp15293;
__m512 tmp15257 = tmp15278;
__m512 tmp15263 = tmp15298;
__m512 tmp15258 = tmp15280;
__m512 tmp15264 = tmp15300;
__m512 tmp15259 = tmp15282;
__m512 tmp15265 = tmp15302;
__m512 tmp15260 = tmp15284;
__m512 tmp15266 = tmp15304;
__m512 tmp15333 = _mm512_unpacklo_ps(tmp15255, tmp15256);
__m512 tmp15334 = _mm512_unpackhi_ps(tmp15255, tmp15256);
__m512 tmp15335 = _mm512_unpacklo_ps(tmp15257, tmp15258);
__m512 tmp15336 = _mm512_unpackhi_ps(tmp15257, tmp15258);
__m512 tmp15337 = _mm512_unpacklo_ps(tmp15259, tmp15260);
__m512 tmp15338 = _mm512_unpackhi_ps(tmp15259, tmp15260);
__m512 tmp15339 = _mm512_unpacklo_ps(tmp15261, tmp15262);
__m512 tmp15340 = _mm512_unpackhi_ps(tmp15261, tmp15262);
__m512 tmp15341 = _mm512_unpacklo_ps(tmp15263, tmp15264);
__m512 tmp15342 = _mm512_unpackhi_ps(tmp15263, tmp15264);
__m512 tmp15343 = _mm512_unpacklo_ps(tmp15265, tmp15266);
__m512 tmp15344 = _mm512_unpackhi_ps(tmp15265, tmp15266);
__m512 tmp15345 = _mm512_shuffle_ps(tmp15333, tmp15335, 68);
__m512 tmp15346 = _mm512_shuffle_ps(tmp15333, tmp15335, 238);
__m512 tmp15347 = _mm512_shuffle_ps(tmp15334, tmp15336, 68);
__m512 tmp15348 = _mm512_shuffle_ps(tmp15334, tmp15336, 238);
__m512 tmp15349 = _mm512_shuffle_ps(tmp15337, tmp15339, 68);
__m512 tmp15350 = _mm512_shuffle_ps(tmp15337, tmp15339, 238);
__m512 tmp15351 = _mm512_shuffle_ps(tmp15338, tmp15340, 68);
__m512 tmp15352 = _mm512_shuffle_ps(tmp15338, tmp15340, 238);
__m512 tmp15353 = _mm512_shuffle_ps(tmp15341, tmp15343, 68);
__m512 tmp15354 = _mm512_shuffle_ps(tmp15341, tmp15343, 238);
__m512 tmp15355 = _mm512_shuffle_ps(tmp15342, tmp15344, 68);
__m512 tmp15356 = _mm512_shuffle_ps(tmp15342, tmp15344, 238);
__m512 tmp15357 = _mm512_shuffle_f32x4(tmp15345, tmp15349, 136);
__m512 tmp15358 = _mm512_shuffle_f32x4(tmp15345, tmp15349, 221);
__m512 tmp15359 = _mm512_shuffle_f32x4(tmp15346, tmp15350, 136);
__m512 tmp15360 = _mm512_shuffle_f32x4(tmp15346, tmp15350, 221);
__m512 tmp15361 = _mm512_shuffle_f32x4(tmp15347, tmp15351, 136);
__m512 tmp15362 = _mm512_shuffle_f32x4(tmp15347, tmp15351, 221);
__m512 tmp15363 = _mm512_shuffle_f32x4(tmp15348, tmp15352, 136);
__m512 tmp15364 = _mm512_shuffle_f32x4(tmp15348, tmp15352, 221);
__m512 tmp15365 = _mm512_shuffle_f32x4(tmp15353, tmp15353, 136);
__m512 tmp15366 = _mm512_shuffle_f32x4(tmp15353, tmp15353, 221);
__m512 tmp15367 = _mm512_shuffle_f32x4(tmp15354, tmp15354, 136);
__m512 tmp15368 = _mm512_shuffle_f32x4(tmp15354, tmp15354, 221);
__m512 tmp15369 = _mm512_shuffle_f32x4(tmp15355, tmp15355, 136);
__m512 tmp15370 = _mm512_shuffle_f32x4(tmp15355, tmp15355, 221);
__m512 tmp15371 = _mm512_shuffle_f32x4(tmp15356, tmp15356, 136);
__m512 tmp15372 = _mm512_shuffle_f32x4(tmp15356, tmp15356, 221);
tmp15255 = _mm512_shuffle_f32x4(tmp15357, tmp15365, 136);
tmp15263 = _mm512_shuffle_f32x4(tmp15357, tmp15365, 221);
tmp15256 = _mm512_shuffle_f32x4(tmp15359, tmp15367, 136);
tmp15264 = _mm512_shuffle_f32x4(tmp15359, tmp15367, 221);
tmp15257 = _mm512_shuffle_f32x4(tmp15361, tmp15369, 136);
tmp15265 = _mm512_shuffle_f32x4(tmp15361, tmp15369, 221);
tmp15258 = _mm512_shuffle_f32x4(tmp15363, tmp15371, 136);
tmp15266 = _mm512_shuffle_f32x4(tmp15363, tmp15371, 221);
tmp15259 = _mm512_shuffle_f32x4(tmp15358, tmp15366, 136);
__m512 tmp15307 = _mm512_shuffle_f32x4(tmp15358, tmp15366, 221);
tmp15260 = _mm512_shuffle_f32x4(tmp15360, tmp15368, 136);
__m512 tmp15308 = _mm512_shuffle_f32x4(tmp15360, tmp15368, 221);
tmp15261 = _mm512_shuffle_f32x4(tmp15362, tmp15370, 136);
__m512 tmp15309 = _mm512_shuffle_f32x4(tmp15362, tmp15370, 221);
tmp15262 = _mm512_shuffle_f32x4(tmp15364, tmp15372, 136);
__m512 tmp15310 = _mm512_shuffle_f32x4(tmp15364, tmp15372, 221);
(void)tmp15262;
(void)tmp15310;
__m512 tmp15315 = _mm512_add_ps(tmp15256, tmp15257);
__m512 tmp15326 = _mm512_add_ps(tmp15264, tmp15265);
__m512 tmp15314 = _mm512_add_ps(tmp15258, tmp15259);
__m512 tmp15325 = _mm512_add_ps(tmp15266, tmp15307);
__m512 tmp15320 = _mm512_sub_ps(tmp15258, tmp15259);
__m512 tmp15331 = _mm512_sub_ps(tmp15266, tmp15307);
__m512 tmp15319 = _mm512_sub_ps(tmp15256, tmp15257);
__m512 tmp15330 = _mm512_sub_ps(tmp15264, tmp15265);
__m512 tmp15316 = _mm512_add_ps(tmp15260, tmp15261);
__m512 tmp15327 = _mm512_add_ps(tmp15308, tmp15309);
__m512 tmp15321 = _mm512_sub_ps(tmp15260, tmp15261);
__m512 tmp15332 = _mm512_sub_ps(tmp15308, tmp15309);
__m512 tmp15318 = _mm512_fmadd_ps(tmp15320, _mm512_set1_ps(2e+00f), tmp15319);
__m512 tmp15329 = _mm512_fmadd_ps(tmp15331, _mm512_set1_ps(2e+00f), tmp15330);
__m512 tmp15313 = _mm512_add_ps(tmp15314, tmp15315);
__m512 tmp15324 = _mm512_add_ps(tmp15325, tmp15326);
__m512 tmp15317 = _mm512_fmadd_ps(tmp15321, _mm512_set1_ps(1.6e+01f), tmp15318);
__m512 tmp15328 = _mm512_fmadd_ps(tmp15332, _mm512_set1_ps(1.6e+01f), tmp15329);
__m512 tmp15312 = _mm512_add_ps(tmp15313, tmp15255);
__m512 tmp15323 = _mm512_add_ps(tmp15324, tmp15263);
__m512 tmp15311 = _mm512_fmadd_ps(tmp15316, _mm512_set1_ps(3.2e+01f), tmp15312);
__m512 tmp15322 = _mm512_fmadd_ps(tmp15327, _mm512_set1_ps(3.2e+01f), tmp15323);
__m512 out1895 = tmp15311;
__m512 out1897 = tmp15322;
__m512 out1896 = tmp15317;
__m512 out1898 = tmp15328;
out1895 = _mm512_max_ps(_mm512_setzero_ps(), out1895);
out1897 = _mm512_max_ps(_mm512_setzero_ps(), out1897);
out1896 = _mm512_max_ps(_mm512_setzero_ps(), out1896);
out1898 = _mm512_max_ps(_mm512_setzero_ps(), out1898);
_mm512_mask_storeu_ps(datPtr39+856+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 255, out1895);
_mm512_mask_storeu_ps(datPtr39+1664+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1897);
_mm512_mask_storeu_ps(datPtr39+912+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 255, out1896);
_mm512_mask_storeu_ps(datPtr39+1720+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1898);
__m512 sf1089 = _mm512_loadu_ps(sfPtr17+512+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1090 = _mm512_loadu_ps(sfPtr17+576+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2061 = _mm512_shuffle_f32x4(sf1090, sf1089, 68);
__m512 in2062 = _mm512_shuffle_f32x4(sf1090, sf1089, 238);
__m512 sf1091 = _mm512_loadu_ps(sfPtr17+640+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1092 = _mm512_loadu_ps(sfPtr17+704+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2069 = _mm512_shuffle_f32x4(sf1091, sf1092, 68);
__m512 in2070 = _mm512_shuffle_f32x4(sf1091, sf1092, 238);
__m512 sf1093 = _mm512_loadu_ps(sfPtr17+9728+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1094 = _mm512_loadu_ps(sfPtr17+9792+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2063 = _mm512_shuffle_f32x4(sf1094, sf1093, 68);
__m512 in2064 = _mm512_shuffle_f32x4(sf1094, sf1093, 238);
__m512 sf1095 = _mm512_loadu_ps(sfPtr17+9856+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1096 = _mm512_loadu_ps(sfPtr17+9920+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2071 = _mm512_shuffle_f32x4(sf1095, sf1096, 68);
__m512 in2072 = _mm512_shuffle_f32x4(sf1095, sf1096, 238);
__m512 sf1097 = _mm512_loadu_ps(sfPtr17+18944+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1098 = _mm512_loadu_ps(sfPtr17+19008+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2065 = _mm512_shuffle_f32x4(sf1098, sf1097, 68);
__m512 in2066 = _mm512_shuffle_f32x4(sf1098, sf1097, 238);
__m512 sf1099 = _mm512_loadu_ps(sfPtr17+19072+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1100 = _mm512_loadu_ps(sfPtr17+19136+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2073 = _mm512_shuffle_f32x4(sf1099, sf1100, 68);
__m512 in2074 = _mm512_shuffle_f32x4(sf1099, sf1100, 238);
__m512 sf1101 = _mm512_loadu_ps(sfPtr17+28160+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1102 = _mm512_loadu_ps(sfPtr17+28224+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2067 = _mm512_shuffle_f32x4(sf1102, sf1101, 68);
__m512 in2068 = _mm512_shuffle_f32x4(sf1102, sf1101, 238);
__m512 sf1103 = _mm512_loadu_ps(sfPtr17+28288+36864*i73+6144*j65+768*k177+768*l75);
__m512 sf1104 = _mm512_loadu_ps(sfPtr17+28352+36864*i73+6144*j65+768*k177+768*l75);
__m512 in2075 = _mm512_shuffle_f32x4(sf1103, sf1104, 68);
__m512 in2076 = _mm512_shuffle_f32x4(sf1103, sf1104, 238);
__m512 tmp15389 = _mm512_add_ps(in2062, in2063);
__m512 tmp15409 = _mm512_add_ps(in2070, in2071);
__m512 tmp15388 = _mm512_add_ps(in2064, in2065);
__m512 tmp15408 = _mm512_add_ps(in2072, in2073);
__m512 tmp15394 = _mm512_sub_ps(in2064, in2065);
__m512 tmp15414 = _mm512_sub_ps(in2072, in2073);
__m512 tmp15393 = _mm512_sub_ps(in2062, in2063);
__m512 tmp15413 = _mm512_sub_ps(in2070, in2071);
__m512 tmp15390 = _mm512_add_ps(in2066, in2067);
__m512 tmp15410 = _mm512_add_ps(in2074, in2075);
__m512 tmp15395 = _mm512_sub_ps(in2066, in2067);
__m512 tmp15415 = _mm512_sub_ps(in2074, in2075);
__m512 tmp15392 = _mm512_fmadd_ps(tmp15394, _mm512_set1_ps(2e+00f), tmp15393);
__m512 tmp15412 = _mm512_fmadd_ps(tmp15414, _mm512_set1_ps(2e+00f), tmp15413);
__m512 tmp15399 = _mm512_fmadd_ps(tmp15394, _mm512_set1_ps(8e+00f), tmp15393);
__m512 tmp15419 = _mm512_fmadd_ps(tmp15414, _mm512_set1_ps(8e+00f), tmp15413);
__m512 tmp15387 = _mm512_add_ps(tmp15388, tmp15389);
__m512 tmp15407 = _mm512_add_ps(tmp15408, tmp15409);
__m512 tmp15391 = _mm512_fmadd_ps(tmp15395, _mm512_set1_ps(1.6e+01f), tmp15392);
__m512 tmp15411 = _mm512_fmadd_ps(tmp15415, _mm512_set1_ps(1.6e+01f), tmp15412);
__m512 tmp15398 = _mm512_fmadd_ps(tmp15395, _mm512_set1_ps(4e+00f), tmp15399);
__m512 tmp15418 = _mm512_fmadd_ps(tmp15415, _mm512_set1_ps(4e+00f), tmp15419);
__m512 tmp15404 = _mm512_add_ps(tmp15395, tmp15393);
__m512 tmp15424 = _mm512_add_ps(tmp15415, tmp15413);
__m512 tmp15397 = _mm512_fmadd_ps(tmp15388, _mm512_set1_ps(4e+00f), tmp15389);
__m512 tmp15417 = _mm512_fmadd_ps(tmp15408, _mm512_set1_ps(4e+00f), tmp15409);
__m512 tmp15401 = _mm512_fmadd_ps(tmp15388, _mm512_set1_ps(1.6e+01f), tmp15389);
__m512 tmp15421 = _mm512_fmadd_ps(tmp15408, _mm512_set1_ps(1.6e+01f), tmp15409);
__m512 tmp15386 = _mm512_add_ps(tmp15387, in2061);
__m512 tmp15406 = _mm512_add_ps(tmp15407, in2069);
__m512 tmp15403 = _mm512_add_ps(tmp15404, in2068);
__m512 tmp15423 = _mm512_add_ps(tmp15424, in2076);
__m512 tmp15385 = _mm512_fmadd_ps(tmp15390, _mm512_set1_ps(3.2e+01f), tmp15386);
__m512 tmp15405 = _mm512_fmadd_ps(tmp15410, _mm512_set1_ps(3.2e+01f), tmp15406);
__m512 tmp15396 = _mm512_fmadd_ps(tmp15390, _mm512_set1_ps(8e+00f), tmp15397);
__m512 tmp15416 = _mm512_fmadd_ps(tmp15410, _mm512_set1_ps(8e+00f), tmp15417);
__m512 tmp15402 = _mm512_fmadd_ps(tmp15394, _mm512_set1_ps(3.2e+01f), tmp15403);
__m512 tmp15422 = _mm512_fmadd_ps(tmp15414, _mm512_set1_ps(3.2e+01f), tmp15423);
__m512 tmp15400 = _mm512_fmadd_ps(tmp15390, _mm512_set1_ps(2e+00f), tmp15401);
__m512 tmp15420 = _mm512_fmadd_ps(tmp15410, _mm512_set1_ps(2e+00f), tmp15421);
__m512 tmp15373 = tmp15385;
__m512 tmp15379 = tmp15405;
__m512 tmp15374 = tmp15391;
__m512 tmp15380 = tmp15411;
__m512 tmp15375 = tmp15396;
__m512 tmp15381 = tmp15416;
__m512 tmp15376 = tmp15398;
__m512 tmp15382 = tmp15418;
__m512 tmp15377 = tmp15400;
__m512 tmp15383 = tmp15420;
__m512 tmp15378 = tmp15402;
__m512 tmp15384 = tmp15422;
__m512 tmp15451 = _mm512_unpacklo_ps(tmp15373, tmp15374);
__m512 tmp15452 = _mm512_unpackhi_ps(tmp15373, tmp15374);
__m512 tmp15453 = _mm512_unpacklo_ps(tmp15375, tmp15376);
__m512 tmp15454 = _mm512_unpackhi_ps(tmp15375, tmp15376);
__m512 tmp15455 = _mm512_unpacklo_ps(tmp15377, tmp15378);
__m512 tmp15456 = _mm512_unpackhi_ps(tmp15377, tmp15378);
__m512 tmp15457 = _mm512_unpacklo_ps(tmp15379, tmp15380);
__m512 tmp15458 = _mm512_unpackhi_ps(tmp15379, tmp15380);
__m512 tmp15459 = _mm512_unpacklo_ps(tmp15381, tmp15382);
__m512 tmp15460 = _mm512_unpackhi_ps(tmp15381, tmp15382);
__m512 tmp15461 = _mm512_unpacklo_ps(tmp15383, tmp15384);
__m512 tmp15462 = _mm512_unpackhi_ps(tmp15383, tmp15384);
__m512 tmp15463 = _mm512_shuffle_ps(tmp15451, tmp15453, 68);
__m512 tmp15464 = _mm512_shuffle_ps(tmp15451, tmp15453, 238);
__m512 tmp15465 = _mm512_shuffle_ps(tmp15452, tmp15454, 68);
__m512 tmp15466 = _mm512_shuffle_ps(tmp15452, tmp15454, 238);
__m512 tmp15467 = _mm512_shuffle_ps(tmp15455, tmp15457, 68);
__m512 tmp15468 = _mm512_shuffle_ps(tmp15455, tmp15457, 238);
__m512 tmp15469 = _mm512_shuffle_ps(tmp15456, tmp15458, 68);
__m512 tmp15470 = _mm512_shuffle_ps(tmp15456, tmp15458, 238);
__m512 tmp15471 = _mm512_shuffle_ps(tmp15459, tmp15461, 68);
__m512 tmp15472 = _mm512_shuffle_ps(tmp15459, tmp15461, 238);
__m512 tmp15473 = _mm512_shuffle_ps(tmp15460, tmp15462, 68);
__m512 tmp15474 = _mm512_shuffle_ps(tmp15460, tmp15462, 238);
__m512 tmp15475 = _mm512_shuffle_f32x4(tmp15463, tmp15467, 136);
__m512 tmp15476 = _mm512_shuffle_f32x4(tmp15463, tmp15467, 221);
__m512 tmp15477 = _mm512_shuffle_f32x4(tmp15464, tmp15468, 136);
__m512 tmp15478 = _mm512_shuffle_f32x4(tmp15464, tmp15468, 221);
__m512 tmp15479 = _mm512_shuffle_f32x4(tmp15465, tmp15469, 136);
__m512 tmp15480 = _mm512_shuffle_f32x4(tmp15465, tmp15469, 221);
__m512 tmp15481 = _mm512_shuffle_f32x4(tmp15466, tmp15470, 136);
__m512 tmp15482 = _mm512_shuffle_f32x4(tmp15466, tmp15470, 221);
__m512 tmp15483 = _mm512_shuffle_f32x4(tmp15471, tmp15471, 136);
__m512 tmp15484 = _mm512_shuffle_f32x4(tmp15471, tmp15471, 221);
__m512 tmp15485 = _mm512_shuffle_f32x4(tmp15472, tmp15472, 136);
__m512 tmp15486 = _mm512_shuffle_f32x4(tmp15472, tmp15472, 221);
__m512 tmp15487 = _mm512_shuffle_f32x4(tmp15473, tmp15473, 136);
__m512 tmp15488 = _mm512_shuffle_f32x4(tmp15473, tmp15473, 221);
__m512 tmp15489 = _mm512_shuffle_f32x4(tmp15474, tmp15474, 136);
__m512 tmp15490 = _mm512_shuffle_f32x4(tmp15474, tmp15474, 221);
tmp15373 = _mm512_shuffle_f32x4(tmp15475, tmp15483, 136);
tmp15381 = _mm512_shuffle_f32x4(tmp15475, tmp15483, 221);
tmp15374 = _mm512_shuffle_f32x4(tmp15477, tmp15485, 136);
tmp15382 = _mm512_shuffle_f32x4(tmp15477, tmp15485, 221);
tmp15375 = _mm512_shuffle_f32x4(tmp15479, tmp15487, 136);
tmp15383 = _mm512_shuffle_f32x4(tmp15479, tmp15487, 221);
tmp15376 = _mm512_shuffle_f32x4(tmp15481, tmp15489, 136);
tmp15384 = _mm512_shuffle_f32x4(tmp15481, tmp15489, 221);
tmp15377 = _mm512_shuffle_f32x4(tmp15476, tmp15484, 136);
__m512 tmp15425 = _mm512_shuffle_f32x4(tmp15476, tmp15484, 221);
tmp15378 = _mm512_shuffle_f32x4(tmp15478, tmp15486, 136);
__m512 tmp15426 = _mm512_shuffle_f32x4(tmp15478, tmp15486, 221);
tmp15379 = _mm512_shuffle_f32x4(tmp15480, tmp15488, 136);
__m512 tmp15427 = _mm512_shuffle_f32x4(tmp15480, tmp15488, 221);
tmp15380 = _mm512_shuffle_f32x4(tmp15482, tmp15490, 136);
__m512 tmp15428 = _mm512_shuffle_f32x4(tmp15482, tmp15490, 221);
(void)tmp15380;
(void)tmp15428;
__m512 tmp15433 = _mm512_add_ps(tmp15374, tmp15375);
__m512 tmp15444 = _mm512_add_ps(tmp15382, tmp15383);
__m512 tmp15432 = _mm512_add_ps(tmp15376, tmp15377);
__m512 tmp15443 = _mm512_add_ps(tmp15384, tmp15425);
__m512 tmp15438 = _mm512_sub_ps(tmp15376, tmp15377);
__m512 tmp15449 = _mm512_sub_ps(tmp15384, tmp15425);
__m512 tmp15437 = _mm512_sub_ps(tmp15374, tmp15375);
__m512 tmp15448 = _mm512_sub_ps(tmp15382, tmp15383);
__m512 tmp15434 = _mm512_add_ps(tmp15378, tmp15379);
__m512 tmp15445 = _mm512_add_ps(tmp15426, tmp15427);
__m512 tmp15439 = _mm512_sub_ps(tmp15378, tmp15379);
__m512 tmp15450 = _mm512_sub_ps(tmp15426, tmp15427);
__m512 tmp15436 = _mm512_fmadd_ps(tmp15438, _mm512_set1_ps(2e+00f), tmp15437);
__m512 tmp15447 = _mm512_fmadd_ps(tmp15449, _mm512_set1_ps(2e+00f), tmp15448);
__m512 tmp15431 = _mm512_add_ps(tmp15432, tmp15433);
__m512 tmp15442 = _mm512_add_ps(tmp15443, tmp15444);
__m512 tmp15435 = _mm512_fmadd_ps(tmp15439, _mm512_set1_ps(1.6e+01f), tmp15436);
__m512 tmp15446 = _mm512_fmadd_ps(tmp15450, _mm512_set1_ps(1.6e+01f), tmp15447);
__m512 tmp15430 = _mm512_add_ps(tmp15431, tmp15373);
__m512 tmp15441 = _mm512_add_ps(tmp15442, tmp15381);
__m512 tmp15429 = _mm512_fmadd_ps(tmp15434, _mm512_set1_ps(3.2e+01f), tmp15430);
__m512 tmp15440 = _mm512_fmadd_ps(tmp15445, _mm512_set1_ps(3.2e+01f), tmp15441);
__m512 out1901 = tmp15429;
__m512 out1899 = tmp15440;
__m512 out1902 = tmp15435;
__m512 out1900 = tmp15446;
out1901 = _mm512_max_ps(_mm512_setzero_ps(), out1901);
out1899 = _mm512_max_ps(_mm512_setzero_ps(), out1899);
out1902 = _mm512_max_ps(_mm512_setzero_ps(), out1902);
out1900 = _mm512_max_ps(_mm512_setzero_ps(), out1900);
_mm512_mask_storeu_ps(datPtr39+2496+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1901);
_mm512_mask_storeu_ps(datPtr39+1712+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 3, out1899);
_mm512_mask_storeu_ps(datPtr39+2520+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 192, out1899);
_mm512_mask_storeu_ps(datPtr39+2552+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 4095, out1902);
_mm512_mask_storeu_ps(datPtr39+1768+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 3, out1900);
_mm512_mask_storeu_ps(datPtr39+2576+13312*i73+56*toH48+4*toW48+3328*k177+3328*l75, 192, out1900);
}
}
++j65;
}
}

static void ResNeXt50ThreeConsumeSums4(ResNeXt50ThreaderTeam1* team75, char** tensors123) {
ResNeXt50ThreaderTask1 task127;
task127.callee1 = ResNeXt50ThreeConsumeSums4Callee1;
task127.any1 = tensors123;
task127.nd1 = 3;
task127.hull1[0] = 1;
task127.hull1[1] = 1;
task127.hull1[2] = 8;
ResNeXt50ThreaderDo1(team75, &task127);
}

static void ResNeXt50ThreeArrangeFilts5Callee1(ResNeXt50ThreaderTask1* task164, int64_t* pt87) {
char** tensors160 = task164->any1;
ptrdiff_t b92 = pt87[0];
ptrdiff_t g51 = pt87[1];
ptrdiff_t e50 = 0;
char*restrict bfPtr21 = tensors160[3]+4096*e50;
char*restrict wfPtr21 = tensors160[3]+4096+65011712*e50;
char*restrict wtPtr26 = tensors160[0]+17856*e50;
char*restrict biasPtr25 = tensors160[1];
char*restrict bnPtr27 = tensors160[2];
ptrdiff_t i96 = 1*g51;
ptrdiff_t j87 = 4*b92;
ptrdiff_t jj71 = j87+3;
if (j87 < 8) {
for (; j87 != 8; ++j87) {
ptrdiff_t k208 = 0+1*j87;
ptrdiff_t cut37 = 0;
__m512 postMul82 = _mm512_set1_ps(((float*)bnPtr27+(ptrdiff_t)2*(0+32*i96+4*j87))[0]);
__m512 postMul83 = _mm512_set1_ps(((float*)bnPtr27+(ptrdiff_t)2*(1+32*i96+4*j87))[0]);
__m512 postMul84 = _mm512_set1_ps(((float*)bnPtr27+(ptrdiff_t)2*(2+32*i96+4*j87))[0]);
__m512 postMul85 = _mm512_set1_ps(((float*)bnPtr27+(ptrdiff_t)2*(3+32*i96+4*j87))[0]);
ptrdiff_t s97 = 0;
for (; s97 != 32; ++s97) {
__m512 wt1145 = _mm512_maskz_loadu_ps(511, wtPtr26+0+36864*i96+4608*j87+36*s97);
__m512 wt1146 = _mm512_maskz_loadu_ps(511, wtPtr26+1152+36864*i96+4608*j87+36*s97);
__m512 wt1147 = _mm512_maskz_loadu_ps(511, wtPtr26+2304+36864*i96+4608*j87+36*s97);
__m512 wt1148 = _mm512_maskz_loadu_ps(511, wtPtr26+3456+36864*i96+4608*j87+36*s97);
wt1145 = _mm512_mul_ps(wt1145, postMul82);
wt1146 = _mm512_mul_ps(wt1146, postMul83);
wt1147 = _mm512_mul_ps(wt1147, postMul84);
wt1148 = _mm512_mul_ps(wt1148, postMul85);
__m512i pm234 = _mm512_set_epi32(22, 8, 7, 6, 21, 20, 19, 5, 4, 3, 18, 17, 16, 2, 1, 0);
__m512i pm235 = _mm512_set_epi32(28, 14, 13, 12, 27, 26, 25, 11, 10, 9, 24, 23, 22, 8, 7, 6);
__m512 tmp16457 = _mm512_permutex2var_ps(wt1145, pm234, wt1147);
__m512 tmp16458 = _mm512_permutex2var_ps(wt1146, pm234, wt1148);
__m512 tmp16459 = _mm512_permutex2var_ps(wt1145, pm235, wt1147);
__m512 tmp16460 = _mm512_permutex2var_ps(wt1146, pm235, wt1148);
__m512 in2077 = _mm512_permutex2var_ps(tmp16457, pm234, tmp16458);
__m512 in2078 = _mm512_permutex2var_ps(tmp16457, pm235, tmp16458);
__m512 in2079 = _mm512_permutex2var_ps(tmp16459, pm234, tmp16460);
__m512 tmp16461 = _mm512_fmadd_ps(in2077, _mm512_set1_ps(4e+00f), in2079);
__m512 tmp16462 = _mm512_add_ps(in2077, in2079);
__m512 tmp16463 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(4e+00f), in2077);
__m512 tmp16464 = _mm512_add_ps(in2078, tmp16462);
__m512 tmp16465 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(2e+00f), tmp16463);
tmp16463 = _mm512_fnmadd_ps(in2078, _mm512_set1_ps(2e+00f), tmp16463);
__m512 tmp16466 = _mm512_fnmadd_ps(in2078, _mm512_set1_ps(2e+00f), tmp16461);
tmp16461 = _mm512_fmadd_ps(in2078, _mm512_set1_ps(2e+00f), tmp16461);
tmp16462 = _mm512_sub_ps(tmp16462, in2078);
__m512 tmp16483 = _mm512_unpacklo_ps(in2077, tmp16464);
__m512 tmp16484 = _mm512_unpackhi_ps(in2077, tmp16464);
__m512 tmp16485 = _mm512_unpacklo_ps(tmp16462, tmp16465);
__m512 tmp16486 = _mm512_unpackhi_ps(tmp16462, tmp16465);
__m512 tmp16487 = _mm512_unpacklo_ps(tmp16463, tmp16461);
__m512 tmp16488 = _mm512_unpackhi_ps(tmp16463, tmp16461);
__m512 tmp16489 = _mm512_unpacklo_ps(tmp16466, in2079);
__m512 tmp16490 = _mm512_unpackhi_ps(tmp16466, in2079);
__m512 tmp16491 = _mm512_shuffle_ps(tmp16483, tmp16485, 68);
__m512 tmp16492 = _mm512_shuffle_ps(tmp16483, tmp16485, 238);
__m512 tmp16493 = _mm512_shuffle_ps(tmp16484, tmp16486, 68);
__m512 tmp16494 = _mm512_shuffle_ps(tmp16484, tmp16486, 238);
__m512 tmp16495 = _mm512_shuffle_ps(tmp16487, tmp16489, 68);
__m512 tmp16496 = _mm512_shuffle_ps(tmp16487, tmp16489, 238);
__m512 tmp16497 = _mm512_shuffle_ps(tmp16488, tmp16490, 68);
__m512 tmp16498 = _mm512_shuffle_ps(tmp16488, tmp16490, 238);
__m512 tmp16499 = _mm512_shuffle_f32x4(tmp16491, tmp16495, 136);
__m512 tmp16500 = _mm512_shuffle_f32x4(tmp16491, tmp16495, 221);
__m512 tmp16501 = _mm512_shuffle_f32x4(tmp16492, tmp16496, 136);
__m512 tmp16502 = _mm512_shuffle_f32x4(tmp16492, tmp16496, 221);
__m512 tmp16503 = _mm512_shuffle_f32x4(tmp16493, tmp16497, 136);
__m512 tmp16504 = _mm512_shuffle_f32x4(tmp16493, tmp16497, 221);
__m512 tmp16505 = _mm512_shuffle_f32x4(tmp16494, tmp16498, 136);
__m512 tmp16506 = _mm512_shuffle_f32x4(tmp16494, tmp16498, 221);
in2077 = _mm512_shuffle_f32x4(tmp16499, tmp16499, 136);
__m512 tmp16467 = _mm512_shuffle_f32x4(tmp16499, tmp16499, 221);
tmp16464 = _mm512_shuffle_f32x4(tmp16501, tmp16501, 136);
__m512 tmp16468 = _mm512_shuffle_f32x4(tmp16501, tmp16501, 221);
tmp16462 = _mm512_shuffle_f32x4(tmp16503, tmp16503, 136);
__m512 tmp16469 = _mm512_shuffle_f32x4(tmp16503, tmp16503, 221);
tmp16465 = _mm512_shuffle_f32x4(tmp16505, tmp16505, 136);
__m512 tmp16470 = _mm512_shuffle_f32x4(tmp16505, tmp16505, 221);
tmp16463 = _mm512_shuffle_f32x4(tmp16500, tmp16500, 136);
tmp16461 = _mm512_shuffle_f32x4(tmp16502, tmp16502, 136);
tmp16466 = _mm512_shuffle_f32x4(tmp16504, tmp16504, 136);
in2079 = _mm512_shuffle_f32x4(tmp16506, tmp16506, 136);
in2077 = _mm512_shuffle_f32x4(in2077, tmp16465, 68);
tmp16464 = _mm512_shuffle_f32x4(tmp16464, tmp16463, 68);
tmp16462 = _mm512_shuffle_f32x4(tmp16462, tmp16461, 68);
tmp16466 = _mm512_shuffle_f32x4(tmp16466, tmp16468, 68);
in2079 = _mm512_shuffle_f32x4(in2079, tmp16469, 68);
tmp16467 = _mm512_shuffle_f32x4(tmp16467, tmp16470, 68);
__m512 tmp16471 = _mm512_fmadd_ps(in2077, _mm512_set1_ps(4e+00f), tmp16462);
__m512 tmp16477 = _mm512_fmadd_ps(tmp16466, _mm512_set1_ps(4e+00f), tmp16467);
__m512 tmp16472 = _mm512_add_ps(in2077, tmp16462);
__m512 tmp16478 = _mm512_add_ps(tmp16466, tmp16467);
__m512 tmp16473 = _mm512_fmadd_ps(tmp16462, _mm512_set1_ps(4e+00f), in2077);
__m512 tmp16479 = _mm512_fmadd_ps(tmp16467, _mm512_set1_ps(4e+00f), tmp16466);
__m512 tmp16474 = _mm512_add_ps(tmp16464, tmp16472);
__m512 tmp16480 = _mm512_add_ps(in2079, tmp16478);
__m512 tmp16475 = _mm512_fmadd_ps(tmp16464, _mm512_set1_ps(2e+00f), tmp16473);
__m512 tmp16481 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(2e+00f), tmp16479);
tmp16473 = _mm512_fnmadd_ps(tmp16464, _mm512_set1_ps(2e+00f), tmp16473);
tmp16479 = _mm512_fnmadd_ps(in2079, _mm512_set1_ps(2e+00f), tmp16479);
__m512 tmp16476 = _mm512_fnmadd_ps(tmp16464, _mm512_set1_ps(2e+00f), tmp16471);
__m512 tmp16482 = _mm512_fnmadd_ps(in2079, _mm512_set1_ps(2e+00f), tmp16477);
tmp16471 = _mm512_fmadd_ps(tmp16464, _mm512_set1_ps(2e+00f), tmp16471);
tmp16477 = _mm512_fmadd_ps(in2079, _mm512_set1_ps(2e+00f), tmp16477);
tmp16472 = _mm512_sub_ps(tmp16472, tmp16464);
tmp16478 = _mm512_sub_ps(tmp16478, in2079);
in2077 = _mm512_mul_ps(in2077, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16474 = _mm512_mul_ps(tmp16474, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16472 = _mm512_mul_ps(tmp16472, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16475 = _mm512_mul_ps(tmp16475, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16473 = _mm512_mul_ps(tmp16473, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16471 = _mm512_mul_ps(tmp16471, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16476 = _mm512_mul_ps(tmp16476, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16462 = _mm512_mul_ps(tmp16462, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16466 = _mm512_mul_ps(tmp16466, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
tmp16480 = _mm512_mul_ps(tmp16480, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16478 = _mm512_mul_ps(tmp16478, _mm512_set_ps(-2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f, -2.2222222e-01f, -1.234568e-03f, -1.234568e-03f, -2.469136e-03f, -2.469136e-03f, 4.9382716e-02f, 4.9382716e-02f, -2.2222222e-01f));
tmp16481 = _mm512_mul_ps(tmp16481, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16479 = _mm512_mul_ps(tmp16479, _mm512_set_ps(1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f, 1.1111111e-02f, 6.1728395e-05f, 6.1728395e-05f, 1.2345679e-04f, 1.2345679e-04f, -2.469136e-03f, -2.469136e-03f, 1.1111111e-02f));
tmp16477 = _mm512_mul_ps(tmp16477, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16482 = _mm512_mul_ps(tmp16482, _mm512_set_ps(5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f, 5.5555557e-03f, 3.0864197e-05f, 3.0864197e-05f, 6.1728395e-05f, 6.1728395e-05f, -1.234568e-03f, -1.234568e-03f, 5.5555557e-03f));
tmp16467 = _mm512_mul_ps(tmp16467, _mm512_set_ps(1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f, 1e+00f, 5.5555557e-03f, 5.5555557e-03f, 1.1111111e-02f, 1.1111111e-02f, -2.2222222e-01f, -2.2222222e-01f, 1e+00f));
__m512 out1903 = _mm512_shuffle_f32x4(in2077, tmp16474, 68);
__m512 out1907 = _mm512_shuffle_f32x4(in2077, tmp16474, 238);
__m512 out1904 = _mm512_shuffle_f32x4(tmp16472, tmp16475, 68);
__m512 out1908 = _mm512_shuffle_f32x4(tmp16472, tmp16475, 238);
__m512 out1905 = _mm512_shuffle_f32x4(tmp16473, tmp16471, 68);
__m512 out1909 = _mm512_shuffle_f32x4(tmp16473, tmp16471, 238);
__m512 out1906 = _mm512_shuffle_f32x4(tmp16476, tmp16462, 68);
__m512 out1910 = _mm512_shuffle_f32x4(tmp16476, tmp16462, 238);
__m512 out1911 = _mm512_shuffle_f32x4(tmp16466, tmp16480, 68);
__m512 out1915 = _mm512_shuffle_f32x4(tmp16466, tmp16480, 238);
__m512 out1912 = _mm512_shuffle_f32x4(tmp16478, tmp16481, 68);
__m512 out1916 = _mm512_shuffle_f32x4(tmp16478, tmp16481, 238);
__m512 out1913 = _mm512_shuffle_f32x4(tmp16479, tmp16477, 68);
__m512 out1917 = _mm512_shuffle_f32x4(tmp16479, tmp16477, 238);
__m512 out1914 = _mm512_shuffle_f32x4(tmp16482, tmp16467, 68);
__m512 out1918 = _mm512_shuffle_f32x4(tmp16482, tmp16467, 238);
ptrdiff_t off17 = 32*cut37;
ptrdiff_t off18 = (size_t)(cut37+1)/4*4096+(size_t)(cut37+1)%4*32;
ptrdiff_t off19 = (size_t)(cut37+2)/4*4096+(size_t)(cut37+2)%4*32;
ptrdiff_t off20 = (size_t)(cut37+3)/4*4096+(size_t)(cut37+3)%4*32;
__m512i wf225 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1903, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf226 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1907, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf227 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1911, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf228 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1915, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf229 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1904, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf230 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1908, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf231 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1912, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf232 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1916, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf233 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1905, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf234 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1909, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf235 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1913, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf236 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1917, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf237 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1906, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf238 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1910, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf239 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1914, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
__m512i wf240 = _mm512_castsi256_si512(_mm512_cvtps_ph(out1918, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
_mm512_mask_storeu_epi32(wfPtr21+0+131072*i96+4096*k208+off17+128*s97, 255, wf225);
_mm512_mask_storeu_epi32(wfPtr21+0+131072*i96+4096*k208+off18+128*s97, 255, wf226);
_mm512_mask_storeu_epi32(wfPtr21+0+131072*i96+4096*k208+off19+128*s97, 255, wf227);
_mm512_mask_storeu_epi32(wfPtr21+0+131072*i96+4096*k208+off20+128*s97, 255, wf228);
_mm512_mask_storeu_epi32(wfPtr21+32768+131072*i96+4096*k208+off17+128*s97, 255, wf229);
_mm512_mask_storeu_epi32(wfPtr21+32768+131072*i96+4096*k208+off18+128*s97, 255, wf230);
_mm512_mask_storeu_epi32(wfPtr21+32768+131072*i96+4096*k208+off19+128*s97, 255, wf231);
_mm512_mask_storeu_epi32(wfPtr21+32768+131072*i96+4096*k208+off20+128*s97, 255, wf232);
_mm512_mask_storeu_epi32(wfPtr21+65536+131072*i96+4096*k208+off17+128*s97, 255, wf233);
_mm512_mask_storeu_epi32(wfPtr21+65536+131072*i96+4096*k208+off18+128*s97, 255, wf234);
_mm512_mask_storeu_epi32(wfPtr21+65536+131072*i96+4096*k208+off19+128*s97, 255, wf235);
_mm512_mask_storeu_epi32(wfPtr21+65536+131072*i96+4096*k208+off20+128*s97, 255, wf236);
_mm512_mask_storeu_epi32(wfPtr21+98304+131072*i96+4096*k208+off17+128*s97, 255, wf237);
_mm512_mask_storeu_epi32(wfPtr21+98304+131072*i96+4096*k208+off18+128*s97, 255, wf238);
_mm512_mask_storeu_epi32(wfPtr21+98304+131072*i96+4096*k208+off19+128*s97, 255, wf239);
_mm512_mask_storeu_epi32(wfPtr21+98304+131072*i96+4096*k208+off20+128*s97, 255, wf240);
}
__m512 bias9 = _mm512_setzero_ps();
if (!e50) {
bias9 = _mm512_maskz_loadu_ps(15, biasPtr25-0+128*i96+16*j87);
__m512i pmMul58 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd58 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas16 = _mm512_maskz_loadu_ps(255, bnPtr27+(ptrdiff_t)8*(0+32*i96+4*j87));
__m512 postMul86 = _mm512_permutexvar_ps(pmMul58, mas16);
__m512 postAdd58 = _mm512_permutexvar_ps(pmAdd58, mas16);
bias9 = _mm512_fmadd_ps(bias9, postMul86, postAdd58);
}
_mm512_mask_storeu_ps(bfPtr21-0+128*i96+16*j87, 15, bias9);
if (j87 >= jj71) return;
}
}
}

static void ResNeXt50ThreeArrangeFilts5(ResNeXt50ThreaderTeam1* team92, char** tensors159) {
ResNeXt50ThreaderTask1 task165;
task165.callee1 = ResNeXt50ThreeArrangeFilts5Callee1;
task165.any1 = tensors159;
task165.nd1 = 3;
task165.hull1[0] = 2;
task165.hull1[1] = 32;
task165.hull1[2] = 1;
ResNeXt50ThreaderDo1(team92, &task165);
}

static void ResNeXt50ThreeArrangeDats5Callee1(ResNeXt50ThreaderTask1* task166, int64_t* pt88) {
char** tensors162 = task166->any1;
ptrdiff_t s98 = 0;
ptrdiff_t c79 = 0;
ptrdiff_t g52 = pt88[2];
ptrdiff_t e51 = 0;
char*restrict datPtr54 = tensors162[0]-32+158720*e51;
char*restrict dfPtr21 = tensors162[1]+16252928*e51;
ptrdiff_t i97 = 4*g52;
ptrdiff_t ii73 = i97+3;
for (; i97 <= ii73; ++i97) {
ptrdiff_t j88 = 1*c79;
ptrdiff_t rel29 = j88-0;
ptrdiff_t base29 = 0;
ptrdiff_t h56 = base29+0;
ptrdiff_t w83 = 0;
ptrdiff_t k209 = 0;
for (; k209 != 32; ++k209) {
__m512 dat2703 = _mm512_maskz_loadu_ps(127, datPtr54+172+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512i pm236 = _mm512_set_epi32(15, 15, 15, 15, 15, 15, 6, 5, 6, 5, 4, 3, 2, 1, 0, 15);
__m512 in2087 = _mm512_permutexvar_ps(pm236, dat2703);
__m512 dat2704 = _mm512_maskz_loadu_ps(127, datPtr54+32+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 dat2705 = _mm512_maskz_loadu_ps(127, datPtr54+200+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2080 = _mm512_permutexvar_ps(pm236, dat2704);
__m512 in2088 = _mm512_permutexvar_ps(pm236, dat2705);
__m512 dat2706 = _mm512_maskz_loadu_ps(127, datPtr54+60+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2081 = _mm512_permutexvar_ps(pm236, dat2706);
__m512 dat2707 = _mm512_maskz_loadu_ps(127, datPtr54+88+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2082 = _mm512_permutexvar_ps(pm236, dat2707);
__m512 dat2708 = _mm512_maskz_loadu_ps(127, datPtr54+116+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2083 = _mm512_permutexvar_ps(pm236, dat2708);
__m512 dat2709 = _mm512_maskz_loadu_ps(127, datPtr54+144+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2084 = _mm512_permutexvar_ps(pm236, dat2709);
__m512 dat2710 = _mm512_maskz_loadu_ps(127, datPtr54+172+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2085 = _mm512_permutexvar_ps(pm236, dat2710);
__m512 dat2711 = _mm512_maskz_loadu_ps(127, datPtr54+200+10240*i97+28*h56+4*w83+10240*s98+320*k209);
__m512 in2086 = _mm512_permutexvar_ps(pm236, dat2711);
__m512 tmp16507 = _mm512_add_ps(in2080, in2084);
__m512 tmp16512 = in2088;
__m512 tmp16508 = _mm512_sub_ps(in2083, in2081);
__m512 tmp16509 = _mm512_add_ps(in2081, in2085);
__m512 tmp16510 = _mm512_sub_ps(_mm512_setzero_ps(), in2085);
in2087 = in2087;
tmp16507 = _mm512_fmadd_ps(in2082, _mm512_set1_ps(-4.25e+00f), tmp16507);
tmp16512 = tmp16512;
tmp16509 = _mm512_fmadd_ps(in2083, _mm512_set1_ps(-4.25e+00f), tmp16509);
tmp16510 = _mm512_fmadd_ps(tmp16508, _mm512_set1_ps(5.25e+00f), tmp16510);
in2087 = in2087;
tmp16508 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(2.5e-01f), in2085);
in2081 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(4e+00f), in2085);
__m512 tmp16511 = _mm512_sub_ps(tmp16509, tmp16507);
__m512 tmp16513 = _mm512_sub_ps(_mm512_setzero_ps(), tmp16512);
tmp16509 = _mm512_add_ps(tmp16507, tmp16509);
__m512 tmp16514 = tmp16512;
tmp16507 = _mm512_fmadd_ps(in2080, _mm512_set1_ps(2.5e-01f), in2084);
tmp16512 = _mm512_mul_ps(in2088, _mm512_set1_ps(2.5e-01f));
tmp16508 = _mm512_fmadd_ps(in2083, _mm512_set1_ps(-1.25e+00f), tmp16508);
in2083 = _mm512_fmadd_ps(in2083, _mm512_set1_ps(-5e+00f), in2081);
tmp16507 = _mm512_fmadd_ps(in2082, _mm512_set1_ps(-1.25e+00f), tmp16507);
tmp16512 = tmp16512;
in2085 = _mm512_fmadd_ps(tmp16507, _mm512_set1_ps(2e+00f), tmp16508);
__m512 tmp16515 = _mm512_mul_ps(tmp16512, _mm512_set1_ps(2e+00f));
tmp16508 = _mm512_fnmadd_ps(tmp16507, _mm512_set1_ps(2e+00f), tmp16508);
__m512 tmp16516 = _mm512_fnmadd_ps(tmp16512, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp16507 = _mm512_fmadd_ps(in2084, _mm512_set1_ps(2.5e-01f), in2080);
tmp16512 = in2088;
in2080 = _mm512_sub_ps(in2086, in2080);
in2088 = _mm512_sub_ps(_mm512_setzero_ps(), in2088);
tmp16507 = _mm512_fmadd_ps(in2082, _mm512_set1_ps(-1.25e+00f), tmp16507);
tmp16512 = tmp16512;
in2082 = _mm512_sub_ps(in2082, in2084);
in2082 = _mm512_fmadd_ps(in2082, _mm512_set1_ps(5.25e+00f), in2080);
__m512 tmp16517 = in2088;
in2081 = _mm512_fmadd_ps(tmp16507, _mm512_set1_ps(2e+00f), in2083);
__m512 tmp16518 = _mm512_mul_ps(tmp16512, _mm512_set1_ps(2e+00f));
in2083 = _mm512_fnmadd_ps(tmp16507, _mm512_set1_ps(2e+00f), in2083);
__m512 tmp16519 = _mm512_fnmadd_ps(tmp16512, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 tmp16533 = _mm512_unpacklo_ps(tmp16510, tmp16509);
__m512 tmp16534 = _mm512_unpackhi_ps(tmp16510, tmp16509);
__m512 tmp16535 = _mm512_unpacklo_ps(tmp16511, in2085);
__m512 tmp16536 = _mm512_unpackhi_ps(tmp16511, in2085);
__m512 tmp16537 = _mm512_unpacklo_ps(tmp16508, in2081);
__m512 tmp16538 = _mm512_unpackhi_ps(tmp16508, in2081);
__m512 tmp16539 = _mm512_unpacklo_ps(in2083, in2082);
__m512 tmp16540 = _mm512_unpackhi_ps(in2083, in2082);
__m512 tmp16541 = _mm512_unpacklo_ps(in2087, tmp16514);
__m512 tmp16542 = _mm512_unpackhi_ps(in2087, tmp16514);
__m512 tmp16543 = _mm512_unpacklo_ps(tmp16513, tmp16515);
__m512 tmp16544 = _mm512_unpackhi_ps(tmp16513, tmp16515);
__m512 tmp16545 = _mm512_unpacklo_ps(tmp16516, tmp16518);
__m512 tmp16546 = _mm512_unpackhi_ps(tmp16516, tmp16518);
__m512 tmp16547 = _mm512_unpacklo_ps(tmp16519, tmp16517);
__m512 tmp16548 = _mm512_unpackhi_ps(tmp16519, tmp16517);
__m512 tmp16549 = _mm512_shuffle_ps(tmp16533, tmp16535, 68);
__m512 tmp16550 = _mm512_shuffle_ps(tmp16533, tmp16535, 238);
__m512 tmp16551 = _mm512_shuffle_ps(tmp16534, tmp16536, 68);
__m512 tmp16552 = _mm512_shuffle_ps(tmp16534, tmp16536, 238);
__m512 tmp16553 = _mm512_shuffle_ps(tmp16537, tmp16539, 68);
__m512 tmp16554 = _mm512_shuffle_ps(tmp16537, tmp16539, 238);
__m512 tmp16555 = _mm512_shuffle_ps(tmp16538, tmp16540, 68);
__m512 tmp16556 = _mm512_shuffle_ps(tmp16538, tmp16540, 238);
__m512 tmp16557 = _mm512_shuffle_ps(tmp16541, tmp16543, 68);
__m512 tmp16558 = _mm512_shuffle_ps(tmp16541, tmp16543, 238);
__m512 tmp16559 = _mm512_shuffle_ps(tmp16542, tmp16544, 68);
__m512 tmp16560 = _mm512_shuffle_ps(tmp16542, tmp16544, 238);
__m512 tmp16561 = _mm512_shuffle_ps(tmp16545, tmp16547, 68);
__m512 tmp16562 = _mm512_shuffle_ps(tmp16545, tmp16547, 238);
__m512 tmp16563 = _mm512_shuffle_ps(tmp16546, tmp16548, 68);
__m512 tmp16564 = _mm512_shuffle_ps(tmp16546, tmp16548, 238);
__m512 tmp16565 = _mm512_shuffle_f32x4(tmp16549, tmp16553, 136);
__m512 tmp16566 = _mm512_shuffle_f32x4(tmp16549, tmp16553, 221);
__m512 tmp16567 = _mm512_shuffle_f32x4(tmp16550, tmp16554, 136);
__m512 tmp16568 = _mm512_shuffle_f32x4(tmp16550, tmp16554, 221);
__m512 tmp16569 = _mm512_shuffle_f32x4(tmp16551, tmp16555, 136);
__m512 tmp16570 = _mm512_shuffle_f32x4(tmp16551, tmp16555, 221);
__m512 tmp16571 = _mm512_shuffle_f32x4(tmp16552, tmp16556, 136);
__m512 tmp16572 = _mm512_shuffle_f32x4(tmp16552, tmp16556, 221);
__m512 tmp16573 = _mm512_shuffle_f32x4(tmp16557, tmp16561, 136);
__m512 tmp16574 = _mm512_shuffle_f32x4(tmp16557, tmp16561, 221);
__m512 tmp16575 = _mm512_shuffle_f32x4(tmp16558, tmp16562, 136);
__m512 tmp16576 = _mm512_shuffle_f32x4(tmp16558, tmp16562, 221);
__m512 tmp16577 = _mm512_shuffle_f32x4(tmp16559, tmp16563, 136);
__m512 tmp16578 = _mm512_shuffle_f32x4(tmp16559, tmp16563, 221);
__m512 tmp16579 = _mm512_shuffle_f32x4(tmp16560, tmp16564, 136);
__m512 tmp16580 = _mm512_shuffle_f32x4(tmp16560, tmp16564, 221);
tmp16510 = _mm512_shuffle_f32x4(tmp16565, tmp16573, 136);
in2087 = _mm512_shuffle_f32x4(tmp16565, tmp16573, 221);
tmp16509 = _mm512_shuffle_f32x4(tmp16567, tmp16575, 136);
tmp16514 = _mm512_shuffle_f32x4(tmp16567, tmp16575, 221);
tmp16511 = _mm512_shuffle_f32x4(tmp16569, tmp16577, 136);
in2085 = _mm512_shuffle_f32x4(tmp16571, tmp16579, 136);
tmp16508 = _mm512_shuffle_f32x4(tmp16566, tmp16574, 136);
in2081 = _mm512_shuffle_f32x4(tmp16568, tmp16576, 136);
in2083 = _mm512_shuffle_f32x4(tmp16570, tmp16578, 136);
in2082 = _mm512_shuffle_f32x4(tmp16572, tmp16580, 136);
(void)tmp16510;
__m512 tmp16520 = _mm512_add_ps(tmp16509, in2081);
__m512 tmp16525 = tmp16514;
__m512 tmp16521 = _mm512_sub_ps(tmp16508, tmp16511);
__m512 tmp16522 = _mm512_add_ps(tmp16511, in2083);
__m512 tmp16523 = _mm512_sub_ps(_mm512_setzero_ps(), in2083);
in2087 = in2087;
tmp16520 = _mm512_fmadd_ps(in2085, _mm512_set1_ps(-4.25e+00f), tmp16520);
tmp16525 = tmp16525;
tmp16522 = _mm512_fmadd_ps(tmp16508, _mm512_set1_ps(-4.25e+00f), tmp16522);
tmp16523 = _mm512_fmadd_ps(tmp16521, _mm512_set1_ps(5.25e+00f), tmp16523);
in2087 = in2087;
tmp16521 = _mm512_fmadd_ps(tmp16511, _mm512_set1_ps(2.5e-01f), in2083);
tmp16511 = _mm512_fmadd_ps(tmp16511, _mm512_set1_ps(4e+00f), in2083);
__m512 tmp16524 = _mm512_sub_ps(tmp16522, tmp16520);
__m512 tmp16526 = _mm512_sub_ps(_mm512_setzero_ps(), tmp16525);
tmp16522 = _mm512_add_ps(tmp16520, tmp16522);
__m512 tmp16527 = tmp16525;
tmp16520 = _mm512_fmadd_ps(tmp16509, _mm512_set1_ps(2.5e-01f), in2081);
tmp16525 = _mm512_mul_ps(tmp16514, _mm512_set1_ps(2.5e-01f));
tmp16521 = _mm512_fmadd_ps(tmp16508, _mm512_set1_ps(-1.25e+00f), tmp16521);
tmp16508 = _mm512_fmadd_ps(tmp16508, _mm512_set1_ps(-5e+00f), tmp16511);
tmp16520 = _mm512_fmadd_ps(in2085, _mm512_set1_ps(-1.25e+00f), tmp16520);
tmp16525 = tmp16525;
in2083 = _mm512_fmadd_ps(tmp16520, _mm512_set1_ps(2e+00f), tmp16521);
__m512 tmp16528 = _mm512_mul_ps(tmp16525, _mm512_set1_ps(2e+00f));
tmp16521 = _mm512_fnmadd_ps(tmp16520, _mm512_set1_ps(2e+00f), tmp16521);
__m512 tmp16529 = _mm512_fnmadd_ps(tmp16525, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
tmp16520 = _mm512_fmadd_ps(in2081, _mm512_set1_ps(2.5e-01f), tmp16509);
tmp16525 = tmp16514;
tmp16509 = _mm512_sub_ps(in2082, tmp16509);
tmp16514 = _mm512_sub_ps(_mm512_setzero_ps(), tmp16514);
tmp16520 = _mm512_fmadd_ps(in2085, _mm512_set1_ps(-1.25e+00f), tmp16520);
tmp16525 = tmp16525;
in2085 = _mm512_sub_ps(in2085, in2081);
in2085 = _mm512_fmadd_ps(in2085, _mm512_set1_ps(5.25e+00f), tmp16509);
__m512 tmp16530 = tmp16514;
tmp16511 = _mm512_fmadd_ps(tmp16520, _mm512_set1_ps(2e+00f), tmp16508);
__m512 tmp16531 = _mm512_mul_ps(tmp16525, _mm512_set1_ps(2e+00f));
tmp16508 = _mm512_fnmadd_ps(tmp16520, _mm512_set1_ps(2e+00f), tmp16508);
__m512 tmp16532 = _mm512_fnmadd_ps(tmp16525, _mm512_set1_ps(2e+00f), _mm512_setzero_ps());
__m512 out1919 = _mm512_shuffle_f32x4(tmp16523, tmp16522, 68);
__m512 out1927 = _mm512_shuffle_f32x4(tmp16523, tmp16522, 238);
__m512 out1920 = _mm512_shuffle_f32x4(tmp16524, in2083, 68);
__m512 out1928 = _mm512_shuffle_f32x4(tmp16524, in2083, 238);
__m512 out1921 = _mm512_shuffle_f32x4(tmp16521, tmp16511, 68);
__m512 out1929 = _mm512_shuffle_f32x4(tmp16521, tmp16511, 238);
__m512 out1922 = _mm512_shuffle_f32x4(tmp16508, in2085, 68);
__m512 out1930 = _mm512_shuffle_f32x4(tmp16508, in2085, 238);
__m512 out1923 = _mm512_shuffle_f32x4(in2087, tmp16527, 68);
__m512 out1931 = _mm512_shuffle_f32x4(in2087, tmp16527, 238);
__m512 out1924 = _mm512_shuffle_f32x4(tmp16526, tmp16528, 68);
__m512 out1932 = _mm512_shuffle_f32x4(tmp16526, tmp16528, 238);
__m512 out1925 = _mm512_shuffle_f32x4(tmp16529, tmp16531, 68);
__m512 out1933 = _mm512_shuffle_f32x4(tmp16529, tmp16531, 238);
__m512 out1926 = _mm512_shuffle_f32x4(tmp16532, tmp16530, 68);
__m512 out1934 = _mm512_shuffle_f32x4(tmp16532, tmp16530, 238);
_mm512_storeu_ps(dfPtr21+0+32768*i97+12288*j88+8192*s98+256*k209, out1919);
_mm512_storeu_ps(dfPtr21+128+32768*i97+12288*j88+8192*s98+256*k209, out1927);
_mm512_storeu_ps(dfPtr21+64+32768*i97+12288*j88+8192*s98+256*k209, out1923);
_mm512_storeu_ps(dfPtr21+192+32768*i97+12288*j88+8192*s98+256*k209, out1931);
_mm512_storeu_ps(dfPtr21+8192+32768*i97+12288*j88+8192*s98+256*k209, out1920);
_mm512_storeu_ps(dfPtr21+8320+32768*i97+12288*j88+8192*s98+256*k209, out1928);
_mm512_storeu_ps(dfPtr21+8256+32768*i97+12288*j88+8192*s98+256*k209, out1924);
_mm512_storeu_ps(dfPtr21+8384+32768*i97+12288*j88+8192*s98+256*k209, out1932);
_mm512_storeu_ps(dfPtr21+16384+32768*i97+12288*j88+8192*s98+256*k209, out1921);
_mm512_storeu_ps(dfPtr21+16512+32768*i97+12288*j88+8192*s98+256*k209, out1929);
_mm512_storeu_ps(dfPtr21+16448+32768*i97+12288*j88+8192*s98+256*k209, out1925);
_mm512_storeu_ps(dfPtr21+16576+32768*i97+12288*j88+8192*s98+256*k209, out1933);
_mm512_storeu_ps(dfPtr21+24576+32768*i97+12288*j88+8192*s98+256*k209, out1922);
_mm512_storeu_ps(dfPtr21+24704+32768*i97+12288*j88+8192*s98+256*k209, out1930);
_mm512_storeu_ps(dfPtr21+24640+32768*i97+12288*j88+8192*s98+256*k209, out1926);
_mm512_storeu_ps(dfPtr21+24768+32768*i97+12288*j88+8192*s98+256*k209, out1934);
}
++j88;
}
}

static void ResNeXt50ThreeArrangeDats5(ResNeXt50ThreaderTeam1* team93, char** tensors161) {
ResNeXt50ThreaderTask1 task167;
task167.callee1 = ResNeXt50ThreeArrangeDats5Callee1;
task167.any1 = tensors161;
task167.nd1 = 4;
task167.hull1[0] = 1;
task167.hull1[1] = 1;
task167.hull1[2] = 8;
task167.hull1[3] = 1;
ResNeXt50ThreaderDo1(team93, &task167);
}

static void ResNeXt50ThreeProduceSums5Callee1(ResNeXt50ThreaderTask1* task168, int64_t* pt89) {
void** pair42 = task168->any1;
char** tensors164 = pair42[0];
ptrdiff_t e52 = 0;
ptrdiff_t g53 = pt89[3];
ptrdiff_t f72 = pt89[2];
ptrdiff_t d34 = 0;
ptrdiff_t w84 = 0;
char*restrict bfPtr22 = tensors164[0]+4096*e52;
char*restrict wfPtr22 = tensors164[0]+4096+65011712*e52;
char*restrict dfPtr22 = tensors164[1]+16252928*e52;
char*restrict sfPtr21 = tensors164[2];
ptrdiff_t i98 = 1*g53;
ptrdiff_t j89 = 2*f72;
ptrdiff_t jj72 = j89+1;
for (; j89 <= jj72; ++j89) {
ptrdiff_t k210 = 1*d34;
ptrdiff_t l90 = 8*w84;
for (; l90 != 8; ++l90) {
__m512 sum877;
__m512 sum881;
__m512 sum885;
__m512 sum889;
if (__builtin_expect(!j89, 0)) {
sum877 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr22+0+128*i98+16*l90)));
sum881 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr22+4+128*i98+16*l90)));
sum885 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr22+8+128*i98+16*l90)));
sum889 = _mm512_mask_mov_ps(_mm512_setzero_ps(), 512, _mm512_set1_ps(*(float*)(bfPtr22+12+128*i98+16*l90)));
} else {
sum877 = _mm512_setzero_ps();
sum881 = _mm512_setzero_ps();
sum885 = _mm512_setzero_ps();
sum889 = _mm512_setzero_ps();
}
__m512 sum878 = sum877;
__m512 sum879 = sum877;
__m512 sum880 = sum877;
__m512 sum882 = sum881;
__m512 sum883 = sum881;
__m512 sum884 = sum881;
__m512 sum886 = sum885;
__m512 sum887 = sum885;
__m512 sum888 = sum885;
__m512 sum890 = sum889;
__m512 sum891 = sum889;
__m512 sum892 = sum889;
ptrdiff_t b93 = 0;
for (; b93 != 32; ++b93) {
__m512i wfs81 = _mm512_maskz_loadu_epi32(65535, wfPtr22+0+131072*i98+32768*j89+4096*l90+128*b93);
__m512 wf241 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs81));
__m512 df965 = _mm512_loadu_ps(dfPtr22+0+32768*i98+8192*j89+12288*k210+256*b93);
sum877 = _mm512_fmadd_ps(wf241, df965, sum877);
__m512 df966 = _mm512_loadu_ps(dfPtr22+64+32768*i98+8192*j89+12288*k210+256*b93);
sum878 = _mm512_fmadd_ps(wf241, df966, sum878);
__m512 df967 = _mm512_loadu_ps(dfPtr22+128+32768*i98+8192*j89+12288*k210+256*b93);
sum879 = _mm512_fmadd_ps(wf241, df967, sum879);
__m512 df968 = _mm512_loadu_ps(dfPtr22+192+32768*i98+8192*j89+12288*k210+256*b93);
sum880 = _mm512_fmadd_ps(wf241, df968, sum880);
__m512 wf242 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs81, 1));
sum881 = _mm512_fmadd_ps(wf242, df965, sum881);
sum882 = _mm512_fmadd_ps(wf242, df966, sum882);
sum883 = _mm512_fmadd_ps(wf242, df967, sum883);
sum884 = _mm512_fmadd_ps(wf242, df968, sum884);
__m512i wfs82 = _mm512_maskz_loadu_epi32(65535, wfPtr22+64+131072*i98+32768*j89+4096*l90+128*b93);
__m512 wf243 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfs82));
sum885 = _mm512_fmadd_ps(wf243, df965, sum885);
sum886 = _mm512_fmadd_ps(wf243, df966, sum886);
sum887 = _mm512_fmadd_ps(wf243, df967, sum887);
sum888 = _mm512_fmadd_ps(wf243, df968, sum888);
__m512 wf244 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfs82, 1));
sum889 = _mm512_fmadd_ps(wf244, df965, sum889);
sum890 = _mm512_fmadd_ps(wf244, df966, sum890);
sum891 = _mm512_fmadd_ps(wf244, df967, sum891);
sum892 = _mm512_fmadd_ps(wf244, df968, sum892);
}
_mm512_storeu_ps(sfPtr21+0+32768*i98+8192*j89+12288*k210+1024*l90, sum877);
_mm512_storeu_ps(sfPtr21+64+32768*i98+8192*j89+12288*k210+1024*l90, sum878);
_mm512_storeu_ps(sfPtr21+128+32768*i98+8192*j89+12288*k210+1024*l90, sum879);
_mm512_storeu_ps(sfPtr21+192+32768*i98+8192*j89+12288*k210+1024*l90, sum880);
_mm512_storeu_ps(sfPtr21+256+32768*i98+8192*j89+12288*k210+1024*l90, sum881);
_mm512_storeu_ps(sfPtr21+320+32768*i98+8192*j89+12288*k210+1024*l90, sum882);
_mm512_storeu_ps(sfPtr21+384+32768*i98+8192*j89+12288*k210+1024*l90, sum883);
_mm512_storeu_ps(sfPtr21+448+32768*i98+8192*j89+12288*k210+1024*l90, sum884);
_mm512_storeu_ps(sfPtr21+512+32768*i98+8192*j89+12288*k210+1024*l90, sum885);
_mm512_storeu_ps(sfPtr21+576+32768*i98+8192*j89+12288*k210+1024*l90, sum886);
_mm512_storeu_ps(sfPtr21+640+32768*i98+8192*j89+12288*k210+1024*l90, sum887);
_mm512_storeu_ps(sfPtr21+704+32768*i98+8192*j89+12288*k210+1024*l90, sum888);
_mm512_storeu_ps(sfPtr21+768+32768*i98+8192*j89+12288*k210+1024*l90, sum889);
_mm512_storeu_ps(sfPtr21+832+32768*i98+8192*j89+12288*k210+1024*l90, sum890);
_mm512_storeu_ps(sfPtr21+896+32768*i98+8192*j89+12288*k210+1024*l90, sum891);
_mm512_storeu_ps(sfPtr21+960+32768*i98+8192*j89+12288*k210+1024*l90, sum892);
}
}
}

static void ResNeXt50ThreeProduceSums5(ResNeXt50ThreaderTeam1* team94, char** tensors163) {
void* pair41[] = {tensors163, 0};
ResNeXt50ThreaderTask1 task169;
task169.callee1 = ResNeXt50ThreeProduceSums5Callee1;
task169.any1 = pair41;
task169.nd1 = 4;
task169.hull1[0] = 1;
task169.hull1[1] = 1;
task169.hull1[2] = 2;
task169.hull1[3] = 32;
ResNeXt50ThreaderDo1(team94, &task169);
}

static void ResNeXt50ThreeConsumeSums5Callee1(ResNeXt50ThreaderTask1* task170, int64_t* pt90) {
char** tensors166 = task170->any1;
ptrdiff_t w85 = 0;
ptrdiff_t d35 = 0;
ptrdiff_t g54 = pt90[2];
char*restrict sfPtr22 = tensors166[0];
char*restrict datPtr55 = tensors166[1];
ptrdiff_t i99 = 4*g54;
ptrdiff_t ii74 = i99+3;
for (; i99 <= ii74; ++i99) {
ptrdiff_t j90 = 1*d35;
ptrdiff_t rel30 = j90-0;
ptrdiff_t base30 = 0;
ptrdiff_t toH50 = base30+0;
ptrdiff_t toW50 = 0;
ptrdiff_t k211 = 8*w85;
for (; k211 != 8; ++k211) {
ptrdiff_t l91 = 0;
for (; l91 != 4; ++l91) {
__m512 sf1105 = _mm512_loadu_ps(sfPtr22+0+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1106 = _mm512_loadu_ps(sfPtr22+128+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2089 = _mm512_shuffle_f32x4(sf1105, sf1106, 68);
__m512 in2090 = _mm512_shuffle_f32x4(sf1105, sf1106, 238);
__m512 sf1107 = _mm512_loadu_ps(sfPtr22+64+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1108 = _mm512_loadu_ps(sfPtr22+192+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2097 = _mm512_shuffle_f32x4(sf1107, sf1108, 68);
__m512 in2098 = _mm512_shuffle_f32x4(sf1107, sf1108, 238);
__m512 sf1109 = _mm512_loadu_ps(sfPtr22+8192+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1110 = _mm512_loadu_ps(sfPtr22+8320+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2091 = _mm512_shuffle_f32x4(sf1109, sf1110, 68);
__m512 in2092 = _mm512_shuffle_f32x4(sf1109, sf1110, 238);
__m512 sf1111 = _mm512_loadu_ps(sfPtr22+8256+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1112 = _mm512_loadu_ps(sfPtr22+8384+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2099 = _mm512_shuffle_f32x4(sf1111, sf1112, 68);
__m512 in2100 = _mm512_shuffle_f32x4(sf1111, sf1112, 238);
__m512 sf1113 = _mm512_loadu_ps(sfPtr22+16384+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1114 = _mm512_loadu_ps(sfPtr22+16512+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2093 = _mm512_shuffle_f32x4(sf1113, sf1114, 68);
__m512 in2094 = _mm512_shuffle_f32x4(sf1113, sf1114, 238);
__m512 sf1115 = _mm512_loadu_ps(sfPtr22+16448+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1116 = _mm512_loadu_ps(sfPtr22+16576+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2101 = _mm512_shuffle_f32x4(sf1115, sf1116, 68);
__m512 in2102 = _mm512_shuffle_f32x4(sf1115, sf1116, 238);
__m512 sf1117 = _mm512_loadu_ps(sfPtr22+24576+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1118 = _mm512_loadu_ps(sfPtr22+24704+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2095 = _mm512_shuffle_f32x4(sf1117, sf1118, 68);
__m512 in2096 = _mm512_shuffle_f32x4(sf1117, sf1118, 238);
__m512 sf1119 = _mm512_loadu_ps(sfPtr22+24640+32768*i99+12288*j90+1024*k211+256*l91);
__m512 sf1120 = _mm512_loadu_ps(sfPtr22+24768+32768*i99+12288*j90+1024*k211+256*l91);
__m512 in2103 = _mm512_shuffle_f32x4(sf1119, sf1120, 68);
__m512 in2104 = _mm512_shuffle_f32x4(sf1119, sf1120, 238);
(void)in2104;
__m512 tmp16592 = _mm512_add_ps(in2090, in2091);
__m512 tmp16612 = _mm512_add_ps(in2098, in2099);
__m512 tmp16591 = _mm512_add_ps(in2092, in2093);
__m512 tmp16611 = _mm512_add_ps(in2100, in2101);
__m512 tmp16597 = _mm512_sub_ps(in2092, in2093);
__m512 tmp16596 = _mm512_sub_ps(in2090, in2091);
__m512 tmp16593 = _mm512_add_ps(in2094, in2095);
__m512 tmp16613 = _mm512_add_ps(in2102, in2103);
__m512 tmp16598 = _mm512_sub_ps(in2094, in2095);
__m512 tmp16595 = _mm512_fmadd_ps(tmp16597, _mm512_set1_ps(2e+00f), tmp16596);
__m512 tmp16602 = _mm512_fmadd_ps(tmp16597, _mm512_set1_ps(8e+00f), tmp16596);
__m512 tmp16590 = _mm512_add_ps(tmp16591, tmp16592);
__m512 tmp16610 = _mm512_add_ps(tmp16611, tmp16612);
__m512 tmp16594 = _mm512_fmadd_ps(tmp16598, _mm512_set1_ps(1.6e+01f), tmp16595);
__m512 tmp16601 = _mm512_fmadd_ps(tmp16598, _mm512_set1_ps(4e+00f), tmp16602);
__m512 tmp16607 = _mm512_add_ps(tmp16598, tmp16596);
__m512 tmp16600 = _mm512_fmadd_ps(tmp16591, _mm512_set1_ps(4e+00f), tmp16592);
__m512 tmp16604 = _mm512_fmadd_ps(tmp16591, _mm512_set1_ps(1.6e+01f), tmp16592);
__m512 tmp16589 = _mm512_add_ps(tmp16590, in2089);
__m512 tmp16609 = _mm512_add_ps(tmp16610, in2097);
__m512 tmp16606 = _mm512_add_ps(tmp16607, in2096);
__m512 tmp16588 = _mm512_fmadd_ps(tmp16593, _mm512_set1_ps(3.2e+01f), tmp16589);
__m512 tmp16608 = _mm512_fmadd_ps(tmp16613, _mm512_set1_ps(3.2e+01f), tmp16609);
__m512 tmp16599 = _mm512_fmadd_ps(tmp16593, _mm512_set1_ps(8e+00f), tmp16600);
__m512 tmp16605 = _mm512_fmadd_ps(tmp16597, _mm512_set1_ps(3.2e+01f), tmp16606);
__m512 tmp16603 = _mm512_fmadd_ps(tmp16593, _mm512_set1_ps(2e+00f), tmp16604);
__m512 tmp16581 = tmp16588;
__m512 tmp16587 = tmp16608;
__m512 tmp16582 = tmp16594;
__m512 tmp16583 = tmp16599;
__m512 tmp16584 = tmp16601;
__m512 tmp16585 = tmp16603;
__m512 tmp16586 = tmp16605;
__m512 tmp16649 = _mm512_unpacklo_ps(tmp16581, tmp16582);
__m512 tmp16650 = _mm512_unpackhi_ps(tmp16581, tmp16582);
__m512 tmp16651 = _mm512_unpacklo_ps(tmp16583, tmp16584);
__m512 tmp16652 = _mm512_unpackhi_ps(tmp16583, tmp16584);
__m512 tmp16653 = _mm512_unpacklo_ps(tmp16585, tmp16586);
__m512 tmp16654 = _mm512_unpackhi_ps(tmp16585, tmp16586);
__m512 tmp16655 = _mm512_unpacklo_ps(tmp16587, tmp16587);
__m512 tmp16656 = _mm512_unpackhi_ps(tmp16587, tmp16587);
__m512 tmp16657 = _mm512_shuffle_ps(tmp16649, tmp16651, 68);
__m512 tmp16658 = _mm512_shuffle_ps(tmp16649, tmp16651, 238);
__m512 tmp16659 = _mm512_shuffle_ps(tmp16650, tmp16652, 68);
__m512 tmp16660 = _mm512_shuffle_ps(tmp16650, tmp16652, 238);
__m512 tmp16661 = _mm512_shuffle_ps(tmp16653, tmp16655, 68);
__m512 tmp16662 = _mm512_shuffle_ps(tmp16653, tmp16655, 238);
__m512 tmp16663 = _mm512_shuffle_ps(tmp16654, tmp16656, 68);
__m512 tmp16664 = _mm512_shuffle_ps(tmp16654, tmp16656, 238);
__m512 tmp16665 = _mm512_shuffle_f32x4(tmp16657, tmp16661, 136);
__m512 tmp16666 = _mm512_shuffle_f32x4(tmp16657, tmp16661, 221);
__m512 tmp16667 = _mm512_shuffle_f32x4(tmp16658, tmp16662, 136);
__m512 tmp16668 = _mm512_shuffle_f32x4(tmp16658, tmp16662, 221);
__m512 tmp16669 = _mm512_shuffle_f32x4(tmp16659, tmp16663, 136);
__m512 tmp16670 = _mm512_shuffle_f32x4(tmp16659, tmp16663, 221);
__m512 tmp16671 = _mm512_shuffle_f32x4(tmp16660, tmp16664, 136);
__m512 tmp16672 = _mm512_shuffle_f32x4(tmp16660, tmp16664, 221);
tmp16581 = _mm512_shuffle_f32x4(tmp16665, tmp16665, 136);
__m512 tmp16615 = _mm512_shuffle_f32x4(tmp16665, tmp16665, 221);
tmp16582 = _mm512_shuffle_f32x4(tmp16667, tmp16667, 136);
__m512 tmp16616 = _mm512_shuffle_f32x4(tmp16667, tmp16667, 221);
tmp16583 = _mm512_shuffle_f32x4(tmp16669, tmp16669, 136);
__m512 tmp16617 = _mm512_shuffle_f32x4(tmp16669, tmp16669, 221);
tmp16584 = _mm512_shuffle_f32x4(tmp16671, tmp16671, 136);
__m512 tmp16618 = _mm512_shuffle_f32x4(tmp16671, tmp16671, 221);
tmp16585 = _mm512_shuffle_f32x4(tmp16666, tmp16666, 136);
__m512 tmp16619 = _mm512_shuffle_f32x4(tmp16666, tmp16666, 221);
tmp16586 = _mm512_shuffle_f32x4(tmp16668, tmp16668, 136);
__m512 tmp16620 = _mm512_shuffle_f32x4(tmp16668, tmp16668, 221);
tmp16587 = _mm512_shuffle_f32x4(tmp16670, tmp16670, 136);
__m512 tmp16621 = _mm512_shuffle_f32x4(tmp16670, tmp16670, 221);
__m512 tmp16614 = _mm512_shuffle_f32x4(tmp16672, tmp16672, 136);
__m512 tmp16622 = _mm512_shuffle_f32x4(tmp16672, tmp16672, 221);
(void)tmp16622;
__m512 tmp16627 = _mm512_add_ps(tmp16582, tmp16583);
__m512 tmp16647 = _mm512_add_ps(tmp16616, tmp16617);
__m512 tmp16626 = _mm512_add_ps(tmp16584, tmp16585);
__m512 tmp16646 = _mm512_add_ps(tmp16618, tmp16619);
__m512 tmp16632 = _mm512_sub_ps(tmp16584, tmp16585);
__m512 tmp16631 = _mm512_sub_ps(tmp16582, tmp16583);
__m512 tmp16628 = _mm512_add_ps(tmp16586, tmp16587);
__m512 tmp16648 = _mm512_add_ps(tmp16620, tmp16621);
__m512 tmp16633 = _mm512_sub_ps(tmp16586, tmp16587);
__m512 tmp16630 = _mm512_fmadd_ps(tmp16632, _mm512_set1_ps(2e+00f), tmp16631);
__m512 tmp16637 = _mm512_fmadd_ps(tmp16632, _mm512_set1_ps(8e+00f), tmp16631);
__m512 tmp16625 = _mm512_add_ps(tmp16626, tmp16627);
__m512 tmp16645 = _mm512_add_ps(tmp16646, tmp16647);
__m512 tmp16629 = _mm512_fmadd_ps(tmp16633, _mm512_set1_ps(1.6e+01f), tmp16630);
__m512 tmp16636 = _mm512_fmadd_ps(tmp16633, _mm512_set1_ps(4e+00f), tmp16637);
__m512 tmp16642 = _mm512_add_ps(tmp16633, tmp16631);
__m512 tmp16635 = _mm512_fmadd_ps(tmp16626, _mm512_set1_ps(4e+00f), tmp16627);
__m512 tmp16639 = _mm512_fmadd_ps(tmp16626, _mm512_set1_ps(1.6e+01f), tmp16627);
__m512 tmp16624 = _mm512_add_ps(tmp16625, tmp16581);
__m512 tmp16644 = _mm512_add_ps(tmp16645, tmp16615);
__m512 tmp16641 = _mm512_add_ps(tmp16642, tmp16614);
__m512 tmp16623 = _mm512_fmadd_ps(tmp16628, _mm512_set1_ps(3.2e+01f), tmp16624);
__m512 tmp16643 = _mm512_fmadd_ps(tmp16648, _mm512_set1_ps(3.2e+01f), tmp16644);
__m512 tmp16634 = _mm512_fmadd_ps(tmp16628, _mm512_set1_ps(8e+00f), tmp16635);
__m512 tmp16640 = _mm512_fmadd_ps(tmp16632, _mm512_set1_ps(3.2e+01f), tmp16641);
__m512 tmp16638 = _mm512_fmadd_ps(tmp16628, _mm512_set1_ps(2e+00f), tmp16639);
__m512 out1935 = tmp16623;
__m512 out1941 = tmp16643;
__m512 out1936 = tmp16629;
__m512 out1937 = tmp16634;
__m512 out1938 = tmp16636;
__m512 out1939 = tmp16638;
__m512 out1940 = tmp16640;
out1935 = _mm512_max_ps(_mm512_setzero_ps(), out1935);
out1941 = _mm512_max_ps(_mm512_setzero_ps(), out1941);
out1936 = _mm512_max_ps(_mm512_setzero_ps(), out1936);
out1937 = _mm512_max_ps(_mm512_setzero_ps(), out1937);
out1938 = _mm512_max_ps(_mm512_setzero_ps(), out1938);
out1939 = _mm512_max_ps(_mm512_setzero_ps(), out1939);
out1940 = _mm512_max_ps(_mm512_setzero_ps(), out1940);
_mm512_mask_storeu_ps(datPtr55+0+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1935);
_mm512_mask_storeu_ps(datPtr55+168+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1941);
_mm512_mask_storeu_ps(datPtr55+28+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1936);
_mm512_mask_storeu_ps(datPtr55+56+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1937);
_mm512_mask_storeu_ps(datPtr55+84+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1938);
_mm512_mask_storeu_ps(datPtr55+112+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1939);
_mm512_mask_storeu_ps(datPtr55+140+10240*i99+28*toH50+4*toW50+1280*k211+320*l91, 127, out1940);
}
}
++j90;
}
}

static void ResNeXt50ThreeConsumeSums5(ResNeXt50ThreaderTeam1* team95, char** tensors165) {
ResNeXt50ThreaderTask1 task171;
task171.callee1 = ResNeXt50ThreeConsumeSums5Callee1;
task171.any1 = tensors165;
task171.nd1 = 3;
task171.hull1[0] = 1;
task171.hull1[1] = 1;
task171.hull1[2] = 8;
ResNeXt50ThreaderDo1(team95, &task171);
}

static void ResNeXt50StriderArrangeFilts1Callee1(ResNeXt50ThreaderTask1* task4, int64_t* pt7) {
char** tensors2 = task4->any1;
ptrdiff_t b2 = 0;
ptrdiff_t g2 = 0;
ptrdiff_t e1 = 0;
(void)pt7;
char*restrict bfPtr1 = tensors2[3]+256*e1;
char*restrict wfPtr1 = tensors2[3]+256+12976128*e1;
char*restrict wtPtr1 = tensors2[0]+77616*e1;
char*restrict biasPtr1 = tensors2[1];
char*restrict bnPtr1 = tensors2[2];
ptrdiff_t i5 = 1*g2;
ptrdiff_t j1 = 32*b2;
if (j1 < 32) {
for (; j1 != 32; ++j1) {
__m512 postMul1 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(0+64*i5+2*j1))[0]);
__m512 postMul2 = _mm512_set1_ps(((float*)bnPtr1+(ptrdiff_t)2*(1+64*i5+2*j1))[0]);
for (ptrdiff_t k1 = 0; k1 < 3; ++k1) {
__m512 wt1 = _mm512_maskz_loadu_ps(127, wtPtr1+0+37632*i5+1176*j1+196*k1);
__m512 wt2 = _mm512_maskz_loadu_ps(127, wtPtr1+28+37632*i5+1176*j1+196*k1);
__m512 wt3 = _mm512_maskz_loadu_ps(127, wtPtr1+56+37632*i5+1176*j1+196*k1);
__m512 wt4 = _mm512_maskz_loadu_ps(127, wtPtr1+84+37632*i5+1176*j1+196*k1);
__m512 wt5 = _mm512_maskz_loadu_ps(127, wtPtr1+112+37632*i5+1176*j1+196*k1);
__m512 wt6 = _mm512_maskz_loadu_ps(127, wtPtr1+140+37632*i5+1176*j1+196*k1);
__m512 wt7 = _mm512_maskz_loadu_ps(127, wtPtr1+168+37632*i5+1176*j1+196*k1);
wt1 = _mm512_mul_ps(postMul1, wt1);
wt2 = _mm512_mul_ps(postMul1, wt2);
wt3 = _mm512_mul_ps(postMul1, wt3);
wt4 = _mm512_mul_ps(postMul1, wt4);
wt5 = _mm512_mul_ps(postMul1, wt5);
wt6 = _mm512_mul_ps(postMul1, wt6);
wt7 = _mm512_mul_ps(postMul1, wt7);
__m512 fft1 = _mm512_add_ps(wt1, _mm512_setzero_ps());
__m512 fft89 = _mm512_add_ps(wt2, _mm512_setzero_ps());
__m512 fft2 = _mm512_sub_ps(wt1, _mm512_setzero_ps());
__m512 fft90 = _mm512_sub_ps(wt2, _mm512_setzero_ps());
__m512 fft3 = _mm512_add_ps(wt3, _mm512_setzero_ps());
__m512 fft91 = _mm512_add_ps(wt4, _mm512_setzero_ps());
__m512 fft4 = _mm512_sub_ps(wt3, _mm512_setzero_ps());
__m512 fft92 = _mm512_sub_ps(wt4, _mm512_setzero_ps());
__m512 fft5 = _mm512_add_ps(wt5, _mm512_setzero_ps());
__m512 fft93 = _mm512_add_ps(wt6, _mm512_setzero_ps());
__m512 fft6 = _mm512_sub_ps(wt5, _mm512_setzero_ps());
__m512 fft94 = _mm512_sub_ps(wt6, _mm512_setzero_ps());
__m512 fft7 = _mm512_add_ps(wt7, _mm512_setzero_ps());
__m512 fft95 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft8 = _mm512_sub_ps(wt7, _mm512_setzero_ps());
__m512 fft96 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9 = _mm512_add_ps(fft1, fft5);
__m512 fft97 = _mm512_add_ps(fft89, fft93);
__m512 fft10 = _mm512_sub_ps(fft1, fft5);
__m512 fft98 = _mm512_sub_ps(fft89, fft93);
__m512 fft11 = _mm512_add_ps(fft3, fft7);
__m512 fft99 = _mm512_add_ps(fft91, fft95);
__m512 fft12 = _mm512_sub_ps(fft7, fft3);
__m512 fft100 = _mm512_sub_ps(fft95, fft91);
__m512 fft13 = _mm512_sub_ps(fft4, fft8);
__m512 fft101 = _mm512_sub_ps(fft92, fft96);
__m512 fft14 = _mm512_add_ps(fft4, fft8);
__m512 fft102 = _mm512_add_ps(fft92, fft96);
__m512 fft15 = _mm512_add_ps(fft9, fft11);
__m512 fft103 = _mm512_add_ps(fft97, fft99);
__m512 fft16 = _mm512_sub_ps(fft9, fft11);
__m512 fft104 = _mm512_sub_ps(fft97, fft99);
__m512 fft17 = _mm512_fmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft105 = _mm512_fmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft18 = _mm512_fnmsub_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft106 = _mm512_fnmsub_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft19 = _mm512_fnmadd_ps(fft13, _mm512_set1_ps(7.0710677e-01f), fft2);
__m512 fft107 = _mm512_fnmadd_ps(fft101, _mm512_set1_ps(7.0710677e-01f), fft90);
__m512 fft20 = _mm512_fnmadd_ps(fft14, _mm512_set1_ps(7.0710677e-01f), fft6);
__m512 fft108 = _mm512_fnmadd_ps(fft102, _mm512_set1_ps(7.0710677e-01f), fft94);
__m512 fft21 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft22 = _mm512_fmadd_ps(fft15, fft21, _mm512_shuffle_f32x4(fft15, fft15, 78));
__m512 fft109 = _mm512_fmadd_ps(fft103, fft21, _mm512_shuffle_f32x4(fft103, fft103, 78));
__m512 fft23 = _mm512_fmadd_ps(fft16, fft21, _mm512_shuffle_f32x4(fft16, fft16, 78));
__m512 fft110 = _mm512_fmadd_ps(fft104, fft21, _mm512_shuffle_f32x4(fft104, fft104, 78));
__m512 fft24 = _mm512_fmadd_ps(fft17, fft21, _mm512_shuffle_f32x4(fft17, fft17, 78));
__m512 fft111 = _mm512_fmadd_ps(fft105, fft21, _mm512_shuffle_f32x4(fft105, fft105, 78));
__m512 fft25 = _mm512_fmadd_ps(fft18, fft21, _mm512_shuffle_f32x4(fft18, fft18, 78));
__m512 fft112 = _mm512_fmadd_ps(fft106, fft21, _mm512_shuffle_f32x4(fft106, fft106, 78));
__m512 fft26 = _mm512_fmadd_ps(fft10, fft21, _mm512_shuffle_f32x4(fft10, fft10, 78));
__m512 fft113 = _mm512_fmadd_ps(fft98, fft21, _mm512_shuffle_f32x4(fft98, fft98, 78));
__m512 fft27 = _mm512_fmadd_ps(fft12, fft21, _mm512_shuffle_f32x4(fft12, fft12, 78));
__m512 fft114 = _mm512_fmadd_ps(fft100, fft21, _mm512_shuffle_f32x4(fft100, fft100, 78));
__m512 fft28 = _mm512_fmadd_ps(fft19, fft21, _mm512_shuffle_f32x4(fft19, fft19, 78));
__m512 fft115 = _mm512_fmadd_ps(fft107, fft21, _mm512_shuffle_f32x4(fft107, fft107, 78));
__m512 fft29 = _mm512_fmadd_ps(fft20, fft21, _mm512_shuffle_f32x4(fft20, fft20, 78));
__m512 fft116 = _mm512_fmadd_ps(fft108, fft21, _mm512_shuffle_f32x4(fft108, fft108, 78));
__m512 fft30 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft31 = _mm512_mul_ps(fft22, fft30);
__m512 fft117 = _mm512_mul_ps(fft109, fft30);
__m512 fft32 = _mm512_mul_ps(fft23, fft30);
__m512 fft118 = _mm512_mul_ps(fft110, fft30);
__m512 fft33 = _mm512_mul_ps(fft24, fft30);
__m512 fft119 = _mm512_mul_ps(fft111, fft30);
__m512 fft34 = _mm512_mul_ps(fft25, fft30);
__m512 fft120 = _mm512_mul_ps(fft112, fft30);
__m512 fft35 = _mm512_mul_ps(fft26, fft30);
__m512 fft121 = _mm512_mul_ps(fft113, fft30);
__m512 fft36 = _mm512_mul_ps(fft27, fft30);
__m512 fft122 = _mm512_mul_ps(fft114, fft30);
__m512 fft37 = _mm512_mul_ps(fft28, fft30);
__m512 fft123 = _mm512_mul_ps(fft115, fft30);
__m512 fft38 = _mm512_mul_ps(fft29, fft30);
__m512 fft124 = _mm512_mul_ps(fft116, fft30);
__m512 fft39 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft40 = _mm512_fmadd_ps(fft23, fft39, fft31);
__m512 fft125 = _mm512_fmadd_ps(fft110, fft39, fft117);
__m512 fft41 = _mm512_fnmadd_ps(fft22, fft39, fft32);
__m512 fft126 = _mm512_fnmadd_ps(fft109, fft39, fft118);
__m512 fft42 = _mm512_fmadd_ps(fft25, fft39, fft33);
__m512 fft127 = _mm512_fmadd_ps(fft112, fft39, fft119);
__m512 fft43 = _mm512_fnmadd_ps(fft24, fft39, fft34);
__m512 fft128 = _mm512_fnmadd_ps(fft111, fft39, fft120);
__m512 fft44 = _mm512_fmadd_ps(fft27, fft39, fft35);
__m512 fft129 = _mm512_fmadd_ps(fft114, fft39, fft121);
__m512 fft45 = _mm512_fnmadd_ps(fft26, fft39, fft36);
__m512 fft130 = _mm512_fnmadd_ps(fft113, fft39, fft122);
__m512 fft46 = _mm512_fmadd_ps(fft29, fft39, fft37);
__m512 fft131 = _mm512_fmadd_ps(fft116, fft39, fft123);
__m512 fft47 = _mm512_fnmadd_ps(fft28, fft39, fft38);
__m512 fft132 = _mm512_fnmadd_ps(fft115, fft39, fft124);
__m512 fft48 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft49 = _mm512_fmadd_ps(fft40, fft48, _mm512_shuffle_f32x4(fft40, fft40, 177));
__m512 fft133 = _mm512_fmadd_ps(fft125, fft48, _mm512_shuffle_f32x4(fft125, fft125, 177));
__m512 fft50 = _mm512_fmadd_ps(fft41, fft48, _mm512_shuffle_f32x4(fft41, fft41, 177));
__m512 fft134 = _mm512_fmadd_ps(fft126, fft48, _mm512_shuffle_f32x4(fft126, fft126, 177));
__m512 fft51 = _mm512_fmadd_ps(fft42, fft48, _mm512_shuffle_f32x4(fft42, fft42, 177));
__m512 fft135 = _mm512_fmadd_ps(fft127, fft48, _mm512_shuffle_f32x4(fft127, fft127, 177));
__m512 fft52 = _mm512_fmadd_ps(fft43, fft48, _mm512_shuffle_f32x4(fft43, fft43, 177));
__m512 fft136 = _mm512_fmadd_ps(fft128, fft48, _mm512_shuffle_f32x4(fft128, fft128, 177));
__m512 fft53 = _mm512_fmadd_ps(fft44, fft48, _mm512_shuffle_f32x4(fft44, fft44, 177));
__m512 fft137 = _mm512_fmadd_ps(fft129, fft48, _mm512_shuffle_f32x4(fft129, fft129, 177));
__m512 fft54 = _mm512_fmadd_ps(fft45, fft48, _mm512_shuffle_f32x4(fft45, fft45, 177));
__m512 fft138 = _mm512_fmadd_ps(fft130, fft48, _mm512_shuffle_f32x4(fft130, fft130, 177));
__m512 fft55 = _mm512_fmadd_ps(fft46, fft48, _mm512_shuffle_f32x4(fft46, fft46, 177));
__m512 fft139 = _mm512_fmadd_ps(fft131, fft48, _mm512_shuffle_f32x4(fft131, fft131, 177));
__m512 fft56 = _mm512_fmadd_ps(fft47, fft48, _mm512_shuffle_f32x4(fft47, fft47, 177));
__m512 fft140 = _mm512_fmadd_ps(fft132, fft48, _mm512_shuffle_f32x4(fft132, fft132, 177));
__m512 fft57 = _mm512_mask_mov_ps(fft49, 49344, fft50);
__m512 fft141 = _mm512_mask_mov_ps(fft133, 49344, fft134);
__m512 fft58 = _mm512_mask_sub_ps(fft50, 49344, _mm512_setzero_ps(), fft49);
__m512 fft142 = _mm512_mask_sub_ps(fft134, 49344, _mm512_setzero_ps(), fft133);
__m512 fft59 = _mm512_mask_mov_ps(fft51, 49344, fft52);
__m512 fft143 = _mm512_mask_mov_ps(fft135, 49344, fft136);
__m512 fft60 = _mm512_mask_sub_ps(fft52, 49344, _mm512_setzero_ps(), fft51);
__m512 fft144 = _mm512_mask_sub_ps(fft136, 49344, _mm512_setzero_ps(), fft135);
__m512 fft61 = _mm512_mask_mov_ps(fft53, 49344, fft54);
__m512 fft145 = _mm512_mask_mov_ps(fft137, 49344, fft138);
__m512 fft62 = _mm512_mask_sub_ps(fft54, 49344, _mm512_setzero_ps(), fft53);
__m512 fft146 = _mm512_mask_sub_ps(fft138, 49344, _mm512_setzero_ps(), fft137);
__m512 fft63 = _mm512_mask_mov_ps(fft55, 49344, fft56);
__m512 fft147 = _mm512_mask_mov_ps(fft139, 49344, fft140);
__m512 fft64 = _mm512_mask_sub_ps(fft56, 49344, _mm512_setzero_ps(), fft55);
__m512 fft148 = _mm512_mask_sub_ps(fft140, 49344, _mm512_setzero_ps(), fft139);
__m512 fft65 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft66 = _mm512_fmadd_ps(fft57, fft65, _mm512_shuffle_ps(fft57, fft57, 78));
__m512 fft149 = _mm512_fmadd_ps(fft141, fft65, _mm512_shuffle_ps(fft141, fft141, 78));
__m512 fft67 = _mm512_fmadd_ps(fft58, fft65, _mm512_shuffle_ps(fft58, fft58, 78));
__m512 fft150 = _mm512_fmadd_ps(fft142, fft65, _mm512_shuffle_ps(fft142, fft142, 78));
__m512 fft68 = _mm512_fmadd_ps(fft59, fft65, _mm512_shuffle_ps(fft59, fft59, 78));
__m512 fft151 = _mm512_fmadd_ps(fft143, fft65, _mm512_shuffle_ps(fft143, fft143, 78));
__m512 fft69 = _mm512_fmadd_ps(fft60, fft65, _mm512_shuffle_ps(fft60, fft60, 78));
__m512 fft152 = _mm512_fmadd_ps(fft144, fft65, _mm512_shuffle_ps(fft144, fft144, 78));
__m512 fft70 = _mm512_fmadd_ps(fft61, fft65, _mm512_shuffle_ps(fft61, fft61, 78));
__m512 fft153 = _mm512_fmadd_ps(fft145, fft65, _mm512_shuffle_ps(fft145, fft145, 78));
__m512 fft71 = _mm512_fmadd_ps(fft62, fft65, _mm512_shuffle_ps(fft62, fft62, 78));
__m512 fft154 = _mm512_fmadd_ps(fft146, fft65, _mm512_shuffle_ps(fft146, fft146, 78));
__m512 fft72 = _mm512_fmadd_ps(fft63, fft65, _mm512_shuffle_ps(fft63, fft63, 78));
__m512 fft155 = _mm512_fmadd_ps(fft147, fft65, _mm512_shuffle_ps(fft147, fft147, 78));
__m512 fft73 = _mm512_fmadd_ps(fft64, fft65, _mm512_shuffle_ps(fft64, fft64, 78));
__m512 fft156 = _mm512_fmadd_ps(fft148, fft65, _mm512_shuffle_ps(fft148, fft148, 78));
__m512i fft74 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft75 = _mm512_permutexvar_ps(fft74, fft66);
__m512 fft157 = _mm512_permutexvar_ps(fft74, fft149);
__m512i fft76 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft77 = _mm512_permutexvar_ps(fft76, fft66);
__m512 fft158 = _mm512_permutexvar_ps(fft76, fft149);
__m512 fft78 = _mm512_permutexvar_ps(fft74, fft67);
__m512 fft159 = _mm512_permutexvar_ps(fft74, fft150);
__m512 fft79 = _mm512_permutexvar_ps(fft76, fft67);
__m512 fft160 = _mm512_permutexvar_ps(fft76, fft150);
__m512 fft80 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft81 = _mm512_fmadd_ps(fft75, fft80, fft77);
__m512 fft161 = _mm512_fmadd_ps(fft157, fft80, fft158);
__m512 fft82 = _mm512_fnmadd_ps(fft79, fft80, fft78);
__m512 fft162 = _mm512_fnmadd_ps(fft160, fft80, fft159);
__m512 fft83 = _mm512_mask_mov_ps(fft79, 21845, fft81);
__m512 fft163 = _mm512_mask_mov_ps(fft160, 21845, fft161);
__m512 fft84 = _mm512_mask_mov_ps(fft75, 43176, fft81);
__m512 fft164 = _mm512_mask_mov_ps(fft157, 43176, fft161);
__m512 fft85 = _mm512_mask_mov_ps(fft83, 43176, fft82);
__m512 fft165 = _mm512_mask_mov_ps(fft163, 43176, fft162);
__m512 fft86 = _mm512_mask_mov_ps(fft84, 22102, fft82);
__m512 fft166 = _mm512_mask_mov_ps(fft164, 22102, fft162);
__m512 fft87 = _mm512_mask_mul_ps(fft85, 64764, fft85, _mm512_set1_ps(5e-01f));
__m512 fft167 = _mm512_mask_mul_ps(fft165, 64764, fft165, _mm512_set1_ps(5e-01f));
__m512 fft88 = _mm512_mask_mul_ps(fft86, 64764, fft86, _mm512_set1_ps(5e-01f));
__m512 fft168 = _mm512_mask_mul_ps(fft166, 64764, fft166, _mm512_set1_ps(5e-01f));
__m512 wf1 = fft87;
__m512 wf9 = fft167;
__m512 wf2 = fft88;
__m512 wf10 = fft168;
__m512 wf3 = fft68;
__m512 wf11 = fft151;
__m512 wf4 = fft69;
__m512 wf12 = fft152;
__m512 wf5 = fft70;
__m512 wf13 = fft153;
__m512 wf6 = fft71;
__m512 wf14 = fft154;
__m512 wf7 = fft72;
__m512 wf15 = fft155;
__m512 wf8 = fft73;
__m512 wf16 = fft156;
ptrdiff_t c1 = (size_t)(0+2*j1)/4;
ptrdiff_t m1 = (size_t)(0+2*j1)%4/2;
ptrdiff_t f2 = (size_t)(0+2*j1)%2;
__m512i eo1 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf3 = _mm512_permutexvar_ps(eo1, wf3);
wf4 = _mm512_permutexvar_ps(eo1, wf4);
__m512i wfs1 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf3, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs1 = _mm512_inserti64x4(wfs1, _mm512_cvtps_ph(wf4, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6144+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs1);
_mm512_mask_storeu_epi32(wfPtr1+30704+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs1);
wf11 = _mm512_permutexvar_ps(eo1, wf11);
wf12 = _mm512_permutexvar_ps(eo1, wf12);
__m512i wfs2 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf11, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs2 = _mm512_inserti64x4(wfs2, _mm512_cvtps_ph(wf12, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55296+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs2);
_mm512_mask_storeu_epi32(wfPtr1+79856+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs2);
wf5 = _mm512_permutexvar_ps(eo1, wf5);
wf6 = _mm512_permutexvar_ps(eo1, wf6);
__m512i wfs3 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf5, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs3 = _mm512_inserti64x4(wfs3, _mm512_cvtps_ph(wf6, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+12288+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs3);
_mm512_mask_storeu_epi32(wfPtr1+36848+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs3);
wf13 = _mm512_permutexvar_ps(eo1, wf13);
wf14 = _mm512_permutexvar_ps(eo1, wf14);
__m512i wfs4 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf13, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs4 = _mm512_inserti64x4(wfs4, _mm512_cvtps_ph(wf14, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+61440+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs4);
_mm512_mask_storeu_epi32(wfPtr1+86000+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs4);
wf7 = _mm512_permutexvar_ps(eo1, wf7);
wf8 = _mm512_permutexvar_ps(eo1, wf8);
__m512i wfs5 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf7, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs5 = _mm512_inserti64x4(wfs5, _mm512_cvtps_ph(wf8, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+18432+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs5);
_mm512_mask_storeu_epi32(wfPtr1+42992+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs5);
wf15 = _mm512_permutexvar_ps(eo1, wf15);
wf16 = _mm512_permutexvar_ps(eo1, wf16);
__m512i wfs6 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf15, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs6 = _mm512_inserti64x4(wfs6, _mm512_cvtps_ph(wf16, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+67584+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs6);
_mm512_mask_storeu_epi32(wfPtr1+92144+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs6);
__m512i wfs7 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf1, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs7 = _mm512_inserti64x4(wfs7, _mm512_cvtps_ph(wf2, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs7);
_mm512_mask_storeu_epi32(wfPtr1+24560+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs7);
__m512i wfs8 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf9, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs8 = _mm512_inserti64x4(wfs8, _mm512_cvtps_ph(wf10, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+49152+24576*i5+384*c1+128*k1+64*m1+16*f2, 3855, wfs8);
_mm512_mask_storeu_epi32(wfPtr1+73712+24576*i5+384*c1+128*k1+64*m1+16*f2, 61680, wfs8);
__m512 wt8 = _mm512_maskz_loadu_ps(127, wtPtr1+588+37632*i5+1176*j1+196*k1);
__m512 wt9 = _mm512_maskz_loadu_ps(127, wtPtr1+616+37632*i5+1176*j1+196*k1);
__m512 wt10 = _mm512_maskz_loadu_ps(127, wtPtr1+644+37632*i5+1176*j1+196*k1);
__m512 wt11 = _mm512_maskz_loadu_ps(127, wtPtr1+672+37632*i5+1176*j1+196*k1);
__m512 wt12 = _mm512_maskz_loadu_ps(127, wtPtr1+700+37632*i5+1176*j1+196*k1);
__m512 wt13 = _mm512_maskz_loadu_ps(127, wtPtr1+728+37632*i5+1176*j1+196*k1);
__m512 wt14 = _mm512_maskz_loadu_ps(127, wtPtr1+756+37632*i5+1176*j1+196*k1);
wt8 = _mm512_mul_ps(postMul2, wt8);
wt9 = _mm512_mul_ps(postMul2, wt9);
wt10 = _mm512_mul_ps(postMul2, wt10);
wt11 = _mm512_mul_ps(postMul2, wt11);
wt12 = _mm512_mul_ps(postMul2, wt12);
wt13 = _mm512_mul_ps(postMul2, wt13);
wt14 = _mm512_mul_ps(postMul2, wt14);
__m512 fft169 = _mm512_add_ps(wt8, _mm512_setzero_ps());
__m512 fft257 = _mm512_add_ps(wt9, _mm512_setzero_ps());
__m512 fft170 = _mm512_sub_ps(wt8, _mm512_setzero_ps());
__m512 fft258 = _mm512_sub_ps(wt9, _mm512_setzero_ps());
__m512 fft171 = _mm512_add_ps(wt10, _mm512_setzero_ps());
__m512 fft259 = _mm512_add_ps(wt11, _mm512_setzero_ps());
__m512 fft172 = _mm512_sub_ps(wt10, _mm512_setzero_ps());
__m512 fft260 = _mm512_sub_ps(wt11, _mm512_setzero_ps());
__m512 fft173 = _mm512_add_ps(wt12, _mm512_setzero_ps());
__m512 fft261 = _mm512_add_ps(wt13, _mm512_setzero_ps());
__m512 fft174 = _mm512_sub_ps(wt12, _mm512_setzero_ps());
__m512 fft262 = _mm512_sub_ps(wt13, _mm512_setzero_ps());
__m512 fft175 = _mm512_add_ps(wt14, _mm512_setzero_ps());
__m512 fft263 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft176 = _mm512_sub_ps(wt14, _mm512_setzero_ps());
__m512 fft264 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft177 = _mm512_add_ps(fft169, fft173);
__m512 fft265 = _mm512_add_ps(fft257, fft261);
__m512 fft178 = _mm512_sub_ps(fft169, fft173);
__m512 fft266 = _mm512_sub_ps(fft257, fft261);
__m512 fft179 = _mm512_add_ps(fft171, fft175);
__m512 fft267 = _mm512_add_ps(fft259, fft263);
__m512 fft180 = _mm512_sub_ps(fft175, fft171);
__m512 fft268 = _mm512_sub_ps(fft263, fft259);
__m512 fft181 = _mm512_sub_ps(fft172, fft176);
__m512 fft269 = _mm512_sub_ps(fft260, fft264);
__m512 fft182 = _mm512_add_ps(fft172, fft176);
__m512 fft270 = _mm512_add_ps(fft260, fft264);
__m512 fft183 = _mm512_add_ps(fft177, fft179);
__m512 fft271 = _mm512_add_ps(fft265, fft267);
__m512 fft184 = _mm512_sub_ps(fft177, fft179);
__m512 fft272 = _mm512_sub_ps(fft265, fft267);
__m512 fft185 = _mm512_fmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft273 = _mm512_fmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft186 = _mm512_fnmsub_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft274 = _mm512_fnmsub_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft187 = _mm512_fnmadd_ps(fft181, _mm512_set1_ps(7.0710677e-01f), fft170);
__m512 fft275 = _mm512_fnmadd_ps(fft269, _mm512_set1_ps(7.0710677e-01f), fft258);
__m512 fft188 = _mm512_fnmadd_ps(fft182, _mm512_set1_ps(7.0710677e-01f), fft174);
__m512 fft276 = _mm512_fnmadd_ps(fft270, _mm512_set1_ps(7.0710677e-01f), fft262);
__m512 fft189 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft190 = _mm512_fmadd_ps(fft183, fft189, _mm512_shuffle_f32x4(fft183, fft183, 78));
__m512 fft277 = _mm512_fmadd_ps(fft271, fft189, _mm512_shuffle_f32x4(fft271, fft271, 78));
__m512 fft191 = _mm512_fmadd_ps(fft184, fft189, _mm512_shuffle_f32x4(fft184, fft184, 78));
__m512 fft278 = _mm512_fmadd_ps(fft272, fft189, _mm512_shuffle_f32x4(fft272, fft272, 78));
__m512 fft192 = _mm512_fmadd_ps(fft185, fft189, _mm512_shuffle_f32x4(fft185, fft185, 78));
__m512 fft279 = _mm512_fmadd_ps(fft273, fft189, _mm512_shuffle_f32x4(fft273, fft273, 78));
__m512 fft193 = _mm512_fmadd_ps(fft186, fft189, _mm512_shuffle_f32x4(fft186, fft186, 78));
__m512 fft280 = _mm512_fmadd_ps(fft274, fft189, _mm512_shuffle_f32x4(fft274, fft274, 78));
__m512 fft194 = _mm512_fmadd_ps(fft178, fft189, _mm512_shuffle_f32x4(fft178, fft178, 78));
__m512 fft281 = _mm512_fmadd_ps(fft266, fft189, _mm512_shuffle_f32x4(fft266, fft266, 78));
__m512 fft195 = _mm512_fmadd_ps(fft180, fft189, _mm512_shuffle_f32x4(fft180, fft180, 78));
__m512 fft282 = _mm512_fmadd_ps(fft268, fft189, _mm512_shuffle_f32x4(fft268, fft268, 78));
__m512 fft196 = _mm512_fmadd_ps(fft187, fft189, _mm512_shuffle_f32x4(fft187, fft187, 78));
__m512 fft283 = _mm512_fmadd_ps(fft275, fft189, _mm512_shuffle_f32x4(fft275, fft275, 78));
__m512 fft197 = _mm512_fmadd_ps(fft188, fft189, _mm512_shuffle_f32x4(fft188, fft188, 78));
__m512 fft284 = _mm512_fmadd_ps(fft276, fft189, _mm512_shuffle_f32x4(fft276, fft276, 78));
__m512 fft198 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft199 = _mm512_mul_ps(fft190, fft198);
__m512 fft285 = _mm512_mul_ps(fft277, fft198);
__m512 fft200 = _mm512_mul_ps(fft191, fft198);
__m512 fft286 = _mm512_mul_ps(fft278, fft198);
__m512 fft201 = _mm512_mul_ps(fft192, fft198);
__m512 fft287 = _mm512_mul_ps(fft279, fft198);
__m512 fft202 = _mm512_mul_ps(fft193, fft198);
__m512 fft288 = _mm512_mul_ps(fft280, fft198);
__m512 fft203 = _mm512_mul_ps(fft194, fft198);
__m512 fft289 = _mm512_mul_ps(fft281, fft198);
__m512 fft204 = _mm512_mul_ps(fft195, fft198);
__m512 fft290 = _mm512_mul_ps(fft282, fft198);
__m512 fft205 = _mm512_mul_ps(fft196, fft198);
__m512 fft291 = _mm512_mul_ps(fft283, fft198);
__m512 fft206 = _mm512_mul_ps(fft197, fft198);
__m512 fft292 = _mm512_mul_ps(fft284, fft198);
__m512 fft207 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft208 = _mm512_fmadd_ps(fft191, fft207, fft199);
__m512 fft293 = _mm512_fmadd_ps(fft278, fft207, fft285);
__m512 fft209 = _mm512_fnmadd_ps(fft190, fft207, fft200);
__m512 fft294 = _mm512_fnmadd_ps(fft277, fft207, fft286);
__m512 fft210 = _mm512_fmadd_ps(fft193, fft207, fft201);
__m512 fft295 = _mm512_fmadd_ps(fft280, fft207, fft287);
__m512 fft211 = _mm512_fnmadd_ps(fft192, fft207, fft202);
__m512 fft296 = _mm512_fnmadd_ps(fft279, fft207, fft288);
__m512 fft212 = _mm512_fmadd_ps(fft195, fft207, fft203);
__m512 fft297 = _mm512_fmadd_ps(fft282, fft207, fft289);
__m512 fft213 = _mm512_fnmadd_ps(fft194, fft207, fft204);
__m512 fft298 = _mm512_fnmadd_ps(fft281, fft207, fft290);
__m512 fft214 = _mm512_fmadd_ps(fft197, fft207, fft205);
__m512 fft299 = _mm512_fmadd_ps(fft284, fft207, fft291);
__m512 fft215 = _mm512_fnmadd_ps(fft196, fft207, fft206);
__m512 fft300 = _mm512_fnmadd_ps(fft283, fft207, fft292);
__m512 fft216 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft217 = _mm512_fmadd_ps(fft208, fft216, _mm512_shuffle_f32x4(fft208, fft208, 177));
__m512 fft301 = _mm512_fmadd_ps(fft293, fft216, _mm512_shuffle_f32x4(fft293, fft293, 177));
__m512 fft218 = _mm512_fmadd_ps(fft209, fft216, _mm512_shuffle_f32x4(fft209, fft209, 177));
__m512 fft302 = _mm512_fmadd_ps(fft294, fft216, _mm512_shuffle_f32x4(fft294, fft294, 177));
__m512 fft219 = _mm512_fmadd_ps(fft210, fft216, _mm512_shuffle_f32x4(fft210, fft210, 177));
__m512 fft303 = _mm512_fmadd_ps(fft295, fft216, _mm512_shuffle_f32x4(fft295, fft295, 177));
__m512 fft220 = _mm512_fmadd_ps(fft211, fft216, _mm512_shuffle_f32x4(fft211, fft211, 177));
__m512 fft304 = _mm512_fmadd_ps(fft296, fft216, _mm512_shuffle_f32x4(fft296, fft296, 177));
__m512 fft221 = _mm512_fmadd_ps(fft212, fft216, _mm512_shuffle_f32x4(fft212, fft212, 177));
__m512 fft305 = _mm512_fmadd_ps(fft297, fft216, _mm512_shuffle_f32x4(fft297, fft297, 177));
__m512 fft222 = _mm512_fmadd_ps(fft213, fft216, _mm512_shuffle_f32x4(fft213, fft213, 177));
__m512 fft306 = _mm512_fmadd_ps(fft298, fft216, _mm512_shuffle_f32x4(fft298, fft298, 177));
__m512 fft223 = _mm512_fmadd_ps(fft214, fft216, _mm512_shuffle_f32x4(fft214, fft214, 177));
__m512 fft307 = _mm512_fmadd_ps(fft299, fft216, _mm512_shuffle_f32x4(fft299, fft299, 177));
__m512 fft224 = _mm512_fmadd_ps(fft215, fft216, _mm512_shuffle_f32x4(fft215, fft215, 177));
__m512 fft308 = _mm512_fmadd_ps(fft300, fft216, _mm512_shuffle_f32x4(fft300, fft300, 177));
__m512 fft225 = _mm512_mask_mov_ps(fft217, 49344, fft218);
__m512 fft309 = _mm512_mask_mov_ps(fft301, 49344, fft302);
__m512 fft226 = _mm512_mask_sub_ps(fft218, 49344, _mm512_setzero_ps(), fft217);
__m512 fft310 = _mm512_mask_sub_ps(fft302, 49344, _mm512_setzero_ps(), fft301);
__m512 fft227 = _mm512_mask_mov_ps(fft219, 49344, fft220);
__m512 fft311 = _mm512_mask_mov_ps(fft303, 49344, fft304);
__m512 fft228 = _mm512_mask_sub_ps(fft220, 49344, _mm512_setzero_ps(), fft219);
__m512 fft312 = _mm512_mask_sub_ps(fft304, 49344, _mm512_setzero_ps(), fft303);
__m512 fft229 = _mm512_mask_mov_ps(fft221, 49344, fft222);
__m512 fft313 = _mm512_mask_mov_ps(fft305, 49344, fft306);
__m512 fft230 = _mm512_mask_sub_ps(fft222, 49344, _mm512_setzero_ps(), fft221);
__m512 fft314 = _mm512_mask_sub_ps(fft306, 49344, _mm512_setzero_ps(), fft305);
__m512 fft231 = _mm512_mask_mov_ps(fft223, 49344, fft224);
__m512 fft315 = _mm512_mask_mov_ps(fft307, 49344, fft308);
__m512 fft232 = _mm512_mask_sub_ps(fft224, 49344, _mm512_setzero_ps(), fft223);
__m512 fft316 = _mm512_mask_sub_ps(fft308, 49344, _mm512_setzero_ps(), fft307);
__m512 fft233 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft234 = _mm512_fmadd_ps(fft225, fft233, _mm512_shuffle_ps(fft225, fft225, 78));
__m512 fft317 = _mm512_fmadd_ps(fft309, fft233, _mm512_shuffle_ps(fft309, fft309, 78));
__m512 fft235 = _mm512_fmadd_ps(fft226, fft233, _mm512_shuffle_ps(fft226, fft226, 78));
__m512 fft318 = _mm512_fmadd_ps(fft310, fft233, _mm512_shuffle_ps(fft310, fft310, 78));
__m512 fft236 = _mm512_fmadd_ps(fft227, fft233, _mm512_shuffle_ps(fft227, fft227, 78));
__m512 fft319 = _mm512_fmadd_ps(fft311, fft233, _mm512_shuffle_ps(fft311, fft311, 78));
__m512 fft237 = _mm512_fmadd_ps(fft228, fft233, _mm512_shuffle_ps(fft228, fft228, 78));
__m512 fft320 = _mm512_fmadd_ps(fft312, fft233, _mm512_shuffle_ps(fft312, fft312, 78));
__m512 fft238 = _mm512_fmadd_ps(fft229, fft233, _mm512_shuffle_ps(fft229, fft229, 78));
__m512 fft321 = _mm512_fmadd_ps(fft313, fft233, _mm512_shuffle_ps(fft313, fft313, 78));
__m512 fft239 = _mm512_fmadd_ps(fft230, fft233, _mm512_shuffle_ps(fft230, fft230, 78));
__m512 fft322 = _mm512_fmadd_ps(fft314, fft233, _mm512_shuffle_ps(fft314, fft314, 78));
__m512 fft240 = _mm512_fmadd_ps(fft231, fft233, _mm512_shuffle_ps(fft231, fft231, 78));
__m512 fft323 = _mm512_fmadd_ps(fft315, fft233, _mm512_shuffle_ps(fft315, fft315, 78));
__m512 fft241 = _mm512_fmadd_ps(fft232, fft233, _mm512_shuffle_ps(fft232, fft232, 78));
__m512 fft324 = _mm512_fmadd_ps(fft316, fft233, _mm512_shuffle_ps(fft316, fft316, 78));
__m512i fft242 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft243 = _mm512_permutexvar_ps(fft242, fft234);
__m512 fft325 = _mm512_permutexvar_ps(fft242, fft317);
__m512i fft244 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft245 = _mm512_permutexvar_ps(fft244, fft234);
__m512 fft326 = _mm512_permutexvar_ps(fft244, fft317);
__m512 fft246 = _mm512_permutexvar_ps(fft242, fft235);
__m512 fft327 = _mm512_permutexvar_ps(fft242, fft318);
__m512 fft247 = _mm512_permutexvar_ps(fft244, fft235);
__m512 fft328 = _mm512_permutexvar_ps(fft244, fft318);
__m512 fft248 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft249 = _mm512_fmadd_ps(fft243, fft248, fft245);
__m512 fft329 = _mm512_fmadd_ps(fft325, fft248, fft326);
__m512 fft250 = _mm512_fnmadd_ps(fft247, fft248, fft246);
__m512 fft330 = _mm512_fnmadd_ps(fft328, fft248, fft327);
__m512 fft251 = _mm512_mask_mov_ps(fft247, 21845, fft249);
__m512 fft331 = _mm512_mask_mov_ps(fft328, 21845, fft329);
__m512 fft252 = _mm512_mask_mov_ps(fft243, 43176, fft249);
__m512 fft332 = _mm512_mask_mov_ps(fft325, 43176, fft329);
__m512 fft253 = _mm512_mask_mov_ps(fft251, 43176, fft250);
__m512 fft333 = _mm512_mask_mov_ps(fft331, 43176, fft330);
__m512 fft254 = _mm512_mask_mov_ps(fft252, 22102, fft250);
__m512 fft334 = _mm512_mask_mov_ps(fft332, 22102, fft330);
__m512 fft255 = _mm512_mask_mul_ps(fft253, 64764, fft253, _mm512_set1_ps(5e-01f));
__m512 fft335 = _mm512_mask_mul_ps(fft333, 64764, fft333, _mm512_set1_ps(5e-01f));
__m512 fft256 = _mm512_mask_mul_ps(fft254, 64764, fft254, _mm512_set1_ps(5e-01f));
__m512 fft336 = _mm512_mask_mul_ps(fft334, 64764, fft334, _mm512_set1_ps(5e-01f));
__m512 wf17 = fft255;
__m512 wf25 = fft335;
__m512 wf18 = fft256;
__m512 wf26 = fft336;
__m512 wf19 = fft236;
__m512 wf27 = fft319;
__m512 wf20 = fft237;
__m512 wf28 = fft320;
__m512 wf21 = fft238;
__m512 wf29 = fft321;
__m512 wf22 = fft239;
__m512 wf30 = fft322;
__m512 wf23 = fft240;
__m512 wf31 = fft323;
__m512 wf24 = fft241;
__m512 wf32 = fft324;
ptrdiff_t c2 = (size_t)(1+2*j1)/4;
ptrdiff_t m2 = (size_t)(1+2*j1)%4/2;
ptrdiff_t f3 = (size_t)(1+2*j1)%2;
__m512i eo2 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf19 = _mm512_permutexvar_ps(eo2, wf19);
wf20 = _mm512_permutexvar_ps(eo2, wf20);
__m512i wfs9 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf19, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs9 = _mm512_inserti64x4(wfs9, _mm512_cvtps_ph(wf20, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+6144+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs9);
_mm512_mask_storeu_epi32(wfPtr1+30704+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs9);
wf27 = _mm512_permutexvar_ps(eo2, wf27);
wf28 = _mm512_permutexvar_ps(eo2, wf28);
__m512i wfs10 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf27, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs10 = _mm512_inserti64x4(wfs10, _mm512_cvtps_ph(wf28, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+55296+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs10);
_mm512_mask_storeu_epi32(wfPtr1+79856+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs10);
wf21 = _mm512_permutexvar_ps(eo2, wf21);
wf22 = _mm512_permutexvar_ps(eo2, wf22);
__m512i wfs11 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf21, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs11 = _mm512_inserti64x4(wfs11, _mm512_cvtps_ph(wf22, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+12288+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs11);
_mm512_mask_storeu_epi32(wfPtr1+36848+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs11);
wf29 = _mm512_permutexvar_ps(eo2, wf29);
wf30 = _mm512_permutexvar_ps(eo2, wf30);
__m512i wfs12 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf29, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs12 = _mm512_inserti64x4(wfs12, _mm512_cvtps_ph(wf30, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+61440+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs12);
_mm512_mask_storeu_epi32(wfPtr1+86000+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs12);
wf23 = _mm512_permutexvar_ps(eo2, wf23);
wf24 = _mm512_permutexvar_ps(eo2, wf24);
__m512i wfs13 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf23, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs13 = _mm512_inserti64x4(wfs13, _mm512_cvtps_ph(wf24, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+18432+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs13);
_mm512_mask_storeu_epi32(wfPtr1+42992+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs13);
wf31 = _mm512_permutexvar_ps(eo2, wf31);
wf32 = _mm512_permutexvar_ps(eo2, wf32);
__m512i wfs14 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf31, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs14 = _mm512_inserti64x4(wfs14, _mm512_cvtps_ph(wf32, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+67584+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs14);
_mm512_mask_storeu_epi32(wfPtr1+92144+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs14);
__m512i wfs15 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf17, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs15 = _mm512_inserti64x4(wfs15, _mm512_cvtps_ph(wf18, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+0+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs15);
_mm512_mask_storeu_epi32(wfPtr1+24560+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs15);
__m512i wfs16 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf25, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs16 = _mm512_inserti64x4(wfs16, _mm512_cvtps_ph(wf26, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr1+49152+24576*i5+384*c2+128*k1+64*m2+16*f3, 3855, wfs16);
_mm512_mask_storeu_epi32(wfPtr1+73712+24576*i5+384*c2+128*k1+64*m2+16*f3, 61680, wfs16);
}
__m512 bias1 = _mm512_setzero_ps();
if (!e1) {
bias1 = _mm512_maskz_loadu_ps(3, biasPtr1-0+256*i5+8*j1);
__m512i pmMul1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas3 = _mm512_maskz_loadu_ps(15, bnPtr1+(ptrdiff_t)8*(0+64*i5+2*j1));
__m512 postMul3 = _mm512_permutexvar_ps(pmMul1, mas3);
__m512 postAdd1 = _mm512_permutexvar_ps(pmAdd1, mas3);
bias1 = _mm512_fmadd_ps(bias1, postMul3, postAdd1);
bias1 = _mm512_mul_ps(bias1, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr1-0+256*i5+8*j1, 3, bias1);
}
}
}

static void ResNeXt50StriderArrangeFilts1(ResNeXt50ThreaderTeam1* team13, char** tensors1) {
ResNeXt50ThreaderTask1 task5;
task5.callee1 = ResNeXt50StriderArrangeFilts1Callee1;
task5.any1 = tensors1;
task5.nd1 = 3;
task5.hull1[0] = 1;
task5.hull1[1] = 1;
task5.hull1[2] = 1;
ResNeXt50ThreaderDo1(team13, &task5);
}

static void ResNeXt50StriderArrangeDats1Callee1(ResNeXt50ThreaderTask1* task6, int64_t* pt8) {
char** tensors4 = task6->any1;
ptrdiff_t s1 = 0;
ptrdiff_t c3 = pt8[1];
ptrdiff_t g3 = 0;
ptrdiff_t e2 = 0;
char*restrict datPtr1 = tensors4[0]-2700+79478784*e2;
char*restrict bnPtr2 = tensors4[1]+(ptrdiff_t)8*396*e2;
char*restrict dfPtr1 = tensors4[2]+214917120*e2;
ptrdiff_t i6 = 1*g3;
ptrdiff_t j2 = 8*c3;
ptrdiff_t last1 = j2+(c3 < 10 ? 7 : 8);
if (j2 < 4) {
ptrdiff_t rel1 = j2-0;
ptrdiff_t base1 = 0;
if (rel1 < 1) {
ptrdiff_t h1 = base1+0;
ptrdiff_t w1 = 0;
ptrdiff_t k2 = 3*s1;
ptrdiff_t kk1 = k2+2;
for (; k2 <= kk1; ++k2) {
__m512 bnMul1 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k2+3*i6))[0]);
__m512 bnAdd1 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k2+3*i6))[1]);
ptrdiff_t b3 = 0;
ptrdiff_t m3 = (size_t)b3/2;
ptrdiff_t f4 = (size_t)b3%2;
__m512 dat1 = _mm512_maskz_loadu_ps(65528, datPtr1+2688+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat1 = _mm512_mask_fmadd_ps(dat1, 65528, bnMul1, bnAdd1);
__m512 dat2 = _mm512_maskz_loadu_ps(65528, datPtr1+3584+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat2 = _mm512_mask_fmadd_ps(dat2, 65528, bnMul1, bnAdd1);
__m512 dat3 = _mm512_maskz_loadu_ps(65528, datPtr1+4480+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat3 = _mm512_mask_fmadd_ps(dat3, 65528, bnMul1, bnAdd1);
__m512 dat4 = _mm512_maskz_loadu_ps(65528, datPtr1+5376+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat4 = _mm512_mask_fmadd_ps(dat4, 65528, bnMul1, bnAdd1);
__m512 dat5 = _mm512_maskz_loadu_ps(65528, datPtr1+6272+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat5 = _mm512_mask_fmadd_ps(dat5, 65528, bnMul1, bnAdd1);
__m512 dat6 = _mm512_maskz_loadu_ps(65528, datPtr1+7168+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat6 = _mm512_mask_fmadd_ps(dat6, 65528, bnMul1, bnAdd1);
__m512 dat7 = _mm512_maskz_loadu_ps(65528, datPtr1+8064+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat7 = _mm512_mask_fmadd_ps(dat7, 65528, bnMul1, bnAdd1);
__m512 dat8 = _mm512_maskz_loadu_ps(65528, datPtr1+8960+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat8 = _mm512_mask_fmadd_ps(dat8, 65528, bnMul1, bnAdd1);
__m512 dat9 = _mm512_maskz_loadu_ps(65528, datPtr1+9856+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat9 = _mm512_mask_fmadd_ps(dat9, 65528, bnMul1, bnAdd1);
__m512 dat10 = _mm512_maskz_loadu_ps(65528, datPtr1+10752+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat10 = _mm512_mask_fmadd_ps(dat10, 65528, bnMul1, bnAdd1);
__m512 dat11 = _mm512_maskz_loadu_ps(65528, datPtr1+11648+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat11 = _mm512_mask_fmadd_ps(dat11, 65528, bnMul1, bnAdd1);
__m512 dat12 = _mm512_maskz_loadu_ps(65528, datPtr1+12544+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat12 = _mm512_mask_fmadd_ps(dat12, 65528, bnMul1, bnAdd1);
__m512 dat13 = _mm512_maskz_loadu_ps(65528, datPtr1+13440+602112*i6+200704*k2+896*h1+4*w1+0*b3);
dat13 = _mm512_mask_fmadd_ps(dat13, 65528, bnMul1, bnAdd1);
__m512 fft337 = _mm512_add_ps(_mm512_setzero_ps(), dat6);
__m512 fft425 = _mm512_add_ps(_mm512_setzero_ps(), dat7);
__m512 fft338 = _mm512_sub_ps(_mm512_setzero_ps(), dat6);
__m512 fft426 = _mm512_sub_ps(_mm512_setzero_ps(), dat7);
__m512 fft339 = _mm512_add_ps(_mm512_setzero_ps(), dat8);
__m512 fft427 = _mm512_add_ps(dat1, dat9);
__m512 fft340 = _mm512_sub_ps(_mm512_setzero_ps(), dat8);
__m512 fft428 = _mm512_sub_ps(dat1, dat9);
__m512 fft341 = _mm512_add_ps(dat2, dat10);
__m512 fft429 = _mm512_add_ps(dat3, dat11);
__m512 fft342 = _mm512_sub_ps(dat2, dat10);
__m512 fft430 = _mm512_sub_ps(dat3, dat11);
__m512 fft343 = _mm512_add_ps(dat4, dat12);
__m512 fft431 = _mm512_add_ps(dat5, dat13);
__m512 fft344 = _mm512_sub_ps(dat4, dat12);
__m512 fft432 = _mm512_sub_ps(dat5, dat13);
__m512 fft345 = _mm512_add_ps(fft337, fft341);
__m512 fft433 = _mm512_add_ps(fft425, fft429);
__m512 fft346 = _mm512_sub_ps(fft337, fft341);
__m512 fft434 = _mm512_sub_ps(fft425, fft429);
__m512 fft347 = _mm512_add_ps(fft339, fft343);
__m512 fft435 = _mm512_add_ps(fft427, fft431);
__m512 fft348 = _mm512_sub_ps(fft343, fft339);
__m512 fft436 = _mm512_sub_ps(fft431, fft427);
__m512 fft349 = _mm512_sub_ps(fft340, fft344);
__m512 fft437 = _mm512_sub_ps(fft428, fft432);
__m512 fft350 = _mm512_add_ps(fft340, fft344);
__m512 fft438 = _mm512_add_ps(fft428, fft432);
__m512 fft351 = _mm512_add_ps(fft345, fft347);
__m512 fft439 = _mm512_add_ps(fft433, fft435);
__m512 fft352 = _mm512_sub_ps(fft345, fft347);
__m512 fft440 = _mm512_sub_ps(fft433, fft435);
__m512 fft353 = _mm512_fmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft441 = _mm512_fmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft354 = _mm512_fnmsub_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft442 = _mm512_fnmsub_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft355 = _mm512_fnmadd_ps(fft349, _mm512_set1_ps(7.0710677e-01f), fft338);
__m512 fft443 = _mm512_fnmadd_ps(fft437, _mm512_set1_ps(7.0710677e-01f), fft426);
__m512 fft356 = _mm512_fnmadd_ps(fft350, _mm512_set1_ps(7.0710677e-01f), fft342);
__m512 fft444 = _mm512_fnmadd_ps(fft438, _mm512_set1_ps(7.0710677e-01f), fft430);
__m512 fft357 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft358 = _mm512_fmadd_ps(fft351, fft357, _mm512_shuffle_f32x4(fft351, fft351, 78));
__m512 fft445 = _mm512_fmadd_ps(fft439, fft357, _mm512_shuffle_f32x4(fft439, fft439, 78));
__m512 fft359 = _mm512_fmadd_ps(fft352, fft357, _mm512_shuffle_f32x4(fft352, fft352, 78));
__m512 fft446 = _mm512_fmadd_ps(fft440, fft357, _mm512_shuffle_f32x4(fft440, fft440, 78));
__m512 fft360 = _mm512_fmadd_ps(fft353, fft357, _mm512_shuffle_f32x4(fft353, fft353, 78));
__m512 fft447 = _mm512_fmadd_ps(fft441, fft357, _mm512_shuffle_f32x4(fft441, fft441, 78));
__m512 fft361 = _mm512_fmadd_ps(fft354, fft357, _mm512_shuffle_f32x4(fft354, fft354, 78));
__m512 fft448 = _mm512_fmadd_ps(fft442, fft357, _mm512_shuffle_f32x4(fft442, fft442, 78));
__m512 fft362 = _mm512_fmadd_ps(fft346, fft357, _mm512_shuffle_f32x4(fft346, fft346, 78));
__m512 fft449 = _mm512_fmadd_ps(fft434, fft357, _mm512_shuffle_f32x4(fft434, fft434, 78));
__m512 fft363 = _mm512_fmadd_ps(fft348, fft357, _mm512_shuffle_f32x4(fft348, fft348, 78));
__m512 fft450 = _mm512_fmadd_ps(fft436, fft357, _mm512_shuffle_f32x4(fft436, fft436, 78));
__m512 fft364 = _mm512_fmadd_ps(fft355, fft357, _mm512_shuffle_f32x4(fft355, fft355, 78));
__m512 fft451 = _mm512_fmadd_ps(fft443, fft357, _mm512_shuffle_f32x4(fft443, fft443, 78));
__m512 fft365 = _mm512_fmadd_ps(fft356, fft357, _mm512_shuffle_f32x4(fft356, fft356, 78));
__m512 fft452 = _mm512_fmadd_ps(fft444, fft357, _mm512_shuffle_f32x4(fft444, fft444, 78));
__m512 fft366 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft367 = _mm512_mul_ps(fft358, fft366);
__m512 fft453 = _mm512_mul_ps(fft445, fft366);
__m512 fft368 = _mm512_mul_ps(fft359, fft366);
__m512 fft454 = _mm512_mul_ps(fft446, fft366);
__m512 fft369 = _mm512_mul_ps(fft360, fft366);
__m512 fft455 = _mm512_mul_ps(fft447, fft366);
__m512 fft370 = _mm512_mul_ps(fft361, fft366);
__m512 fft456 = _mm512_mul_ps(fft448, fft366);
__m512 fft371 = _mm512_mul_ps(fft362, fft366);
__m512 fft457 = _mm512_mul_ps(fft449, fft366);
__m512 fft372 = _mm512_mul_ps(fft363, fft366);
__m512 fft458 = _mm512_mul_ps(fft450, fft366);
__m512 fft373 = _mm512_mul_ps(fft364, fft366);
__m512 fft459 = _mm512_mul_ps(fft451, fft366);
__m512 fft374 = _mm512_mul_ps(fft365, fft366);
__m512 fft460 = _mm512_mul_ps(fft452, fft366);
__m512 fft375 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft376 = _mm512_fmadd_ps(fft359, fft375, fft367);
__m512 fft461 = _mm512_fmadd_ps(fft446, fft375, fft453);
__m512 fft377 = _mm512_fnmadd_ps(fft358, fft375, fft368);
__m512 fft462 = _mm512_fnmadd_ps(fft445, fft375, fft454);
__m512 fft378 = _mm512_fmadd_ps(fft361, fft375, fft369);
__m512 fft463 = _mm512_fmadd_ps(fft448, fft375, fft455);
__m512 fft379 = _mm512_fnmadd_ps(fft360, fft375, fft370);
__m512 fft464 = _mm512_fnmadd_ps(fft447, fft375, fft456);
__m512 fft380 = _mm512_fmadd_ps(fft363, fft375, fft371);
__m512 fft465 = _mm512_fmadd_ps(fft450, fft375, fft457);
__m512 fft381 = _mm512_fnmadd_ps(fft362, fft375, fft372);
__m512 fft466 = _mm512_fnmadd_ps(fft449, fft375, fft458);
__m512 fft382 = _mm512_fmadd_ps(fft365, fft375, fft373);
__m512 fft467 = _mm512_fmadd_ps(fft452, fft375, fft459);
__m512 fft383 = _mm512_fnmadd_ps(fft364, fft375, fft374);
__m512 fft468 = _mm512_fnmadd_ps(fft451, fft375, fft460);
__m512 fft384 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft385 = _mm512_fmadd_ps(fft376, fft384, _mm512_shuffle_f32x4(fft376, fft376, 177));
__m512 fft469 = _mm512_fmadd_ps(fft461, fft384, _mm512_shuffle_f32x4(fft461, fft461, 177));
__m512 fft386 = _mm512_fmadd_ps(fft377, fft384, _mm512_shuffle_f32x4(fft377, fft377, 177));
__m512 fft470 = _mm512_fmadd_ps(fft462, fft384, _mm512_shuffle_f32x4(fft462, fft462, 177));
__m512 fft387 = _mm512_fmadd_ps(fft378, fft384, _mm512_shuffle_f32x4(fft378, fft378, 177));
__m512 fft471 = _mm512_fmadd_ps(fft463, fft384, _mm512_shuffle_f32x4(fft463, fft463, 177));
__m512 fft388 = _mm512_fmadd_ps(fft379, fft384, _mm512_shuffle_f32x4(fft379, fft379, 177));
__m512 fft472 = _mm512_fmadd_ps(fft464, fft384, _mm512_shuffle_f32x4(fft464, fft464, 177));
__m512 fft389 = _mm512_fmadd_ps(fft380, fft384, _mm512_shuffle_f32x4(fft380, fft380, 177));
__m512 fft473 = _mm512_fmadd_ps(fft465, fft384, _mm512_shuffle_f32x4(fft465, fft465, 177));
__m512 fft390 = _mm512_fmadd_ps(fft381, fft384, _mm512_shuffle_f32x4(fft381, fft381, 177));
__m512 fft474 = _mm512_fmadd_ps(fft466, fft384, _mm512_shuffle_f32x4(fft466, fft466, 177));
__m512 fft391 = _mm512_fmadd_ps(fft382, fft384, _mm512_shuffle_f32x4(fft382, fft382, 177));
__m512 fft475 = _mm512_fmadd_ps(fft467, fft384, _mm512_shuffle_f32x4(fft467, fft467, 177));
__m512 fft392 = _mm512_fmadd_ps(fft383, fft384, _mm512_shuffle_f32x4(fft383, fft383, 177));
__m512 fft476 = _mm512_fmadd_ps(fft468, fft384, _mm512_shuffle_f32x4(fft468, fft468, 177));
__m512 fft393 = _mm512_mask_mov_ps(fft385, 49344, fft386);
__m512 fft477 = _mm512_mask_mov_ps(fft469, 49344, fft470);
__m512 fft394 = _mm512_mask_sub_ps(fft386, 49344, _mm512_setzero_ps(), fft385);
__m512 fft478 = _mm512_mask_sub_ps(fft470, 49344, _mm512_setzero_ps(), fft469);
__m512 fft395 = _mm512_mask_mov_ps(fft387, 49344, fft388);
__m512 fft479 = _mm512_mask_mov_ps(fft471, 49344, fft472);
__m512 fft396 = _mm512_mask_sub_ps(fft388, 49344, _mm512_setzero_ps(), fft387);
__m512 fft480 = _mm512_mask_sub_ps(fft472, 49344, _mm512_setzero_ps(), fft471);
__m512 fft397 = _mm512_mask_mov_ps(fft389, 49344, fft390);
__m512 fft481 = _mm512_mask_mov_ps(fft473, 49344, fft474);
__m512 fft398 = _mm512_mask_sub_ps(fft390, 49344, _mm512_setzero_ps(), fft389);
__m512 fft482 = _mm512_mask_sub_ps(fft474, 49344, _mm512_setzero_ps(), fft473);
__m512 fft399 = _mm512_mask_mov_ps(fft391, 49344, fft392);
__m512 fft483 = _mm512_mask_mov_ps(fft475, 49344, fft476);
__m512 fft400 = _mm512_mask_sub_ps(fft392, 49344, _mm512_setzero_ps(), fft391);
__m512 fft484 = _mm512_mask_sub_ps(fft476, 49344, _mm512_setzero_ps(), fft475);
__m512 fft401 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft402 = _mm512_fmadd_ps(fft393, fft401, _mm512_shuffle_ps(fft393, fft393, 78));
__m512 fft485 = _mm512_fmadd_ps(fft477, fft401, _mm512_shuffle_ps(fft477, fft477, 78));
__m512 fft403 = _mm512_fmadd_ps(fft394, fft401, _mm512_shuffle_ps(fft394, fft394, 78));
__m512 fft486 = _mm512_fmadd_ps(fft478, fft401, _mm512_shuffle_ps(fft478, fft478, 78));
__m512 fft404 = _mm512_fmadd_ps(fft395, fft401, _mm512_shuffle_ps(fft395, fft395, 78));
__m512 fft487 = _mm512_fmadd_ps(fft479, fft401, _mm512_shuffle_ps(fft479, fft479, 78));
__m512 fft405 = _mm512_fmadd_ps(fft396, fft401, _mm512_shuffle_ps(fft396, fft396, 78));
__m512 fft488 = _mm512_fmadd_ps(fft480, fft401, _mm512_shuffle_ps(fft480, fft480, 78));
__m512 fft406 = _mm512_fmadd_ps(fft397, fft401, _mm512_shuffle_ps(fft397, fft397, 78));
__m512 fft489 = _mm512_fmadd_ps(fft481, fft401, _mm512_shuffle_ps(fft481, fft481, 78));
__m512 fft407 = _mm512_fmadd_ps(fft398, fft401, _mm512_shuffle_ps(fft398, fft398, 78));
__m512 fft490 = _mm512_fmadd_ps(fft482, fft401, _mm512_shuffle_ps(fft482, fft482, 78));
__m512 fft408 = _mm512_fmadd_ps(fft399, fft401, _mm512_shuffle_ps(fft399, fft399, 78));
__m512 fft491 = _mm512_fmadd_ps(fft483, fft401, _mm512_shuffle_ps(fft483, fft483, 78));
__m512 fft409 = _mm512_fmadd_ps(fft400, fft401, _mm512_shuffle_ps(fft400, fft400, 78));
__m512 fft492 = _mm512_fmadd_ps(fft484, fft401, _mm512_shuffle_ps(fft484, fft484, 78));
__m512i fft410 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft411 = _mm512_permutexvar_ps(fft410, fft402);
__m512 fft493 = _mm512_permutexvar_ps(fft410, fft485);
__m512i fft412 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft413 = _mm512_permutexvar_ps(fft412, fft402);
__m512 fft494 = _mm512_permutexvar_ps(fft412, fft485);
__m512 fft414 = _mm512_permutexvar_ps(fft410, fft403);
__m512 fft495 = _mm512_permutexvar_ps(fft410, fft486);
__m512 fft415 = _mm512_permutexvar_ps(fft412, fft403);
__m512 fft496 = _mm512_permutexvar_ps(fft412, fft486);
__m512 fft416 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft417 = _mm512_fmadd_ps(fft411, fft416, fft413);
__m512 fft497 = _mm512_fmadd_ps(fft493, fft416, fft494);
__m512 fft418 = _mm512_fnmadd_ps(fft415, fft416, fft414);
__m512 fft498 = _mm512_fnmadd_ps(fft496, fft416, fft495);
__m512 fft419 = _mm512_mask_mov_ps(fft415, 21845, fft417);
__m512 fft499 = _mm512_mask_mov_ps(fft496, 21845, fft497);
__m512 fft420 = _mm512_mask_mov_ps(fft411, 43176, fft417);
__m512 fft500 = _mm512_mask_mov_ps(fft493, 43176, fft497);
__m512 fft421 = _mm512_mask_mov_ps(fft419, 43176, fft418);
__m512 fft501 = _mm512_mask_mov_ps(fft499, 43176, fft498);
__m512 fft422 = _mm512_mask_mov_ps(fft420, 22102, fft418);
__m512 fft502 = _mm512_mask_mov_ps(fft500, 22102, fft498);
__m512 fft423 = _mm512_mask_mul_ps(fft421, 64764, fft421, _mm512_set1_ps(5e-01f));
__m512 fft503 = _mm512_mask_mul_ps(fft501, 64764, fft501, _mm512_set1_ps(5e-01f));
__m512 fft424 = _mm512_mask_mul_ps(fft422, 64764, fft422, _mm512_set1_ps(5e-01f));
__m512 fft504 = _mm512_mask_mul_ps(fft502, 64764, fft502, _mm512_set1_ps(5e-01f));
__m512 df1 = fft423;
__m512 df9 = fft503;
__m512 df2 = fft424;
__m512 df10 = fft504;
__m512 df3 = fft404;
__m512 df11 = fft487;
__m512 df4 = fft405;
__m512 df12 = fft488;
__m512 df5 = fft406;
__m512 df13 = fft489;
__m512 df6 = fft407;
__m512 df14 = fft490;
__m512 df7 = fft408;
__m512 df15 = fft491;
__m512 df8 = fft409;
__m512 df16 = fft492;
__m512i eo3 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df3 = _mm512_permutexvar_ps(eo3, df3);
df4 = _mm512_permutexvar_ps(eo3, df4);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df3);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df4);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df3);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df4);
df11 = _mm512_permutexvar_ps(eo3, df11);
df12 = _mm512_permutexvar_ps(eo3, df12);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df11);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df12);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df11);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df12);
df5 = _mm512_permutexvar_ps(eo3, df5);
df6 = _mm512_permutexvar_ps(eo3, df6);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df5);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df6);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df5);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df6);
df13 = _mm512_permutexvar_ps(eo3, df13);
df14 = _mm512_permutexvar_ps(eo3, df14);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df13);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df14);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df13);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df14);
df7 = _mm512_permutexvar_ps(eo3, df7);
df8 = _mm512_permutexvar_ps(eo3, df8);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df7);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df8);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df7);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df8);
df15 = _mm512_permutexvar_ps(eo3, df15);
df16 = _mm512_permutexvar_ps(eo3, df16);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df15);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df16);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df15);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df16);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df1);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df2);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df1);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df2);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df9);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k2+128*m3+32*f4, 255, df10);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df9);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k2+128*m3+32*f4, 65280, df10);
for (ptrdiff_t b4 = 1; b4 < 6; ++b4) {
ptrdiff_t m4 = (size_t)b4/2;
ptrdiff_t f5 = (size_t)b4%2;
__m512 dat14 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat14 = _mm512_mask_fmadd_ps(dat14, 65535, bnMul1, bnAdd1);
__m512 dat15 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat15 = _mm512_mask_fmadd_ps(dat15, 65535, bnMul1, bnAdd1);
__m512 dat16 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat16 = _mm512_mask_fmadd_ps(dat16, 65535, bnMul1, bnAdd1);
__m512 dat17 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat17 = _mm512_mask_fmadd_ps(dat17, 65535, bnMul1, bnAdd1);
__m512 dat18 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat18 = _mm512_mask_fmadd_ps(dat18, 65535, bnMul1, bnAdd1);
__m512 dat19 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat19 = _mm512_mask_fmadd_ps(dat19, 65535, bnMul1, bnAdd1);
__m512 dat20 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat20 = _mm512_mask_fmadd_ps(dat20, 65535, bnMul1, bnAdd1);
__m512 dat21 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat21 = _mm512_mask_fmadd_ps(dat21, 65535, bnMul1, bnAdd1);
__m512 dat22 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat22 = _mm512_mask_fmadd_ps(dat22, 65535, bnMul1, bnAdd1);
__m512 dat23 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat23 = _mm512_mask_fmadd_ps(dat23, 65535, bnMul1, bnAdd1);
__m512 dat24 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat24 = _mm512_mask_fmadd_ps(dat24, 65535, bnMul1, bnAdd1);
__m512 dat25 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat25 = _mm512_mask_fmadd_ps(dat25, 65535, bnMul1, bnAdd1);
__m512 dat26 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k2+896*h1+4*w1+40*b4);
dat26 = _mm512_mask_fmadd_ps(dat26, 65535, bnMul1, bnAdd1);
__m512 fft505 = _mm512_add_ps(_mm512_setzero_ps(), dat19);
__m512 fft593 = _mm512_add_ps(_mm512_setzero_ps(), dat20);
__m512 fft506 = _mm512_sub_ps(_mm512_setzero_ps(), dat19);
__m512 fft594 = _mm512_sub_ps(_mm512_setzero_ps(), dat20);
__m512 fft507 = _mm512_add_ps(_mm512_setzero_ps(), dat21);
__m512 fft595 = _mm512_add_ps(dat14, dat22);
__m512 fft508 = _mm512_sub_ps(_mm512_setzero_ps(), dat21);
__m512 fft596 = _mm512_sub_ps(dat14, dat22);
__m512 fft509 = _mm512_add_ps(dat15, dat23);
__m512 fft597 = _mm512_add_ps(dat16, dat24);
__m512 fft510 = _mm512_sub_ps(dat15, dat23);
__m512 fft598 = _mm512_sub_ps(dat16, dat24);
__m512 fft511 = _mm512_add_ps(dat17, dat25);
__m512 fft599 = _mm512_add_ps(dat18, dat26);
__m512 fft512 = _mm512_sub_ps(dat17, dat25);
__m512 fft600 = _mm512_sub_ps(dat18, dat26);
__m512 fft513 = _mm512_add_ps(fft505, fft509);
__m512 fft601 = _mm512_add_ps(fft593, fft597);
__m512 fft514 = _mm512_sub_ps(fft505, fft509);
__m512 fft602 = _mm512_sub_ps(fft593, fft597);
__m512 fft515 = _mm512_add_ps(fft507, fft511);
__m512 fft603 = _mm512_add_ps(fft595, fft599);
__m512 fft516 = _mm512_sub_ps(fft511, fft507);
__m512 fft604 = _mm512_sub_ps(fft599, fft595);
__m512 fft517 = _mm512_sub_ps(fft508, fft512);
__m512 fft605 = _mm512_sub_ps(fft596, fft600);
__m512 fft518 = _mm512_add_ps(fft508, fft512);
__m512 fft606 = _mm512_add_ps(fft596, fft600);
__m512 fft519 = _mm512_add_ps(fft513, fft515);
__m512 fft607 = _mm512_add_ps(fft601, fft603);
__m512 fft520 = _mm512_sub_ps(fft513, fft515);
__m512 fft608 = _mm512_sub_ps(fft601, fft603);
__m512 fft521 = _mm512_fmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft609 = _mm512_fmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft522 = _mm512_fnmsub_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft610 = _mm512_fnmsub_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft523 = _mm512_fnmadd_ps(fft517, _mm512_set1_ps(7.0710677e-01f), fft506);
__m512 fft611 = _mm512_fnmadd_ps(fft605, _mm512_set1_ps(7.0710677e-01f), fft594);
__m512 fft524 = _mm512_fnmadd_ps(fft518, _mm512_set1_ps(7.0710677e-01f), fft510);
__m512 fft612 = _mm512_fnmadd_ps(fft606, _mm512_set1_ps(7.0710677e-01f), fft598);
__m512 fft525 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft526 = _mm512_fmadd_ps(fft519, fft525, _mm512_shuffle_f32x4(fft519, fft519, 78));
__m512 fft613 = _mm512_fmadd_ps(fft607, fft525, _mm512_shuffle_f32x4(fft607, fft607, 78));
__m512 fft527 = _mm512_fmadd_ps(fft520, fft525, _mm512_shuffle_f32x4(fft520, fft520, 78));
__m512 fft614 = _mm512_fmadd_ps(fft608, fft525, _mm512_shuffle_f32x4(fft608, fft608, 78));
__m512 fft528 = _mm512_fmadd_ps(fft521, fft525, _mm512_shuffle_f32x4(fft521, fft521, 78));
__m512 fft615 = _mm512_fmadd_ps(fft609, fft525, _mm512_shuffle_f32x4(fft609, fft609, 78));
__m512 fft529 = _mm512_fmadd_ps(fft522, fft525, _mm512_shuffle_f32x4(fft522, fft522, 78));
__m512 fft616 = _mm512_fmadd_ps(fft610, fft525, _mm512_shuffle_f32x4(fft610, fft610, 78));
__m512 fft530 = _mm512_fmadd_ps(fft514, fft525, _mm512_shuffle_f32x4(fft514, fft514, 78));
__m512 fft617 = _mm512_fmadd_ps(fft602, fft525, _mm512_shuffle_f32x4(fft602, fft602, 78));
__m512 fft531 = _mm512_fmadd_ps(fft516, fft525, _mm512_shuffle_f32x4(fft516, fft516, 78));
__m512 fft618 = _mm512_fmadd_ps(fft604, fft525, _mm512_shuffle_f32x4(fft604, fft604, 78));
__m512 fft532 = _mm512_fmadd_ps(fft523, fft525, _mm512_shuffle_f32x4(fft523, fft523, 78));
__m512 fft619 = _mm512_fmadd_ps(fft611, fft525, _mm512_shuffle_f32x4(fft611, fft611, 78));
__m512 fft533 = _mm512_fmadd_ps(fft524, fft525, _mm512_shuffle_f32x4(fft524, fft524, 78));
__m512 fft620 = _mm512_fmadd_ps(fft612, fft525, _mm512_shuffle_f32x4(fft612, fft612, 78));
__m512 fft534 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft535 = _mm512_mul_ps(fft526, fft534);
__m512 fft621 = _mm512_mul_ps(fft613, fft534);
__m512 fft536 = _mm512_mul_ps(fft527, fft534);
__m512 fft622 = _mm512_mul_ps(fft614, fft534);
__m512 fft537 = _mm512_mul_ps(fft528, fft534);
__m512 fft623 = _mm512_mul_ps(fft615, fft534);
__m512 fft538 = _mm512_mul_ps(fft529, fft534);
__m512 fft624 = _mm512_mul_ps(fft616, fft534);
__m512 fft539 = _mm512_mul_ps(fft530, fft534);
__m512 fft625 = _mm512_mul_ps(fft617, fft534);
__m512 fft540 = _mm512_mul_ps(fft531, fft534);
__m512 fft626 = _mm512_mul_ps(fft618, fft534);
__m512 fft541 = _mm512_mul_ps(fft532, fft534);
__m512 fft627 = _mm512_mul_ps(fft619, fft534);
__m512 fft542 = _mm512_mul_ps(fft533, fft534);
__m512 fft628 = _mm512_mul_ps(fft620, fft534);
__m512 fft543 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft544 = _mm512_fmadd_ps(fft527, fft543, fft535);
__m512 fft629 = _mm512_fmadd_ps(fft614, fft543, fft621);
__m512 fft545 = _mm512_fnmadd_ps(fft526, fft543, fft536);
__m512 fft630 = _mm512_fnmadd_ps(fft613, fft543, fft622);
__m512 fft546 = _mm512_fmadd_ps(fft529, fft543, fft537);
__m512 fft631 = _mm512_fmadd_ps(fft616, fft543, fft623);
__m512 fft547 = _mm512_fnmadd_ps(fft528, fft543, fft538);
__m512 fft632 = _mm512_fnmadd_ps(fft615, fft543, fft624);
__m512 fft548 = _mm512_fmadd_ps(fft531, fft543, fft539);
__m512 fft633 = _mm512_fmadd_ps(fft618, fft543, fft625);
__m512 fft549 = _mm512_fnmadd_ps(fft530, fft543, fft540);
__m512 fft634 = _mm512_fnmadd_ps(fft617, fft543, fft626);
__m512 fft550 = _mm512_fmadd_ps(fft533, fft543, fft541);
__m512 fft635 = _mm512_fmadd_ps(fft620, fft543, fft627);
__m512 fft551 = _mm512_fnmadd_ps(fft532, fft543, fft542);
__m512 fft636 = _mm512_fnmadd_ps(fft619, fft543, fft628);
__m512 fft552 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft553 = _mm512_fmadd_ps(fft544, fft552, _mm512_shuffle_f32x4(fft544, fft544, 177));
__m512 fft637 = _mm512_fmadd_ps(fft629, fft552, _mm512_shuffle_f32x4(fft629, fft629, 177));
__m512 fft554 = _mm512_fmadd_ps(fft545, fft552, _mm512_shuffle_f32x4(fft545, fft545, 177));
__m512 fft638 = _mm512_fmadd_ps(fft630, fft552, _mm512_shuffle_f32x4(fft630, fft630, 177));
__m512 fft555 = _mm512_fmadd_ps(fft546, fft552, _mm512_shuffle_f32x4(fft546, fft546, 177));
__m512 fft639 = _mm512_fmadd_ps(fft631, fft552, _mm512_shuffle_f32x4(fft631, fft631, 177));
__m512 fft556 = _mm512_fmadd_ps(fft547, fft552, _mm512_shuffle_f32x4(fft547, fft547, 177));
__m512 fft640 = _mm512_fmadd_ps(fft632, fft552, _mm512_shuffle_f32x4(fft632, fft632, 177));
__m512 fft557 = _mm512_fmadd_ps(fft548, fft552, _mm512_shuffle_f32x4(fft548, fft548, 177));
__m512 fft641 = _mm512_fmadd_ps(fft633, fft552, _mm512_shuffle_f32x4(fft633, fft633, 177));
__m512 fft558 = _mm512_fmadd_ps(fft549, fft552, _mm512_shuffle_f32x4(fft549, fft549, 177));
__m512 fft642 = _mm512_fmadd_ps(fft634, fft552, _mm512_shuffle_f32x4(fft634, fft634, 177));
__m512 fft559 = _mm512_fmadd_ps(fft550, fft552, _mm512_shuffle_f32x4(fft550, fft550, 177));
__m512 fft643 = _mm512_fmadd_ps(fft635, fft552, _mm512_shuffle_f32x4(fft635, fft635, 177));
__m512 fft560 = _mm512_fmadd_ps(fft551, fft552, _mm512_shuffle_f32x4(fft551, fft551, 177));
__m512 fft644 = _mm512_fmadd_ps(fft636, fft552, _mm512_shuffle_f32x4(fft636, fft636, 177));
__m512 fft561 = _mm512_mask_mov_ps(fft553, 49344, fft554);
__m512 fft645 = _mm512_mask_mov_ps(fft637, 49344, fft638);
__m512 fft562 = _mm512_mask_sub_ps(fft554, 49344, _mm512_setzero_ps(), fft553);
__m512 fft646 = _mm512_mask_sub_ps(fft638, 49344, _mm512_setzero_ps(), fft637);
__m512 fft563 = _mm512_mask_mov_ps(fft555, 49344, fft556);
__m512 fft647 = _mm512_mask_mov_ps(fft639, 49344, fft640);
__m512 fft564 = _mm512_mask_sub_ps(fft556, 49344, _mm512_setzero_ps(), fft555);
__m512 fft648 = _mm512_mask_sub_ps(fft640, 49344, _mm512_setzero_ps(), fft639);
__m512 fft565 = _mm512_mask_mov_ps(fft557, 49344, fft558);
__m512 fft649 = _mm512_mask_mov_ps(fft641, 49344, fft642);
__m512 fft566 = _mm512_mask_sub_ps(fft558, 49344, _mm512_setzero_ps(), fft557);
__m512 fft650 = _mm512_mask_sub_ps(fft642, 49344, _mm512_setzero_ps(), fft641);
__m512 fft567 = _mm512_mask_mov_ps(fft559, 49344, fft560);
__m512 fft651 = _mm512_mask_mov_ps(fft643, 49344, fft644);
__m512 fft568 = _mm512_mask_sub_ps(fft560, 49344, _mm512_setzero_ps(), fft559);
__m512 fft652 = _mm512_mask_sub_ps(fft644, 49344, _mm512_setzero_ps(), fft643);
__m512 fft569 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft570 = _mm512_fmadd_ps(fft561, fft569, _mm512_shuffle_ps(fft561, fft561, 78));
__m512 fft653 = _mm512_fmadd_ps(fft645, fft569, _mm512_shuffle_ps(fft645, fft645, 78));
__m512 fft571 = _mm512_fmadd_ps(fft562, fft569, _mm512_shuffle_ps(fft562, fft562, 78));
__m512 fft654 = _mm512_fmadd_ps(fft646, fft569, _mm512_shuffle_ps(fft646, fft646, 78));
__m512 fft572 = _mm512_fmadd_ps(fft563, fft569, _mm512_shuffle_ps(fft563, fft563, 78));
__m512 fft655 = _mm512_fmadd_ps(fft647, fft569, _mm512_shuffle_ps(fft647, fft647, 78));
__m512 fft573 = _mm512_fmadd_ps(fft564, fft569, _mm512_shuffle_ps(fft564, fft564, 78));
__m512 fft656 = _mm512_fmadd_ps(fft648, fft569, _mm512_shuffle_ps(fft648, fft648, 78));
__m512 fft574 = _mm512_fmadd_ps(fft565, fft569, _mm512_shuffle_ps(fft565, fft565, 78));
__m512 fft657 = _mm512_fmadd_ps(fft649, fft569, _mm512_shuffle_ps(fft649, fft649, 78));
__m512 fft575 = _mm512_fmadd_ps(fft566, fft569, _mm512_shuffle_ps(fft566, fft566, 78));
__m512 fft658 = _mm512_fmadd_ps(fft650, fft569, _mm512_shuffle_ps(fft650, fft650, 78));
__m512 fft576 = _mm512_fmadd_ps(fft567, fft569, _mm512_shuffle_ps(fft567, fft567, 78));
__m512 fft659 = _mm512_fmadd_ps(fft651, fft569, _mm512_shuffle_ps(fft651, fft651, 78));
__m512 fft577 = _mm512_fmadd_ps(fft568, fft569, _mm512_shuffle_ps(fft568, fft568, 78));
__m512 fft660 = _mm512_fmadd_ps(fft652, fft569, _mm512_shuffle_ps(fft652, fft652, 78));
__m512i fft578 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft579 = _mm512_permutexvar_ps(fft578, fft570);
__m512 fft661 = _mm512_permutexvar_ps(fft578, fft653);
__m512i fft580 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft581 = _mm512_permutexvar_ps(fft580, fft570);
__m512 fft662 = _mm512_permutexvar_ps(fft580, fft653);
__m512 fft582 = _mm512_permutexvar_ps(fft578, fft571);
__m512 fft663 = _mm512_permutexvar_ps(fft578, fft654);
__m512 fft583 = _mm512_permutexvar_ps(fft580, fft571);
__m512 fft664 = _mm512_permutexvar_ps(fft580, fft654);
__m512 fft584 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft585 = _mm512_fmadd_ps(fft579, fft584, fft581);
__m512 fft665 = _mm512_fmadd_ps(fft661, fft584, fft662);
__m512 fft586 = _mm512_fnmadd_ps(fft583, fft584, fft582);
__m512 fft666 = _mm512_fnmadd_ps(fft664, fft584, fft663);
__m512 fft587 = _mm512_mask_mov_ps(fft583, 21845, fft585);
__m512 fft667 = _mm512_mask_mov_ps(fft664, 21845, fft665);
__m512 fft588 = _mm512_mask_mov_ps(fft579, 43176, fft585);
__m512 fft668 = _mm512_mask_mov_ps(fft661, 43176, fft665);
__m512 fft589 = _mm512_mask_mov_ps(fft587, 43176, fft586);
__m512 fft669 = _mm512_mask_mov_ps(fft667, 43176, fft666);
__m512 fft590 = _mm512_mask_mov_ps(fft588, 22102, fft586);
__m512 fft670 = _mm512_mask_mov_ps(fft668, 22102, fft666);
__m512 fft591 = _mm512_mask_mul_ps(fft589, 64764, fft589, _mm512_set1_ps(5e-01f));
__m512 fft671 = _mm512_mask_mul_ps(fft669, 64764, fft669, _mm512_set1_ps(5e-01f));
__m512 fft592 = _mm512_mask_mul_ps(fft590, 64764, fft590, _mm512_set1_ps(5e-01f));
__m512 fft672 = _mm512_mask_mul_ps(fft670, 64764, fft670, _mm512_set1_ps(5e-01f));
__m512 df17 = fft591;
__m512 df25 = fft671;
__m512 df18 = fft592;
__m512 df26 = fft672;
__m512 df19 = fft572;
__m512 df27 = fft655;
__m512 df20 = fft573;
__m512 df28 = fft656;
__m512 df21 = fft574;
__m512 df29 = fft657;
__m512 df22 = fft575;
__m512 df30 = fft658;
__m512 df23 = fft576;
__m512 df31 = fft659;
__m512 df24 = fft577;
__m512 df32 = fft660;
__m512i eo4 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df19 = _mm512_permutexvar_ps(eo4, df19);
df20 = _mm512_permutexvar_ps(eo4, df20);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df19);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df20);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df19);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df20);
df27 = _mm512_permutexvar_ps(eo4, df27);
df28 = _mm512_permutexvar_ps(eo4, df28);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df27);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df28);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df27);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df28);
df21 = _mm512_permutexvar_ps(eo4, df21);
df22 = _mm512_permutexvar_ps(eo4, df22);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df21);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df22);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df21);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df22);
df29 = _mm512_permutexvar_ps(eo4, df29);
df30 = _mm512_permutexvar_ps(eo4, df30);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df29);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df30);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df29);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df30);
df23 = _mm512_permutexvar_ps(eo4, df23);
df24 = _mm512_permutexvar_ps(eo4, df24);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df23);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df24);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df23);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df24);
df31 = _mm512_permutexvar_ps(eo4, df31);
df32 = _mm512_permutexvar_ps(eo4, df32);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df31);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df32);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df31);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df32);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df17);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df18);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df17);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df18);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df25);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k2+128*m4+32*f5, 255, df26);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df25);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k2+128*m4+32*f5, 65280, df26);
}
}
if (j2 >= last1) return;
++j2;
rel1 = 1;
}
if (rel1 < 3) {
ptrdiff_t h2 = base1+0;
ptrdiff_t w2 = 0+60*rel1;
ptrdiff_t jj1 = 2-rel1+j2;
for (; j2 <= jj1; w2 += 60) {
ptrdiff_t k3 = 3*s1;
ptrdiff_t kk2 = k3+2;
for (; k3 <= kk2; ++k3) {
__m512 bnMul2 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k3+3*i6))[0]);
__m512 bnAdd2 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k3+3*i6))[1]);
for (ptrdiff_t b5 = 0; b5 < 6; ++b5) {
ptrdiff_t m5 = (size_t)b5/2;
ptrdiff_t f6 = (size_t)b5%2;
__m512 dat27 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat27 = _mm512_mask_fmadd_ps(dat27, 65535, bnMul2, bnAdd2);
__m512 dat28 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat28 = _mm512_mask_fmadd_ps(dat28, 65535, bnMul2, bnAdd2);
__m512 dat29 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat29 = _mm512_mask_fmadd_ps(dat29, 65535, bnMul2, bnAdd2);
__m512 dat30 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat30 = _mm512_mask_fmadd_ps(dat30, 65535, bnMul2, bnAdd2);
__m512 dat31 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat31 = _mm512_mask_fmadd_ps(dat31, 65535, bnMul2, bnAdd2);
__m512 dat32 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat32 = _mm512_mask_fmadd_ps(dat32, 65535, bnMul2, bnAdd2);
__m512 dat33 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat33 = _mm512_mask_fmadd_ps(dat33, 65535, bnMul2, bnAdd2);
__m512 dat34 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat34 = _mm512_mask_fmadd_ps(dat34, 65535, bnMul2, bnAdd2);
__m512 dat35 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat35 = _mm512_mask_fmadd_ps(dat35, 65535, bnMul2, bnAdd2);
__m512 dat36 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat36 = _mm512_mask_fmadd_ps(dat36, 65535, bnMul2, bnAdd2);
__m512 dat37 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat37 = _mm512_mask_fmadd_ps(dat37, 65535, bnMul2, bnAdd2);
__m512 dat38 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat38 = _mm512_mask_fmadd_ps(dat38, 65535, bnMul2, bnAdd2);
__m512 dat39 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k3+896*h2+4*w2+40*b5);
dat39 = _mm512_mask_fmadd_ps(dat39, 65535, bnMul2, bnAdd2);
__m512 fft673 = _mm512_add_ps(_mm512_setzero_ps(), dat32);
__m512 fft761 = _mm512_add_ps(_mm512_setzero_ps(), dat33);
__m512 fft674 = _mm512_sub_ps(_mm512_setzero_ps(), dat32);
__m512 fft762 = _mm512_sub_ps(_mm512_setzero_ps(), dat33);
__m512 fft675 = _mm512_add_ps(_mm512_setzero_ps(), dat34);
__m512 fft763 = _mm512_add_ps(dat27, dat35);
__m512 fft676 = _mm512_sub_ps(_mm512_setzero_ps(), dat34);
__m512 fft764 = _mm512_sub_ps(dat27, dat35);
__m512 fft677 = _mm512_add_ps(dat28, dat36);
__m512 fft765 = _mm512_add_ps(dat29, dat37);
__m512 fft678 = _mm512_sub_ps(dat28, dat36);
__m512 fft766 = _mm512_sub_ps(dat29, dat37);
__m512 fft679 = _mm512_add_ps(dat30, dat38);
__m512 fft767 = _mm512_add_ps(dat31, dat39);
__m512 fft680 = _mm512_sub_ps(dat30, dat38);
__m512 fft768 = _mm512_sub_ps(dat31, dat39);
__m512 fft681 = _mm512_add_ps(fft673, fft677);
__m512 fft769 = _mm512_add_ps(fft761, fft765);
__m512 fft682 = _mm512_sub_ps(fft673, fft677);
__m512 fft770 = _mm512_sub_ps(fft761, fft765);
__m512 fft683 = _mm512_add_ps(fft675, fft679);
__m512 fft771 = _mm512_add_ps(fft763, fft767);
__m512 fft684 = _mm512_sub_ps(fft679, fft675);
__m512 fft772 = _mm512_sub_ps(fft767, fft763);
__m512 fft685 = _mm512_sub_ps(fft676, fft680);
__m512 fft773 = _mm512_sub_ps(fft764, fft768);
__m512 fft686 = _mm512_add_ps(fft676, fft680);
__m512 fft774 = _mm512_add_ps(fft764, fft768);
__m512 fft687 = _mm512_add_ps(fft681, fft683);
__m512 fft775 = _mm512_add_ps(fft769, fft771);
__m512 fft688 = _mm512_sub_ps(fft681, fft683);
__m512 fft776 = _mm512_sub_ps(fft769, fft771);
__m512 fft689 = _mm512_fmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft777 = _mm512_fmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft690 = _mm512_fnmsub_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft778 = _mm512_fnmsub_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft691 = _mm512_fnmadd_ps(fft685, _mm512_set1_ps(7.0710677e-01f), fft674);
__m512 fft779 = _mm512_fnmadd_ps(fft773, _mm512_set1_ps(7.0710677e-01f), fft762);
__m512 fft692 = _mm512_fnmadd_ps(fft686, _mm512_set1_ps(7.0710677e-01f), fft678);
__m512 fft780 = _mm512_fnmadd_ps(fft774, _mm512_set1_ps(7.0710677e-01f), fft766);
__m512 fft693 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft694 = _mm512_fmadd_ps(fft687, fft693, _mm512_shuffle_f32x4(fft687, fft687, 78));
__m512 fft781 = _mm512_fmadd_ps(fft775, fft693, _mm512_shuffle_f32x4(fft775, fft775, 78));
__m512 fft695 = _mm512_fmadd_ps(fft688, fft693, _mm512_shuffle_f32x4(fft688, fft688, 78));
__m512 fft782 = _mm512_fmadd_ps(fft776, fft693, _mm512_shuffle_f32x4(fft776, fft776, 78));
__m512 fft696 = _mm512_fmadd_ps(fft689, fft693, _mm512_shuffle_f32x4(fft689, fft689, 78));
__m512 fft783 = _mm512_fmadd_ps(fft777, fft693, _mm512_shuffle_f32x4(fft777, fft777, 78));
__m512 fft697 = _mm512_fmadd_ps(fft690, fft693, _mm512_shuffle_f32x4(fft690, fft690, 78));
__m512 fft784 = _mm512_fmadd_ps(fft778, fft693, _mm512_shuffle_f32x4(fft778, fft778, 78));
__m512 fft698 = _mm512_fmadd_ps(fft682, fft693, _mm512_shuffle_f32x4(fft682, fft682, 78));
__m512 fft785 = _mm512_fmadd_ps(fft770, fft693, _mm512_shuffle_f32x4(fft770, fft770, 78));
__m512 fft699 = _mm512_fmadd_ps(fft684, fft693, _mm512_shuffle_f32x4(fft684, fft684, 78));
__m512 fft786 = _mm512_fmadd_ps(fft772, fft693, _mm512_shuffle_f32x4(fft772, fft772, 78));
__m512 fft700 = _mm512_fmadd_ps(fft691, fft693, _mm512_shuffle_f32x4(fft691, fft691, 78));
__m512 fft787 = _mm512_fmadd_ps(fft779, fft693, _mm512_shuffle_f32x4(fft779, fft779, 78));
__m512 fft701 = _mm512_fmadd_ps(fft692, fft693, _mm512_shuffle_f32x4(fft692, fft692, 78));
__m512 fft788 = _mm512_fmadd_ps(fft780, fft693, _mm512_shuffle_f32x4(fft780, fft780, 78));
__m512 fft702 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft703 = _mm512_mul_ps(fft694, fft702);
__m512 fft789 = _mm512_mul_ps(fft781, fft702);
__m512 fft704 = _mm512_mul_ps(fft695, fft702);
__m512 fft790 = _mm512_mul_ps(fft782, fft702);
__m512 fft705 = _mm512_mul_ps(fft696, fft702);
__m512 fft791 = _mm512_mul_ps(fft783, fft702);
__m512 fft706 = _mm512_mul_ps(fft697, fft702);
__m512 fft792 = _mm512_mul_ps(fft784, fft702);
__m512 fft707 = _mm512_mul_ps(fft698, fft702);
__m512 fft793 = _mm512_mul_ps(fft785, fft702);
__m512 fft708 = _mm512_mul_ps(fft699, fft702);
__m512 fft794 = _mm512_mul_ps(fft786, fft702);
__m512 fft709 = _mm512_mul_ps(fft700, fft702);
__m512 fft795 = _mm512_mul_ps(fft787, fft702);
__m512 fft710 = _mm512_mul_ps(fft701, fft702);
__m512 fft796 = _mm512_mul_ps(fft788, fft702);
__m512 fft711 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft712 = _mm512_fmadd_ps(fft695, fft711, fft703);
__m512 fft797 = _mm512_fmadd_ps(fft782, fft711, fft789);
__m512 fft713 = _mm512_fnmadd_ps(fft694, fft711, fft704);
__m512 fft798 = _mm512_fnmadd_ps(fft781, fft711, fft790);
__m512 fft714 = _mm512_fmadd_ps(fft697, fft711, fft705);
__m512 fft799 = _mm512_fmadd_ps(fft784, fft711, fft791);
__m512 fft715 = _mm512_fnmadd_ps(fft696, fft711, fft706);
__m512 fft800 = _mm512_fnmadd_ps(fft783, fft711, fft792);
__m512 fft716 = _mm512_fmadd_ps(fft699, fft711, fft707);
__m512 fft801 = _mm512_fmadd_ps(fft786, fft711, fft793);
__m512 fft717 = _mm512_fnmadd_ps(fft698, fft711, fft708);
__m512 fft802 = _mm512_fnmadd_ps(fft785, fft711, fft794);
__m512 fft718 = _mm512_fmadd_ps(fft701, fft711, fft709);
__m512 fft803 = _mm512_fmadd_ps(fft788, fft711, fft795);
__m512 fft719 = _mm512_fnmadd_ps(fft700, fft711, fft710);
__m512 fft804 = _mm512_fnmadd_ps(fft787, fft711, fft796);
__m512 fft720 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft721 = _mm512_fmadd_ps(fft712, fft720, _mm512_shuffle_f32x4(fft712, fft712, 177));
__m512 fft805 = _mm512_fmadd_ps(fft797, fft720, _mm512_shuffle_f32x4(fft797, fft797, 177));
__m512 fft722 = _mm512_fmadd_ps(fft713, fft720, _mm512_shuffle_f32x4(fft713, fft713, 177));
__m512 fft806 = _mm512_fmadd_ps(fft798, fft720, _mm512_shuffle_f32x4(fft798, fft798, 177));
__m512 fft723 = _mm512_fmadd_ps(fft714, fft720, _mm512_shuffle_f32x4(fft714, fft714, 177));
__m512 fft807 = _mm512_fmadd_ps(fft799, fft720, _mm512_shuffle_f32x4(fft799, fft799, 177));
__m512 fft724 = _mm512_fmadd_ps(fft715, fft720, _mm512_shuffle_f32x4(fft715, fft715, 177));
__m512 fft808 = _mm512_fmadd_ps(fft800, fft720, _mm512_shuffle_f32x4(fft800, fft800, 177));
__m512 fft725 = _mm512_fmadd_ps(fft716, fft720, _mm512_shuffle_f32x4(fft716, fft716, 177));
__m512 fft809 = _mm512_fmadd_ps(fft801, fft720, _mm512_shuffle_f32x4(fft801, fft801, 177));
__m512 fft726 = _mm512_fmadd_ps(fft717, fft720, _mm512_shuffle_f32x4(fft717, fft717, 177));
__m512 fft810 = _mm512_fmadd_ps(fft802, fft720, _mm512_shuffle_f32x4(fft802, fft802, 177));
__m512 fft727 = _mm512_fmadd_ps(fft718, fft720, _mm512_shuffle_f32x4(fft718, fft718, 177));
__m512 fft811 = _mm512_fmadd_ps(fft803, fft720, _mm512_shuffle_f32x4(fft803, fft803, 177));
__m512 fft728 = _mm512_fmadd_ps(fft719, fft720, _mm512_shuffle_f32x4(fft719, fft719, 177));
__m512 fft812 = _mm512_fmadd_ps(fft804, fft720, _mm512_shuffle_f32x4(fft804, fft804, 177));
__m512 fft729 = _mm512_mask_mov_ps(fft721, 49344, fft722);
__m512 fft813 = _mm512_mask_mov_ps(fft805, 49344, fft806);
__m512 fft730 = _mm512_mask_sub_ps(fft722, 49344, _mm512_setzero_ps(), fft721);
__m512 fft814 = _mm512_mask_sub_ps(fft806, 49344, _mm512_setzero_ps(), fft805);
__m512 fft731 = _mm512_mask_mov_ps(fft723, 49344, fft724);
__m512 fft815 = _mm512_mask_mov_ps(fft807, 49344, fft808);
__m512 fft732 = _mm512_mask_sub_ps(fft724, 49344, _mm512_setzero_ps(), fft723);
__m512 fft816 = _mm512_mask_sub_ps(fft808, 49344, _mm512_setzero_ps(), fft807);
__m512 fft733 = _mm512_mask_mov_ps(fft725, 49344, fft726);
__m512 fft817 = _mm512_mask_mov_ps(fft809, 49344, fft810);
__m512 fft734 = _mm512_mask_sub_ps(fft726, 49344, _mm512_setzero_ps(), fft725);
__m512 fft818 = _mm512_mask_sub_ps(fft810, 49344, _mm512_setzero_ps(), fft809);
__m512 fft735 = _mm512_mask_mov_ps(fft727, 49344, fft728);
__m512 fft819 = _mm512_mask_mov_ps(fft811, 49344, fft812);
__m512 fft736 = _mm512_mask_sub_ps(fft728, 49344, _mm512_setzero_ps(), fft727);
__m512 fft820 = _mm512_mask_sub_ps(fft812, 49344, _mm512_setzero_ps(), fft811);
__m512 fft737 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft738 = _mm512_fmadd_ps(fft729, fft737, _mm512_shuffle_ps(fft729, fft729, 78));
__m512 fft821 = _mm512_fmadd_ps(fft813, fft737, _mm512_shuffle_ps(fft813, fft813, 78));
__m512 fft739 = _mm512_fmadd_ps(fft730, fft737, _mm512_shuffle_ps(fft730, fft730, 78));
__m512 fft822 = _mm512_fmadd_ps(fft814, fft737, _mm512_shuffle_ps(fft814, fft814, 78));
__m512 fft740 = _mm512_fmadd_ps(fft731, fft737, _mm512_shuffle_ps(fft731, fft731, 78));
__m512 fft823 = _mm512_fmadd_ps(fft815, fft737, _mm512_shuffle_ps(fft815, fft815, 78));
__m512 fft741 = _mm512_fmadd_ps(fft732, fft737, _mm512_shuffle_ps(fft732, fft732, 78));
__m512 fft824 = _mm512_fmadd_ps(fft816, fft737, _mm512_shuffle_ps(fft816, fft816, 78));
__m512 fft742 = _mm512_fmadd_ps(fft733, fft737, _mm512_shuffle_ps(fft733, fft733, 78));
__m512 fft825 = _mm512_fmadd_ps(fft817, fft737, _mm512_shuffle_ps(fft817, fft817, 78));
__m512 fft743 = _mm512_fmadd_ps(fft734, fft737, _mm512_shuffle_ps(fft734, fft734, 78));
__m512 fft826 = _mm512_fmadd_ps(fft818, fft737, _mm512_shuffle_ps(fft818, fft818, 78));
__m512 fft744 = _mm512_fmadd_ps(fft735, fft737, _mm512_shuffle_ps(fft735, fft735, 78));
__m512 fft827 = _mm512_fmadd_ps(fft819, fft737, _mm512_shuffle_ps(fft819, fft819, 78));
__m512 fft745 = _mm512_fmadd_ps(fft736, fft737, _mm512_shuffle_ps(fft736, fft736, 78));
__m512 fft828 = _mm512_fmadd_ps(fft820, fft737, _mm512_shuffle_ps(fft820, fft820, 78));
__m512i fft746 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft747 = _mm512_permutexvar_ps(fft746, fft738);
__m512 fft829 = _mm512_permutexvar_ps(fft746, fft821);
__m512i fft748 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft749 = _mm512_permutexvar_ps(fft748, fft738);
__m512 fft830 = _mm512_permutexvar_ps(fft748, fft821);
__m512 fft750 = _mm512_permutexvar_ps(fft746, fft739);
__m512 fft831 = _mm512_permutexvar_ps(fft746, fft822);
__m512 fft751 = _mm512_permutexvar_ps(fft748, fft739);
__m512 fft832 = _mm512_permutexvar_ps(fft748, fft822);
__m512 fft752 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft753 = _mm512_fmadd_ps(fft747, fft752, fft749);
__m512 fft833 = _mm512_fmadd_ps(fft829, fft752, fft830);
__m512 fft754 = _mm512_fnmadd_ps(fft751, fft752, fft750);
__m512 fft834 = _mm512_fnmadd_ps(fft832, fft752, fft831);
__m512 fft755 = _mm512_mask_mov_ps(fft751, 21845, fft753);
__m512 fft835 = _mm512_mask_mov_ps(fft832, 21845, fft833);
__m512 fft756 = _mm512_mask_mov_ps(fft747, 43176, fft753);
__m512 fft836 = _mm512_mask_mov_ps(fft829, 43176, fft833);
__m512 fft757 = _mm512_mask_mov_ps(fft755, 43176, fft754);
__m512 fft837 = _mm512_mask_mov_ps(fft835, 43176, fft834);
__m512 fft758 = _mm512_mask_mov_ps(fft756, 22102, fft754);
__m512 fft838 = _mm512_mask_mov_ps(fft836, 22102, fft834);
__m512 fft759 = _mm512_mask_mul_ps(fft757, 64764, fft757, _mm512_set1_ps(5e-01f));
__m512 fft839 = _mm512_mask_mul_ps(fft837, 64764, fft837, _mm512_set1_ps(5e-01f));
__m512 fft760 = _mm512_mask_mul_ps(fft758, 64764, fft758, _mm512_set1_ps(5e-01f));
__m512 fft840 = _mm512_mask_mul_ps(fft838, 64764, fft838, _mm512_set1_ps(5e-01f));
__m512 df33 = fft759;
__m512 df41 = fft839;
__m512 df34 = fft760;
__m512 df42 = fft840;
__m512 df35 = fft740;
__m512 df43 = fft823;
__m512 df36 = fft741;
__m512 df44 = fft824;
__m512 df37 = fft742;
__m512 df45 = fft825;
__m512 df38 = fft743;
__m512 df46 = fft826;
__m512 df39 = fft744;
__m512 df47 = fft827;
__m512 df40 = fft745;
__m512 df48 = fft828;
__m512i eo5 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df35 = _mm512_permutexvar_ps(eo5, df35);
df36 = _mm512_permutexvar_ps(eo5, df36);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df35);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df36);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df35);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df36);
df43 = _mm512_permutexvar_ps(eo5, df43);
df44 = _mm512_permutexvar_ps(eo5, df44);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df43);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df44);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df43);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df44);
df37 = _mm512_permutexvar_ps(eo5, df37);
df38 = _mm512_permutexvar_ps(eo5, df38);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df37);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df38);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df37);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df38);
df45 = _mm512_permutexvar_ps(eo5, df45);
df46 = _mm512_permutexvar_ps(eo5, df46);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df45);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df46);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df45);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df46);
df39 = _mm512_permutexvar_ps(eo5, df39);
df40 = _mm512_permutexvar_ps(eo5, df40);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df39);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df40);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df39);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df40);
df47 = _mm512_permutexvar_ps(eo5, df47);
df48 = _mm512_permutexvar_ps(eo5, df48);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df47);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df48);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df47);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df48);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df33);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df34);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df33);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df34);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df41);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k3+128*m5+32*f6, 255, df42);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df41);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k3+128*m5+32*f6, 65280, df42);
}
}
if (j2 >= last1) return;
++j2;
}
rel1 = 3;
}
ptrdiff_t h3 = base1+0;
ptrdiff_t w3 = 180;
ptrdiff_t k4 = 3*s1;
ptrdiff_t kk3 = k4+2;
for (; k4 <= kk3; ++k4) {
__m512 bnMul3 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k4+3*i6))[0]);
__m512 bnAdd3 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k4+3*i6))[1]);
for (ptrdiff_t b6 = 0; b6 < 4; ++b6) {
ptrdiff_t m6 = (size_t)b6/2;
ptrdiff_t f7 = (size_t)b6%2;
__m512 dat40 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat40 = _mm512_mask_fmadd_ps(dat40, 65535, bnMul3, bnAdd3);
__m512 dat41 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat41 = _mm512_mask_fmadd_ps(dat41, 65535, bnMul3, bnAdd3);
__m512 dat42 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat42 = _mm512_mask_fmadd_ps(dat42, 65535, bnMul3, bnAdd3);
__m512 dat43 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat43 = _mm512_mask_fmadd_ps(dat43, 65535, bnMul3, bnAdd3);
__m512 dat44 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat44 = _mm512_mask_fmadd_ps(dat44, 65535, bnMul3, bnAdd3);
__m512 dat45 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat45 = _mm512_mask_fmadd_ps(dat45, 65535, bnMul3, bnAdd3);
__m512 dat46 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat46 = _mm512_mask_fmadd_ps(dat46, 65535, bnMul3, bnAdd3);
__m512 dat47 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat47 = _mm512_mask_fmadd_ps(dat47, 65535, bnMul3, bnAdd3);
__m512 dat48 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat48 = _mm512_mask_fmadd_ps(dat48, 65535, bnMul3, bnAdd3);
__m512 dat49 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat49 = _mm512_mask_fmadd_ps(dat49, 65535, bnMul3, bnAdd3);
__m512 dat50 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat50 = _mm512_mask_fmadd_ps(dat50, 65535, bnMul3, bnAdd3);
__m512 dat51 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat51 = _mm512_mask_fmadd_ps(dat51, 65535, bnMul3, bnAdd3);
__m512 dat52 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k4+896*h3+4*w3+40*b6);
dat52 = _mm512_mask_fmadd_ps(dat52, 65535, bnMul3, bnAdd3);
__m512 fft841 = _mm512_add_ps(_mm512_setzero_ps(), dat45);
__m512 fft929 = _mm512_add_ps(_mm512_setzero_ps(), dat46);
__m512 fft842 = _mm512_sub_ps(_mm512_setzero_ps(), dat45);
__m512 fft930 = _mm512_sub_ps(_mm512_setzero_ps(), dat46);
__m512 fft843 = _mm512_add_ps(_mm512_setzero_ps(), dat47);
__m512 fft931 = _mm512_add_ps(dat40, dat48);
__m512 fft844 = _mm512_sub_ps(_mm512_setzero_ps(), dat47);
__m512 fft932 = _mm512_sub_ps(dat40, dat48);
__m512 fft845 = _mm512_add_ps(dat41, dat49);
__m512 fft933 = _mm512_add_ps(dat42, dat50);
__m512 fft846 = _mm512_sub_ps(dat41, dat49);
__m512 fft934 = _mm512_sub_ps(dat42, dat50);
__m512 fft847 = _mm512_add_ps(dat43, dat51);
__m512 fft935 = _mm512_add_ps(dat44, dat52);
__m512 fft848 = _mm512_sub_ps(dat43, dat51);
__m512 fft936 = _mm512_sub_ps(dat44, dat52);
__m512 fft849 = _mm512_add_ps(fft841, fft845);
__m512 fft937 = _mm512_add_ps(fft929, fft933);
__m512 fft850 = _mm512_sub_ps(fft841, fft845);
__m512 fft938 = _mm512_sub_ps(fft929, fft933);
__m512 fft851 = _mm512_add_ps(fft843, fft847);
__m512 fft939 = _mm512_add_ps(fft931, fft935);
__m512 fft852 = _mm512_sub_ps(fft847, fft843);
__m512 fft940 = _mm512_sub_ps(fft935, fft931);
__m512 fft853 = _mm512_sub_ps(fft844, fft848);
__m512 fft941 = _mm512_sub_ps(fft932, fft936);
__m512 fft854 = _mm512_add_ps(fft844, fft848);
__m512 fft942 = _mm512_add_ps(fft932, fft936);
__m512 fft855 = _mm512_add_ps(fft849, fft851);
__m512 fft943 = _mm512_add_ps(fft937, fft939);
__m512 fft856 = _mm512_sub_ps(fft849, fft851);
__m512 fft944 = _mm512_sub_ps(fft937, fft939);
__m512 fft857 = _mm512_fmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft945 = _mm512_fmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft858 = _mm512_fnmsub_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft946 = _mm512_fnmsub_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft859 = _mm512_fnmadd_ps(fft853, _mm512_set1_ps(7.0710677e-01f), fft842);
__m512 fft947 = _mm512_fnmadd_ps(fft941, _mm512_set1_ps(7.0710677e-01f), fft930);
__m512 fft860 = _mm512_fnmadd_ps(fft854, _mm512_set1_ps(7.0710677e-01f), fft846);
__m512 fft948 = _mm512_fnmadd_ps(fft942, _mm512_set1_ps(7.0710677e-01f), fft934);
__m512 fft861 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft862 = _mm512_fmadd_ps(fft855, fft861, _mm512_shuffle_f32x4(fft855, fft855, 78));
__m512 fft949 = _mm512_fmadd_ps(fft943, fft861, _mm512_shuffle_f32x4(fft943, fft943, 78));
__m512 fft863 = _mm512_fmadd_ps(fft856, fft861, _mm512_shuffle_f32x4(fft856, fft856, 78));
__m512 fft950 = _mm512_fmadd_ps(fft944, fft861, _mm512_shuffle_f32x4(fft944, fft944, 78));
__m512 fft864 = _mm512_fmadd_ps(fft857, fft861, _mm512_shuffle_f32x4(fft857, fft857, 78));
__m512 fft951 = _mm512_fmadd_ps(fft945, fft861, _mm512_shuffle_f32x4(fft945, fft945, 78));
__m512 fft865 = _mm512_fmadd_ps(fft858, fft861, _mm512_shuffle_f32x4(fft858, fft858, 78));
__m512 fft952 = _mm512_fmadd_ps(fft946, fft861, _mm512_shuffle_f32x4(fft946, fft946, 78));
__m512 fft866 = _mm512_fmadd_ps(fft850, fft861, _mm512_shuffle_f32x4(fft850, fft850, 78));
__m512 fft953 = _mm512_fmadd_ps(fft938, fft861, _mm512_shuffle_f32x4(fft938, fft938, 78));
__m512 fft867 = _mm512_fmadd_ps(fft852, fft861, _mm512_shuffle_f32x4(fft852, fft852, 78));
__m512 fft954 = _mm512_fmadd_ps(fft940, fft861, _mm512_shuffle_f32x4(fft940, fft940, 78));
__m512 fft868 = _mm512_fmadd_ps(fft859, fft861, _mm512_shuffle_f32x4(fft859, fft859, 78));
__m512 fft955 = _mm512_fmadd_ps(fft947, fft861, _mm512_shuffle_f32x4(fft947, fft947, 78));
__m512 fft869 = _mm512_fmadd_ps(fft860, fft861, _mm512_shuffle_f32x4(fft860, fft860, 78));
__m512 fft956 = _mm512_fmadd_ps(fft948, fft861, _mm512_shuffle_f32x4(fft948, fft948, 78));
__m512 fft870 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft871 = _mm512_mul_ps(fft862, fft870);
__m512 fft957 = _mm512_mul_ps(fft949, fft870);
__m512 fft872 = _mm512_mul_ps(fft863, fft870);
__m512 fft958 = _mm512_mul_ps(fft950, fft870);
__m512 fft873 = _mm512_mul_ps(fft864, fft870);
__m512 fft959 = _mm512_mul_ps(fft951, fft870);
__m512 fft874 = _mm512_mul_ps(fft865, fft870);
__m512 fft960 = _mm512_mul_ps(fft952, fft870);
__m512 fft875 = _mm512_mul_ps(fft866, fft870);
__m512 fft961 = _mm512_mul_ps(fft953, fft870);
__m512 fft876 = _mm512_mul_ps(fft867, fft870);
__m512 fft962 = _mm512_mul_ps(fft954, fft870);
__m512 fft877 = _mm512_mul_ps(fft868, fft870);
__m512 fft963 = _mm512_mul_ps(fft955, fft870);
__m512 fft878 = _mm512_mul_ps(fft869, fft870);
__m512 fft964 = _mm512_mul_ps(fft956, fft870);
__m512 fft879 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft880 = _mm512_fmadd_ps(fft863, fft879, fft871);
__m512 fft965 = _mm512_fmadd_ps(fft950, fft879, fft957);
__m512 fft881 = _mm512_fnmadd_ps(fft862, fft879, fft872);
__m512 fft966 = _mm512_fnmadd_ps(fft949, fft879, fft958);
__m512 fft882 = _mm512_fmadd_ps(fft865, fft879, fft873);
__m512 fft967 = _mm512_fmadd_ps(fft952, fft879, fft959);
__m512 fft883 = _mm512_fnmadd_ps(fft864, fft879, fft874);
__m512 fft968 = _mm512_fnmadd_ps(fft951, fft879, fft960);
__m512 fft884 = _mm512_fmadd_ps(fft867, fft879, fft875);
__m512 fft969 = _mm512_fmadd_ps(fft954, fft879, fft961);
__m512 fft885 = _mm512_fnmadd_ps(fft866, fft879, fft876);
__m512 fft970 = _mm512_fnmadd_ps(fft953, fft879, fft962);
__m512 fft886 = _mm512_fmadd_ps(fft869, fft879, fft877);
__m512 fft971 = _mm512_fmadd_ps(fft956, fft879, fft963);
__m512 fft887 = _mm512_fnmadd_ps(fft868, fft879, fft878);
__m512 fft972 = _mm512_fnmadd_ps(fft955, fft879, fft964);
__m512 fft888 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft889 = _mm512_fmadd_ps(fft880, fft888, _mm512_shuffle_f32x4(fft880, fft880, 177));
__m512 fft973 = _mm512_fmadd_ps(fft965, fft888, _mm512_shuffle_f32x4(fft965, fft965, 177));
__m512 fft890 = _mm512_fmadd_ps(fft881, fft888, _mm512_shuffle_f32x4(fft881, fft881, 177));
__m512 fft974 = _mm512_fmadd_ps(fft966, fft888, _mm512_shuffle_f32x4(fft966, fft966, 177));
__m512 fft891 = _mm512_fmadd_ps(fft882, fft888, _mm512_shuffle_f32x4(fft882, fft882, 177));
__m512 fft975 = _mm512_fmadd_ps(fft967, fft888, _mm512_shuffle_f32x4(fft967, fft967, 177));
__m512 fft892 = _mm512_fmadd_ps(fft883, fft888, _mm512_shuffle_f32x4(fft883, fft883, 177));
__m512 fft976 = _mm512_fmadd_ps(fft968, fft888, _mm512_shuffle_f32x4(fft968, fft968, 177));
__m512 fft893 = _mm512_fmadd_ps(fft884, fft888, _mm512_shuffle_f32x4(fft884, fft884, 177));
__m512 fft977 = _mm512_fmadd_ps(fft969, fft888, _mm512_shuffle_f32x4(fft969, fft969, 177));
__m512 fft894 = _mm512_fmadd_ps(fft885, fft888, _mm512_shuffle_f32x4(fft885, fft885, 177));
__m512 fft978 = _mm512_fmadd_ps(fft970, fft888, _mm512_shuffle_f32x4(fft970, fft970, 177));
__m512 fft895 = _mm512_fmadd_ps(fft886, fft888, _mm512_shuffle_f32x4(fft886, fft886, 177));
__m512 fft979 = _mm512_fmadd_ps(fft971, fft888, _mm512_shuffle_f32x4(fft971, fft971, 177));
__m512 fft896 = _mm512_fmadd_ps(fft887, fft888, _mm512_shuffle_f32x4(fft887, fft887, 177));
__m512 fft980 = _mm512_fmadd_ps(fft972, fft888, _mm512_shuffle_f32x4(fft972, fft972, 177));
__m512 fft897 = _mm512_mask_mov_ps(fft889, 49344, fft890);
__m512 fft981 = _mm512_mask_mov_ps(fft973, 49344, fft974);
__m512 fft898 = _mm512_mask_sub_ps(fft890, 49344, _mm512_setzero_ps(), fft889);
__m512 fft982 = _mm512_mask_sub_ps(fft974, 49344, _mm512_setzero_ps(), fft973);
__m512 fft899 = _mm512_mask_mov_ps(fft891, 49344, fft892);
__m512 fft983 = _mm512_mask_mov_ps(fft975, 49344, fft976);
__m512 fft900 = _mm512_mask_sub_ps(fft892, 49344, _mm512_setzero_ps(), fft891);
__m512 fft984 = _mm512_mask_sub_ps(fft976, 49344, _mm512_setzero_ps(), fft975);
__m512 fft901 = _mm512_mask_mov_ps(fft893, 49344, fft894);
__m512 fft985 = _mm512_mask_mov_ps(fft977, 49344, fft978);
__m512 fft902 = _mm512_mask_sub_ps(fft894, 49344, _mm512_setzero_ps(), fft893);
__m512 fft986 = _mm512_mask_sub_ps(fft978, 49344, _mm512_setzero_ps(), fft977);
__m512 fft903 = _mm512_mask_mov_ps(fft895, 49344, fft896);
__m512 fft987 = _mm512_mask_mov_ps(fft979, 49344, fft980);
__m512 fft904 = _mm512_mask_sub_ps(fft896, 49344, _mm512_setzero_ps(), fft895);
__m512 fft988 = _mm512_mask_sub_ps(fft980, 49344, _mm512_setzero_ps(), fft979);
__m512 fft905 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft906 = _mm512_fmadd_ps(fft897, fft905, _mm512_shuffle_ps(fft897, fft897, 78));
__m512 fft989 = _mm512_fmadd_ps(fft981, fft905, _mm512_shuffle_ps(fft981, fft981, 78));
__m512 fft907 = _mm512_fmadd_ps(fft898, fft905, _mm512_shuffle_ps(fft898, fft898, 78));
__m512 fft990 = _mm512_fmadd_ps(fft982, fft905, _mm512_shuffle_ps(fft982, fft982, 78));
__m512 fft908 = _mm512_fmadd_ps(fft899, fft905, _mm512_shuffle_ps(fft899, fft899, 78));
__m512 fft991 = _mm512_fmadd_ps(fft983, fft905, _mm512_shuffle_ps(fft983, fft983, 78));
__m512 fft909 = _mm512_fmadd_ps(fft900, fft905, _mm512_shuffle_ps(fft900, fft900, 78));
__m512 fft992 = _mm512_fmadd_ps(fft984, fft905, _mm512_shuffle_ps(fft984, fft984, 78));
__m512 fft910 = _mm512_fmadd_ps(fft901, fft905, _mm512_shuffle_ps(fft901, fft901, 78));
__m512 fft993 = _mm512_fmadd_ps(fft985, fft905, _mm512_shuffle_ps(fft985, fft985, 78));
__m512 fft911 = _mm512_fmadd_ps(fft902, fft905, _mm512_shuffle_ps(fft902, fft902, 78));
__m512 fft994 = _mm512_fmadd_ps(fft986, fft905, _mm512_shuffle_ps(fft986, fft986, 78));
__m512 fft912 = _mm512_fmadd_ps(fft903, fft905, _mm512_shuffle_ps(fft903, fft903, 78));
__m512 fft995 = _mm512_fmadd_ps(fft987, fft905, _mm512_shuffle_ps(fft987, fft987, 78));
__m512 fft913 = _mm512_fmadd_ps(fft904, fft905, _mm512_shuffle_ps(fft904, fft904, 78));
__m512 fft996 = _mm512_fmadd_ps(fft988, fft905, _mm512_shuffle_ps(fft988, fft988, 78));
__m512i fft914 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft915 = _mm512_permutexvar_ps(fft914, fft906);
__m512 fft997 = _mm512_permutexvar_ps(fft914, fft989);
__m512i fft916 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft917 = _mm512_permutexvar_ps(fft916, fft906);
__m512 fft998 = _mm512_permutexvar_ps(fft916, fft989);
__m512 fft918 = _mm512_permutexvar_ps(fft914, fft907);
__m512 fft999 = _mm512_permutexvar_ps(fft914, fft990);
__m512 fft919 = _mm512_permutexvar_ps(fft916, fft907);
__m512 fft1000 = _mm512_permutexvar_ps(fft916, fft990);
__m512 fft920 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft921 = _mm512_fmadd_ps(fft915, fft920, fft917);
__m512 fft1001 = _mm512_fmadd_ps(fft997, fft920, fft998);
__m512 fft922 = _mm512_fnmadd_ps(fft919, fft920, fft918);
__m512 fft1002 = _mm512_fnmadd_ps(fft1000, fft920, fft999);
__m512 fft923 = _mm512_mask_mov_ps(fft919, 21845, fft921);
__m512 fft1003 = _mm512_mask_mov_ps(fft1000, 21845, fft1001);
__m512 fft924 = _mm512_mask_mov_ps(fft915, 43176, fft921);
__m512 fft1004 = _mm512_mask_mov_ps(fft997, 43176, fft1001);
__m512 fft925 = _mm512_mask_mov_ps(fft923, 43176, fft922);
__m512 fft1005 = _mm512_mask_mov_ps(fft1003, 43176, fft1002);
__m512 fft926 = _mm512_mask_mov_ps(fft924, 22102, fft922);
__m512 fft1006 = _mm512_mask_mov_ps(fft1004, 22102, fft1002);
__m512 fft927 = _mm512_mask_mul_ps(fft925, 64764, fft925, _mm512_set1_ps(5e-01f));
__m512 fft1007 = _mm512_mask_mul_ps(fft1005, 64764, fft1005, _mm512_set1_ps(5e-01f));
__m512 fft928 = _mm512_mask_mul_ps(fft926, 64764, fft926, _mm512_set1_ps(5e-01f));
__m512 fft1008 = _mm512_mask_mul_ps(fft1006, 64764, fft1006, _mm512_set1_ps(5e-01f));
__m512 df49 = fft927;
__m512 df57 = fft1007;
__m512 df50 = fft928;
__m512 df58 = fft1008;
__m512 df51 = fft908;
__m512 df59 = fft991;
__m512 df52 = fft909;
__m512 df60 = fft992;
__m512 df53 = fft910;
__m512 df61 = fft993;
__m512 df54 = fft911;
__m512 df62 = fft994;
__m512 df55 = fft912;
__m512 df63 = fft995;
__m512 df56 = fft913;
__m512 df64 = fft996;
__m512i eo6 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df51 = _mm512_permutexvar_ps(eo6, df51);
df52 = _mm512_permutexvar_ps(eo6, df52);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df51);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df52);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df51);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df52);
df59 = _mm512_permutexvar_ps(eo6, df59);
df60 = _mm512_permutexvar_ps(eo6, df60);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df59);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df60);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df59);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df60);
df53 = _mm512_permutexvar_ps(eo6, df53);
df54 = _mm512_permutexvar_ps(eo6, df54);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df53);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df54);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df53);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df54);
df61 = _mm512_permutexvar_ps(eo6, df61);
df62 = _mm512_permutexvar_ps(eo6, df62);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df61);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df62);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df61);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df62);
df55 = _mm512_permutexvar_ps(eo6, df55);
df56 = _mm512_permutexvar_ps(eo6, df56);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df55);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df56);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df55);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df56);
df63 = _mm512_permutexvar_ps(eo6, df63);
df64 = _mm512_permutexvar_ps(eo6, df64);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df63);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df64);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df63);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df64);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df49);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df50);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df49);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df50);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df57);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m6+32*f7, 255, df58);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df57);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m6+32*f7, 65280, df58);
}
ptrdiff_t b7 = 4;
ptrdiff_t m7 = (size_t)b7/2;
ptrdiff_t f8 = (size_t)b7%2;
__m512 dat53 = _mm512_maskz_loadu_ps(127, datPtr1+2848+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat53 = _mm512_mask_fmadd_ps(dat53, 127, bnMul3, bnAdd3);
__m512 dat54 = _mm512_maskz_loadu_ps(127, datPtr1+3744+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat54 = _mm512_mask_fmadd_ps(dat54, 127, bnMul3, bnAdd3);
__m512 dat55 = _mm512_maskz_loadu_ps(127, datPtr1+4640+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat55 = _mm512_mask_fmadd_ps(dat55, 127, bnMul3, bnAdd3);
__m512 dat56 = _mm512_maskz_loadu_ps(127, datPtr1+5536+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat56 = _mm512_mask_fmadd_ps(dat56, 127, bnMul3, bnAdd3);
__m512 dat57 = _mm512_maskz_loadu_ps(127, datPtr1+6432+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat57 = _mm512_mask_fmadd_ps(dat57, 127, bnMul3, bnAdd3);
__m512 dat58 = _mm512_maskz_loadu_ps(127, datPtr1+7328+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat58 = _mm512_mask_fmadd_ps(dat58, 127, bnMul3, bnAdd3);
__m512 dat59 = _mm512_maskz_loadu_ps(127, datPtr1+8224+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat59 = _mm512_mask_fmadd_ps(dat59, 127, bnMul3, bnAdd3);
__m512 dat60 = _mm512_maskz_loadu_ps(127, datPtr1+9120+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat60 = _mm512_mask_fmadd_ps(dat60, 127, bnMul3, bnAdd3);
__m512 dat61 = _mm512_maskz_loadu_ps(127, datPtr1+10016+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat61 = _mm512_mask_fmadd_ps(dat61, 127, bnMul3, bnAdd3);
__m512 dat62 = _mm512_maskz_loadu_ps(127, datPtr1+10912+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat62 = _mm512_mask_fmadd_ps(dat62, 127, bnMul3, bnAdd3);
__m512 dat63 = _mm512_maskz_loadu_ps(127, datPtr1+11808+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat63 = _mm512_mask_fmadd_ps(dat63, 127, bnMul3, bnAdd3);
__m512 dat64 = _mm512_maskz_loadu_ps(127, datPtr1+12704+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat64 = _mm512_mask_fmadd_ps(dat64, 127, bnMul3, bnAdd3);
__m512 dat65 = _mm512_maskz_loadu_ps(127, datPtr1+13600+602112*i6+200704*k4+896*h3+4*w3+0*b7);
dat65 = _mm512_mask_fmadd_ps(dat65, 127, bnMul3, bnAdd3);
__m512 fft1009 = _mm512_add_ps(_mm512_setzero_ps(), dat58);
__m512 fft1097 = _mm512_add_ps(_mm512_setzero_ps(), dat59);
__m512 fft1010 = _mm512_sub_ps(_mm512_setzero_ps(), dat58);
__m512 fft1098 = _mm512_sub_ps(_mm512_setzero_ps(), dat59);
__m512 fft1011 = _mm512_add_ps(_mm512_setzero_ps(), dat60);
__m512 fft1099 = _mm512_add_ps(dat53, dat61);
__m512 fft1012 = _mm512_sub_ps(_mm512_setzero_ps(), dat60);
__m512 fft1100 = _mm512_sub_ps(dat53, dat61);
__m512 fft1013 = _mm512_add_ps(dat54, dat62);
__m512 fft1101 = _mm512_add_ps(dat55, dat63);
__m512 fft1014 = _mm512_sub_ps(dat54, dat62);
__m512 fft1102 = _mm512_sub_ps(dat55, dat63);
__m512 fft1015 = _mm512_add_ps(dat56, dat64);
__m512 fft1103 = _mm512_add_ps(dat57, dat65);
__m512 fft1016 = _mm512_sub_ps(dat56, dat64);
__m512 fft1104 = _mm512_sub_ps(dat57, dat65);
__m512 fft1017 = _mm512_add_ps(fft1009, fft1013);
__m512 fft1105 = _mm512_add_ps(fft1097, fft1101);
__m512 fft1018 = _mm512_sub_ps(fft1009, fft1013);
__m512 fft1106 = _mm512_sub_ps(fft1097, fft1101);
__m512 fft1019 = _mm512_add_ps(fft1011, fft1015);
__m512 fft1107 = _mm512_add_ps(fft1099, fft1103);
__m512 fft1020 = _mm512_sub_ps(fft1015, fft1011);
__m512 fft1108 = _mm512_sub_ps(fft1103, fft1099);
__m512 fft1021 = _mm512_sub_ps(fft1012, fft1016);
__m512 fft1109 = _mm512_sub_ps(fft1100, fft1104);
__m512 fft1022 = _mm512_add_ps(fft1012, fft1016);
__m512 fft1110 = _mm512_add_ps(fft1100, fft1104);
__m512 fft1023 = _mm512_add_ps(fft1017, fft1019);
__m512 fft1111 = _mm512_add_ps(fft1105, fft1107);
__m512 fft1024 = _mm512_sub_ps(fft1017, fft1019);
__m512 fft1112 = _mm512_sub_ps(fft1105, fft1107);
__m512 fft1025 = _mm512_fmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1113 = _mm512_fmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1026 = _mm512_fnmsub_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1114 = _mm512_fnmsub_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1027 = _mm512_fnmadd_ps(fft1021, _mm512_set1_ps(7.0710677e-01f), fft1010);
__m512 fft1115 = _mm512_fnmadd_ps(fft1109, _mm512_set1_ps(7.0710677e-01f), fft1098);
__m512 fft1028 = _mm512_fnmadd_ps(fft1022, _mm512_set1_ps(7.0710677e-01f), fft1014);
__m512 fft1116 = _mm512_fnmadd_ps(fft1110, _mm512_set1_ps(7.0710677e-01f), fft1102);
__m512 fft1029 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1030 = _mm512_fmadd_ps(fft1023, fft1029, _mm512_shuffle_f32x4(fft1023, fft1023, 78));
__m512 fft1117 = _mm512_fmadd_ps(fft1111, fft1029, _mm512_shuffle_f32x4(fft1111, fft1111, 78));
__m512 fft1031 = _mm512_fmadd_ps(fft1024, fft1029, _mm512_shuffle_f32x4(fft1024, fft1024, 78));
__m512 fft1118 = _mm512_fmadd_ps(fft1112, fft1029, _mm512_shuffle_f32x4(fft1112, fft1112, 78));
__m512 fft1032 = _mm512_fmadd_ps(fft1025, fft1029, _mm512_shuffle_f32x4(fft1025, fft1025, 78));
__m512 fft1119 = _mm512_fmadd_ps(fft1113, fft1029, _mm512_shuffle_f32x4(fft1113, fft1113, 78));
__m512 fft1033 = _mm512_fmadd_ps(fft1026, fft1029, _mm512_shuffle_f32x4(fft1026, fft1026, 78));
__m512 fft1120 = _mm512_fmadd_ps(fft1114, fft1029, _mm512_shuffle_f32x4(fft1114, fft1114, 78));
__m512 fft1034 = _mm512_fmadd_ps(fft1018, fft1029, _mm512_shuffle_f32x4(fft1018, fft1018, 78));
__m512 fft1121 = _mm512_fmadd_ps(fft1106, fft1029, _mm512_shuffle_f32x4(fft1106, fft1106, 78));
__m512 fft1035 = _mm512_fmadd_ps(fft1020, fft1029, _mm512_shuffle_f32x4(fft1020, fft1020, 78));
__m512 fft1122 = _mm512_fmadd_ps(fft1108, fft1029, _mm512_shuffle_f32x4(fft1108, fft1108, 78));
__m512 fft1036 = _mm512_fmadd_ps(fft1027, fft1029, _mm512_shuffle_f32x4(fft1027, fft1027, 78));
__m512 fft1123 = _mm512_fmadd_ps(fft1115, fft1029, _mm512_shuffle_f32x4(fft1115, fft1115, 78));
__m512 fft1037 = _mm512_fmadd_ps(fft1028, fft1029, _mm512_shuffle_f32x4(fft1028, fft1028, 78));
__m512 fft1124 = _mm512_fmadd_ps(fft1116, fft1029, _mm512_shuffle_f32x4(fft1116, fft1116, 78));
__m512 fft1038 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1039 = _mm512_mul_ps(fft1030, fft1038);
__m512 fft1125 = _mm512_mul_ps(fft1117, fft1038);
__m512 fft1040 = _mm512_mul_ps(fft1031, fft1038);
__m512 fft1126 = _mm512_mul_ps(fft1118, fft1038);
__m512 fft1041 = _mm512_mul_ps(fft1032, fft1038);
__m512 fft1127 = _mm512_mul_ps(fft1119, fft1038);
__m512 fft1042 = _mm512_mul_ps(fft1033, fft1038);
__m512 fft1128 = _mm512_mul_ps(fft1120, fft1038);
__m512 fft1043 = _mm512_mul_ps(fft1034, fft1038);
__m512 fft1129 = _mm512_mul_ps(fft1121, fft1038);
__m512 fft1044 = _mm512_mul_ps(fft1035, fft1038);
__m512 fft1130 = _mm512_mul_ps(fft1122, fft1038);
__m512 fft1045 = _mm512_mul_ps(fft1036, fft1038);
__m512 fft1131 = _mm512_mul_ps(fft1123, fft1038);
__m512 fft1046 = _mm512_mul_ps(fft1037, fft1038);
__m512 fft1132 = _mm512_mul_ps(fft1124, fft1038);
__m512 fft1047 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1048 = _mm512_fmadd_ps(fft1031, fft1047, fft1039);
__m512 fft1133 = _mm512_fmadd_ps(fft1118, fft1047, fft1125);
__m512 fft1049 = _mm512_fnmadd_ps(fft1030, fft1047, fft1040);
__m512 fft1134 = _mm512_fnmadd_ps(fft1117, fft1047, fft1126);
__m512 fft1050 = _mm512_fmadd_ps(fft1033, fft1047, fft1041);
__m512 fft1135 = _mm512_fmadd_ps(fft1120, fft1047, fft1127);
__m512 fft1051 = _mm512_fnmadd_ps(fft1032, fft1047, fft1042);
__m512 fft1136 = _mm512_fnmadd_ps(fft1119, fft1047, fft1128);
__m512 fft1052 = _mm512_fmadd_ps(fft1035, fft1047, fft1043);
__m512 fft1137 = _mm512_fmadd_ps(fft1122, fft1047, fft1129);
__m512 fft1053 = _mm512_fnmadd_ps(fft1034, fft1047, fft1044);
__m512 fft1138 = _mm512_fnmadd_ps(fft1121, fft1047, fft1130);
__m512 fft1054 = _mm512_fmadd_ps(fft1037, fft1047, fft1045);
__m512 fft1139 = _mm512_fmadd_ps(fft1124, fft1047, fft1131);
__m512 fft1055 = _mm512_fnmadd_ps(fft1036, fft1047, fft1046);
__m512 fft1140 = _mm512_fnmadd_ps(fft1123, fft1047, fft1132);
__m512 fft1056 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1057 = _mm512_fmadd_ps(fft1048, fft1056, _mm512_shuffle_f32x4(fft1048, fft1048, 177));
__m512 fft1141 = _mm512_fmadd_ps(fft1133, fft1056, _mm512_shuffle_f32x4(fft1133, fft1133, 177));
__m512 fft1058 = _mm512_fmadd_ps(fft1049, fft1056, _mm512_shuffle_f32x4(fft1049, fft1049, 177));
__m512 fft1142 = _mm512_fmadd_ps(fft1134, fft1056, _mm512_shuffle_f32x4(fft1134, fft1134, 177));
__m512 fft1059 = _mm512_fmadd_ps(fft1050, fft1056, _mm512_shuffle_f32x4(fft1050, fft1050, 177));
__m512 fft1143 = _mm512_fmadd_ps(fft1135, fft1056, _mm512_shuffle_f32x4(fft1135, fft1135, 177));
__m512 fft1060 = _mm512_fmadd_ps(fft1051, fft1056, _mm512_shuffle_f32x4(fft1051, fft1051, 177));
__m512 fft1144 = _mm512_fmadd_ps(fft1136, fft1056, _mm512_shuffle_f32x4(fft1136, fft1136, 177));
__m512 fft1061 = _mm512_fmadd_ps(fft1052, fft1056, _mm512_shuffle_f32x4(fft1052, fft1052, 177));
__m512 fft1145 = _mm512_fmadd_ps(fft1137, fft1056, _mm512_shuffle_f32x4(fft1137, fft1137, 177));
__m512 fft1062 = _mm512_fmadd_ps(fft1053, fft1056, _mm512_shuffle_f32x4(fft1053, fft1053, 177));
__m512 fft1146 = _mm512_fmadd_ps(fft1138, fft1056, _mm512_shuffle_f32x4(fft1138, fft1138, 177));
__m512 fft1063 = _mm512_fmadd_ps(fft1054, fft1056, _mm512_shuffle_f32x4(fft1054, fft1054, 177));
__m512 fft1147 = _mm512_fmadd_ps(fft1139, fft1056, _mm512_shuffle_f32x4(fft1139, fft1139, 177));
__m512 fft1064 = _mm512_fmadd_ps(fft1055, fft1056, _mm512_shuffle_f32x4(fft1055, fft1055, 177));
__m512 fft1148 = _mm512_fmadd_ps(fft1140, fft1056, _mm512_shuffle_f32x4(fft1140, fft1140, 177));
__m512 fft1065 = _mm512_mask_mov_ps(fft1057, 49344, fft1058);
__m512 fft1149 = _mm512_mask_mov_ps(fft1141, 49344, fft1142);
__m512 fft1066 = _mm512_mask_sub_ps(fft1058, 49344, _mm512_setzero_ps(), fft1057);
__m512 fft1150 = _mm512_mask_sub_ps(fft1142, 49344, _mm512_setzero_ps(), fft1141);
__m512 fft1067 = _mm512_mask_mov_ps(fft1059, 49344, fft1060);
__m512 fft1151 = _mm512_mask_mov_ps(fft1143, 49344, fft1144);
__m512 fft1068 = _mm512_mask_sub_ps(fft1060, 49344, _mm512_setzero_ps(), fft1059);
__m512 fft1152 = _mm512_mask_sub_ps(fft1144, 49344, _mm512_setzero_ps(), fft1143);
__m512 fft1069 = _mm512_mask_mov_ps(fft1061, 49344, fft1062);
__m512 fft1153 = _mm512_mask_mov_ps(fft1145, 49344, fft1146);
__m512 fft1070 = _mm512_mask_sub_ps(fft1062, 49344, _mm512_setzero_ps(), fft1061);
__m512 fft1154 = _mm512_mask_sub_ps(fft1146, 49344, _mm512_setzero_ps(), fft1145);
__m512 fft1071 = _mm512_mask_mov_ps(fft1063, 49344, fft1064);
__m512 fft1155 = _mm512_mask_mov_ps(fft1147, 49344, fft1148);
__m512 fft1072 = _mm512_mask_sub_ps(fft1064, 49344, _mm512_setzero_ps(), fft1063);
__m512 fft1156 = _mm512_mask_sub_ps(fft1148, 49344, _mm512_setzero_ps(), fft1147);
__m512 fft1073 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1074 = _mm512_fmadd_ps(fft1065, fft1073, _mm512_shuffle_ps(fft1065, fft1065, 78));
__m512 fft1157 = _mm512_fmadd_ps(fft1149, fft1073, _mm512_shuffle_ps(fft1149, fft1149, 78));
__m512 fft1075 = _mm512_fmadd_ps(fft1066, fft1073, _mm512_shuffle_ps(fft1066, fft1066, 78));
__m512 fft1158 = _mm512_fmadd_ps(fft1150, fft1073, _mm512_shuffle_ps(fft1150, fft1150, 78));
__m512 fft1076 = _mm512_fmadd_ps(fft1067, fft1073, _mm512_shuffle_ps(fft1067, fft1067, 78));
__m512 fft1159 = _mm512_fmadd_ps(fft1151, fft1073, _mm512_shuffle_ps(fft1151, fft1151, 78));
__m512 fft1077 = _mm512_fmadd_ps(fft1068, fft1073, _mm512_shuffle_ps(fft1068, fft1068, 78));
__m512 fft1160 = _mm512_fmadd_ps(fft1152, fft1073, _mm512_shuffle_ps(fft1152, fft1152, 78));
__m512 fft1078 = _mm512_fmadd_ps(fft1069, fft1073, _mm512_shuffle_ps(fft1069, fft1069, 78));
__m512 fft1161 = _mm512_fmadd_ps(fft1153, fft1073, _mm512_shuffle_ps(fft1153, fft1153, 78));
__m512 fft1079 = _mm512_fmadd_ps(fft1070, fft1073, _mm512_shuffle_ps(fft1070, fft1070, 78));
__m512 fft1162 = _mm512_fmadd_ps(fft1154, fft1073, _mm512_shuffle_ps(fft1154, fft1154, 78));
__m512 fft1080 = _mm512_fmadd_ps(fft1071, fft1073, _mm512_shuffle_ps(fft1071, fft1071, 78));
__m512 fft1163 = _mm512_fmadd_ps(fft1155, fft1073, _mm512_shuffle_ps(fft1155, fft1155, 78));
__m512 fft1081 = _mm512_fmadd_ps(fft1072, fft1073, _mm512_shuffle_ps(fft1072, fft1072, 78));
__m512 fft1164 = _mm512_fmadd_ps(fft1156, fft1073, _mm512_shuffle_ps(fft1156, fft1156, 78));
__m512i fft1082 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1083 = _mm512_permutexvar_ps(fft1082, fft1074);
__m512 fft1165 = _mm512_permutexvar_ps(fft1082, fft1157);
__m512i fft1084 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1085 = _mm512_permutexvar_ps(fft1084, fft1074);
__m512 fft1166 = _mm512_permutexvar_ps(fft1084, fft1157);
__m512 fft1086 = _mm512_permutexvar_ps(fft1082, fft1075);
__m512 fft1167 = _mm512_permutexvar_ps(fft1082, fft1158);
__m512 fft1087 = _mm512_permutexvar_ps(fft1084, fft1075);
__m512 fft1168 = _mm512_permutexvar_ps(fft1084, fft1158);
__m512 fft1088 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1089 = _mm512_fmadd_ps(fft1083, fft1088, fft1085);
__m512 fft1169 = _mm512_fmadd_ps(fft1165, fft1088, fft1166);
__m512 fft1090 = _mm512_fnmadd_ps(fft1087, fft1088, fft1086);
__m512 fft1170 = _mm512_fnmadd_ps(fft1168, fft1088, fft1167);
__m512 fft1091 = _mm512_mask_mov_ps(fft1087, 21845, fft1089);
__m512 fft1171 = _mm512_mask_mov_ps(fft1168, 21845, fft1169);
__m512 fft1092 = _mm512_mask_mov_ps(fft1083, 43176, fft1089);
__m512 fft1172 = _mm512_mask_mov_ps(fft1165, 43176, fft1169);
__m512 fft1093 = _mm512_mask_mov_ps(fft1091, 43176, fft1090);
__m512 fft1173 = _mm512_mask_mov_ps(fft1171, 43176, fft1170);
__m512 fft1094 = _mm512_mask_mov_ps(fft1092, 22102, fft1090);
__m512 fft1174 = _mm512_mask_mov_ps(fft1172, 22102, fft1170);
__m512 fft1095 = _mm512_mask_mul_ps(fft1093, 64764, fft1093, _mm512_set1_ps(5e-01f));
__m512 fft1175 = _mm512_mask_mul_ps(fft1173, 64764, fft1173, _mm512_set1_ps(5e-01f));
__m512 fft1096 = _mm512_mask_mul_ps(fft1094, 64764, fft1094, _mm512_set1_ps(5e-01f));
__m512 fft1176 = _mm512_mask_mul_ps(fft1174, 64764, fft1174, _mm512_set1_ps(5e-01f));
__m512 df65 = fft1095;
__m512 df73 = fft1175;
__m512 df66 = fft1096;
__m512 df74 = fft1176;
__m512 df67 = fft1076;
__m512 df75 = fft1159;
__m512 df68 = fft1077;
__m512 df76 = fft1160;
__m512 df69 = fft1078;
__m512 df77 = fft1161;
__m512 df70 = fft1079;
__m512 df78 = fft1162;
__m512 df71 = fft1080;
__m512 df79 = fft1163;
__m512 df72 = fft1081;
__m512 df80 = fft1164;
__m512i eo7 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df67 = _mm512_permutexvar_ps(eo7, df67);
df68 = _mm512_permutexvar_ps(eo7, df68);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df67);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df68);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df67);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df68);
df75 = _mm512_permutexvar_ps(eo7, df75);
df76 = _mm512_permutexvar_ps(eo7, df76);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df75);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df76);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df75);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df76);
df69 = _mm512_permutexvar_ps(eo7, df69);
df70 = _mm512_permutexvar_ps(eo7, df70);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df69);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df70);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df69);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df70);
df77 = _mm512_permutexvar_ps(eo7, df77);
df78 = _mm512_permutexvar_ps(eo7, df78);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df77);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df78);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df77);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df78);
df71 = _mm512_permutexvar_ps(eo7, df71);
df72 = _mm512_permutexvar_ps(eo7, df72);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df71);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df72);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df71);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df72);
df79 = _mm512_permutexvar_ps(eo7, df79);
df80 = _mm512_permutexvar_ps(eo7, df80);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df79);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df80);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df79);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df80);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df65);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df66);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df65);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df66);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df73);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m7+32*f8, 255, df74);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df73);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m7+32*f8, 65280, df74);
ptrdiff_t b8 = 5;
ptrdiff_t m8 = (size_t)b8/2;
ptrdiff_t f9 = (size_t)b8%2;
__m512 dat66 = _mm512_maskz_loadu_ps(65528, datPtr1+8240+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat66 = _mm512_mask_fmadd_ps(dat66, 65528, bnMul3, bnAdd3);
__m512 dat67 = _mm512_maskz_loadu_ps(65528, datPtr1+9136+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat67 = _mm512_mask_fmadd_ps(dat67, 65528, bnMul3, bnAdd3);
__m512 dat68 = _mm512_maskz_loadu_ps(65528, datPtr1+10032+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat68 = _mm512_mask_fmadd_ps(dat68, 65528, bnMul3, bnAdd3);
__m512 dat69 = _mm512_maskz_loadu_ps(65528, datPtr1+10928+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat69 = _mm512_mask_fmadd_ps(dat69, 65528, bnMul3, bnAdd3);
__m512 dat70 = _mm512_maskz_loadu_ps(65528, datPtr1+11824+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat70 = _mm512_mask_fmadd_ps(dat70, 65528, bnMul3, bnAdd3);
__m512 dat71 = _mm512_maskz_loadu_ps(65528, datPtr1+12720+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat71 = _mm512_mask_fmadd_ps(dat71, 65528, bnMul3, bnAdd3);
__m512 dat72 = _mm512_maskz_loadu_ps(65528, datPtr1+13616+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat72 = _mm512_mask_fmadd_ps(dat72, 65528, bnMul3, bnAdd3);
__m512 dat73 = _mm512_maskz_loadu_ps(65528, datPtr1+14512+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat73 = _mm512_mask_fmadd_ps(dat73, 65528, bnMul3, bnAdd3);
__m512 dat74 = _mm512_maskz_loadu_ps(65528, datPtr1+15408+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat74 = _mm512_mask_fmadd_ps(dat74, 65528, bnMul3, bnAdd3);
__m512 dat75 = _mm512_maskz_loadu_ps(65528, datPtr1+16304+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat75 = _mm512_mask_fmadd_ps(dat75, 65528, bnMul3, bnAdd3);
__m512 dat76 = _mm512_maskz_loadu_ps(65528, datPtr1+17200+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat76 = _mm512_mask_fmadd_ps(dat76, 65528, bnMul3, bnAdd3);
__m512 dat77 = _mm512_maskz_loadu_ps(65528, datPtr1+18096+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat77 = _mm512_mask_fmadd_ps(dat77, 65528, bnMul3, bnAdd3);
__m512 dat78 = _mm512_maskz_loadu_ps(65528, datPtr1+18992+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat78 = _mm512_mask_fmadd_ps(dat78, 65528, bnMul3, bnAdd3);
__m512 dat79 = _mm512_maskz_loadu_ps(65528, datPtr1+19888+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat79 = _mm512_mask_fmadd_ps(dat79, 65528, bnMul3, bnAdd3);
__m512 dat80 = _mm512_maskz_loadu_ps(65528, datPtr1+20784+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat80 = _mm512_mask_fmadd_ps(dat80, 65528, bnMul3, bnAdd3);
__m512 dat81 = _mm512_maskz_loadu_ps(65528, datPtr1+21680+602112*i6+200704*k4+896*h3+4*w3+0*b8);
dat81 = _mm512_mask_fmadd_ps(dat81, 65528, bnMul3, bnAdd3);
__m512 fft1177 = _mm512_add_ps(dat66, dat74);
__m512 fft1265 = _mm512_add_ps(dat67, dat75);
__m512 fft1178 = _mm512_sub_ps(dat66, dat74);
__m512 fft1266 = _mm512_sub_ps(dat67, dat75);
__m512 fft1179 = _mm512_add_ps(dat68, dat76);
__m512 fft1267 = _mm512_add_ps(dat69, dat77);
__m512 fft1180 = _mm512_sub_ps(dat68, dat76);
__m512 fft1268 = _mm512_sub_ps(dat69, dat77);
__m512 fft1181 = _mm512_add_ps(dat70, dat78);
__m512 fft1269 = _mm512_add_ps(dat71, dat79);
__m512 fft1182 = _mm512_sub_ps(dat70, dat78);
__m512 fft1270 = _mm512_sub_ps(dat71, dat79);
__m512 fft1183 = _mm512_add_ps(dat72, dat80);
__m512 fft1271 = _mm512_add_ps(dat73, dat81);
__m512 fft1184 = _mm512_sub_ps(dat72, dat80);
__m512 fft1272 = _mm512_sub_ps(dat73, dat81);
__m512 fft1185 = _mm512_add_ps(fft1177, fft1181);
__m512 fft1273 = _mm512_add_ps(fft1265, fft1269);
__m512 fft1186 = _mm512_sub_ps(fft1177, fft1181);
__m512 fft1274 = _mm512_sub_ps(fft1265, fft1269);
__m512 fft1187 = _mm512_add_ps(fft1179, fft1183);
__m512 fft1275 = _mm512_add_ps(fft1267, fft1271);
__m512 fft1188 = _mm512_sub_ps(fft1183, fft1179);
__m512 fft1276 = _mm512_sub_ps(fft1271, fft1267);
__m512 fft1189 = _mm512_sub_ps(fft1180, fft1184);
__m512 fft1277 = _mm512_sub_ps(fft1268, fft1272);
__m512 fft1190 = _mm512_add_ps(fft1180, fft1184);
__m512 fft1278 = _mm512_add_ps(fft1268, fft1272);
__m512 fft1191 = _mm512_add_ps(fft1185, fft1187);
__m512 fft1279 = _mm512_add_ps(fft1273, fft1275);
__m512 fft1192 = _mm512_sub_ps(fft1185, fft1187);
__m512 fft1280 = _mm512_sub_ps(fft1273, fft1275);
__m512 fft1193 = _mm512_fmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1281 = _mm512_fmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1194 = _mm512_fnmsub_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1282 = _mm512_fnmsub_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1195 = _mm512_fnmadd_ps(fft1189, _mm512_set1_ps(7.0710677e-01f), fft1178);
__m512 fft1283 = _mm512_fnmadd_ps(fft1277, _mm512_set1_ps(7.0710677e-01f), fft1266);
__m512 fft1196 = _mm512_fnmadd_ps(fft1190, _mm512_set1_ps(7.0710677e-01f), fft1182);
__m512 fft1284 = _mm512_fnmadd_ps(fft1278, _mm512_set1_ps(7.0710677e-01f), fft1270);
__m512 fft1197 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1198 = _mm512_fmadd_ps(fft1191, fft1197, _mm512_shuffle_f32x4(fft1191, fft1191, 78));
__m512 fft1285 = _mm512_fmadd_ps(fft1279, fft1197, _mm512_shuffle_f32x4(fft1279, fft1279, 78));
__m512 fft1199 = _mm512_fmadd_ps(fft1192, fft1197, _mm512_shuffle_f32x4(fft1192, fft1192, 78));
__m512 fft1286 = _mm512_fmadd_ps(fft1280, fft1197, _mm512_shuffle_f32x4(fft1280, fft1280, 78));
__m512 fft1200 = _mm512_fmadd_ps(fft1193, fft1197, _mm512_shuffle_f32x4(fft1193, fft1193, 78));
__m512 fft1287 = _mm512_fmadd_ps(fft1281, fft1197, _mm512_shuffle_f32x4(fft1281, fft1281, 78));
__m512 fft1201 = _mm512_fmadd_ps(fft1194, fft1197, _mm512_shuffle_f32x4(fft1194, fft1194, 78));
__m512 fft1288 = _mm512_fmadd_ps(fft1282, fft1197, _mm512_shuffle_f32x4(fft1282, fft1282, 78));
__m512 fft1202 = _mm512_fmadd_ps(fft1186, fft1197, _mm512_shuffle_f32x4(fft1186, fft1186, 78));
__m512 fft1289 = _mm512_fmadd_ps(fft1274, fft1197, _mm512_shuffle_f32x4(fft1274, fft1274, 78));
__m512 fft1203 = _mm512_fmadd_ps(fft1188, fft1197, _mm512_shuffle_f32x4(fft1188, fft1188, 78));
__m512 fft1290 = _mm512_fmadd_ps(fft1276, fft1197, _mm512_shuffle_f32x4(fft1276, fft1276, 78));
__m512 fft1204 = _mm512_fmadd_ps(fft1195, fft1197, _mm512_shuffle_f32x4(fft1195, fft1195, 78));
__m512 fft1291 = _mm512_fmadd_ps(fft1283, fft1197, _mm512_shuffle_f32x4(fft1283, fft1283, 78));
__m512 fft1205 = _mm512_fmadd_ps(fft1196, fft1197, _mm512_shuffle_f32x4(fft1196, fft1196, 78));
__m512 fft1292 = _mm512_fmadd_ps(fft1284, fft1197, _mm512_shuffle_f32x4(fft1284, fft1284, 78));
__m512 fft1206 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1207 = _mm512_mul_ps(fft1198, fft1206);
__m512 fft1293 = _mm512_mul_ps(fft1285, fft1206);
__m512 fft1208 = _mm512_mul_ps(fft1199, fft1206);
__m512 fft1294 = _mm512_mul_ps(fft1286, fft1206);
__m512 fft1209 = _mm512_mul_ps(fft1200, fft1206);
__m512 fft1295 = _mm512_mul_ps(fft1287, fft1206);
__m512 fft1210 = _mm512_mul_ps(fft1201, fft1206);
__m512 fft1296 = _mm512_mul_ps(fft1288, fft1206);
__m512 fft1211 = _mm512_mul_ps(fft1202, fft1206);
__m512 fft1297 = _mm512_mul_ps(fft1289, fft1206);
__m512 fft1212 = _mm512_mul_ps(fft1203, fft1206);
__m512 fft1298 = _mm512_mul_ps(fft1290, fft1206);
__m512 fft1213 = _mm512_mul_ps(fft1204, fft1206);
__m512 fft1299 = _mm512_mul_ps(fft1291, fft1206);
__m512 fft1214 = _mm512_mul_ps(fft1205, fft1206);
__m512 fft1300 = _mm512_mul_ps(fft1292, fft1206);
__m512 fft1215 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1216 = _mm512_fmadd_ps(fft1199, fft1215, fft1207);
__m512 fft1301 = _mm512_fmadd_ps(fft1286, fft1215, fft1293);
__m512 fft1217 = _mm512_fnmadd_ps(fft1198, fft1215, fft1208);
__m512 fft1302 = _mm512_fnmadd_ps(fft1285, fft1215, fft1294);
__m512 fft1218 = _mm512_fmadd_ps(fft1201, fft1215, fft1209);
__m512 fft1303 = _mm512_fmadd_ps(fft1288, fft1215, fft1295);
__m512 fft1219 = _mm512_fnmadd_ps(fft1200, fft1215, fft1210);
__m512 fft1304 = _mm512_fnmadd_ps(fft1287, fft1215, fft1296);
__m512 fft1220 = _mm512_fmadd_ps(fft1203, fft1215, fft1211);
__m512 fft1305 = _mm512_fmadd_ps(fft1290, fft1215, fft1297);
__m512 fft1221 = _mm512_fnmadd_ps(fft1202, fft1215, fft1212);
__m512 fft1306 = _mm512_fnmadd_ps(fft1289, fft1215, fft1298);
__m512 fft1222 = _mm512_fmadd_ps(fft1205, fft1215, fft1213);
__m512 fft1307 = _mm512_fmadd_ps(fft1292, fft1215, fft1299);
__m512 fft1223 = _mm512_fnmadd_ps(fft1204, fft1215, fft1214);
__m512 fft1308 = _mm512_fnmadd_ps(fft1291, fft1215, fft1300);
__m512 fft1224 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1225 = _mm512_fmadd_ps(fft1216, fft1224, _mm512_shuffle_f32x4(fft1216, fft1216, 177));
__m512 fft1309 = _mm512_fmadd_ps(fft1301, fft1224, _mm512_shuffle_f32x4(fft1301, fft1301, 177));
__m512 fft1226 = _mm512_fmadd_ps(fft1217, fft1224, _mm512_shuffle_f32x4(fft1217, fft1217, 177));
__m512 fft1310 = _mm512_fmadd_ps(fft1302, fft1224, _mm512_shuffle_f32x4(fft1302, fft1302, 177));
__m512 fft1227 = _mm512_fmadd_ps(fft1218, fft1224, _mm512_shuffle_f32x4(fft1218, fft1218, 177));
__m512 fft1311 = _mm512_fmadd_ps(fft1303, fft1224, _mm512_shuffle_f32x4(fft1303, fft1303, 177));
__m512 fft1228 = _mm512_fmadd_ps(fft1219, fft1224, _mm512_shuffle_f32x4(fft1219, fft1219, 177));
__m512 fft1312 = _mm512_fmadd_ps(fft1304, fft1224, _mm512_shuffle_f32x4(fft1304, fft1304, 177));
__m512 fft1229 = _mm512_fmadd_ps(fft1220, fft1224, _mm512_shuffle_f32x4(fft1220, fft1220, 177));
__m512 fft1313 = _mm512_fmadd_ps(fft1305, fft1224, _mm512_shuffle_f32x4(fft1305, fft1305, 177));
__m512 fft1230 = _mm512_fmadd_ps(fft1221, fft1224, _mm512_shuffle_f32x4(fft1221, fft1221, 177));
__m512 fft1314 = _mm512_fmadd_ps(fft1306, fft1224, _mm512_shuffle_f32x4(fft1306, fft1306, 177));
__m512 fft1231 = _mm512_fmadd_ps(fft1222, fft1224, _mm512_shuffle_f32x4(fft1222, fft1222, 177));
__m512 fft1315 = _mm512_fmadd_ps(fft1307, fft1224, _mm512_shuffle_f32x4(fft1307, fft1307, 177));
__m512 fft1232 = _mm512_fmadd_ps(fft1223, fft1224, _mm512_shuffle_f32x4(fft1223, fft1223, 177));
__m512 fft1316 = _mm512_fmadd_ps(fft1308, fft1224, _mm512_shuffle_f32x4(fft1308, fft1308, 177));
__m512 fft1233 = _mm512_mask_mov_ps(fft1225, 49344, fft1226);
__m512 fft1317 = _mm512_mask_mov_ps(fft1309, 49344, fft1310);
__m512 fft1234 = _mm512_mask_sub_ps(fft1226, 49344, _mm512_setzero_ps(), fft1225);
__m512 fft1318 = _mm512_mask_sub_ps(fft1310, 49344, _mm512_setzero_ps(), fft1309);
__m512 fft1235 = _mm512_mask_mov_ps(fft1227, 49344, fft1228);
__m512 fft1319 = _mm512_mask_mov_ps(fft1311, 49344, fft1312);
__m512 fft1236 = _mm512_mask_sub_ps(fft1228, 49344, _mm512_setzero_ps(), fft1227);
__m512 fft1320 = _mm512_mask_sub_ps(fft1312, 49344, _mm512_setzero_ps(), fft1311);
__m512 fft1237 = _mm512_mask_mov_ps(fft1229, 49344, fft1230);
__m512 fft1321 = _mm512_mask_mov_ps(fft1313, 49344, fft1314);
__m512 fft1238 = _mm512_mask_sub_ps(fft1230, 49344, _mm512_setzero_ps(), fft1229);
__m512 fft1322 = _mm512_mask_sub_ps(fft1314, 49344, _mm512_setzero_ps(), fft1313);
__m512 fft1239 = _mm512_mask_mov_ps(fft1231, 49344, fft1232);
__m512 fft1323 = _mm512_mask_mov_ps(fft1315, 49344, fft1316);
__m512 fft1240 = _mm512_mask_sub_ps(fft1232, 49344, _mm512_setzero_ps(), fft1231);
__m512 fft1324 = _mm512_mask_sub_ps(fft1316, 49344, _mm512_setzero_ps(), fft1315);
__m512 fft1241 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1242 = _mm512_fmadd_ps(fft1233, fft1241, _mm512_shuffle_ps(fft1233, fft1233, 78));
__m512 fft1325 = _mm512_fmadd_ps(fft1317, fft1241, _mm512_shuffle_ps(fft1317, fft1317, 78));
__m512 fft1243 = _mm512_fmadd_ps(fft1234, fft1241, _mm512_shuffle_ps(fft1234, fft1234, 78));
__m512 fft1326 = _mm512_fmadd_ps(fft1318, fft1241, _mm512_shuffle_ps(fft1318, fft1318, 78));
__m512 fft1244 = _mm512_fmadd_ps(fft1235, fft1241, _mm512_shuffle_ps(fft1235, fft1235, 78));
__m512 fft1327 = _mm512_fmadd_ps(fft1319, fft1241, _mm512_shuffle_ps(fft1319, fft1319, 78));
__m512 fft1245 = _mm512_fmadd_ps(fft1236, fft1241, _mm512_shuffle_ps(fft1236, fft1236, 78));
__m512 fft1328 = _mm512_fmadd_ps(fft1320, fft1241, _mm512_shuffle_ps(fft1320, fft1320, 78));
__m512 fft1246 = _mm512_fmadd_ps(fft1237, fft1241, _mm512_shuffle_ps(fft1237, fft1237, 78));
__m512 fft1329 = _mm512_fmadd_ps(fft1321, fft1241, _mm512_shuffle_ps(fft1321, fft1321, 78));
__m512 fft1247 = _mm512_fmadd_ps(fft1238, fft1241, _mm512_shuffle_ps(fft1238, fft1238, 78));
__m512 fft1330 = _mm512_fmadd_ps(fft1322, fft1241, _mm512_shuffle_ps(fft1322, fft1322, 78));
__m512 fft1248 = _mm512_fmadd_ps(fft1239, fft1241, _mm512_shuffle_ps(fft1239, fft1239, 78));
__m512 fft1331 = _mm512_fmadd_ps(fft1323, fft1241, _mm512_shuffle_ps(fft1323, fft1323, 78));
__m512 fft1249 = _mm512_fmadd_ps(fft1240, fft1241, _mm512_shuffle_ps(fft1240, fft1240, 78));
__m512 fft1332 = _mm512_fmadd_ps(fft1324, fft1241, _mm512_shuffle_ps(fft1324, fft1324, 78));
__m512i fft1250 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1251 = _mm512_permutexvar_ps(fft1250, fft1242);
__m512 fft1333 = _mm512_permutexvar_ps(fft1250, fft1325);
__m512i fft1252 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1253 = _mm512_permutexvar_ps(fft1252, fft1242);
__m512 fft1334 = _mm512_permutexvar_ps(fft1252, fft1325);
__m512 fft1254 = _mm512_permutexvar_ps(fft1250, fft1243);
__m512 fft1335 = _mm512_permutexvar_ps(fft1250, fft1326);
__m512 fft1255 = _mm512_permutexvar_ps(fft1252, fft1243);
__m512 fft1336 = _mm512_permutexvar_ps(fft1252, fft1326);
__m512 fft1256 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1257 = _mm512_fmadd_ps(fft1251, fft1256, fft1253);
__m512 fft1337 = _mm512_fmadd_ps(fft1333, fft1256, fft1334);
__m512 fft1258 = _mm512_fnmadd_ps(fft1255, fft1256, fft1254);
__m512 fft1338 = _mm512_fnmadd_ps(fft1336, fft1256, fft1335);
__m512 fft1259 = _mm512_mask_mov_ps(fft1255, 21845, fft1257);
__m512 fft1339 = _mm512_mask_mov_ps(fft1336, 21845, fft1337);
__m512 fft1260 = _mm512_mask_mov_ps(fft1251, 43176, fft1257);
__m512 fft1340 = _mm512_mask_mov_ps(fft1333, 43176, fft1337);
__m512 fft1261 = _mm512_mask_mov_ps(fft1259, 43176, fft1258);
__m512 fft1341 = _mm512_mask_mov_ps(fft1339, 43176, fft1338);
__m512 fft1262 = _mm512_mask_mov_ps(fft1260, 22102, fft1258);
__m512 fft1342 = _mm512_mask_mov_ps(fft1340, 22102, fft1338);
__m512 fft1263 = _mm512_mask_mul_ps(fft1261, 64764, fft1261, _mm512_set1_ps(5e-01f));
__m512 fft1343 = _mm512_mask_mul_ps(fft1341, 64764, fft1341, _mm512_set1_ps(5e-01f));
__m512 fft1264 = _mm512_mask_mul_ps(fft1262, 64764, fft1262, _mm512_set1_ps(5e-01f));
__m512 fft1344 = _mm512_mask_mul_ps(fft1342, 64764, fft1342, _mm512_set1_ps(5e-01f));
__m512 df81 = fft1263;
__m512 df89 = fft1343;
__m512 df82 = fft1264;
__m512 df90 = fft1344;
__m512 df83 = fft1244;
__m512 df91 = fft1327;
__m512 df84 = fft1245;
__m512 df92 = fft1328;
__m512 df85 = fft1246;
__m512 df93 = fft1329;
__m512 df86 = fft1247;
__m512 df94 = fft1330;
__m512 df87 = fft1248;
__m512 df95 = fft1331;
__m512 df88 = fft1249;
__m512 df96 = fft1332;
__m512i eo8 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df83 = _mm512_permutexvar_ps(eo8, df83);
df84 = _mm512_permutexvar_ps(eo8, df84);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df83);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df84);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df83);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df84);
df91 = _mm512_permutexvar_ps(eo8, df91);
df92 = _mm512_permutexvar_ps(eo8, df92);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df91);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df92);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df91);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df92);
df85 = _mm512_permutexvar_ps(eo8, df85);
df86 = _mm512_permutexvar_ps(eo8, df86);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df85);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df86);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df85);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df86);
df93 = _mm512_permutexvar_ps(eo8, df93);
df94 = _mm512_permutexvar_ps(eo8, df94);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df93);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df94);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df93);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df94);
df87 = _mm512_permutexvar_ps(eo8, df87);
df88 = _mm512_permutexvar_ps(eo8, df88);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df87);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df88);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df87);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df88);
df95 = _mm512_permutexvar_ps(eo8, df95);
df96 = _mm512_permutexvar_ps(eo8, df96);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df95);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df96);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df95);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df96);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df81);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df82);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df81);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df82);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df89);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k4+128*m8+32*f9, 255, df90);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df89);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k4+128*m8+32*f9, 65280, df90);
}
if (j2 >= last1) return;
++j2;
j2 = 4;
}
if (j2 < 84) {
ptrdiff_t rel2 = (size_t)(j2-4)%23;
ptrdiff_t base2 = 10+(size_t)(j2-4)/23*60;
for (; ; rel2 = 0, base2 += 60) {
if (rel2 < 11) {
if (rel2 < 4) {
if (rel2 < 3) {
ptrdiff_t h4 = base2+0;
ptrdiff_t w4 = 10+60*rel2;
ptrdiff_t jj2 = 2-rel2+j2;
for (; j2 <= jj2; w4 += 60) {
ptrdiff_t k5 = 3*s1;
ptrdiff_t kk4 = k5+2;
for (; k5 <= kk4; ++k5) {
__m512 bnMul4 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k5+3*i6))[0]);
__m512 bnAdd4 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k5+3*i6))[1]);
for (ptrdiff_t b9 = 0; b9 < 6; ++b9) {
ptrdiff_t m9 = (size_t)b9/2;
ptrdiff_t f10 = (size_t)b9%2;
__m512 dat82 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat82 = _mm512_mask_fmadd_ps(dat82, 65535, bnMul4, bnAdd4);
__m512 dat83 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat83 = _mm512_mask_fmadd_ps(dat83, 65535, bnMul4, bnAdd4);
__m512 dat84 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat84 = _mm512_mask_fmadd_ps(dat84, 65535, bnMul4, bnAdd4);
__m512 dat85 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat85 = _mm512_mask_fmadd_ps(dat85, 65535, bnMul4, bnAdd4);
__m512 dat86 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat86 = _mm512_mask_fmadd_ps(dat86, 65535, bnMul4, bnAdd4);
__m512 dat87 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat87 = _mm512_mask_fmadd_ps(dat87, 65535, bnMul4, bnAdd4);
__m512 dat88 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat88 = _mm512_mask_fmadd_ps(dat88, 65535, bnMul4, bnAdd4);
__m512 dat89 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat89 = _mm512_mask_fmadd_ps(dat89, 65535, bnMul4, bnAdd4);
__m512 dat90 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat90 = _mm512_mask_fmadd_ps(dat90, 65535, bnMul4, bnAdd4);
__m512 dat91 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat91 = _mm512_mask_fmadd_ps(dat91, 65535, bnMul4, bnAdd4);
__m512 dat92 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat92 = _mm512_mask_fmadd_ps(dat92, 65535, bnMul4, bnAdd4);
__m512 dat93 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat93 = _mm512_mask_fmadd_ps(dat93, 65535, bnMul4, bnAdd4);
__m512 dat94 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat94 = _mm512_mask_fmadd_ps(dat94, 65535, bnMul4, bnAdd4);
__m512 dat95 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat95 = _mm512_mask_fmadd_ps(dat95, 65535, bnMul4, bnAdd4);
__m512 dat96 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat96 = _mm512_mask_fmadd_ps(dat96, 65535, bnMul4, bnAdd4);
__m512 dat97 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k5+896*h4+4*w4+40*b9);
dat97 = _mm512_mask_fmadd_ps(dat97, 65535, bnMul4, bnAdd4);
__m512 fft1345 = _mm512_add_ps(dat82, dat90);
__m512 fft1433 = _mm512_add_ps(dat83, dat91);
__m512 fft1346 = _mm512_sub_ps(dat82, dat90);
__m512 fft1434 = _mm512_sub_ps(dat83, dat91);
__m512 fft1347 = _mm512_add_ps(dat84, dat92);
__m512 fft1435 = _mm512_add_ps(dat85, dat93);
__m512 fft1348 = _mm512_sub_ps(dat84, dat92);
__m512 fft1436 = _mm512_sub_ps(dat85, dat93);
__m512 fft1349 = _mm512_add_ps(dat86, dat94);
__m512 fft1437 = _mm512_add_ps(dat87, dat95);
__m512 fft1350 = _mm512_sub_ps(dat86, dat94);
__m512 fft1438 = _mm512_sub_ps(dat87, dat95);
__m512 fft1351 = _mm512_add_ps(dat88, dat96);
__m512 fft1439 = _mm512_add_ps(dat89, dat97);
__m512 fft1352 = _mm512_sub_ps(dat88, dat96);
__m512 fft1440 = _mm512_sub_ps(dat89, dat97);
__m512 fft1353 = _mm512_add_ps(fft1345, fft1349);
__m512 fft1441 = _mm512_add_ps(fft1433, fft1437);
__m512 fft1354 = _mm512_sub_ps(fft1345, fft1349);
__m512 fft1442 = _mm512_sub_ps(fft1433, fft1437);
__m512 fft1355 = _mm512_add_ps(fft1347, fft1351);
__m512 fft1443 = _mm512_add_ps(fft1435, fft1439);
__m512 fft1356 = _mm512_sub_ps(fft1351, fft1347);
__m512 fft1444 = _mm512_sub_ps(fft1439, fft1435);
__m512 fft1357 = _mm512_sub_ps(fft1348, fft1352);
__m512 fft1445 = _mm512_sub_ps(fft1436, fft1440);
__m512 fft1358 = _mm512_add_ps(fft1348, fft1352);
__m512 fft1446 = _mm512_add_ps(fft1436, fft1440);
__m512 fft1359 = _mm512_add_ps(fft1353, fft1355);
__m512 fft1447 = _mm512_add_ps(fft1441, fft1443);
__m512 fft1360 = _mm512_sub_ps(fft1353, fft1355);
__m512 fft1448 = _mm512_sub_ps(fft1441, fft1443);
__m512 fft1361 = _mm512_fmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1449 = _mm512_fmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1362 = _mm512_fnmsub_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1450 = _mm512_fnmsub_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1363 = _mm512_fnmadd_ps(fft1357, _mm512_set1_ps(7.0710677e-01f), fft1346);
__m512 fft1451 = _mm512_fnmadd_ps(fft1445, _mm512_set1_ps(7.0710677e-01f), fft1434);
__m512 fft1364 = _mm512_fnmadd_ps(fft1358, _mm512_set1_ps(7.0710677e-01f), fft1350);
__m512 fft1452 = _mm512_fnmadd_ps(fft1446, _mm512_set1_ps(7.0710677e-01f), fft1438);
__m512 fft1365 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1366 = _mm512_fmadd_ps(fft1359, fft1365, _mm512_shuffle_f32x4(fft1359, fft1359, 78));
__m512 fft1453 = _mm512_fmadd_ps(fft1447, fft1365, _mm512_shuffle_f32x4(fft1447, fft1447, 78));
__m512 fft1367 = _mm512_fmadd_ps(fft1360, fft1365, _mm512_shuffle_f32x4(fft1360, fft1360, 78));
__m512 fft1454 = _mm512_fmadd_ps(fft1448, fft1365, _mm512_shuffle_f32x4(fft1448, fft1448, 78));
__m512 fft1368 = _mm512_fmadd_ps(fft1361, fft1365, _mm512_shuffle_f32x4(fft1361, fft1361, 78));
__m512 fft1455 = _mm512_fmadd_ps(fft1449, fft1365, _mm512_shuffle_f32x4(fft1449, fft1449, 78));
__m512 fft1369 = _mm512_fmadd_ps(fft1362, fft1365, _mm512_shuffle_f32x4(fft1362, fft1362, 78));
__m512 fft1456 = _mm512_fmadd_ps(fft1450, fft1365, _mm512_shuffle_f32x4(fft1450, fft1450, 78));
__m512 fft1370 = _mm512_fmadd_ps(fft1354, fft1365, _mm512_shuffle_f32x4(fft1354, fft1354, 78));
__m512 fft1457 = _mm512_fmadd_ps(fft1442, fft1365, _mm512_shuffle_f32x4(fft1442, fft1442, 78));
__m512 fft1371 = _mm512_fmadd_ps(fft1356, fft1365, _mm512_shuffle_f32x4(fft1356, fft1356, 78));
__m512 fft1458 = _mm512_fmadd_ps(fft1444, fft1365, _mm512_shuffle_f32x4(fft1444, fft1444, 78));
__m512 fft1372 = _mm512_fmadd_ps(fft1363, fft1365, _mm512_shuffle_f32x4(fft1363, fft1363, 78));
__m512 fft1459 = _mm512_fmadd_ps(fft1451, fft1365, _mm512_shuffle_f32x4(fft1451, fft1451, 78));
__m512 fft1373 = _mm512_fmadd_ps(fft1364, fft1365, _mm512_shuffle_f32x4(fft1364, fft1364, 78));
__m512 fft1460 = _mm512_fmadd_ps(fft1452, fft1365, _mm512_shuffle_f32x4(fft1452, fft1452, 78));
__m512 fft1374 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1375 = _mm512_mul_ps(fft1366, fft1374);
__m512 fft1461 = _mm512_mul_ps(fft1453, fft1374);
__m512 fft1376 = _mm512_mul_ps(fft1367, fft1374);
__m512 fft1462 = _mm512_mul_ps(fft1454, fft1374);
__m512 fft1377 = _mm512_mul_ps(fft1368, fft1374);
__m512 fft1463 = _mm512_mul_ps(fft1455, fft1374);
__m512 fft1378 = _mm512_mul_ps(fft1369, fft1374);
__m512 fft1464 = _mm512_mul_ps(fft1456, fft1374);
__m512 fft1379 = _mm512_mul_ps(fft1370, fft1374);
__m512 fft1465 = _mm512_mul_ps(fft1457, fft1374);
__m512 fft1380 = _mm512_mul_ps(fft1371, fft1374);
__m512 fft1466 = _mm512_mul_ps(fft1458, fft1374);
__m512 fft1381 = _mm512_mul_ps(fft1372, fft1374);
__m512 fft1467 = _mm512_mul_ps(fft1459, fft1374);
__m512 fft1382 = _mm512_mul_ps(fft1373, fft1374);
__m512 fft1468 = _mm512_mul_ps(fft1460, fft1374);
__m512 fft1383 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1384 = _mm512_fmadd_ps(fft1367, fft1383, fft1375);
__m512 fft1469 = _mm512_fmadd_ps(fft1454, fft1383, fft1461);
__m512 fft1385 = _mm512_fnmadd_ps(fft1366, fft1383, fft1376);
__m512 fft1470 = _mm512_fnmadd_ps(fft1453, fft1383, fft1462);
__m512 fft1386 = _mm512_fmadd_ps(fft1369, fft1383, fft1377);
__m512 fft1471 = _mm512_fmadd_ps(fft1456, fft1383, fft1463);
__m512 fft1387 = _mm512_fnmadd_ps(fft1368, fft1383, fft1378);
__m512 fft1472 = _mm512_fnmadd_ps(fft1455, fft1383, fft1464);
__m512 fft1388 = _mm512_fmadd_ps(fft1371, fft1383, fft1379);
__m512 fft1473 = _mm512_fmadd_ps(fft1458, fft1383, fft1465);
__m512 fft1389 = _mm512_fnmadd_ps(fft1370, fft1383, fft1380);
__m512 fft1474 = _mm512_fnmadd_ps(fft1457, fft1383, fft1466);
__m512 fft1390 = _mm512_fmadd_ps(fft1373, fft1383, fft1381);
__m512 fft1475 = _mm512_fmadd_ps(fft1460, fft1383, fft1467);
__m512 fft1391 = _mm512_fnmadd_ps(fft1372, fft1383, fft1382);
__m512 fft1476 = _mm512_fnmadd_ps(fft1459, fft1383, fft1468);
__m512 fft1392 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1393 = _mm512_fmadd_ps(fft1384, fft1392, _mm512_shuffle_f32x4(fft1384, fft1384, 177));
__m512 fft1477 = _mm512_fmadd_ps(fft1469, fft1392, _mm512_shuffle_f32x4(fft1469, fft1469, 177));
__m512 fft1394 = _mm512_fmadd_ps(fft1385, fft1392, _mm512_shuffle_f32x4(fft1385, fft1385, 177));
__m512 fft1478 = _mm512_fmadd_ps(fft1470, fft1392, _mm512_shuffle_f32x4(fft1470, fft1470, 177));
__m512 fft1395 = _mm512_fmadd_ps(fft1386, fft1392, _mm512_shuffle_f32x4(fft1386, fft1386, 177));
__m512 fft1479 = _mm512_fmadd_ps(fft1471, fft1392, _mm512_shuffle_f32x4(fft1471, fft1471, 177));
__m512 fft1396 = _mm512_fmadd_ps(fft1387, fft1392, _mm512_shuffle_f32x4(fft1387, fft1387, 177));
__m512 fft1480 = _mm512_fmadd_ps(fft1472, fft1392, _mm512_shuffle_f32x4(fft1472, fft1472, 177));
__m512 fft1397 = _mm512_fmadd_ps(fft1388, fft1392, _mm512_shuffle_f32x4(fft1388, fft1388, 177));
__m512 fft1481 = _mm512_fmadd_ps(fft1473, fft1392, _mm512_shuffle_f32x4(fft1473, fft1473, 177));
__m512 fft1398 = _mm512_fmadd_ps(fft1389, fft1392, _mm512_shuffle_f32x4(fft1389, fft1389, 177));
__m512 fft1482 = _mm512_fmadd_ps(fft1474, fft1392, _mm512_shuffle_f32x4(fft1474, fft1474, 177));
__m512 fft1399 = _mm512_fmadd_ps(fft1390, fft1392, _mm512_shuffle_f32x4(fft1390, fft1390, 177));
__m512 fft1483 = _mm512_fmadd_ps(fft1475, fft1392, _mm512_shuffle_f32x4(fft1475, fft1475, 177));
__m512 fft1400 = _mm512_fmadd_ps(fft1391, fft1392, _mm512_shuffle_f32x4(fft1391, fft1391, 177));
__m512 fft1484 = _mm512_fmadd_ps(fft1476, fft1392, _mm512_shuffle_f32x4(fft1476, fft1476, 177));
__m512 fft1401 = _mm512_mask_mov_ps(fft1393, 49344, fft1394);
__m512 fft1485 = _mm512_mask_mov_ps(fft1477, 49344, fft1478);
__m512 fft1402 = _mm512_mask_sub_ps(fft1394, 49344, _mm512_setzero_ps(), fft1393);
__m512 fft1486 = _mm512_mask_sub_ps(fft1478, 49344, _mm512_setzero_ps(), fft1477);
__m512 fft1403 = _mm512_mask_mov_ps(fft1395, 49344, fft1396);
__m512 fft1487 = _mm512_mask_mov_ps(fft1479, 49344, fft1480);
__m512 fft1404 = _mm512_mask_sub_ps(fft1396, 49344, _mm512_setzero_ps(), fft1395);
__m512 fft1488 = _mm512_mask_sub_ps(fft1480, 49344, _mm512_setzero_ps(), fft1479);
__m512 fft1405 = _mm512_mask_mov_ps(fft1397, 49344, fft1398);
__m512 fft1489 = _mm512_mask_mov_ps(fft1481, 49344, fft1482);
__m512 fft1406 = _mm512_mask_sub_ps(fft1398, 49344, _mm512_setzero_ps(), fft1397);
__m512 fft1490 = _mm512_mask_sub_ps(fft1482, 49344, _mm512_setzero_ps(), fft1481);
__m512 fft1407 = _mm512_mask_mov_ps(fft1399, 49344, fft1400);
__m512 fft1491 = _mm512_mask_mov_ps(fft1483, 49344, fft1484);
__m512 fft1408 = _mm512_mask_sub_ps(fft1400, 49344, _mm512_setzero_ps(), fft1399);
__m512 fft1492 = _mm512_mask_sub_ps(fft1484, 49344, _mm512_setzero_ps(), fft1483);
__m512 fft1409 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1410 = _mm512_fmadd_ps(fft1401, fft1409, _mm512_shuffle_ps(fft1401, fft1401, 78));
__m512 fft1493 = _mm512_fmadd_ps(fft1485, fft1409, _mm512_shuffle_ps(fft1485, fft1485, 78));
__m512 fft1411 = _mm512_fmadd_ps(fft1402, fft1409, _mm512_shuffle_ps(fft1402, fft1402, 78));
__m512 fft1494 = _mm512_fmadd_ps(fft1486, fft1409, _mm512_shuffle_ps(fft1486, fft1486, 78));
__m512 fft1412 = _mm512_fmadd_ps(fft1403, fft1409, _mm512_shuffle_ps(fft1403, fft1403, 78));
__m512 fft1495 = _mm512_fmadd_ps(fft1487, fft1409, _mm512_shuffle_ps(fft1487, fft1487, 78));
__m512 fft1413 = _mm512_fmadd_ps(fft1404, fft1409, _mm512_shuffle_ps(fft1404, fft1404, 78));
__m512 fft1496 = _mm512_fmadd_ps(fft1488, fft1409, _mm512_shuffle_ps(fft1488, fft1488, 78));
__m512 fft1414 = _mm512_fmadd_ps(fft1405, fft1409, _mm512_shuffle_ps(fft1405, fft1405, 78));
__m512 fft1497 = _mm512_fmadd_ps(fft1489, fft1409, _mm512_shuffle_ps(fft1489, fft1489, 78));
__m512 fft1415 = _mm512_fmadd_ps(fft1406, fft1409, _mm512_shuffle_ps(fft1406, fft1406, 78));
__m512 fft1498 = _mm512_fmadd_ps(fft1490, fft1409, _mm512_shuffle_ps(fft1490, fft1490, 78));
__m512 fft1416 = _mm512_fmadd_ps(fft1407, fft1409, _mm512_shuffle_ps(fft1407, fft1407, 78));
__m512 fft1499 = _mm512_fmadd_ps(fft1491, fft1409, _mm512_shuffle_ps(fft1491, fft1491, 78));
__m512 fft1417 = _mm512_fmadd_ps(fft1408, fft1409, _mm512_shuffle_ps(fft1408, fft1408, 78));
__m512 fft1500 = _mm512_fmadd_ps(fft1492, fft1409, _mm512_shuffle_ps(fft1492, fft1492, 78));
__m512i fft1418 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1419 = _mm512_permutexvar_ps(fft1418, fft1410);
__m512 fft1501 = _mm512_permutexvar_ps(fft1418, fft1493);
__m512i fft1420 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1421 = _mm512_permutexvar_ps(fft1420, fft1410);
__m512 fft1502 = _mm512_permutexvar_ps(fft1420, fft1493);
__m512 fft1422 = _mm512_permutexvar_ps(fft1418, fft1411);
__m512 fft1503 = _mm512_permutexvar_ps(fft1418, fft1494);
__m512 fft1423 = _mm512_permutexvar_ps(fft1420, fft1411);
__m512 fft1504 = _mm512_permutexvar_ps(fft1420, fft1494);
__m512 fft1424 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1425 = _mm512_fmadd_ps(fft1419, fft1424, fft1421);
__m512 fft1505 = _mm512_fmadd_ps(fft1501, fft1424, fft1502);
__m512 fft1426 = _mm512_fnmadd_ps(fft1423, fft1424, fft1422);
__m512 fft1506 = _mm512_fnmadd_ps(fft1504, fft1424, fft1503);
__m512 fft1427 = _mm512_mask_mov_ps(fft1423, 21845, fft1425);
__m512 fft1507 = _mm512_mask_mov_ps(fft1504, 21845, fft1505);
__m512 fft1428 = _mm512_mask_mov_ps(fft1419, 43176, fft1425);
__m512 fft1508 = _mm512_mask_mov_ps(fft1501, 43176, fft1505);
__m512 fft1429 = _mm512_mask_mov_ps(fft1427, 43176, fft1426);
__m512 fft1509 = _mm512_mask_mov_ps(fft1507, 43176, fft1506);
__m512 fft1430 = _mm512_mask_mov_ps(fft1428, 22102, fft1426);
__m512 fft1510 = _mm512_mask_mov_ps(fft1508, 22102, fft1506);
__m512 fft1431 = _mm512_mask_mul_ps(fft1429, 64764, fft1429, _mm512_set1_ps(5e-01f));
__m512 fft1511 = _mm512_mask_mul_ps(fft1509, 64764, fft1509, _mm512_set1_ps(5e-01f));
__m512 fft1432 = _mm512_mask_mul_ps(fft1430, 64764, fft1430, _mm512_set1_ps(5e-01f));
__m512 fft1512 = _mm512_mask_mul_ps(fft1510, 64764, fft1510, _mm512_set1_ps(5e-01f));
__m512 df97 = fft1431;
__m512 df105 = fft1511;
__m512 df98 = fft1432;
__m512 df106 = fft1512;
__m512 df99 = fft1412;
__m512 df107 = fft1495;
__m512 df100 = fft1413;
__m512 df108 = fft1496;
__m512 df101 = fft1414;
__m512 df109 = fft1497;
__m512 df102 = fft1415;
__m512 df110 = fft1498;
__m512 df103 = fft1416;
__m512 df111 = fft1499;
__m512 df104 = fft1417;
__m512 df112 = fft1500;
__m512i eo9 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df99 = _mm512_permutexvar_ps(eo9, df99);
df100 = _mm512_permutexvar_ps(eo9, df100);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df99);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df100);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df99);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df100);
df107 = _mm512_permutexvar_ps(eo9, df107);
df108 = _mm512_permutexvar_ps(eo9, df108);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df107);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df108);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df107);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df108);
df101 = _mm512_permutexvar_ps(eo9, df101);
df102 = _mm512_permutexvar_ps(eo9, df102);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df101);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df102);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df101);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df102);
df109 = _mm512_permutexvar_ps(eo9, df109);
df110 = _mm512_permutexvar_ps(eo9, df110);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df109);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df110);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df109);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df110);
df103 = _mm512_permutexvar_ps(eo9, df103);
df104 = _mm512_permutexvar_ps(eo9, df104);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df103);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df104);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df103);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df104);
df111 = _mm512_permutexvar_ps(eo9, df111);
df112 = _mm512_permutexvar_ps(eo9, df112);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df111);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df112);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df111);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df112);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df97);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df98);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df97);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df98);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df105);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k5+128*m9+32*f10, 255, df106);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df105);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k5+128*m9+32*f10, 65280, df106);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 3;
}
ptrdiff_t h5 = base2+0;
ptrdiff_t w5 = 190;
ptrdiff_t k6 = 3*s1;
ptrdiff_t kk5 = k6+2;
for (; k6 <= kk5; ++k6) {
__m512 bnMul5 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k6+3*i6))[0]);
__m512 bnAdd5 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k6+3*i6))[1]);
for (ptrdiff_t b10 = 0; b10 < 3; ++b10) {
ptrdiff_t m10 = (size_t)b10/2;
ptrdiff_t f11 = (size_t)b10%2;
__m512 dat98 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat98 = _mm512_mask_fmadd_ps(dat98, 65535, bnMul5, bnAdd5);
__m512 dat99 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat99 = _mm512_mask_fmadd_ps(dat99, 65535, bnMul5, bnAdd5);
__m512 dat100 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat100 = _mm512_mask_fmadd_ps(dat100, 65535, bnMul5, bnAdd5);
__m512 dat101 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat101 = _mm512_mask_fmadd_ps(dat101, 65535, bnMul5, bnAdd5);
__m512 dat102 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat102 = _mm512_mask_fmadd_ps(dat102, 65535, bnMul5, bnAdd5);
__m512 dat103 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat103 = _mm512_mask_fmadd_ps(dat103, 65535, bnMul5, bnAdd5);
__m512 dat104 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat104 = _mm512_mask_fmadd_ps(dat104, 65535, bnMul5, bnAdd5);
__m512 dat105 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat105 = _mm512_mask_fmadd_ps(dat105, 65535, bnMul5, bnAdd5);
__m512 dat106 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat106 = _mm512_mask_fmadd_ps(dat106, 65535, bnMul5, bnAdd5);
__m512 dat107 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat107 = _mm512_mask_fmadd_ps(dat107, 65535, bnMul5, bnAdd5);
__m512 dat108 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat108 = _mm512_mask_fmadd_ps(dat108, 65535, bnMul5, bnAdd5);
__m512 dat109 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat109 = _mm512_mask_fmadd_ps(dat109, 65535, bnMul5, bnAdd5);
__m512 dat110 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat110 = _mm512_mask_fmadd_ps(dat110, 65535, bnMul5, bnAdd5);
__m512 dat111 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat111 = _mm512_mask_fmadd_ps(dat111, 65535, bnMul5, bnAdd5);
__m512 dat112 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat112 = _mm512_mask_fmadd_ps(dat112, 65535, bnMul5, bnAdd5);
__m512 dat113 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k6+896*h5+4*w5+40*b10);
dat113 = _mm512_mask_fmadd_ps(dat113, 65535, bnMul5, bnAdd5);
__m512 fft1513 = _mm512_add_ps(dat98, dat106);
__m512 fft1601 = _mm512_add_ps(dat99, dat107);
__m512 fft1514 = _mm512_sub_ps(dat98, dat106);
__m512 fft1602 = _mm512_sub_ps(dat99, dat107);
__m512 fft1515 = _mm512_add_ps(dat100, dat108);
__m512 fft1603 = _mm512_add_ps(dat101, dat109);
__m512 fft1516 = _mm512_sub_ps(dat100, dat108);
__m512 fft1604 = _mm512_sub_ps(dat101, dat109);
__m512 fft1517 = _mm512_add_ps(dat102, dat110);
__m512 fft1605 = _mm512_add_ps(dat103, dat111);
__m512 fft1518 = _mm512_sub_ps(dat102, dat110);
__m512 fft1606 = _mm512_sub_ps(dat103, dat111);
__m512 fft1519 = _mm512_add_ps(dat104, dat112);
__m512 fft1607 = _mm512_add_ps(dat105, dat113);
__m512 fft1520 = _mm512_sub_ps(dat104, dat112);
__m512 fft1608 = _mm512_sub_ps(dat105, dat113);
__m512 fft1521 = _mm512_add_ps(fft1513, fft1517);
__m512 fft1609 = _mm512_add_ps(fft1601, fft1605);
__m512 fft1522 = _mm512_sub_ps(fft1513, fft1517);
__m512 fft1610 = _mm512_sub_ps(fft1601, fft1605);
__m512 fft1523 = _mm512_add_ps(fft1515, fft1519);
__m512 fft1611 = _mm512_add_ps(fft1603, fft1607);
__m512 fft1524 = _mm512_sub_ps(fft1519, fft1515);
__m512 fft1612 = _mm512_sub_ps(fft1607, fft1603);
__m512 fft1525 = _mm512_sub_ps(fft1516, fft1520);
__m512 fft1613 = _mm512_sub_ps(fft1604, fft1608);
__m512 fft1526 = _mm512_add_ps(fft1516, fft1520);
__m512 fft1614 = _mm512_add_ps(fft1604, fft1608);
__m512 fft1527 = _mm512_add_ps(fft1521, fft1523);
__m512 fft1615 = _mm512_add_ps(fft1609, fft1611);
__m512 fft1528 = _mm512_sub_ps(fft1521, fft1523);
__m512 fft1616 = _mm512_sub_ps(fft1609, fft1611);
__m512 fft1529 = _mm512_fmadd_ps(fft1525, _mm512_set1_ps(7.0710677e-01f), fft1514);
__m512 fft1617 = _mm512_fmadd_ps(fft1613, _mm512_set1_ps(7.0710677e-01f), fft1602);
__m512 fft1530 = _mm512_fnmsub_ps(fft1526, _mm512_set1_ps(7.0710677e-01f), fft1518);
__m512 fft1618 = _mm512_fnmsub_ps(fft1614, _mm512_set1_ps(7.0710677e-01f), fft1606);
__m512 fft1531 = _mm512_fnmadd_ps(fft1525, _mm512_set1_ps(7.0710677e-01f), fft1514);
__m512 fft1619 = _mm512_fnmadd_ps(fft1613, _mm512_set1_ps(7.0710677e-01f), fft1602);
__m512 fft1532 = _mm512_fnmadd_ps(fft1526, _mm512_set1_ps(7.0710677e-01f), fft1518);
__m512 fft1620 = _mm512_fnmadd_ps(fft1614, _mm512_set1_ps(7.0710677e-01f), fft1606);
__m512 fft1533 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1534 = _mm512_fmadd_ps(fft1527, fft1533, _mm512_shuffle_f32x4(fft1527, fft1527, 78));
__m512 fft1621 = _mm512_fmadd_ps(fft1615, fft1533, _mm512_shuffle_f32x4(fft1615, fft1615, 78));
__m512 fft1535 = _mm512_fmadd_ps(fft1528, fft1533, _mm512_shuffle_f32x4(fft1528, fft1528, 78));
__m512 fft1622 = _mm512_fmadd_ps(fft1616, fft1533, _mm512_shuffle_f32x4(fft1616, fft1616, 78));
__m512 fft1536 = _mm512_fmadd_ps(fft1529, fft1533, _mm512_shuffle_f32x4(fft1529, fft1529, 78));
__m512 fft1623 = _mm512_fmadd_ps(fft1617, fft1533, _mm512_shuffle_f32x4(fft1617, fft1617, 78));
__m512 fft1537 = _mm512_fmadd_ps(fft1530, fft1533, _mm512_shuffle_f32x4(fft1530, fft1530, 78));
__m512 fft1624 = _mm512_fmadd_ps(fft1618, fft1533, _mm512_shuffle_f32x4(fft1618, fft1618, 78));
__m512 fft1538 = _mm512_fmadd_ps(fft1522, fft1533, _mm512_shuffle_f32x4(fft1522, fft1522, 78));
__m512 fft1625 = _mm512_fmadd_ps(fft1610, fft1533, _mm512_shuffle_f32x4(fft1610, fft1610, 78));
__m512 fft1539 = _mm512_fmadd_ps(fft1524, fft1533, _mm512_shuffle_f32x4(fft1524, fft1524, 78));
__m512 fft1626 = _mm512_fmadd_ps(fft1612, fft1533, _mm512_shuffle_f32x4(fft1612, fft1612, 78));
__m512 fft1540 = _mm512_fmadd_ps(fft1531, fft1533, _mm512_shuffle_f32x4(fft1531, fft1531, 78));
__m512 fft1627 = _mm512_fmadd_ps(fft1619, fft1533, _mm512_shuffle_f32x4(fft1619, fft1619, 78));
__m512 fft1541 = _mm512_fmadd_ps(fft1532, fft1533, _mm512_shuffle_f32x4(fft1532, fft1532, 78));
__m512 fft1628 = _mm512_fmadd_ps(fft1620, fft1533, _mm512_shuffle_f32x4(fft1620, fft1620, 78));
__m512 fft1542 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1543 = _mm512_mul_ps(fft1534, fft1542);
__m512 fft1629 = _mm512_mul_ps(fft1621, fft1542);
__m512 fft1544 = _mm512_mul_ps(fft1535, fft1542);
__m512 fft1630 = _mm512_mul_ps(fft1622, fft1542);
__m512 fft1545 = _mm512_mul_ps(fft1536, fft1542);
__m512 fft1631 = _mm512_mul_ps(fft1623, fft1542);
__m512 fft1546 = _mm512_mul_ps(fft1537, fft1542);
__m512 fft1632 = _mm512_mul_ps(fft1624, fft1542);
__m512 fft1547 = _mm512_mul_ps(fft1538, fft1542);
__m512 fft1633 = _mm512_mul_ps(fft1625, fft1542);
__m512 fft1548 = _mm512_mul_ps(fft1539, fft1542);
__m512 fft1634 = _mm512_mul_ps(fft1626, fft1542);
__m512 fft1549 = _mm512_mul_ps(fft1540, fft1542);
__m512 fft1635 = _mm512_mul_ps(fft1627, fft1542);
__m512 fft1550 = _mm512_mul_ps(fft1541, fft1542);
__m512 fft1636 = _mm512_mul_ps(fft1628, fft1542);
__m512 fft1551 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1552 = _mm512_fmadd_ps(fft1535, fft1551, fft1543);
__m512 fft1637 = _mm512_fmadd_ps(fft1622, fft1551, fft1629);
__m512 fft1553 = _mm512_fnmadd_ps(fft1534, fft1551, fft1544);
__m512 fft1638 = _mm512_fnmadd_ps(fft1621, fft1551, fft1630);
__m512 fft1554 = _mm512_fmadd_ps(fft1537, fft1551, fft1545);
__m512 fft1639 = _mm512_fmadd_ps(fft1624, fft1551, fft1631);
__m512 fft1555 = _mm512_fnmadd_ps(fft1536, fft1551, fft1546);
__m512 fft1640 = _mm512_fnmadd_ps(fft1623, fft1551, fft1632);
__m512 fft1556 = _mm512_fmadd_ps(fft1539, fft1551, fft1547);
__m512 fft1641 = _mm512_fmadd_ps(fft1626, fft1551, fft1633);
__m512 fft1557 = _mm512_fnmadd_ps(fft1538, fft1551, fft1548);
__m512 fft1642 = _mm512_fnmadd_ps(fft1625, fft1551, fft1634);
__m512 fft1558 = _mm512_fmadd_ps(fft1541, fft1551, fft1549);
__m512 fft1643 = _mm512_fmadd_ps(fft1628, fft1551, fft1635);
__m512 fft1559 = _mm512_fnmadd_ps(fft1540, fft1551, fft1550);
__m512 fft1644 = _mm512_fnmadd_ps(fft1627, fft1551, fft1636);
__m512 fft1560 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1561 = _mm512_fmadd_ps(fft1552, fft1560, _mm512_shuffle_f32x4(fft1552, fft1552, 177));
__m512 fft1645 = _mm512_fmadd_ps(fft1637, fft1560, _mm512_shuffle_f32x4(fft1637, fft1637, 177));
__m512 fft1562 = _mm512_fmadd_ps(fft1553, fft1560, _mm512_shuffle_f32x4(fft1553, fft1553, 177));
__m512 fft1646 = _mm512_fmadd_ps(fft1638, fft1560, _mm512_shuffle_f32x4(fft1638, fft1638, 177));
__m512 fft1563 = _mm512_fmadd_ps(fft1554, fft1560, _mm512_shuffle_f32x4(fft1554, fft1554, 177));
__m512 fft1647 = _mm512_fmadd_ps(fft1639, fft1560, _mm512_shuffle_f32x4(fft1639, fft1639, 177));
__m512 fft1564 = _mm512_fmadd_ps(fft1555, fft1560, _mm512_shuffle_f32x4(fft1555, fft1555, 177));
__m512 fft1648 = _mm512_fmadd_ps(fft1640, fft1560, _mm512_shuffle_f32x4(fft1640, fft1640, 177));
__m512 fft1565 = _mm512_fmadd_ps(fft1556, fft1560, _mm512_shuffle_f32x4(fft1556, fft1556, 177));
__m512 fft1649 = _mm512_fmadd_ps(fft1641, fft1560, _mm512_shuffle_f32x4(fft1641, fft1641, 177));
__m512 fft1566 = _mm512_fmadd_ps(fft1557, fft1560, _mm512_shuffle_f32x4(fft1557, fft1557, 177));
__m512 fft1650 = _mm512_fmadd_ps(fft1642, fft1560, _mm512_shuffle_f32x4(fft1642, fft1642, 177));
__m512 fft1567 = _mm512_fmadd_ps(fft1558, fft1560, _mm512_shuffle_f32x4(fft1558, fft1558, 177));
__m512 fft1651 = _mm512_fmadd_ps(fft1643, fft1560, _mm512_shuffle_f32x4(fft1643, fft1643, 177));
__m512 fft1568 = _mm512_fmadd_ps(fft1559, fft1560, _mm512_shuffle_f32x4(fft1559, fft1559, 177));
__m512 fft1652 = _mm512_fmadd_ps(fft1644, fft1560, _mm512_shuffle_f32x4(fft1644, fft1644, 177));
__m512 fft1569 = _mm512_mask_mov_ps(fft1561, 49344, fft1562);
__m512 fft1653 = _mm512_mask_mov_ps(fft1645, 49344, fft1646);
__m512 fft1570 = _mm512_mask_sub_ps(fft1562, 49344, _mm512_setzero_ps(), fft1561);
__m512 fft1654 = _mm512_mask_sub_ps(fft1646, 49344, _mm512_setzero_ps(), fft1645);
__m512 fft1571 = _mm512_mask_mov_ps(fft1563, 49344, fft1564);
__m512 fft1655 = _mm512_mask_mov_ps(fft1647, 49344, fft1648);
__m512 fft1572 = _mm512_mask_sub_ps(fft1564, 49344, _mm512_setzero_ps(), fft1563);
__m512 fft1656 = _mm512_mask_sub_ps(fft1648, 49344, _mm512_setzero_ps(), fft1647);
__m512 fft1573 = _mm512_mask_mov_ps(fft1565, 49344, fft1566);
__m512 fft1657 = _mm512_mask_mov_ps(fft1649, 49344, fft1650);
__m512 fft1574 = _mm512_mask_sub_ps(fft1566, 49344, _mm512_setzero_ps(), fft1565);
__m512 fft1658 = _mm512_mask_sub_ps(fft1650, 49344, _mm512_setzero_ps(), fft1649);
__m512 fft1575 = _mm512_mask_mov_ps(fft1567, 49344, fft1568);
__m512 fft1659 = _mm512_mask_mov_ps(fft1651, 49344, fft1652);
__m512 fft1576 = _mm512_mask_sub_ps(fft1568, 49344, _mm512_setzero_ps(), fft1567);
__m512 fft1660 = _mm512_mask_sub_ps(fft1652, 49344, _mm512_setzero_ps(), fft1651);
__m512 fft1577 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1578 = _mm512_fmadd_ps(fft1569, fft1577, _mm512_shuffle_ps(fft1569, fft1569, 78));
__m512 fft1661 = _mm512_fmadd_ps(fft1653, fft1577, _mm512_shuffle_ps(fft1653, fft1653, 78));
__m512 fft1579 = _mm512_fmadd_ps(fft1570, fft1577, _mm512_shuffle_ps(fft1570, fft1570, 78));
__m512 fft1662 = _mm512_fmadd_ps(fft1654, fft1577, _mm512_shuffle_ps(fft1654, fft1654, 78));
__m512 fft1580 = _mm512_fmadd_ps(fft1571, fft1577, _mm512_shuffle_ps(fft1571, fft1571, 78));
__m512 fft1663 = _mm512_fmadd_ps(fft1655, fft1577, _mm512_shuffle_ps(fft1655, fft1655, 78));
__m512 fft1581 = _mm512_fmadd_ps(fft1572, fft1577, _mm512_shuffle_ps(fft1572, fft1572, 78));
__m512 fft1664 = _mm512_fmadd_ps(fft1656, fft1577, _mm512_shuffle_ps(fft1656, fft1656, 78));
__m512 fft1582 = _mm512_fmadd_ps(fft1573, fft1577, _mm512_shuffle_ps(fft1573, fft1573, 78));
__m512 fft1665 = _mm512_fmadd_ps(fft1657, fft1577, _mm512_shuffle_ps(fft1657, fft1657, 78));
__m512 fft1583 = _mm512_fmadd_ps(fft1574, fft1577, _mm512_shuffle_ps(fft1574, fft1574, 78));
__m512 fft1666 = _mm512_fmadd_ps(fft1658, fft1577, _mm512_shuffle_ps(fft1658, fft1658, 78));
__m512 fft1584 = _mm512_fmadd_ps(fft1575, fft1577, _mm512_shuffle_ps(fft1575, fft1575, 78));
__m512 fft1667 = _mm512_fmadd_ps(fft1659, fft1577, _mm512_shuffle_ps(fft1659, fft1659, 78));
__m512 fft1585 = _mm512_fmadd_ps(fft1576, fft1577, _mm512_shuffle_ps(fft1576, fft1576, 78));
__m512 fft1668 = _mm512_fmadd_ps(fft1660, fft1577, _mm512_shuffle_ps(fft1660, fft1660, 78));
__m512i fft1586 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1587 = _mm512_permutexvar_ps(fft1586, fft1578);
__m512 fft1669 = _mm512_permutexvar_ps(fft1586, fft1661);
__m512i fft1588 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1589 = _mm512_permutexvar_ps(fft1588, fft1578);
__m512 fft1670 = _mm512_permutexvar_ps(fft1588, fft1661);
__m512 fft1590 = _mm512_permutexvar_ps(fft1586, fft1579);
__m512 fft1671 = _mm512_permutexvar_ps(fft1586, fft1662);
__m512 fft1591 = _mm512_permutexvar_ps(fft1588, fft1579);
__m512 fft1672 = _mm512_permutexvar_ps(fft1588, fft1662);
__m512 fft1592 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1593 = _mm512_fmadd_ps(fft1587, fft1592, fft1589);
__m512 fft1673 = _mm512_fmadd_ps(fft1669, fft1592, fft1670);
__m512 fft1594 = _mm512_fnmadd_ps(fft1591, fft1592, fft1590);
__m512 fft1674 = _mm512_fnmadd_ps(fft1672, fft1592, fft1671);
__m512 fft1595 = _mm512_mask_mov_ps(fft1591, 21845, fft1593);
__m512 fft1675 = _mm512_mask_mov_ps(fft1672, 21845, fft1673);
__m512 fft1596 = _mm512_mask_mov_ps(fft1587, 43176, fft1593);
__m512 fft1676 = _mm512_mask_mov_ps(fft1669, 43176, fft1673);
__m512 fft1597 = _mm512_mask_mov_ps(fft1595, 43176, fft1594);
__m512 fft1677 = _mm512_mask_mov_ps(fft1675, 43176, fft1674);
__m512 fft1598 = _mm512_mask_mov_ps(fft1596, 22102, fft1594);
__m512 fft1678 = _mm512_mask_mov_ps(fft1676, 22102, fft1674);
__m512 fft1599 = _mm512_mask_mul_ps(fft1597, 64764, fft1597, _mm512_set1_ps(5e-01f));
__m512 fft1679 = _mm512_mask_mul_ps(fft1677, 64764, fft1677, _mm512_set1_ps(5e-01f));
__m512 fft1600 = _mm512_mask_mul_ps(fft1598, 64764, fft1598, _mm512_set1_ps(5e-01f));
__m512 fft1680 = _mm512_mask_mul_ps(fft1678, 64764, fft1678, _mm512_set1_ps(5e-01f));
__m512 df113 = fft1599;
__m512 df121 = fft1679;
__m512 df114 = fft1600;
__m512 df122 = fft1680;
__m512 df115 = fft1580;
__m512 df123 = fft1663;
__m512 df116 = fft1581;
__m512 df124 = fft1664;
__m512 df117 = fft1582;
__m512 df125 = fft1665;
__m512 df118 = fft1583;
__m512 df126 = fft1666;
__m512 df119 = fft1584;
__m512 df127 = fft1667;
__m512 df120 = fft1585;
__m512 df128 = fft1668;
__m512i eo10 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df115 = _mm512_permutexvar_ps(eo10, df115);
df116 = _mm512_permutexvar_ps(eo10, df116);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df115);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df116);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df115);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df116);
df123 = _mm512_permutexvar_ps(eo10, df123);
df124 = _mm512_permutexvar_ps(eo10, df124);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df123);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df124);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df123);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df124);
df117 = _mm512_permutexvar_ps(eo10, df117);
df118 = _mm512_permutexvar_ps(eo10, df118);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df117);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df118);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df117);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df118);
df125 = _mm512_permutexvar_ps(eo10, df125);
df126 = _mm512_permutexvar_ps(eo10, df126);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df125);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df126);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df125);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df126);
df119 = _mm512_permutexvar_ps(eo10, df119);
df120 = _mm512_permutexvar_ps(eo10, df120);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df119);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df120);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df119);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df120);
df127 = _mm512_permutexvar_ps(eo10, df127);
df128 = _mm512_permutexvar_ps(eo10, df128);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df127);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df128);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df127);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df128);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df113);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df114);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df113);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df114);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df121);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m10+32*f11, 255, df122);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df121);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m10+32*f11, 65280, df122);
}
ptrdiff_t b11 = 3;
ptrdiff_t m11 = (size_t)b11/2;
ptrdiff_t f12 = (size_t)b11%2;
__m512 dat114 = _mm512_maskz_loadu_ps(127, datPtr1+120+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat114 = _mm512_mask_fmadd_ps(dat114, 127, bnMul5, bnAdd5);
__m512 dat115 = _mm512_maskz_loadu_ps(127, datPtr1+1016+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat115 = _mm512_mask_fmadd_ps(dat115, 127, bnMul5, bnAdd5);
__m512 dat116 = _mm512_maskz_loadu_ps(127, datPtr1+1912+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat116 = _mm512_mask_fmadd_ps(dat116, 127, bnMul5, bnAdd5);
__m512 dat117 = _mm512_maskz_loadu_ps(127, datPtr1+2808+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat117 = _mm512_mask_fmadd_ps(dat117, 127, bnMul5, bnAdd5);
__m512 dat118 = _mm512_maskz_loadu_ps(127, datPtr1+3704+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat118 = _mm512_mask_fmadd_ps(dat118, 127, bnMul5, bnAdd5);
__m512 dat119 = _mm512_maskz_loadu_ps(127, datPtr1+4600+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat119 = _mm512_mask_fmadd_ps(dat119, 127, bnMul5, bnAdd5);
__m512 dat120 = _mm512_maskz_loadu_ps(127, datPtr1+5496+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat120 = _mm512_mask_fmadd_ps(dat120, 127, bnMul5, bnAdd5);
__m512 dat121 = _mm512_maskz_loadu_ps(127, datPtr1+6392+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat121 = _mm512_mask_fmadd_ps(dat121, 127, bnMul5, bnAdd5);
__m512 dat122 = _mm512_maskz_loadu_ps(127, datPtr1+7288+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat122 = _mm512_mask_fmadd_ps(dat122, 127, bnMul5, bnAdd5);
__m512 dat123 = _mm512_maskz_loadu_ps(127, datPtr1+8184+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat123 = _mm512_mask_fmadd_ps(dat123, 127, bnMul5, bnAdd5);
__m512 dat124 = _mm512_maskz_loadu_ps(127, datPtr1+9080+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat124 = _mm512_mask_fmadd_ps(dat124, 127, bnMul5, bnAdd5);
__m512 dat125 = _mm512_maskz_loadu_ps(127, datPtr1+9976+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat125 = _mm512_mask_fmadd_ps(dat125, 127, bnMul5, bnAdd5);
__m512 dat126 = _mm512_maskz_loadu_ps(127, datPtr1+10872+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat126 = _mm512_mask_fmadd_ps(dat126, 127, bnMul5, bnAdd5);
__m512 dat127 = _mm512_maskz_loadu_ps(127, datPtr1+11768+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat127 = _mm512_mask_fmadd_ps(dat127, 127, bnMul5, bnAdd5);
__m512 dat128 = _mm512_maskz_loadu_ps(127, datPtr1+12664+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat128 = _mm512_mask_fmadd_ps(dat128, 127, bnMul5, bnAdd5);
__m512 dat129 = _mm512_maskz_loadu_ps(127, datPtr1+13560+602112*i6+200704*k6+896*h5+4*w5+0*b11);
dat129 = _mm512_mask_fmadd_ps(dat129, 127, bnMul5, bnAdd5);
__m512 fft1681 = _mm512_add_ps(dat114, dat122);
__m512 fft1769 = _mm512_add_ps(dat115, dat123);
__m512 fft1682 = _mm512_sub_ps(dat114, dat122);
__m512 fft1770 = _mm512_sub_ps(dat115, dat123);
__m512 fft1683 = _mm512_add_ps(dat116, dat124);
__m512 fft1771 = _mm512_add_ps(dat117, dat125);
__m512 fft1684 = _mm512_sub_ps(dat116, dat124);
__m512 fft1772 = _mm512_sub_ps(dat117, dat125);
__m512 fft1685 = _mm512_add_ps(dat118, dat126);
__m512 fft1773 = _mm512_add_ps(dat119, dat127);
__m512 fft1686 = _mm512_sub_ps(dat118, dat126);
__m512 fft1774 = _mm512_sub_ps(dat119, dat127);
__m512 fft1687 = _mm512_add_ps(dat120, dat128);
__m512 fft1775 = _mm512_add_ps(dat121, dat129);
__m512 fft1688 = _mm512_sub_ps(dat120, dat128);
__m512 fft1776 = _mm512_sub_ps(dat121, dat129);
__m512 fft1689 = _mm512_add_ps(fft1681, fft1685);
__m512 fft1777 = _mm512_add_ps(fft1769, fft1773);
__m512 fft1690 = _mm512_sub_ps(fft1681, fft1685);
__m512 fft1778 = _mm512_sub_ps(fft1769, fft1773);
__m512 fft1691 = _mm512_add_ps(fft1683, fft1687);
__m512 fft1779 = _mm512_add_ps(fft1771, fft1775);
__m512 fft1692 = _mm512_sub_ps(fft1687, fft1683);
__m512 fft1780 = _mm512_sub_ps(fft1775, fft1771);
__m512 fft1693 = _mm512_sub_ps(fft1684, fft1688);
__m512 fft1781 = _mm512_sub_ps(fft1772, fft1776);
__m512 fft1694 = _mm512_add_ps(fft1684, fft1688);
__m512 fft1782 = _mm512_add_ps(fft1772, fft1776);
__m512 fft1695 = _mm512_add_ps(fft1689, fft1691);
__m512 fft1783 = _mm512_add_ps(fft1777, fft1779);
__m512 fft1696 = _mm512_sub_ps(fft1689, fft1691);
__m512 fft1784 = _mm512_sub_ps(fft1777, fft1779);
__m512 fft1697 = _mm512_fmadd_ps(fft1693, _mm512_set1_ps(7.0710677e-01f), fft1682);
__m512 fft1785 = _mm512_fmadd_ps(fft1781, _mm512_set1_ps(7.0710677e-01f), fft1770);
__m512 fft1698 = _mm512_fnmsub_ps(fft1694, _mm512_set1_ps(7.0710677e-01f), fft1686);
__m512 fft1786 = _mm512_fnmsub_ps(fft1782, _mm512_set1_ps(7.0710677e-01f), fft1774);
__m512 fft1699 = _mm512_fnmadd_ps(fft1693, _mm512_set1_ps(7.0710677e-01f), fft1682);
__m512 fft1787 = _mm512_fnmadd_ps(fft1781, _mm512_set1_ps(7.0710677e-01f), fft1770);
__m512 fft1700 = _mm512_fnmadd_ps(fft1694, _mm512_set1_ps(7.0710677e-01f), fft1686);
__m512 fft1788 = _mm512_fnmadd_ps(fft1782, _mm512_set1_ps(7.0710677e-01f), fft1774);
__m512 fft1701 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1702 = _mm512_fmadd_ps(fft1695, fft1701, _mm512_shuffle_f32x4(fft1695, fft1695, 78));
__m512 fft1789 = _mm512_fmadd_ps(fft1783, fft1701, _mm512_shuffle_f32x4(fft1783, fft1783, 78));
__m512 fft1703 = _mm512_fmadd_ps(fft1696, fft1701, _mm512_shuffle_f32x4(fft1696, fft1696, 78));
__m512 fft1790 = _mm512_fmadd_ps(fft1784, fft1701, _mm512_shuffle_f32x4(fft1784, fft1784, 78));
__m512 fft1704 = _mm512_fmadd_ps(fft1697, fft1701, _mm512_shuffle_f32x4(fft1697, fft1697, 78));
__m512 fft1791 = _mm512_fmadd_ps(fft1785, fft1701, _mm512_shuffle_f32x4(fft1785, fft1785, 78));
__m512 fft1705 = _mm512_fmadd_ps(fft1698, fft1701, _mm512_shuffle_f32x4(fft1698, fft1698, 78));
__m512 fft1792 = _mm512_fmadd_ps(fft1786, fft1701, _mm512_shuffle_f32x4(fft1786, fft1786, 78));
__m512 fft1706 = _mm512_fmadd_ps(fft1690, fft1701, _mm512_shuffle_f32x4(fft1690, fft1690, 78));
__m512 fft1793 = _mm512_fmadd_ps(fft1778, fft1701, _mm512_shuffle_f32x4(fft1778, fft1778, 78));
__m512 fft1707 = _mm512_fmadd_ps(fft1692, fft1701, _mm512_shuffle_f32x4(fft1692, fft1692, 78));
__m512 fft1794 = _mm512_fmadd_ps(fft1780, fft1701, _mm512_shuffle_f32x4(fft1780, fft1780, 78));
__m512 fft1708 = _mm512_fmadd_ps(fft1699, fft1701, _mm512_shuffle_f32x4(fft1699, fft1699, 78));
__m512 fft1795 = _mm512_fmadd_ps(fft1787, fft1701, _mm512_shuffle_f32x4(fft1787, fft1787, 78));
__m512 fft1709 = _mm512_fmadd_ps(fft1700, fft1701, _mm512_shuffle_f32x4(fft1700, fft1700, 78));
__m512 fft1796 = _mm512_fmadd_ps(fft1788, fft1701, _mm512_shuffle_f32x4(fft1788, fft1788, 78));
__m512 fft1710 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1711 = _mm512_mul_ps(fft1702, fft1710);
__m512 fft1797 = _mm512_mul_ps(fft1789, fft1710);
__m512 fft1712 = _mm512_mul_ps(fft1703, fft1710);
__m512 fft1798 = _mm512_mul_ps(fft1790, fft1710);
__m512 fft1713 = _mm512_mul_ps(fft1704, fft1710);
__m512 fft1799 = _mm512_mul_ps(fft1791, fft1710);
__m512 fft1714 = _mm512_mul_ps(fft1705, fft1710);
__m512 fft1800 = _mm512_mul_ps(fft1792, fft1710);
__m512 fft1715 = _mm512_mul_ps(fft1706, fft1710);
__m512 fft1801 = _mm512_mul_ps(fft1793, fft1710);
__m512 fft1716 = _mm512_mul_ps(fft1707, fft1710);
__m512 fft1802 = _mm512_mul_ps(fft1794, fft1710);
__m512 fft1717 = _mm512_mul_ps(fft1708, fft1710);
__m512 fft1803 = _mm512_mul_ps(fft1795, fft1710);
__m512 fft1718 = _mm512_mul_ps(fft1709, fft1710);
__m512 fft1804 = _mm512_mul_ps(fft1796, fft1710);
__m512 fft1719 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1720 = _mm512_fmadd_ps(fft1703, fft1719, fft1711);
__m512 fft1805 = _mm512_fmadd_ps(fft1790, fft1719, fft1797);
__m512 fft1721 = _mm512_fnmadd_ps(fft1702, fft1719, fft1712);
__m512 fft1806 = _mm512_fnmadd_ps(fft1789, fft1719, fft1798);
__m512 fft1722 = _mm512_fmadd_ps(fft1705, fft1719, fft1713);
__m512 fft1807 = _mm512_fmadd_ps(fft1792, fft1719, fft1799);
__m512 fft1723 = _mm512_fnmadd_ps(fft1704, fft1719, fft1714);
__m512 fft1808 = _mm512_fnmadd_ps(fft1791, fft1719, fft1800);
__m512 fft1724 = _mm512_fmadd_ps(fft1707, fft1719, fft1715);
__m512 fft1809 = _mm512_fmadd_ps(fft1794, fft1719, fft1801);
__m512 fft1725 = _mm512_fnmadd_ps(fft1706, fft1719, fft1716);
__m512 fft1810 = _mm512_fnmadd_ps(fft1793, fft1719, fft1802);
__m512 fft1726 = _mm512_fmadd_ps(fft1709, fft1719, fft1717);
__m512 fft1811 = _mm512_fmadd_ps(fft1796, fft1719, fft1803);
__m512 fft1727 = _mm512_fnmadd_ps(fft1708, fft1719, fft1718);
__m512 fft1812 = _mm512_fnmadd_ps(fft1795, fft1719, fft1804);
__m512 fft1728 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1729 = _mm512_fmadd_ps(fft1720, fft1728, _mm512_shuffle_f32x4(fft1720, fft1720, 177));
__m512 fft1813 = _mm512_fmadd_ps(fft1805, fft1728, _mm512_shuffle_f32x4(fft1805, fft1805, 177));
__m512 fft1730 = _mm512_fmadd_ps(fft1721, fft1728, _mm512_shuffle_f32x4(fft1721, fft1721, 177));
__m512 fft1814 = _mm512_fmadd_ps(fft1806, fft1728, _mm512_shuffle_f32x4(fft1806, fft1806, 177));
__m512 fft1731 = _mm512_fmadd_ps(fft1722, fft1728, _mm512_shuffle_f32x4(fft1722, fft1722, 177));
__m512 fft1815 = _mm512_fmadd_ps(fft1807, fft1728, _mm512_shuffle_f32x4(fft1807, fft1807, 177));
__m512 fft1732 = _mm512_fmadd_ps(fft1723, fft1728, _mm512_shuffle_f32x4(fft1723, fft1723, 177));
__m512 fft1816 = _mm512_fmadd_ps(fft1808, fft1728, _mm512_shuffle_f32x4(fft1808, fft1808, 177));
__m512 fft1733 = _mm512_fmadd_ps(fft1724, fft1728, _mm512_shuffle_f32x4(fft1724, fft1724, 177));
__m512 fft1817 = _mm512_fmadd_ps(fft1809, fft1728, _mm512_shuffle_f32x4(fft1809, fft1809, 177));
__m512 fft1734 = _mm512_fmadd_ps(fft1725, fft1728, _mm512_shuffle_f32x4(fft1725, fft1725, 177));
__m512 fft1818 = _mm512_fmadd_ps(fft1810, fft1728, _mm512_shuffle_f32x4(fft1810, fft1810, 177));
__m512 fft1735 = _mm512_fmadd_ps(fft1726, fft1728, _mm512_shuffle_f32x4(fft1726, fft1726, 177));
__m512 fft1819 = _mm512_fmadd_ps(fft1811, fft1728, _mm512_shuffle_f32x4(fft1811, fft1811, 177));
__m512 fft1736 = _mm512_fmadd_ps(fft1727, fft1728, _mm512_shuffle_f32x4(fft1727, fft1727, 177));
__m512 fft1820 = _mm512_fmadd_ps(fft1812, fft1728, _mm512_shuffle_f32x4(fft1812, fft1812, 177));
__m512 fft1737 = _mm512_mask_mov_ps(fft1729, 49344, fft1730);
__m512 fft1821 = _mm512_mask_mov_ps(fft1813, 49344, fft1814);
__m512 fft1738 = _mm512_mask_sub_ps(fft1730, 49344, _mm512_setzero_ps(), fft1729);
__m512 fft1822 = _mm512_mask_sub_ps(fft1814, 49344, _mm512_setzero_ps(), fft1813);
__m512 fft1739 = _mm512_mask_mov_ps(fft1731, 49344, fft1732);
__m512 fft1823 = _mm512_mask_mov_ps(fft1815, 49344, fft1816);
__m512 fft1740 = _mm512_mask_sub_ps(fft1732, 49344, _mm512_setzero_ps(), fft1731);
__m512 fft1824 = _mm512_mask_sub_ps(fft1816, 49344, _mm512_setzero_ps(), fft1815);
__m512 fft1741 = _mm512_mask_mov_ps(fft1733, 49344, fft1734);
__m512 fft1825 = _mm512_mask_mov_ps(fft1817, 49344, fft1818);
__m512 fft1742 = _mm512_mask_sub_ps(fft1734, 49344, _mm512_setzero_ps(), fft1733);
__m512 fft1826 = _mm512_mask_sub_ps(fft1818, 49344, _mm512_setzero_ps(), fft1817);
__m512 fft1743 = _mm512_mask_mov_ps(fft1735, 49344, fft1736);
__m512 fft1827 = _mm512_mask_mov_ps(fft1819, 49344, fft1820);
__m512 fft1744 = _mm512_mask_sub_ps(fft1736, 49344, _mm512_setzero_ps(), fft1735);
__m512 fft1828 = _mm512_mask_sub_ps(fft1820, 49344, _mm512_setzero_ps(), fft1819);
__m512 fft1745 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1746 = _mm512_fmadd_ps(fft1737, fft1745, _mm512_shuffle_ps(fft1737, fft1737, 78));
__m512 fft1829 = _mm512_fmadd_ps(fft1821, fft1745, _mm512_shuffle_ps(fft1821, fft1821, 78));
__m512 fft1747 = _mm512_fmadd_ps(fft1738, fft1745, _mm512_shuffle_ps(fft1738, fft1738, 78));
__m512 fft1830 = _mm512_fmadd_ps(fft1822, fft1745, _mm512_shuffle_ps(fft1822, fft1822, 78));
__m512 fft1748 = _mm512_fmadd_ps(fft1739, fft1745, _mm512_shuffle_ps(fft1739, fft1739, 78));
__m512 fft1831 = _mm512_fmadd_ps(fft1823, fft1745, _mm512_shuffle_ps(fft1823, fft1823, 78));
__m512 fft1749 = _mm512_fmadd_ps(fft1740, fft1745, _mm512_shuffle_ps(fft1740, fft1740, 78));
__m512 fft1832 = _mm512_fmadd_ps(fft1824, fft1745, _mm512_shuffle_ps(fft1824, fft1824, 78));
__m512 fft1750 = _mm512_fmadd_ps(fft1741, fft1745, _mm512_shuffle_ps(fft1741, fft1741, 78));
__m512 fft1833 = _mm512_fmadd_ps(fft1825, fft1745, _mm512_shuffle_ps(fft1825, fft1825, 78));
__m512 fft1751 = _mm512_fmadd_ps(fft1742, fft1745, _mm512_shuffle_ps(fft1742, fft1742, 78));
__m512 fft1834 = _mm512_fmadd_ps(fft1826, fft1745, _mm512_shuffle_ps(fft1826, fft1826, 78));
__m512 fft1752 = _mm512_fmadd_ps(fft1743, fft1745, _mm512_shuffle_ps(fft1743, fft1743, 78));
__m512 fft1835 = _mm512_fmadd_ps(fft1827, fft1745, _mm512_shuffle_ps(fft1827, fft1827, 78));
__m512 fft1753 = _mm512_fmadd_ps(fft1744, fft1745, _mm512_shuffle_ps(fft1744, fft1744, 78));
__m512 fft1836 = _mm512_fmadd_ps(fft1828, fft1745, _mm512_shuffle_ps(fft1828, fft1828, 78));
__m512i fft1754 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1755 = _mm512_permutexvar_ps(fft1754, fft1746);
__m512 fft1837 = _mm512_permutexvar_ps(fft1754, fft1829);
__m512i fft1756 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1757 = _mm512_permutexvar_ps(fft1756, fft1746);
__m512 fft1838 = _mm512_permutexvar_ps(fft1756, fft1829);
__m512 fft1758 = _mm512_permutexvar_ps(fft1754, fft1747);
__m512 fft1839 = _mm512_permutexvar_ps(fft1754, fft1830);
__m512 fft1759 = _mm512_permutexvar_ps(fft1756, fft1747);
__m512 fft1840 = _mm512_permutexvar_ps(fft1756, fft1830);
__m512 fft1760 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1761 = _mm512_fmadd_ps(fft1755, fft1760, fft1757);
__m512 fft1841 = _mm512_fmadd_ps(fft1837, fft1760, fft1838);
__m512 fft1762 = _mm512_fnmadd_ps(fft1759, fft1760, fft1758);
__m512 fft1842 = _mm512_fnmadd_ps(fft1840, fft1760, fft1839);
__m512 fft1763 = _mm512_mask_mov_ps(fft1759, 21845, fft1761);
__m512 fft1843 = _mm512_mask_mov_ps(fft1840, 21845, fft1841);
__m512 fft1764 = _mm512_mask_mov_ps(fft1755, 43176, fft1761);
__m512 fft1844 = _mm512_mask_mov_ps(fft1837, 43176, fft1841);
__m512 fft1765 = _mm512_mask_mov_ps(fft1763, 43176, fft1762);
__m512 fft1845 = _mm512_mask_mov_ps(fft1843, 43176, fft1842);
__m512 fft1766 = _mm512_mask_mov_ps(fft1764, 22102, fft1762);
__m512 fft1846 = _mm512_mask_mov_ps(fft1844, 22102, fft1842);
__m512 fft1767 = _mm512_mask_mul_ps(fft1765, 64764, fft1765, _mm512_set1_ps(5e-01f));
__m512 fft1847 = _mm512_mask_mul_ps(fft1845, 64764, fft1845, _mm512_set1_ps(5e-01f));
__m512 fft1768 = _mm512_mask_mul_ps(fft1766, 64764, fft1766, _mm512_set1_ps(5e-01f));
__m512 fft1848 = _mm512_mask_mul_ps(fft1846, 64764, fft1846, _mm512_set1_ps(5e-01f));
__m512 df129 = fft1767;
__m512 df137 = fft1847;
__m512 df130 = fft1768;
__m512 df138 = fft1848;
__m512 df131 = fft1748;
__m512 df139 = fft1831;
__m512 df132 = fft1749;
__m512 df140 = fft1832;
__m512 df133 = fft1750;
__m512 df141 = fft1833;
__m512 df134 = fft1751;
__m512 df142 = fft1834;
__m512 df135 = fft1752;
__m512 df143 = fft1835;
__m512 df136 = fft1753;
__m512 df144 = fft1836;
__m512i eo11 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df131 = _mm512_permutexvar_ps(eo11, df131);
df132 = _mm512_permutexvar_ps(eo11, df132);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df131);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df132);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df131);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df132);
df139 = _mm512_permutexvar_ps(eo11, df139);
df140 = _mm512_permutexvar_ps(eo11, df140);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df139);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df140);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df139);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df140);
df133 = _mm512_permutexvar_ps(eo11, df133);
df134 = _mm512_permutexvar_ps(eo11, df134);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df133);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df134);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df133);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df134);
df141 = _mm512_permutexvar_ps(eo11, df141);
df142 = _mm512_permutexvar_ps(eo11, df142);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df141);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df142);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df141);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df142);
df135 = _mm512_permutexvar_ps(eo11, df135);
df136 = _mm512_permutexvar_ps(eo11, df136);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df135);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df136);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df135);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df136);
df143 = _mm512_permutexvar_ps(eo11, df143);
df144 = _mm512_permutexvar_ps(eo11, df144);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df143);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df144);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df143);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df144);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df129);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df130);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df129);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df130);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df137);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m11+32*f12, 255, df138);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df137);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m11+32*f12, 65280, df138);
ptrdiff_t b12 = 4;
ptrdiff_t m12 = (size_t)b12/2;
ptrdiff_t f13 = (size_t)b12%2;
__m512 dat130 = _mm512_maskz_loadu_ps(65528, datPtr1+8200+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat130 = _mm512_mask_fmadd_ps(dat130, 65528, bnMul5, bnAdd5);
__m512 dat131 = _mm512_maskz_loadu_ps(65528, datPtr1+9096+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat131 = _mm512_mask_fmadd_ps(dat131, 65528, bnMul5, bnAdd5);
__m512 dat132 = _mm512_maskz_loadu_ps(65528, datPtr1+9992+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat132 = _mm512_mask_fmadd_ps(dat132, 65528, bnMul5, bnAdd5);
__m512 dat133 = _mm512_maskz_loadu_ps(65528, datPtr1+10888+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat133 = _mm512_mask_fmadd_ps(dat133, 65528, bnMul5, bnAdd5);
__m512 dat134 = _mm512_maskz_loadu_ps(65528, datPtr1+11784+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat134 = _mm512_mask_fmadd_ps(dat134, 65528, bnMul5, bnAdd5);
__m512 dat135 = _mm512_maskz_loadu_ps(65528, datPtr1+12680+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat135 = _mm512_mask_fmadd_ps(dat135, 65528, bnMul5, bnAdd5);
__m512 dat136 = _mm512_maskz_loadu_ps(65528, datPtr1+13576+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat136 = _mm512_mask_fmadd_ps(dat136, 65528, bnMul5, bnAdd5);
__m512 dat137 = _mm512_maskz_loadu_ps(65528, datPtr1+14472+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat137 = _mm512_mask_fmadd_ps(dat137, 65528, bnMul5, bnAdd5);
__m512 dat138 = _mm512_maskz_loadu_ps(65528, datPtr1+15368+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat138 = _mm512_mask_fmadd_ps(dat138, 65528, bnMul5, bnAdd5);
__m512 dat139 = _mm512_maskz_loadu_ps(65528, datPtr1+16264+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat139 = _mm512_mask_fmadd_ps(dat139, 65528, bnMul5, bnAdd5);
__m512 dat140 = _mm512_maskz_loadu_ps(65528, datPtr1+17160+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat140 = _mm512_mask_fmadd_ps(dat140, 65528, bnMul5, bnAdd5);
__m512 dat141 = _mm512_maskz_loadu_ps(65528, datPtr1+18056+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat141 = _mm512_mask_fmadd_ps(dat141, 65528, bnMul5, bnAdd5);
__m512 dat142 = _mm512_maskz_loadu_ps(65528, datPtr1+18952+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat142 = _mm512_mask_fmadd_ps(dat142, 65528, bnMul5, bnAdd5);
__m512 dat143 = _mm512_maskz_loadu_ps(65528, datPtr1+19848+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat143 = _mm512_mask_fmadd_ps(dat143, 65528, bnMul5, bnAdd5);
__m512 dat144 = _mm512_maskz_loadu_ps(65528, datPtr1+20744+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat144 = _mm512_mask_fmadd_ps(dat144, 65528, bnMul5, bnAdd5);
__m512 dat145 = _mm512_maskz_loadu_ps(65528, datPtr1+21640+602112*i6+200704*k6+896*h5+4*w5+0*b12);
dat145 = _mm512_mask_fmadd_ps(dat145, 65528, bnMul5, bnAdd5);
__m512 fft1849 = _mm512_add_ps(dat130, dat138);
__m512 fft1937 = _mm512_add_ps(dat131, dat139);
__m512 fft1850 = _mm512_sub_ps(dat130, dat138);
__m512 fft1938 = _mm512_sub_ps(dat131, dat139);
__m512 fft1851 = _mm512_add_ps(dat132, dat140);
__m512 fft1939 = _mm512_add_ps(dat133, dat141);
__m512 fft1852 = _mm512_sub_ps(dat132, dat140);
__m512 fft1940 = _mm512_sub_ps(dat133, dat141);
__m512 fft1853 = _mm512_add_ps(dat134, dat142);
__m512 fft1941 = _mm512_add_ps(dat135, dat143);
__m512 fft1854 = _mm512_sub_ps(dat134, dat142);
__m512 fft1942 = _mm512_sub_ps(dat135, dat143);
__m512 fft1855 = _mm512_add_ps(dat136, dat144);
__m512 fft1943 = _mm512_add_ps(dat137, dat145);
__m512 fft1856 = _mm512_sub_ps(dat136, dat144);
__m512 fft1944 = _mm512_sub_ps(dat137, dat145);
__m512 fft1857 = _mm512_add_ps(fft1849, fft1853);
__m512 fft1945 = _mm512_add_ps(fft1937, fft1941);
__m512 fft1858 = _mm512_sub_ps(fft1849, fft1853);
__m512 fft1946 = _mm512_sub_ps(fft1937, fft1941);
__m512 fft1859 = _mm512_add_ps(fft1851, fft1855);
__m512 fft1947 = _mm512_add_ps(fft1939, fft1943);
__m512 fft1860 = _mm512_sub_ps(fft1855, fft1851);
__m512 fft1948 = _mm512_sub_ps(fft1943, fft1939);
__m512 fft1861 = _mm512_sub_ps(fft1852, fft1856);
__m512 fft1949 = _mm512_sub_ps(fft1940, fft1944);
__m512 fft1862 = _mm512_add_ps(fft1852, fft1856);
__m512 fft1950 = _mm512_add_ps(fft1940, fft1944);
__m512 fft1863 = _mm512_add_ps(fft1857, fft1859);
__m512 fft1951 = _mm512_add_ps(fft1945, fft1947);
__m512 fft1864 = _mm512_sub_ps(fft1857, fft1859);
__m512 fft1952 = _mm512_sub_ps(fft1945, fft1947);
__m512 fft1865 = _mm512_fmadd_ps(fft1861, _mm512_set1_ps(7.0710677e-01f), fft1850);
__m512 fft1953 = _mm512_fmadd_ps(fft1949, _mm512_set1_ps(7.0710677e-01f), fft1938);
__m512 fft1866 = _mm512_fnmsub_ps(fft1862, _mm512_set1_ps(7.0710677e-01f), fft1854);
__m512 fft1954 = _mm512_fnmsub_ps(fft1950, _mm512_set1_ps(7.0710677e-01f), fft1942);
__m512 fft1867 = _mm512_fnmadd_ps(fft1861, _mm512_set1_ps(7.0710677e-01f), fft1850);
__m512 fft1955 = _mm512_fnmadd_ps(fft1949, _mm512_set1_ps(7.0710677e-01f), fft1938);
__m512 fft1868 = _mm512_fnmadd_ps(fft1862, _mm512_set1_ps(7.0710677e-01f), fft1854);
__m512 fft1956 = _mm512_fnmadd_ps(fft1950, _mm512_set1_ps(7.0710677e-01f), fft1942);
__m512 fft1869 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1870 = _mm512_fmadd_ps(fft1863, fft1869, _mm512_shuffle_f32x4(fft1863, fft1863, 78));
__m512 fft1957 = _mm512_fmadd_ps(fft1951, fft1869, _mm512_shuffle_f32x4(fft1951, fft1951, 78));
__m512 fft1871 = _mm512_fmadd_ps(fft1864, fft1869, _mm512_shuffle_f32x4(fft1864, fft1864, 78));
__m512 fft1958 = _mm512_fmadd_ps(fft1952, fft1869, _mm512_shuffle_f32x4(fft1952, fft1952, 78));
__m512 fft1872 = _mm512_fmadd_ps(fft1865, fft1869, _mm512_shuffle_f32x4(fft1865, fft1865, 78));
__m512 fft1959 = _mm512_fmadd_ps(fft1953, fft1869, _mm512_shuffle_f32x4(fft1953, fft1953, 78));
__m512 fft1873 = _mm512_fmadd_ps(fft1866, fft1869, _mm512_shuffle_f32x4(fft1866, fft1866, 78));
__m512 fft1960 = _mm512_fmadd_ps(fft1954, fft1869, _mm512_shuffle_f32x4(fft1954, fft1954, 78));
__m512 fft1874 = _mm512_fmadd_ps(fft1858, fft1869, _mm512_shuffle_f32x4(fft1858, fft1858, 78));
__m512 fft1961 = _mm512_fmadd_ps(fft1946, fft1869, _mm512_shuffle_f32x4(fft1946, fft1946, 78));
__m512 fft1875 = _mm512_fmadd_ps(fft1860, fft1869, _mm512_shuffle_f32x4(fft1860, fft1860, 78));
__m512 fft1962 = _mm512_fmadd_ps(fft1948, fft1869, _mm512_shuffle_f32x4(fft1948, fft1948, 78));
__m512 fft1876 = _mm512_fmadd_ps(fft1867, fft1869, _mm512_shuffle_f32x4(fft1867, fft1867, 78));
__m512 fft1963 = _mm512_fmadd_ps(fft1955, fft1869, _mm512_shuffle_f32x4(fft1955, fft1955, 78));
__m512 fft1877 = _mm512_fmadd_ps(fft1868, fft1869, _mm512_shuffle_f32x4(fft1868, fft1868, 78));
__m512 fft1964 = _mm512_fmadd_ps(fft1956, fft1869, _mm512_shuffle_f32x4(fft1956, fft1956, 78));
__m512 fft1878 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft1879 = _mm512_mul_ps(fft1870, fft1878);
__m512 fft1965 = _mm512_mul_ps(fft1957, fft1878);
__m512 fft1880 = _mm512_mul_ps(fft1871, fft1878);
__m512 fft1966 = _mm512_mul_ps(fft1958, fft1878);
__m512 fft1881 = _mm512_mul_ps(fft1872, fft1878);
__m512 fft1967 = _mm512_mul_ps(fft1959, fft1878);
__m512 fft1882 = _mm512_mul_ps(fft1873, fft1878);
__m512 fft1968 = _mm512_mul_ps(fft1960, fft1878);
__m512 fft1883 = _mm512_mul_ps(fft1874, fft1878);
__m512 fft1969 = _mm512_mul_ps(fft1961, fft1878);
__m512 fft1884 = _mm512_mul_ps(fft1875, fft1878);
__m512 fft1970 = _mm512_mul_ps(fft1962, fft1878);
__m512 fft1885 = _mm512_mul_ps(fft1876, fft1878);
__m512 fft1971 = _mm512_mul_ps(fft1963, fft1878);
__m512 fft1886 = _mm512_mul_ps(fft1877, fft1878);
__m512 fft1972 = _mm512_mul_ps(fft1964, fft1878);
__m512 fft1887 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft1888 = _mm512_fmadd_ps(fft1871, fft1887, fft1879);
__m512 fft1973 = _mm512_fmadd_ps(fft1958, fft1887, fft1965);
__m512 fft1889 = _mm512_fnmadd_ps(fft1870, fft1887, fft1880);
__m512 fft1974 = _mm512_fnmadd_ps(fft1957, fft1887, fft1966);
__m512 fft1890 = _mm512_fmadd_ps(fft1873, fft1887, fft1881);
__m512 fft1975 = _mm512_fmadd_ps(fft1960, fft1887, fft1967);
__m512 fft1891 = _mm512_fnmadd_ps(fft1872, fft1887, fft1882);
__m512 fft1976 = _mm512_fnmadd_ps(fft1959, fft1887, fft1968);
__m512 fft1892 = _mm512_fmadd_ps(fft1875, fft1887, fft1883);
__m512 fft1977 = _mm512_fmadd_ps(fft1962, fft1887, fft1969);
__m512 fft1893 = _mm512_fnmadd_ps(fft1874, fft1887, fft1884);
__m512 fft1978 = _mm512_fnmadd_ps(fft1961, fft1887, fft1970);
__m512 fft1894 = _mm512_fmadd_ps(fft1877, fft1887, fft1885);
__m512 fft1979 = _mm512_fmadd_ps(fft1964, fft1887, fft1971);
__m512 fft1895 = _mm512_fnmadd_ps(fft1876, fft1887, fft1886);
__m512 fft1980 = _mm512_fnmadd_ps(fft1963, fft1887, fft1972);
__m512 fft1896 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft1897 = _mm512_fmadd_ps(fft1888, fft1896, _mm512_shuffle_f32x4(fft1888, fft1888, 177));
__m512 fft1981 = _mm512_fmadd_ps(fft1973, fft1896, _mm512_shuffle_f32x4(fft1973, fft1973, 177));
__m512 fft1898 = _mm512_fmadd_ps(fft1889, fft1896, _mm512_shuffle_f32x4(fft1889, fft1889, 177));
__m512 fft1982 = _mm512_fmadd_ps(fft1974, fft1896, _mm512_shuffle_f32x4(fft1974, fft1974, 177));
__m512 fft1899 = _mm512_fmadd_ps(fft1890, fft1896, _mm512_shuffle_f32x4(fft1890, fft1890, 177));
__m512 fft1983 = _mm512_fmadd_ps(fft1975, fft1896, _mm512_shuffle_f32x4(fft1975, fft1975, 177));
__m512 fft1900 = _mm512_fmadd_ps(fft1891, fft1896, _mm512_shuffle_f32x4(fft1891, fft1891, 177));
__m512 fft1984 = _mm512_fmadd_ps(fft1976, fft1896, _mm512_shuffle_f32x4(fft1976, fft1976, 177));
__m512 fft1901 = _mm512_fmadd_ps(fft1892, fft1896, _mm512_shuffle_f32x4(fft1892, fft1892, 177));
__m512 fft1985 = _mm512_fmadd_ps(fft1977, fft1896, _mm512_shuffle_f32x4(fft1977, fft1977, 177));
__m512 fft1902 = _mm512_fmadd_ps(fft1893, fft1896, _mm512_shuffle_f32x4(fft1893, fft1893, 177));
__m512 fft1986 = _mm512_fmadd_ps(fft1978, fft1896, _mm512_shuffle_f32x4(fft1978, fft1978, 177));
__m512 fft1903 = _mm512_fmadd_ps(fft1894, fft1896, _mm512_shuffle_f32x4(fft1894, fft1894, 177));
__m512 fft1987 = _mm512_fmadd_ps(fft1979, fft1896, _mm512_shuffle_f32x4(fft1979, fft1979, 177));
__m512 fft1904 = _mm512_fmadd_ps(fft1895, fft1896, _mm512_shuffle_f32x4(fft1895, fft1895, 177));
__m512 fft1988 = _mm512_fmadd_ps(fft1980, fft1896, _mm512_shuffle_f32x4(fft1980, fft1980, 177));
__m512 fft1905 = _mm512_mask_mov_ps(fft1897, 49344, fft1898);
__m512 fft1989 = _mm512_mask_mov_ps(fft1981, 49344, fft1982);
__m512 fft1906 = _mm512_mask_sub_ps(fft1898, 49344, _mm512_setzero_ps(), fft1897);
__m512 fft1990 = _mm512_mask_sub_ps(fft1982, 49344, _mm512_setzero_ps(), fft1981);
__m512 fft1907 = _mm512_mask_mov_ps(fft1899, 49344, fft1900);
__m512 fft1991 = _mm512_mask_mov_ps(fft1983, 49344, fft1984);
__m512 fft1908 = _mm512_mask_sub_ps(fft1900, 49344, _mm512_setzero_ps(), fft1899);
__m512 fft1992 = _mm512_mask_sub_ps(fft1984, 49344, _mm512_setzero_ps(), fft1983);
__m512 fft1909 = _mm512_mask_mov_ps(fft1901, 49344, fft1902);
__m512 fft1993 = _mm512_mask_mov_ps(fft1985, 49344, fft1986);
__m512 fft1910 = _mm512_mask_sub_ps(fft1902, 49344, _mm512_setzero_ps(), fft1901);
__m512 fft1994 = _mm512_mask_sub_ps(fft1986, 49344, _mm512_setzero_ps(), fft1985);
__m512 fft1911 = _mm512_mask_mov_ps(fft1903, 49344, fft1904);
__m512 fft1995 = _mm512_mask_mov_ps(fft1987, 49344, fft1988);
__m512 fft1912 = _mm512_mask_sub_ps(fft1904, 49344, _mm512_setzero_ps(), fft1903);
__m512 fft1996 = _mm512_mask_sub_ps(fft1988, 49344, _mm512_setzero_ps(), fft1987);
__m512 fft1913 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft1914 = _mm512_fmadd_ps(fft1905, fft1913, _mm512_shuffle_ps(fft1905, fft1905, 78));
__m512 fft1997 = _mm512_fmadd_ps(fft1989, fft1913, _mm512_shuffle_ps(fft1989, fft1989, 78));
__m512 fft1915 = _mm512_fmadd_ps(fft1906, fft1913, _mm512_shuffle_ps(fft1906, fft1906, 78));
__m512 fft1998 = _mm512_fmadd_ps(fft1990, fft1913, _mm512_shuffle_ps(fft1990, fft1990, 78));
__m512 fft1916 = _mm512_fmadd_ps(fft1907, fft1913, _mm512_shuffle_ps(fft1907, fft1907, 78));
__m512 fft1999 = _mm512_fmadd_ps(fft1991, fft1913, _mm512_shuffle_ps(fft1991, fft1991, 78));
__m512 fft1917 = _mm512_fmadd_ps(fft1908, fft1913, _mm512_shuffle_ps(fft1908, fft1908, 78));
__m512 fft2000 = _mm512_fmadd_ps(fft1992, fft1913, _mm512_shuffle_ps(fft1992, fft1992, 78));
__m512 fft1918 = _mm512_fmadd_ps(fft1909, fft1913, _mm512_shuffle_ps(fft1909, fft1909, 78));
__m512 fft2001 = _mm512_fmadd_ps(fft1993, fft1913, _mm512_shuffle_ps(fft1993, fft1993, 78));
__m512 fft1919 = _mm512_fmadd_ps(fft1910, fft1913, _mm512_shuffle_ps(fft1910, fft1910, 78));
__m512 fft2002 = _mm512_fmadd_ps(fft1994, fft1913, _mm512_shuffle_ps(fft1994, fft1994, 78));
__m512 fft1920 = _mm512_fmadd_ps(fft1911, fft1913, _mm512_shuffle_ps(fft1911, fft1911, 78));
__m512 fft2003 = _mm512_fmadd_ps(fft1995, fft1913, _mm512_shuffle_ps(fft1995, fft1995, 78));
__m512 fft1921 = _mm512_fmadd_ps(fft1912, fft1913, _mm512_shuffle_ps(fft1912, fft1912, 78));
__m512 fft2004 = _mm512_fmadd_ps(fft1996, fft1913, _mm512_shuffle_ps(fft1996, fft1996, 78));
__m512i fft1922 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft1923 = _mm512_permutexvar_ps(fft1922, fft1914);
__m512 fft2005 = _mm512_permutexvar_ps(fft1922, fft1997);
__m512i fft1924 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft1925 = _mm512_permutexvar_ps(fft1924, fft1914);
__m512 fft2006 = _mm512_permutexvar_ps(fft1924, fft1997);
__m512 fft1926 = _mm512_permutexvar_ps(fft1922, fft1915);
__m512 fft2007 = _mm512_permutexvar_ps(fft1922, fft1998);
__m512 fft1927 = _mm512_permutexvar_ps(fft1924, fft1915);
__m512 fft2008 = _mm512_permutexvar_ps(fft1924, fft1998);
__m512 fft1928 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft1929 = _mm512_fmadd_ps(fft1923, fft1928, fft1925);
__m512 fft2009 = _mm512_fmadd_ps(fft2005, fft1928, fft2006);
__m512 fft1930 = _mm512_fnmadd_ps(fft1927, fft1928, fft1926);
__m512 fft2010 = _mm512_fnmadd_ps(fft2008, fft1928, fft2007);
__m512 fft1931 = _mm512_mask_mov_ps(fft1927, 21845, fft1929);
__m512 fft2011 = _mm512_mask_mov_ps(fft2008, 21845, fft2009);
__m512 fft1932 = _mm512_mask_mov_ps(fft1923, 43176, fft1929);
__m512 fft2012 = _mm512_mask_mov_ps(fft2005, 43176, fft2009);
__m512 fft1933 = _mm512_mask_mov_ps(fft1931, 43176, fft1930);
__m512 fft2013 = _mm512_mask_mov_ps(fft2011, 43176, fft2010);
__m512 fft1934 = _mm512_mask_mov_ps(fft1932, 22102, fft1930);
__m512 fft2014 = _mm512_mask_mov_ps(fft2012, 22102, fft2010);
__m512 fft1935 = _mm512_mask_mul_ps(fft1933, 64764, fft1933, _mm512_set1_ps(5e-01f));
__m512 fft2015 = _mm512_mask_mul_ps(fft2013, 64764, fft2013, _mm512_set1_ps(5e-01f));
__m512 fft1936 = _mm512_mask_mul_ps(fft1934, 64764, fft1934, _mm512_set1_ps(5e-01f));
__m512 fft2016 = _mm512_mask_mul_ps(fft2014, 64764, fft2014, _mm512_set1_ps(5e-01f));
__m512 df145 = fft1935;
__m512 df153 = fft2015;
__m512 df146 = fft1936;
__m512 df154 = fft2016;
__m512 df147 = fft1916;
__m512 df155 = fft1999;
__m512 df148 = fft1917;
__m512 df156 = fft2000;
__m512 df149 = fft1918;
__m512 df157 = fft2001;
__m512 df150 = fft1919;
__m512 df158 = fft2002;
__m512 df151 = fft1920;
__m512 df159 = fft2003;
__m512 df152 = fft1921;
__m512 df160 = fft2004;
__m512i eo12 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df147 = _mm512_permutexvar_ps(eo12, df147);
df148 = _mm512_permutexvar_ps(eo12, df148);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df147);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df148);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df147);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df148);
df155 = _mm512_permutexvar_ps(eo12, df155);
df156 = _mm512_permutexvar_ps(eo12, df156);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df155);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df156);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df155);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df156);
df149 = _mm512_permutexvar_ps(eo12, df149);
df150 = _mm512_permutexvar_ps(eo12, df150);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df149);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df150);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df149);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df150);
df157 = _mm512_permutexvar_ps(eo12, df157);
df158 = _mm512_permutexvar_ps(eo12, df158);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df157);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df158);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df157);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df158);
df151 = _mm512_permutexvar_ps(eo12, df151);
df152 = _mm512_permutexvar_ps(eo12, df152);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df151);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df152);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df151);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df152);
df159 = _mm512_permutexvar_ps(eo12, df159);
df160 = _mm512_permutexvar_ps(eo12, df160);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df159);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df160);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df159);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df160);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df145);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df146);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df145);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df146);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df153);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m12+32*f13, 255, df154);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df153);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m12+32*f13, 65280, df154);
ptrdiff_t b13 = 5;
ptrdiff_t m13 = (size_t)b13/2;
ptrdiff_t f14 = (size_t)b13%2;
__m512 dat146 = _mm512_maskz_loadu_ps(65535, datPtr1+8240+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat146 = _mm512_mask_fmadd_ps(dat146, 65535, bnMul5, bnAdd5);
__m512 dat147 = _mm512_maskz_loadu_ps(65535, datPtr1+9136+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat147 = _mm512_mask_fmadd_ps(dat147, 65535, bnMul5, bnAdd5);
__m512 dat148 = _mm512_maskz_loadu_ps(65535, datPtr1+10032+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat148 = _mm512_mask_fmadd_ps(dat148, 65535, bnMul5, bnAdd5);
__m512 dat149 = _mm512_maskz_loadu_ps(65535, datPtr1+10928+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat149 = _mm512_mask_fmadd_ps(dat149, 65535, bnMul5, bnAdd5);
__m512 dat150 = _mm512_maskz_loadu_ps(65535, datPtr1+11824+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat150 = _mm512_mask_fmadd_ps(dat150, 65535, bnMul5, bnAdd5);
__m512 dat151 = _mm512_maskz_loadu_ps(65535, datPtr1+12720+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat151 = _mm512_mask_fmadd_ps(dat151, 65535, bnMul5, bnAdd5);
__m512 dat152 = _mm512_maskz_loadu_ps(65535, datPtr1+13616+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat152 = _mm512_mask_fmadd_ps(dat152, 65535, bnMul5, bnAdd5);
__m512 dat153 = _mm512_maskz_loadu_ps(65535, datPtr1+14512+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat153 = _mm512_mask_fmadd_ps(dat153, 65535, bnMul5, bnAdd5);
__m512 dat154 = _mm512_maskz_loadu_ps(65535, datPtr1+15408+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat154 = _mm512_mask_fmadd_ps(dat154, 65535, bnMul5, bnAdd5);
__m512 dat155 = _mm512_maskz_loadu_ps(65535, datPtr1+16304+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat155 = _mm512_mask_fmadd_ps(dat155, 65535, bnMul5, bnAdd5);
__m512 dat156 = _mm512_maskz_loadu_ps(65535, datPtr1+17200+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat156 = _mm512_mask_fmadd_ps(dat156, 65535, bnMul5, bnAdd5);
__m512 dat157 = _mm512_maskz_loadu_ps(65535, datPtr1+18096+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat157 = _mm512_mask_fmadd_ps(dat157, 65535, bnMul5, bnAdd5);
__m512 dat158 = _mm512_maskz_loadu_ps(65535, datPtr1+18992+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat158 = _mm512_mask_fmadd_ps(dat158, 65535, bnMul5, bnAdd5);
__m512 dat159 = _mm512_maskz_loadu_ps(65535, datPtr1+19888+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat159 = _mm512_mask_fmadd_ps(dat159, 65535, bnMul5, bnAdd5);
__m512 dat160 = _mm512_maskz_loadu_ps(65535, datPtr1+20784+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat160 = _mm512_mask_fmadd_ps(dat160, 65535, bnMul5, bnAdd5);
__m512 dat161 = _mm512_maskz_loadu_ps(65535, datPtr1+21680+602112*i6+200704*k6+896*h5+4*w5+0*b13);
dat161 = _mm512_mask_fmadd_ps(dat161, 65535, bnMul5, bnAdd5);
__m512 fft2017 = _mm512_add_ps(dat146, dat154);
__m512 fft2105 = _mm512_add_ps(dat147, dat155);
__m512 fft2018 = _mm512_sub_ps(dat146, dat154);
__m512 fft2106 = _mm512_sub_ps(dat147, dat155);
__m512 fft2019 = _mm512_add_ps(dat148, dat156);
__m512 fft2107 = _mm512_add_ps(dat149, dat157);
__m512 fft2020 = _mm512_sub_ps(dat148, dat156);
__m512 fft2108 = _mm512_sub_ps(dat149, dat157);
__m512 fft2021 = _mm512_add_ps(dat150, dat158);
__m512 fft2109 = _mm512_add_ps(dat151, dat159);
__m512 fft2022 = _mm512_sub_ps(dat150, dat158);
__m512 fft2110 = _mm512_sub_ps(dat151, dat159);
__m512 fft2023 = _mm512_add_ps(dat152, dat160);
__m512 fft2111 = _mm512_add_ps(dat153, dat161);
__m512 fft2024 = _mm512_sub_ps(dat152, dat160);
__m512 fft2112 = _mm512_sub_ps(dat153, dat161);
__m512 fft2025 = _mm512_add_ps(fft2017, fft2021);
__m512 fft2113 = _mm512_add_ps(fft2105, fft2109);
__m512 fft2026 = _mm512_sub_ps(fft2017, fft2021);
__m512 fft2114 = _mm512_sub_ps(fft2105, fft2109);
__m512 fft2027 = _mm512_add_ps(fft2019, fft2023);
__m512 fft2115 = _mm512_add_ps(fft2107, fft2111);
__m512 fft2028 = _mm512_sub_ps(fft2023, fft2019);
__m512 fft2116 = _mm512_sub_ps(fft2111, fft2107);
__m512 fft2029 = _mm512_sub_ps(fft2020, fft2024);
__m512 fft2117 = _mm512_sub_ps(fft2108, fft2112);
__m512 fft2030 = _mm512_add_ps(fft2020, fft2024);
__m512 fft2118 = _mm512_add_ps(fft2108, fft2112);
__m512 fft2031 = _mm512_add_ps(fft2025, fft2027);
__m512 fft2119 = _mm512_add_ps(fft2113, fft2115);
__m512 fft2032 = _mm512_sub_ps(fft2025, fft2027);
__m512 fft2120 = _mm512_sub_ps(fft2113, fft2115);
__m512 fft2033 = _mm512_fmadd_ps(fft2029, _mm512_set1_ps(7.0710677e-01f), fft2018);
__m512 fft2121 = _mm512_fmadd_ps(fft2117, _mm512_set1_ps(7.0710677e-01f), fft2106);
__m512 fft2034 = _mm512_fnmsub_ps(fft2030, _mm512_set1_ps(7.0710677e-01f), fft2022);
__m512 fft2122 = _mm512_fnmsub_ps(fft2118, _mm512_set1_ps(7.0710677e-01f), fft2110);
__m512 fft2035 = _mm512_fnmadd_ps(fft2029, _mm512_set1_ps(7.0710677e-01f), fft2018);
__m512 fft2123 = _mm512_fnmadd_ps(fft2117, _mm512_set1_ps(7.0710677e-01f), fft2106);
__m512 fft2036 = _mm512_fnmadd_ps(fft2030, _mm512_set1_ps(7.0710677e-01f), fft2022);
__m512 fft2124 = _mm512_fnmadd_ps(fft2118, _mm512_set1_ps(7.0710677e-01f), fft2110);
__m512 fft2037 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2038 = _mm512_fmadd_ps(fft2031, fft2037, _mm512_shuffle_f32x4(fft2031, fft2031, 78));
__m512 fft2125 = _mm512_fmadd_ps(fft2119, fft2037, _mm512_shuffle_f32x4(fft2119, fft2119, 78));
__m512 fft2039 = _mm512_fmadd_ps(fft2032, fft2037, _mm512_shuffle_f32x4(fft2032, fft2032, 78));
__m512 fft2126 = _mm512_fmadd_ps(fft2120, fft2037, _mm512_shuffle_f32x4(fft2120, fft2120, 78));
__m512 fft2040 = _mm512_fmadd_ps(fft2033, fft2037, _mm512_shuffle_f32x4(fft2033, fft2033, 78));
__m512 fft2127 = _mm512_fmadd_ps(fft2121, fft2037, _mm512_shuffle_f32x4(fft2121, fft2121, 78));
__m512 fft2041 = _mm512_fmadd_ps(fft2034, fft2037, _mm512_shuffle_f32x4(fft2034, fft2034, 78));
__m512 fft2128 = _mm512_fmadd_ps(fft2122, fft2037, _mm512_shuffle_f32x4(fft2122, fft2122, 78));
__m512 fft2042 = _mm512_fmadd_ps(fft2026, fft2037, _mm512_shuffle_f32x4(fft2026, fft2026, 78));
__m512 fft2129 = _mm512_fmadd_ps(fft2114, fft2037, _mm512_shuffle_f32x4(fft2114, fft2114, 78));
__m512 fft2043 = _mm512_fmadd_ps(fft2028, fft2037, _mm512_shuffle_f32x4(fft2028, fft2028, 78));
__m512 fft2130 = _mm512_fmadd_ps(fft2116, fft2037, _mm512_shuffle_f32x4(fft2116, fft2116, 78));
__m512 fft2044 = _mm512_fmadd_ps(fft2035, fft2037, _mm512_shuffle_f32x4(fft2035, fft2035, 78));
__m512 fft2131 = _mm512_fmadd_ps(fft2123, fft2037, _mm512_shuffle_f32x4(fft2123, fft2123, 78));
__m512 fft2045 = _mm512_fmadd_ps(fft2036, fft2037, _mm512_shuffle_f32x4(fft2036, fft2036, 78));
__m512 fft2132 = _mm512_fmadd_ps(fft2124, fft2037, _mm512_shuffle_f32x4(fft2124, fft2124, 78));
__m512 fft2046 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2047 = _mm512_mul_ps(fft2038, fft2046);
__m512 fft2133 = _mm512_mul_ps(fft2125, fft2046);
__m512 fft2048 = _mm512_mul_ps(fft2039, fft2046);
__m512 fft2134 = _mm512_mul_ps(fft2126, fft2046);
__m512 fft2049 = _mm512_mul_ps(fft2040, fft2046);
__m512 fft2135 = _mm512_mul_ps(fft2127, fft2046);
__m512 fft2050 = _mm512_mul_ps(fft2041, fft2046);
__m512 fft2136 = _mm512_mul_ps(fft2128, fft2046);
__m512 fft2051 = _mm512_mul_ps(fft2042, fft2046);
__m512 fft2137 = _mm512_mul_ps(fft2129, fft2046);
__m512 fft2052 = _mm512_mul_ps(fft2043, fft2046);
__m512 fft2138 = _mm512_mul_ps(fft2130, fft2046);
__m512 fft2053 = _mm512_mul_ps(fft2044, fft2046);
__m512 fft2139 = _mm512_mul_ps(fft2131, fft2046);
__m512 fft2054 = _mm512_mul_ps(fft2045, fft2046);
__m512 fft2140 = _mm512_mul_ps(fft2132, fft2046);
__m512 fft2055 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2056 = _mm512_fmadd_ps(fft2039, fft2055, fft2047);
__m512 fft2141 = _mm512_fmadd_ps(fft2126, fft2055, fft2133);
__m512 fft2057 = _mm512_fnmadd_ps(fft2038, fft2055, fft2048);
__m512 fft2142 = _mm512_fnmadd_ps(fft2125, fft2055, fft2134);
__m512 fft2058 = _mm512_fmadd_ps(fft2041, fft2055, fft2049);
__m512 fft2143 = _mm512_fmadd_ps(fft2128, fft2055, fft2135);
__m512 fft2059 = _mm512_fnmadd_ps(fft2040, fft2055, fft2050);
__m512 fft2144 = _mm512_fnmadd_ps(fft2127, fft2055, fft2136);
__m512 fft2060 = _mm512_fmadd_ps(fft2043, fft2055, fft2051);
__m512 fft2145 = _mm512_fmadd_ps(fft2130, fft2055, fft2137);
__m512 fft2061 = _mm512_fnmadd_ps(fft2042, fft2055, fft2052);
__m512 fft2146 = _mm512_fnmadd_ps(fft2129, fft2055, fft2138);
__m512 fft2062 = _mm512_fmadd_ps(fft2045, fft2055, fft2053);
__m512 fft2147 = _mm512_fmadd_ps(fft2132, fft2055, fft2139);
__m512 fft2063 = _mm512_fnmadd_ps(fft2044, fft2055, fft2054);
__m512 fft2148 = _mm512_fnmadd_ps(fft2131, fft2055, fft2140);
__m512 fft2064 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2065 = _mm512_fmadd_ps(fft2056, fft2064, _mm512_shuffle_f32x4(fft2056, fft2056, 177));
__m512 fft2149 = _mm512_fmadd_ps(fft2141, fft2064, _mm512_shuffle_f32x4(fft2141, fft2141, 177));
__m512 fft2066 = _mm512_fmadd_ps(fft2057, fft2064, _mm512_shuffle_f32x4(fft2057, fft2057, 177));
__m512 fft2150 = _mm512_fmadd_ps(fft2142, fft2064, _mm512_shuffle_f32x4(fft2142, fft2142, 177));
__m512 fft2067 = _mm512_fmadd_ps(fft2058, fft2064, _mm512_shuffle_f32x4(fft2058, fft2058, 177));
__m512 fft2151 = _mm512_fmadd_ps(fft2143, fft2064, _mm512_shuffle_f32x4(fft2143, fft2143, 177));
__m512 fft2068 = _mm512_fmadd_ps(fft2059, fft2064, _mm512_shuffle_f32x4(fft2059, fft2059, 177));
__m512 fft2152 = _mm512_fmadd_ps(fft2144, fft2064, _mm512_shuffle_f32x4(fft2144, fft2144, 177));
__m512 fft2069 = _mm512_fmadd_ps(fft2060, fft2064, _mm512_shuffle_f32x4(fft2060, fft2060, 177));
__m512 fft2153 = _mm512_fmadd_ps(fft2145, fft2064, _mm512_shuffle_f32x4(fft2145, fft2145, 177));
__m512 fft2070 = _mm512_fmadd_ps(fft2061, fft2064, _mm512_shuffle_f32x4(fft2061, fft2061, 177));
__m512 fft2154 = _mm512_fmadd_ps(fft2146, fft2064, _mm512_shuffle_f32x4(fft2146, fft2146, 177));
__m512 fft2071 = _mm512_fmadd_ps(fft2062, fft2064, _mm512_shuffle_f32x4(fft2062, fft2062, 177));
__m512 fft2155 = _mm512_fmadd_ps(fft2147, fft2064, _mm512_shuffle_f32x4(fft2147, fft2147, 177));
__m512 fft2072 = _mm512_fmadd_ps(fft2063, fft2064, _mm512_shuffle_f32x4(fft2063, fft2063, 177));
__m512 fft2156 = _mm512_fmadd_ps(fft2148, fft2064, _mm512_shuffle_f32x4(fft2148, fft2148, 177));
__m512 fft2073 = _mm512_mask_mov_ps(fft2065, 49344, fft2066);
__m512 fft2157 = _mm512_mask_mov_ps(fft2149, 49344, fft2150);
__m512 fft2074 = _mm512_mask_sub_ps(fft2066, 49344, _mm512_setzero_ps(), fft2065);
__m512 fft2158 = _mm512_mask_sub_ps(fft2150, 49344, _mm512_setzero_ps(), fft2149);
__m512 fft2075 = _mm512_mask_mov_ps(fft2067, 49344, fft2068);
__m512 fft2159 = _mm512_mask_mov_ps(fft2151, 49344, fft2152);
__m512 fft2076 = _mm512_mask_sub_ps(fft2068, 49344, _mm512_setzero_ps(), fft2067);
__m512 fft2160 = _mm512_mask_sub_ps(fft2152, 49344, _mm512_setzero_ps(), fft2151);
__m512 fft2077 = _mm512_mask_mov_ps(fft2069, 49344, fft2070);
__m512 fft2161 = _mm512_mask_mov_ps(fft2153, 49344, fft2154);
__m512 fft2078 = _mm512_mask_sub_ps(fft2070, 49344, _mm512_setzero_ps(), fft2069);
__m512 fft2162 = _mm512_mask_sub_ps(fft2154, 49344, _mm512_setzero_ps(), fft2153);
__m512 fft2079 = _mm512_mask_mov_ps(fft2071, 49344, fft2072);
__m512 fft2163 = _mm512_mask_mov_ps(fft2155, 49344, fft2156);
__m512 fft2080 = _mm512_mask_sub_ps(fft2072, 49344, _mm512_setzero_ps(), fft2071);
__m512 fft2164 = _mm512_mask_sub_ps(fft2156, 49344, _mm512_setzero_ps(), fft2155);
__m512 fft2081 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2082 = _mm512_fmadd_ps(fft2073, fft2081, _mm512_shuffle_ps(fft2073, fft2073, 78));
__m512 fft2165 = _mm512_fmadd_ps(fft2157, fft2081, _mm512_shuffle_ps(fft2157, fft2157, 78));
__m512 fft2083 = _mm512_fmadd_ps(fft2074, fft2081, _mm512_shuffle_ps(fft2074, fft2074, 78));
__m512 fft2166 = _mm512_fmadd_ps(fft2158, fft2081, _mm512_shuffle_ps(fft2158, fft2158, 78));
__m512 fft2084 = _mm512_fmadd_ps(fft2075, fft2081, _mm512_shuffle_ps(fft2075, fft2075, 78));
__m512 fft2167 = _mm512_fmadd_ps(fft2159, fft2081, _mm512_shuffle_ps(fft2159, fft2159, 78));
__m512 fft2085 = _mm512_fmadd_ps(fft2076, fft2081, _mm512_shuffle_ps(fft2076, fft2076, 78));
__m512 fft2168 = _mm512_fmadd_ps(fft2160, fft2081, _mm512_shuffle_ps(fft2160, fft2160, 78));
__m512 fft2086 = _mm512_fmadd_ps(fft2077, fft2081, _mm512_shuffle_ps(fft2077, fft2077, 78));
__m512 fft2169 = _mm512_fmadd_ps(fft2161, fft2081, _mm512_shuffle_ps(fft2161, fft2161, 78));
__m512 fft2087 = _mm512_fmadd_ps(fft2078, fft2081, _mm512_shuffle_ps(fft2078, fft2078, 78));
__m512 fft2170 = _mm512_fmadd_ps(fft2162, fft2081, _mm512_shuffle_ps(fft2162, fft2162, 78));
__m512 fft2088 = _mm512_fmadd_ps(fft2079, fft2081, _mm512_shuffle_ps(fft2079, fft2079, 78));
__m512 fft2171 = _mm512_fmadd_ps(fft2163, fft2081, _mm512_shuffle_ps(fft2163, fft2163, 78));
__m512 fft2089 = _mm512_fmadd_ps(fft2080, fft2081, _mm512_shuffle_ps(fft2080, fft2080, 78));
__m512 fft2172 = _mm512_fmadd_ps(fft2164, fft2081, _mm512_shuffle_ps(fft2164, fft2164, 78));
__m512i fft2090 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2091 = _mm512_permutexvar_ps(fft2090, fft2082);
__m512 fft2173 = _mm512_permutexvar_ps(fft2090, fft2165);
__m512i fft2092 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2093 = _mm512_permutexvar_ps(fft2092, fft2082);
__m512 fft2174 = _mm512_permutexvar_ps(fft2092, fft2165);
__m512 fft2094 = _mm512_permutexvar_ps(fft2090, fft2083);
__m512 fft2175 = _mm512_permutexvar_ps(fft2090, fft2166);
__m512 fft2095 = _mm512_permutexvar_ps(fft2092, fft2083);
__m512 fft2176 = _mm512_permutexvar_ps(fft2092, fft2166);
__m512 fft2096 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2097 = _mm512_fmadd_ps(fft2091, fft2096, fft2093);
__m512 fft2177 = _mm512_fmadd_ps(fft2173, fft2096, fft2174);
__m512 fft2098 = _mm512_fnmadd_ps(fft2095, fft2096, fft2094);
__m512 fft2178 = _mm512_fnmadd_ps(fft2176, fft2096, fft2175);
__m512 fft2099 = _mm512_mask_mov_ps(fft2095, 21845, fft2097);
__m512 fft2179 = _mm512_mask_mov_ps(fft2176, 21845, fft2177);
__m512 fft2100 = _mm512_mask_mov_ps(fft2091, 43176, fft2097);
__m512 fft2180 = _mm512_mask_mov_ps(fft2173, 43176, fft2177);
__m512 fft2101 = _mm512_mask_mov_ps(fft2099, 43176, fft2098);
__m512 fft2181 = _mm512_mask_mov_ps(fft2179, 43176, fft2178);
__m512 fft2102 = _mm512_mask_mov_ps(fft2100, 22102, fft2098);
__m512 fft2182 = _mm512_mask_mov_ps(fft2180, 22102, fft2178);
__m512 fft2103 = _mm512_mask_mul_ps(fft2101, 64764, fft2101, _mm512_set1_ps(5e-01f));
__m512 fft2183 = _mm512_mask_mul_ps(fft2181, 64764, fft2181, _mm512_set1_ps(5e-01f));
__m512 fft2104 = _mm512_mask_mul_ps(fft2102, 64764, fft2102, _mm512_set1_ps(5e-01f));
__m512 fft2184 = _mm512_mask_mul_ps(fft2182, 64764, fft2182, _mm512_set1_ps(5e-01f));
__m512 df161 = fft2103;
__m512 df169 = fft2183;
__m512 df162 = fft2104;
__m512 df170 = fft2184;
__m512 df163 = fft2084;
__m512 df171 = fft2167;
__m512 df164 = fft2085;
__m512 df172 = fft2168;
__m512 df165 = fft2086;
__m512 df173 = fft2169;
__m512 df166 = fft2087;
__m512 df174 = fft2170;
__m512 df167 = fft2088;
__m512 df175 = fft2171;
__m512 df168 = fft2089;
__m512 df176 = fft2172;
__m512i eo13 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df163 = _mm512_permutexvar_ps(eo13, df163);
df164 = _mm512_permutexvar_ps(eo13, df164);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df163);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df164);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df163);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df164);
df171 = _mm512_permutexvar_ps(eo13, df171);
df172 = _mm512_permutexvar_ps(eo13, df172);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df171);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df172);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df171);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df172);
df165 = _mm512_permutexvar_ps(eo13, df165);
df166 = _mm512_permutexvar_ps(eo13, df166);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df165);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df166);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df165);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df166);
df173 = _mm512_permutexvar_ps(eo13, df173);
df174 = _mm512_permutexvar_ps(eo13, df174);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df173);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df174);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df173);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df174);
df167 = _mm512_permutexvar_ps(eo13, df167);
df168 = _mm512_permutexvar_ps(eo13, df168);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df167);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df168);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df167);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df168);
df175 = _mm512_permutexvar_ps(eo13, df175);
df176 = _mm512_permutexvar_ps(eo13, df176);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df175);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df176);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df175);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df176);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df161);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df162);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df161);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df162);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df169);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k6+128*m13+32*f14, 255, df170);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df169);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k6+128*m13+32*f14, 65280, df170);
}
if (j2 >= last1) return;
++j2;
rel2 = 4;
}
if (rel2 < 7) {
ptrdiff_t h6 = base2+10;
ptrdiff_t w6 = -220+60*rel2;
ptrdiff_t jj3 = 6-rel2+j2;
for (; j2 <= jj3; w6 += 60) {
ptrdiff_t k7 = 3*s1;
ptrdiff_t kk6 = k7+2;
for (; k7 <= kk6; ++k7) {
__m512 bnMul6 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k7+3*i6))[0]);
__m512 bnAdd6 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k7+3*i6))[1]);
for (ptrdiff_t b14 = 0; b14 < 6; ++b14) {
ptrdiff_t m14 = (size_t)b14/2;
ptrdiff_t f15 = (size_t)b14%2;
__m512 dat162 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat162 = _mm512_mask_fmadd_ps(dat162, 65535, bnMul6, bnAdd6);
__m512 dat163 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat163 = _mm512_mask_fmadd_ps(dat163, 65535, bnMul6, bnAdd6);
__m512 dat164 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat164 = _mm512_mask_fmadd_ps(dat164, 65535, bnMul6, bnAdd6);
__m512 dat165 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat165 = _mm512_mask_fmadd_ps(dat165, 65535, bnMul6, bnAdd6);
__m512 dat166 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat166 = _mm512_mask_fmadd_ps(dat166, 65535, bnMul6, bnAdd6);
__m512 dat167 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat167 = _mm512_mask_fmadd_ps(dat167, 65535, bnMul6, bnAdd6);
__m512 dat168 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat168 = _mm512_mask_fmadd_ps(dat168, 65535, bnMul6, bnAdd6);
__m512 dat169 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat169 = _mm512_mask_fmadd_ps(dat169, 65535, bnMul6, bnAdd6);
__m512 dat170 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat170 = _mm512_mask_fmadd_ps(dat170, 65535, bnMul6, bnAdd6);
__m512 dat171 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat171 = _mm512_mask_fmadd_ps(dat171, 65535, bnMul6, bnAdd6);
__m512 dat172 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat172 = _mm512_mask_fmadd_ps(dat172, 65535, bnMul6, bnAdd6);
__m512 dat173 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat173 = _mm512_mask_fmadd_ps(dat173, 65535, bnMul6, bnAdd6);
__m512 dat174 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat174 = _mm512_mask_fmadd_ps(dat174, 65535, bnMul6, bnAdd6);
__m512 dat175 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat175 = _mm512_mask_fmadd_ps(dat175, 65535, bnMul6, bnAdd6);
__m512 dat176 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat176 = _mm512_mask_fmadd_ps(dat176, 65535, bnMul6, bnAdd6);
__m512 dat177 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k7+896*h6+4*w6+40*b14);
dat177 = _mm512_mask_fmadd_ps(dat177, 65535, bnMul6, bnAdd6);
__m512 fft2185 = _mm512_add_ps(dat162, dat170);
__m512 fft2273 = _mm512_add_ps(dat163, dat171);
__m512 fft2186 = _mm512_sub_ps(dat162, dat170);
__m512 fft2274 = _mm512_sub_ps(dat163, dat171);
__m512 fft2187 = _mm512_add_ps(dat164, dat172);
__m512 fft2275 = _mm512_add_ps(dat165, dat173);
__m512 fft2188 = _mm512_sub_ps(dat164, dat172);
__m512 fft2276 = _mm512_sub_ps(dat165, dat173);
__m512 fft2189 = _mm512_add_ps(dat166, dat174);
__m512 fft2277 = _mm512_add_ps(dat167, dat175);
__m512 fft2190 = _mm512_sub_ps(dat166, dat174);
__m512 fft2278 = _mm512_sub_ps(dat167, dat175);
__m512 fft2191 = _mm512_add_ps(dat168, dat176);
__m512 fft2279 = _mm512_add_ps(dat169, dat177);
__m512 fft2192 = _mm512_sub_ps(dat168, dat176);
__m512 fft2280 = _mm512_sub_ps(dat169, dat177);
__m512 fft2193 = _mm512_add_ps(fft2185, fft2189);
__m512 fft2281 = _mm512_add_ps(fft2273, fft2277);
__m512 fft2194 = _mm512_sub_ps(fft2185, fft2189);
__m512 fft2282 = _mm512_sub_ps(fft2273, fft2277);
__m512 fft2195 = _mm512_add_ps(fft2187, fft2191);
__m512 fft2283 = _mm512_add_ps(fft2275, fft2279);
__m512 fft2196 = _mm512_sub_ps(fft2191, fft2187);
__m512 fft2284 = _mm512_sub_ps(fft2279, fft2275);
__m512 fft2197 = _mm512_sub_ps(fft2188, fft2192);
__m512 fft2285 = _mm512_sub_ps(fft2276, fft2280);
__m512 fft2198 = _mm512_add_ps(fft2188, fft2192);
__m512 fft2286 = _mm512_add_ps(fft2276, fft2280);
__m512 fft2199 = _mm512_add_ps(fft2193, fft2195);
__m512 fft2287 = _mm512_add_ps(fft2281, fft2283);
__m512 fft2200 = _mm512_sub_ps(fft2193, fft2195);
__m512 fft2288 = _mm512_sub_ps(fft2281, fft2283);
__m512 fft2201 = _mm512_fmadd_ps(fft2197, _mm512_set1_ps(7.0710677e-01f), fft2186);
__m512 fft2289 = _mm512_fmadd_ps(fft2285, _mm512_set1_ps(7.0710677e-01f), fft2274);
__m512 fft2202 = _mm512_fnmsub_ps(fft2198, _mm512_set1_ps(7.0710677e-01f), fft2190);
__m512 fft2290 = _mm512_fnmsub_ps(fft2286, _mm512_set1_ps(7.0710677e-01f), fft2278);
__m512 fft2203 = _mm512_fnmadd_ps(fft2197, _mm512_set1_ps(7.0710677e-01f), fft2186);
__m512 fft2291 = _mm512_fnmadd_ps(fft2285, _mm512_set1_ps(7.0710677e-01f), fft2274);
__m512 fft2204 = _mm512_fnmadd_ps(fft2198, _mm512_set1_ps(7.0710677e-01f), fft2190);
__m512 fft2292 = _mm512_fnmadd_ps(fft2286, _mm512_set1_ps(7.0710677e-01f), fft2278);
__m512 fft2205 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2206 = _mm512_fmadd_ps(fft2199, fft2205, _mm512_shuffle_f32x4(fft2199, fft2199, 78));
__m512 fft2293 = _mm512_fmadd_ps(fft2287, fft2205, _mm512_shuffle_f32x4(fft2287, fft2287, 78));
__m512 fft2207 = _mm512_fmadd_ps(fft2200, fft2205, _mm512_shuffle_f32x4(fft2200, fft2200, 78));
__m512 fft2294 = _mm512_fmadd_ps(fft2288, fft2205, _mm512_shuffle_f32x4(fft2288, fft2288, 78));
__m512 fft2208 = _mm512_fmadd_ps(fft2201, fft2205, _mm512_shuffle_f32x4(fft2201, fft2201, 78));
__m512 fft2295 = _mm512_fmadd_ps(fft2289, fft2205, _mm512_shuffle_f32x4(fft2289, fft2289, 78));
__m512 fft2209 = _mm512_fmadd_ps(fft2202, fft2205, _mm512_shuffle_f32x4(fft2202, fft2202, 78));
__m512 fft2296 = _mm512_fmadd_ps(fft2290, fft2205, _mm512_shuffle_f32x4(fft2290, fft2290, 78));
__m512 fft2210 = _mm512_fmadd_ps(fft2194, fft2205, _mm512_shuffle_f32x4(fft2194, fft2194, 78));
__m512 fft2297 = _mm512_fmadd_ps(fft2282, fft2205, _mm512_shuffle_f32x4(fft2282, fft2282, 78));
__m512 fft2211 = _mm512_fmadd_ps(fft2196, fft2205, _mm512_shuffle_f32x4(fft2196, fft2196, 78));
__m512 fft2298 = _mm512_fmadd_ps(fft2284, fft2205, _mm512_shuffle_f32x4(fft2284, fft2284, 78));
__m512 fft2212 = _mm512_fmadd_ps(fft2203, fft2205, _mm512_shuffle_f32x4(fft2203, fft2203, 78));
__m512 fft2299 = _mm512_fmadd_ps(fft2291, fft2205, _mm512_shuffle_f32x4(fft2291, fft2291, 78));
__m512 fft2213 = _mm512_fmadd_ps(fft2204, fft2205, _mm512_shuffle_f32x4(fft2204, fft2204, 78));
__m512 fft2300 = _mm512_fmadd_ps(fft2292, fft2205, _mm512_shuffle_f32x4(fft2292, fft2292, 78));
__m512 fft2214 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2215 = _mm512_mul_ps(fft2206, fft2214);
__m512 fft2301 = _mm512_mul_ps(fft2293, fft2214);
__m512 fft2216 = _mm512_mul_ps(fft2207, fft2214);
__m512 fft2302 = _mm512_mul_ps(fft2294, fft2214);
__m512 fft2217 = _mm512_mul_ps(fft2208, fft2214);
__m512 fft2303 = _mm512_mul_ps(fft2295, fft2214);
__m512 fft2218 = _mm512_mul_ps(fft2209, fft2214);
__m512 fft2304 = _mm512_mul_ps(fft2296, fft2214);
__m512 fft2219 = _mm512_mul_ps(fft2210, fft2214);
__m512 fft2305 = _mm512_mul_ps(fft2297, fft2214);
__m512 fft2220 = _mm512_mul_ps(fft2211, fft2214);
__m512 fft2306 = _mm512_mul_ps(fft2298, fft2214);
__m512 fft2221 = _mm512_mul_ps(fft2212, fft2214);
__m512 fft2307 = _mm512_mul_ps(fft2299, fft2214);
__m512 fft2222 = _mm512_mul_ps(fft2213, fft2214);
__m512 fft2308 = _mm512_mul_ps(fft2300, fft2214);
__m512 fft2223 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2224 = _mm512_fmadd_ps(fft2207, fft2223, fft2215);
__m512 fft2309 = _mm512_fmadd_ps(fft2294, fft2223, fft2301);
__m512 fft2225 = _mm512_fnmadd_ps(fft2206, fft2223, fft2216);
__m512 fft2310 = _mm512_fnmadd_ps(fft2293, fft2223, fft2302);
__m512 fft2226 = _mm512_fmadd_ps(fft2209, fft2223, fft2217);
__m512 fft2311 = _mm512_fmadd_ps(fft2296, fft2223, fft2303);
__m512 fft2227 = _mm512_fnmadd_ps(fft2208, fft2223, fft2218);
__m512 fft2312 = _mm512_fnmadd_ps(fft2295, fft2223, fft2304);
__m512 fft2228 = _mm512_fmadd_ps(fft2211, fft2223, fft2219);
__m512 fft2313 = _mm512_fmadd_ps(fft2298, fft2223, fft2305);
__m512 fft2229 = _mm512_fnmadd_ps(fft2210, fft2223, fft2220);
__m512 fft2314 = _mm512_fnmadd_ps(fft2297, fft2223, fft2306);
__m512 fft2230 = _mm512_fmadd_ps(fft2213, fft2223, fft2221);
__m512 fft2315 = _mm512_fmadd_ps(fft2300, fft2223, fft2307);
__m512 fft2231 = _mm512_fnmadd_ps(fft2212, fft2223, fft2222);
__m512 fft2316 = _mm512_fnmadd_ps(fft2299, fft2223, fft2308);
__m512 fft2232 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2233 = _mm512_fmadd_ps(fft2224, fft2232, _mm512_shuffle_f32x4(fft2224, fft2224, 177));
__m512 fft2317 = _mm512_fmadd_ps(fft2309, fft2232, _mm512_shuffle_f32x4(fft2309, fft2309, 177));
__m512 fft2234 = _mm512_fmadd_ps(fft2225, fft2232, _mm512_shuffle_f32x4(fft2225, fft2225, 177));
__m512 fft2318 = _mm512_fmadd_ps(fft2310, fft2232, _mm512_shuffle_f32x4(fft2310, fft2310, 177));
__m512 fft2235 = _mm512_fmadd_ps(fft2226, fft2232, _mm512_shuffle_f32x4(fft2226, fft2226, 177));
__m512 fft2319 = _mm512_fmadd_ps(fft2311, fft2232, _mm512_shuffle_f32x4(fft2311, fft2311, 177));
__m512 fft2236 = _mm512_fmadd_ps(fft2227, fft2232, _mm512_shuffle_f32x4(fft2227, fft2227, 177));
__m512 fft2320 = _mm512_fmadd_ps(fft2312, fft2232, _mm512_shuffle_f32x4(fft2312, fft2312, 177));
__m512 fft2237 = _mm512_fmadd_ps(fft2228, fft2232, _mm512_shuffle_f32x4(fft2228, fft2228, 177));
__m512 fft2321 = _mm512_fmadd_ps(fft2313, fft2232, _mm512_shuffle_f32x4(fft2313, fft2313, 177));
__m512 fft2238 = _mm512_fmadd_ps(fft2229, fft2232, _mm512_shuffle_f32x4(fft2229, fft2229, 177));
__m512 fft2322 = _mm512_fmadd_ps(fft2314, fft2232, _mm512_shuffle_f32x4(fft2314, fft2314, 177));
__m512 fft2239 = _mm512_fmadd_ps(fft2230, fft2232, _mm512_shuffle_f32x4(fft2230, fft2230, 177));
__m512 fft2323 = _mm512_fmadd_ps(fft2315, fft2232, _mm512_shuffle_f32x4(fft2315, fft2315, 177));
__m512 fft2240 = _mm512_fmadd_ps(fft2231, fft2232, _mm512_shuffle_f32x4(fft2231, fft2231, 177));
__m512 fft2324 = _mm512_fmadd_ps(fft2316, fft2232, _mm512_shuffle_f32x4(fft2316, fft2316, 177));
__m512 fft2241 = _mm512_mask_mov_ps(fft2233, 49344, fft2234);
__m512 fft2325 = _mm512_mask_mov_ps(fft2317, 49344, fft2318);
__m512 fft2242 = _mm512_mask_sub_ps(fft2234, 49344, _mm512_setzero_ps(), fft2233);
__m512 fft2326 = _mm512_mask_sub_ps(fft2318, 49344, _mm512_setzero_ps(), fft2317);
__m512 fft2243 = _mm512_mask_mov_ps(fft2235, 49344, fft2236);
__m512 fft2327 = _mm512_mask_mov_ps(fft2319, 49344, fft2320);
__m512 fft2244 = _mm512_mask_sub_ps(fft2236, 49344, _mm512_setzero_ps(), fft2235);
__m512 fft2328 = _mm512_mask_sub_ps(fft2320, 49344, _mm512_setzero_ps(), fft2319);
__m512 fft2245 = _mm512_mask_mov_ps(fft2237, 49344, fft2238);
__m512 fft2329 = _mm512_mask_mov_ps(fft2321, 49344, fft2322);
__m512 fft2246 = _mm512_mask_sub_ps(fft2238, 49344, _mm512_setzero_ps(), fft2237);
__m512 fft2330 = _mm512_mask_sub_ps(fft2322, 49344, _mm512_setzero_ps(), fft2321);
__m512 fft2247 = _mm512_mask_mov_ps(fft2239, 49344, fft2240);
__m512 fft2331 = _mm512_mask_mov_ps(fft2323, 49344, fft2324);
__m512 fft2248 = _mm512_mask_sub_ps(fft2240, 49344, _mm512_setzero_ps(), fft2239);
__m512 fft2332 = _mm512_mask_sub_ps(fft2324, 49344, _mm512_setzero_ps(), fft2323);
__m512 fft2249 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2250 = _mm512_fmadd_ps(fft2241, fft2249, _mm512_shuffle_ps(fft2241, fft2241, 78));
__m512 fft2333 = _mm512_fmadd_ps(fft2325, fft2249, _mm512_shuffle_ps(fft2325, fft2325, 78));
__m512 fft2251 = _mm512_fmadd_ps(fft2242, fft2249, _mm512_shuffle_ps(fft2242, fft2242, 78));
__m512 fft2334 = _mm512_fmadd_ps(fft2326, fft2249, _mm512_shuffle_ps(fft2326, fft2326, 78));
__m512 fft2252 = _mm512_fmadd_ps(fft2243, fft2249, _mm512_shuffle_ps(fft2243, fft2243, 78));
__m512 fft2335 = _mm512_fmadd_ps(fft2327, fft2249, _mm512_shuffle_ps(fft2327, fft2327, 78));
__m512 fft2253 = _mm512_fmadd_ps(fft2244, fft2249, _mm512_shuffle_ps(fft2244, fft2244, 78));
__m512 fft2336 = _mm512_fmadd_ps(fft2328, fft2249, _mm512_shuffle_ps(fft2328, fft2328, 78));
__m512 fft2254 = _mm512_fmadd_ps(fft2245, fft2249, _mm512_shuffle_ps(fft2245, fft2245, 78));
__m512 fft2337 = _mm512_fmadd_ps(fft2329, fft2249, _mm512_shuffle_ps(fft2329, fft2329, 78));
__m512 fft2255 = _mm512_fmadd_ps(fft2246, fft2249, _mm512_shuffle_ps(fft2246, fft2246, 78));
__m512 fft2338 = _mm512_fmadd_ps(fft2330, fft2249, _mm512_shuffle_ps(fft2330, fft2330, 78));
__m512 fft2256 = _mm512_fmadd_ps(fft2247, fft2249, _mm512_shuffle_ps(fft2247, fft2247, 78));
__m512 fft2339 = _mm512_fmadd_ps(fft2331, fft2249, _mm512_shuffle_ps(fft2331, fft2331, 78));
__m512 fft2257 = _mm512_fmadd_ps(fft2248, fft2249, _mm512_shuffle_ps(fft2248, fft2248, 78));
__m512 fft2340 = _mm512_fmadd_ps(fft2332, fft2249, _mm512_shuffle_ps(fft2332, fft2332, 78));
__m512i fft2258 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2259 = _mm512_permutexvar_ps(fft2258, fft2250);
__m512 fft2341 = _mm512_permutexvar_ps(fft2258, fft2333);
__m512i fft2260 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2261 = _mm512_permutexvar_ps(fft2260, fft2250);
__m512 fft2342 = _mm512_permutexvar_ps(fft2260, fft2333);
__m512 fft2262 = _mm512_permutexvar_ps(fft2258, fft2251);
__m512 fft2343 = _mm512_permutexvar_ps(fft2258, fft2334);
__m512 fft2263 = _mm512_permutexvar_ps(fft2260, fft2251);
__m512 fft2344 = _mm512_permutexvar_ps(fft2260, fft2334);
__m512 fft2264 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2265 = _mm512_fmadd_ps(fft2259, fft2264, fft2261);
__m512 fft2345 = _mm512_fmadd_ps(fft2341, fft2264, fft2342);
__m512 fft2266 = _mm512_fnmadd_ps(fft2263, fft2264, fft2262);
__m512 fft2346 = _mm512_fnmadd_ps(fft2344, fft2264, fft2343);
__m512 fft2267 = _mm512_mask_mov_ps(fft2263, 21845, fft2265);
__m512 fft2347 = _mm512_mask_mov_ps(fft2344, 21845, fft2345);
__m512 fft2268 = _mm512_mask_mov_ps(fft2259, 43176, fft2265);
__m512 fft2348 = _mm512_mask_mov_ps(fft2341, 43176, fft2345);
__m512 fft2269 = _mm512_mask_mov_ps(fft2267, 43176, fft2266);
__m512 fft2349 = _mm512_mask_mov_ps(fft2347, 43176, fft2346);
__m512 fft2270 = _mm512_mask_mov_ps(fft2268, 22102, fft2266);
__m512 fft2350 = _mm512_mask_mov_ps(fft2348, 22102, fft2346);
__m512 fft2271 = _mm512_mask_mul_ps(fft2269, 64764, fft2269, _mm512_set1_ps(5e-01f));
__m512 fft2351 = _mm512_mask_mul_ps(fft2349, 64764, fft2349, _mm512_set1_ps(5e-01f));
__m512 fft2272 = _mm512_mask_mul_ps(fft2270, 64764, fft2270, _mm512_set1_ps(5e-01f));
__m512 fft2352 = _mm512_mask_mul_ps(fft2350, 64764, fft2350, _mm512_set1_ps(5e-01f));
__m512 df177 = fft2271;
__m512 df185 = fft2351;
__m512 df178 = fft2272;
__m512 df186 = fft2352;
__m512 df179 = fft2252;
__m512 df187 = fft2335;
__m512 df180 = fft2253;
__m512 df188 = fft2336;
__m512 df181 = fft2254;
__m512 df189 = fft2337;
__m512 df182 = fft2255;
__m512 df190 = fft2338;
__m512 df183 = fft2256;
__m512 df191 = fft2339;
__m512 df184 = fft2257;
__m512 df192 = fft2340;
__m512i eo14 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df179 = _mm512_permutexvar_ps(eo14, df179);
df180 = _mm512_permutexvar_ps(eo14, df180);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df179);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df180);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df179);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df180);
df187 = _mm512_permutexvar_ps(eo14, df187);
df188 = _mm512_permutexvar_ps(eo14, df188);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df187);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df188);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df187);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df188);
df181 = _mm512_permutexvar_ps(eo14, df181);
df182 = _mm512_permutexvar_ps(eo14, df182);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df181);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df182);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df181);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df182);
df189 = _mm512_permutexvar_ps(eo14, df189);
df190 = _mm512_permutexvar_ps(eo14, df190);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df189);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df190);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df189);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df190);
df183 = _mm512_permutexvar_ps(eo14, df183);
df184 = _mm512_permutexvar_ps(eo14, df184);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df183);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df184);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df183);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df184);
df191 = _mm512_permutexvar_ps(eo14, df191);
df192 = _mm512_permutexvar_ps(eo14, df192);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df191);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df192);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df191);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df192);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df177);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df178);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df177);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df178);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df185);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k7+128*m14+32*f15, 255, df186);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df185);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k7+128*m14+32*f15, 65280, df186);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 7;
}
if (rel2 < 8) {
ptrdiff_t h7 = base2+10;
ptrdiff_t w7 = 200;
ptrdiff_t k8 = 3*s1;
ptrdiff_t kk7 = k8+2;
for (; k8 <= kk7; ++k8) {
__m512 bnMul7 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k8+3*i6))[0]);
__m512 bnAdd7 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k8+3*i6))[1]);
for (ptrdiff_t b15 = 0; b15 < 2; ++b15) {
ptrdiff_t m15 = (size_t)b15/2;
ptrdiff_t f16 = (size_t)b15%2;
__m512 dat178 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat178 = _mm512_mask_fmadd_ps(dat178, 65535, bnMul7, bnAdd7);
__m512 dat179 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat179 = _mm512_mask_fmadd_ps(dat179, 65535, bnMul7, bnAdd7);
__m512 dat180 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat180 = _mm512_mask_fmadd_ps(dat180, 65535, bnMul7, bnAdd7);
__m512 dat181 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat181 = _mm512_mask_fmadd_ps(dat181, 65535, bnMul7, bnAdd7);
__m512 dat182 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat182 = _mm512_mask_fmadd_ps(dat182, 65535, bnMul7, bnAdd7);
__m512 dat183 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat183 = _mm512_mask_fmadd_ps(dat183, 65535, bnMul7, bnAdd7);
__m512 dat184 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat184 = _mm512_mask_fmadd_ps(dat184, 65535, bnMul7, bnAdd7);
__m512 dat185 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat185 = _mm512_mask_fmadd_ps(dat185, 65535, bnMul7, bnAdd7);
__m512 dat186 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat186 = _mm512_mask_fmadd_ps(dat186, 65535, bnMul7, bnAdd7);
__m512 dat187 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat187 = _mm512_mask_fmadd_ps(dat187, 65535, bnMul7, bnAdd7);
__m512 dat188 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat188 = _mm512_mask_fmadd_ps(dat188, 65535, bnMul7, bnAdd7);
__m512 dat189 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat189 = _mm512_mask_fmadd_ps(dat189, 65535, bnMul7, bnAdd7);
__m512 dat190 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat190 = _mm512_mask_fmadd_ps(dat190, 65535, bnMul7, bnAdd7);
__m512 dat191 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat191 = _mm512_mask_fmadd_ps(dat191, 65535, bnMul7, bnAdd7);
__m512 dat192 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat192 = _mm512_mask_fmadd_ps(dat192, 65535, bnMul7, bnAdd7);
__m512 dat193 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k8+896*h7+4*w7+40*b15);
dat193 = _mm512_mask_fmadd_ps(dat193, 65535, bnMul7, bnAdd7);
__m512 fft2353 = _mm512_add_ps(dat178, dat186);
__m512 fft2441 = _mm512_add_ps(dat179, dat187);
__m512 fft2354 = _mm512_sub_ps(dat178, dat186);
__m512 fft2442 = _mm512_sub_ps(dat179, dat187);
__m512 fft2355 = _mm512_add_ps(dat180, dat188);
__m512 fft2443 = _mm512_add_ps(dat181, dat189);
__m512 fft2356 = _mm512_sub_ps(dat180, dat188);
__m512 fft2444 = _mm512_sub_ps(dat181, dat189);
__m512 fft2357 = _mm512_add_ps(dat182, dat190);
__m512 fft2445 = _mm512_add_ps(dat183, dat191);
__m512 fft2358 = _mm512_sub_ps(dat182, dat190);
__m512 fft2446 = _mm512_sub_ps(dat183, dat191);
__m512 fft2359 = _mm512_add_ps(dat184, dat192);
__m512 fft2447 = _mm512_add_ps(dat185, dat193);
__m512 fft2360 = _mm512_sub_ps(dat184, dat192);
__m512 fft2448 = _mm512_sub_ps(dat185, dat193);
__m512 fft2361 = _mm512_add_ps(fft2353, fft2357);
__m512 fft2449 = _mm512_add_ps(fft2441, fft2445);
__m512 fft2362 = _mm512_sub_ps(fft2353, fft2357);
__m512 fft2450 = _mm512_sub_ps(fft2441, fft2445);
__m512 fft2363 = _mm512_add_ps(fft2355, fft2359);
__m512 fft2451 = _mm512_add_ps(fft2443, fft2447);
__m512 fft2364 = _mm512_sub_ps(fft2359, fft2355);
__m512 fft2452 = _mm512_sub_ps(fft2447, fft2443);
__m512 fft2365 = _mm512_sub_ps(fft2356, fft2360);
__m512 fft2453 = _mm512_sub_ps(fft2444, fft2448);
__m512 fft2366 = _mm512_add_ps(fft2356, fft2360);
__m512 fft2454 = _mm512_add_ps(fft2444, fft2448);
__m512 fft2367 = _mm512_add_ps(fft2361, fft2363);
__m512 fft2455 = _mm512_add_ps(fft2449, fft2451);
__m512 fft2368 = _mm512_sub_ps(fft2361, fft2363);
__m512 fft2456 = _mm512_sub_ps(fft2449, fft2451);
__m512 fft2369 = _mm512_fmadd_ps(fft2365, _mm512_set1_ps(7.0710677e-01f), fft2354);
__m512 fft2457 = _mm512_fmadd_ps(fft2453, _mm512_set1_ps(7.0710677e-01f), fft2442);
__m512 fft2370 = _mm512_fnmsub_ps(fft2366, _mm512_set1_ps(7.0710677e-01f), fft2358);
__m512 fft2458 = _mm512_fnmsub_ps(fft2454, _mm512_set1_ps(7.0710677e-01f), fft2446);
__m512 fft2371 = _mm512_fnmadd_ps(fft2365, _mm512_set1_ps(7.0710677e-01f), fft2354);
__m512 fft2459 = _mm512_fnmadd_ps(fft2453, _mm512_set1_ps(7.0710677e-01f), fft2442);
__m512 fft2372 = _mm512_fnmadd_ps(fft2366, _mm512_set1_ps(7.0710677e-01f), fft2358);
__m512 fft2460 = _mm512_fnmadd_ps(fft2454, _mm512_set1_ps(7.0710677e-01f), fft2446);
__m512 fft2373 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2374 = _mm512_fmadd_ps(fft2367, fft2373, _mm512_shuffle_f32x4(fft2367, fft2367, 78));
__m512 fft2461 = _mm512_fmadd_ps(fft2455, fft2373, _mm512_shuffle_f32x4(fft2455, fft2455, 78));
__m512 fft2375 = _mm512_fmadd_ps(fft2368, fft2373, _mm512_shuffle_f32x4(fft2368, fft2368, 78));
__m512 fft2462 = _mm512_fmadd_ps(fft2456, fft2373, _mm512_shuffle_f32x4(fft2456, fft2456, 78));
__m512 fft2376 = _mm512_fmadd_ps(fft2369, fft2373, _mm512_shuffle_f32x4(fft2369, fft2369, 78));
__m512 fft2463 = _mm512_fmadd_ps(fft2457, fft2373, _mm512_shuffle_f32x4(fft2457, fft2457, 78));
__m512 fft2377 = _mm512_fmadd_ps(fft2370, fft2373, _mm512_shuffle_f32x4(fft2370, fft2370, 78));
__m512 fft2464 = _mm512_fmadd_ps(fft2458, fft2373, _mm512_shuffle_f32x4(fft2458, fft2458, 78));
__m512 fft2378 = _mm512_fmadd_ps(fft2362, fft2373, _mm512_shuffle_f32x4(fft2362, fft2362, 78));
__m512 fft2465 = _mm512_fmadd_ps(fft2450, fft2373, _mm512_shuffle_f32x4(fft2450, fft2450, 78));
__m512 fft2379 = _mm512_fmadd_ps(fft2364, fft2373, _mm512_shuffle_f32x4(fft2364, fft2364, 78));
__m512 fft2466 = _mm512_fmadd_ps(fft2452, fft2373, _mm512_shuffle_f32x4(fft2452, fft2452, 78));
__m512 fft2380 = _mm512_fmadd_ps(fft2371, fft2373, _mm512_shuffle_f32x4(fft2371, fft2371, 78));
__m512 fft2467 = _mm512_fmadd_ps(fft2459, fft2373, _mm512_shuffle_f32x4(fft2459, fft2459, 78));
__m512 fft2381 = _mm512_fmadd_ps(fft2372, fft2373, _mm512_shuffle_f32x4(fft2372, fft2372, 78));
__m512 fft2468 = _mm512_fmadd_ps(fft2460, fft2373, _mm512_shuffle_f32x4(fft2460, fft2460, 78));
__m512 fft2382 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2383 = _mm512_mul_ps(fft2374, fft2382);
__m512 fft2469 = _mm512_mul_ps(fft2461, fft2382);
__m512 fft2384 = _mm512_mul_ps(fft2375, fft2382);
__m512 fft2470 = _mm512_mul_ps(fft2462, fft2382);
__m512 fft2385 = _mm512_mul_ps(fft2376, fft2382);
__m512 fft2471 = _mm512_mul_ps(fft2463, fft2382);
__m512 fft2386 = _mm512_mul_ps(fft2377, fft2382);
__m512 fft2472 = _mm512_mul_ps(fft2464, fft2382);
__m512 fft2387 = _mm512_mul_ps(fft2378, fft2382);
__m512 fft2473 = _mm512_mul_ps(fft2465, fft2382);
__m512 fft2388 = _mm512_mul_ps(fft2379, fft2382);
__m512 fft2474 = _mm512_mul_ps(fft2466, fft2382);
__m512 fft2389 = _mm512_mul_ps(fft2380, fft2382);
__m512 fft2475 = _mm512_mul_ps(fft2467, fft2382);
__m512 fft2390 = _mm512_mul_ps(fft2381, fft2382);
__m512 fft2476 = _mm512_mul_ps(fft2468, fft2382);
__m512 fft2391 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2392 = _mm512_fmadd_ps(fft2375, fft2391, fft2383);
__m512 fft2477 = _mm512_fmadd_ps(fft2462, fft2391, fft2469);
__m512 fft2393 = _mm512_fnmadd_ps(fft2374, fft2391, fft2384);
__m512 fft2478 = _mm512_fnmadd_ps(fft2461, fft2391, fft2470);
__m512 fft2394 = _mm512_fmadd_ps(fft2377, fft2391, fft2385);
__m512 fft2479 = _mm512_fmadd_ps(fft2464, fft2391, fft2471);
__m512 fft2395 = _mm512_fnmadd_ps(fft2376, fft2391, fft2386);
__m512 fft2480 = _mm512_fnmadd_ps(fft2463, fft2391, fft2472);
__m512 fft2396 = _mm512_fmadd_ps(fft2379, fft2391, fft2387);
__m512 fft2481 = _mm512_fmadd_ps(fft2466, fft2391, fft2473);
__m512 fft2397 = _mm512_fnmadd_ps(fft2378, fft2391, fft2388);
__m512 fft2482 = _mm512_fnmadd_ps(fft2465, fft2391, fft2474);
__m512 fft2398 = _mm512_fmadd_ps(fft2381, fft2391, fft2389);
__m512 fft2483 = _mm512_fmadd_ps(fft2468, fft2391, fft2475);
__m512 fft2399 = _mm512_fnmadd_ps(fft2380, fft2391, fft2390);
__m512 fft2484 = _mm512_fnmadd_ps(fft2467, fft2391, fft2476);
__m512 fft2400 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2401 = _mm512_fmadd_ps(fft2392, fft2400, _mm512_shuffle_f32x4(fft2392, fft2392, 177));
__m512 fft2485 = _mm512_fmadd_ps(fft2477, fft2400, _mm512_shuffle_f32x4(fft2477, fft2477, 177));
__m512 fft2402 = _mm512_fmadd_ps(fft2393, fft2400, _mm512_shuffle_f32x4(fft2393, fft2393, 177));
__m512 fft2486 = _mm512_fmadd_ps(fft2478, fft2400, _mm512_shuffle_f32x4(fft2478, fft2478, 177));
__m512 fft2403 = _mm512_fmadd_ps(fft2394, fft2400, _mm512_shuffle_f32x4(fft2394, fft2394, 177));
__m512 fft2487 = _mm512_fmadd_ps(fft2479, fft2400, _mm512_shuffle_f32x4(fft2479, fft2479, 177));
__m512 fft2404 = _mm512_fmadd_ps(fft2395, fft2400, _mm512_shuffle_f32x4(fft2395, fft2395, 177));
__m512 fft2488 = _mm512_fmadd_ps(fft2480, fft2400, _mm512_shuffle_f32x4(fft2480, fft2480, 177));
__m512 fft2405 = _mm512_fmadd_ps(fft2396, fft2400, _mm512_shuffle_f32x4(fft2396, fft2396, 177));
__m512 fft2489 = _mm512_fmadd_ps(fft2481, fft2400, _mm512_shuffle_f32x4(fft2481, fft2481, 177));
__m512 fft2406 = _mm512_fmadd_ps(fft2397, fft2400, _mm512_shuffle_f32x4(fft2397, fft2397, 177));
__m512 fft2490 = _mm512_fmadd_ps(fft2482, fft2400, _mm512_shuffle_f32x4(fft2482, fft2482, 177));
__m512 fft2407 = _mm512_fmadd_ps(fft2398, fft2400, _mm512_shuffle_f32x4(fft2398, fft2398, 177));
__m512 fft2491 = _mm512_fmadd_ps(fft2483, fft2400, _mm512_shuffle_f32x4(fft2483, fft2483, 177));
__m512 fft2408 = _mm512_fmadd_ps(fft2399, fft2400, _mm512_shuffle_f32x4(fft2399, fft2399, 177));
__m512 fft2492 = _mm512_fmadd_ps(fft2484, fft2400, _mm512_shuffle_f32x4(fft2484, fft2484, 177));
__m512 fft2409 = _mm512_mask_mov_ps(fft2401, 49344, fft2402);
__m512 fft2493 = _mm512_mask_mov_ps(fft2485, 49344, fft2486);
__m512 fft2410 = _mm512_mask_sub_ps(fft2402, 49344, _mm512_setzero_ps(), fft2401);
__m512 fft2494 = _mm512_mask_sub_ps(fft2486, 49344, _mm512_setzero_ps(), fft2485);
__m512 fft2411 = _mm512_mask_mov_ps(fft2403, 49344, fft2404);
__m512 fft2495 = _mm512_mask_mov_ps(fft2487, 49344, fft2488);
__m512 fft2412 = _mm512_mask_sub_ps(fft2404, 49344, _mm512_setzero_ps(), fft2403);
__m512 fft2496 = _mm512_mask_sub_ps(fft2488, 49344, _mm512_setzero_ps(), fft2487);
__m512 fft2413 = _mm512_mask_mov_ps(fft2405, 49344, fft2406);
__m512 fft2497 = _mm512_mask_mov_ps(fft2489, 49344, fft2490);
__m512 fft2414 = _mm512_mask_sub_ps(fft2406, 49344, _mm512_setzero_ps(), fft2405);
__m512 fft2498 = _mm512_mask_sub_ps(fft2490, 49344, _mm512_setzero_ps(), fft2489);
__m512 fft2415 = _mm512_mask_mov_ps(fft2407, 49344, fft2408);
__m512 fft2499 = _mm512_mask_mov_ps(fft2491, 49344, fft2492);
__m512 fft2416 = _mm512_mask_sub_ps(fft2408, 49344, _mm512_setzero_ps(), fft2407);
__m512 fft2500 = _mm512_mask_sub_ps(fft2492, 49344, _mm512_setzero_ps(), fft2491);
__m512 fft2417 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2418 = _mm512_fmadd_ps(fft2409, fft2417, _mm512_shuffle_ps(fft2409, fft2409, 78));
__m512 fft2501 = _mm512_fmadd_ps(fft2493, fft2417, _mm512_shuffle_ps(fft2493, fft2493, 78));
__m512 fft2419 = _mm512_fmadd_ps(fft2410, fft2417, _mm512_shuffle_ps(fft2410, fft2410, 78));
__m512 fft2502 = _mm512_fmadd_ps(fft2494, fft2417, _mm512_shuffle_ps(fft2494, fft2494, 78));
__m512 fft2420 = _mm512_fmadd_ps(fft2411, fft2417, _mm512_shuffle_ps(fft2411, fft2411, 78));
__m512 fft2503 = _mm512_fmadd_ps(fft2495, fft2417, _mm512_shuffle_ps(fft2495, fft2495, 78));
__m512 fft2421 = _mm512_fmadd_ps(fft2412, fft2417, _mm512_shuffle_ps(fft2412, fft2412, 78));
__m512 fft2504 = _mm512_fmadd_ps(fft2496, fft2417, _mm512_shuffle_ps(fft2496, fft2496, 78));
__m512 fft2422 = _mm512_fmadd_ps(fft2413, fft2417, _mm512_shuffle_ps(fft2413, fft2413, 78));
__m512 fft2505 = _mm512_fmadd_ps(fft2497, fft2417, _mm512_shuffle_ps(fft2497, fft2497, 78));
__m512 fft2423 = _mm512_fmadd_ps(fft2414, fft2417, _mm512_shuffle_ps(fft2414, fft2414, 78));
__m512 fft2506 = _mm512_fmadd_ps(fft2498, fft2417, _mm512_shuffle_ps(fft2498, fft2498, 78));
__m512 fft2424 = _mm512_fmadd_ps(fft2415, fft2417, _mm512_shuffle_ps(fft2415, fft2415, 78));
__m512 fft2507 = _mm512_fmadd_ps(fft2499, fft2417, _mm512_shuffle_ps(fft2499, fft2499, 78));
__m512 fft2425 = _mm512_fmadd_ps(fft2416, fft2417, _mm512_shuffle_ps(fft2416, fft2416, 78));
__m512 fft2508 = _mm512_fmadd_ps(fft2500, fft2417, _mm512_shuffle_ps(fft2500, fft2500, 78));
__m512i fft2426 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2427 = _mm512_permutexvar_ps(fft2426, fft2418);
__m512 fft2509 = _mm512_permutexvar_ps(fft2426, fft2501);
__m512i fft2428 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2429 = _mm512_permutexvar_ps(fft2428, fft2418);
__m512 fft2510 = _mm512_permutexvar_ps(fft2428, fft2501);
__m512 fft2430 = _mm512_permutexvar_ps(fft2426, fft2419);
__m512 fft2511 = _mm512_permutexvar_ps(fft2426, fft2502);
__m512 fft2431 = _mm512_permutexvar_ps(fft2428, fft2419);
__m512 fft2512 = _mm512_permutexvar_ps(fft2428, fft2502);
__m512 fft2432 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2433 = _mm512_fmadd_ps(fft2427, fft2432, fft2429);
__m512 fft2513 = _mm512_fmadd_ps(fft2509, fft2432, fft2510);
__m512 fft2434 = _mm512_fnmadd_ps(fft2431, fft2432, fft2430);
__m512 fft2514 = _mm512_fnmadd_ps(fft2512, fft2432, fft2511);
__m512 fft2435 = _mm512_mask_mov_ps(fft2431, 21845, fft2433);
__m512 fft2515 = _mm512_mask_mov_ps(fft2512, 21845, fft2513);
__m512 fft2436 = _mm512_mask_mov_ps(fft2427, 43176, fft2433);
__m512 fft2516 = _mm512_mask_mov_ps(fft2509, 43176, fft2513);
__m512 fft2437 = _mm512_mask_mov_ps(fft2435, 43176, fft2434);
__m512 fft2517 = _mm512_mask_mov_ps(fft2515, 43176, fft2514);
__m512 fft2438 = _mm512_mask_mov_ps(fft2436, 22102, fft2434);
__m512 fft2518 = _mm512_mask_mov_ps(fft2516, 22102, fft2514);
__m512 fft2439 = _mm512_mask_mul_ps(fft2437, 64764, fft2437, _mm512_set1_ps(5e-01f));
__m512 fft2519 = _mm512_mask_mul_ps(fft2517, 64764, fft2517, _mm512_set1_ps(5e-01f));
__m512 fft2440 = _mm512_mask_mul_ps(fft2438, 64764, fft2438, _mm512_set1_ps(5e-01f));
__m512 fft2520 = _mm512_mask_mul_ps(fft2518, 64764, fft2518, _mm512_set1_ps(5e-01f));
__m512 df193 = fft2439;
__m512 df201 = fft2519;
__m512 df194 = fft2440;
__m512 df202 = fft2520;
__m512 df195 = fft2420;
__m512 df203 = fft2503;
__m512 df196 = fft2421;
__m512 df204 = fft2504;
__m512 df197 = fft2422;
__m512 df205 = fft2505;
__m512 df198 = fft2423;
__m512 df206 = fft2506;
__m512 df199 = fft2424;
__m512 df207 = fft2507;
__m512 df200 = fft2425;
__m512 df208 = fft2508;
__m512i eo15 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df195 = _mm512_permutexvar_ps(eo15, df195);
df196 = _mm512_permutexvar_ps(eo15, df196);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df195);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df196);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df195);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df196);
df203 = _mm512_permutexvar_ps(eo15, df203);
df204 = _mm512_permutexvar_ps(eo15, df204);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df203);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df204);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df203);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df204);
df197 = _mm512_permutexvar_ps(eo15, df197);
df198 = _mm512_permutexvar_ps(eo15, df198);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df197);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df198);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df197);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df198);
df205 = _mm512_permutexvar_ps(eo15, df205);
df206 = _mm512_permutexvar_ps(eo15, df206);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df205);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df206);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df205);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df206);
df199 = _mm512_permutexvar_ps(eo15, df199);
df200 = _mm512_permutexvar_ps(eo15, df200);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df199);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df200);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df199);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df200);
df207 = _mm512_permutexvar_ps(eo15, df207);
df208 = _mm512_permutexvar_ps(eo15, df208);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df207);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df208);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df207);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df208);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df193);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df194);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df193);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df194);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df201);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m15+32*f16, 255, df202);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df201);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m15+32*f16, 65280, df202);
}
ptrdiff_t b16 = 2;
ptrdiff_t m16 = (size_t)b16/2;
ptrdiff_t f17 = (size_t)b16%2;
__m512 dat194 = _mm512_maskz_loadu_ps(127, datPtr1+80+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat194 = _mm512_mask_fmadd_ps(dat194, 127, bnMul7, bnAdd7);
__m512 dat195 = _mm512_maskz_loadu_ps(127, datPtr1+976+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat195 = _mm512_mask_fmadd_ps(dat195, 127, bnMul7, bnAdd7);
__m512 dat196 = _mm512_maskz_loadu_ps(127, datPtr1+1872+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat196 = _mm512_mask_fmadd_ps(dat196, 127, bnMul7, bnAdd7);
__m512 dat197 = _mm512_maskz_loadu_ps(127, datPtr1+2768+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat197 = _mm512_mask_fmadd_ps(dat197, 127, bnMul7, bnAdd7);
__m512 dat198 = _mm512_maskz_loadu_ps(127, datPtr1+3664+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat198 = _mm512_mask_fmadd_ps(dat198, 127, bnMul7, bnAdd7);
__m512 dat199 = _mm512_maskz_loadu_ps(127, datPtr1+4560+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat199 = _mm512_mask_fmadd_ps(dat199, 127, bnMul7, bnAdd7);
__m512 dat200 = _mm512_maskz_loadu_ps(127, datPtr1+5456+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat200 = _mm512_mask_fmadd_ps(dat200, 127, bnMul7, bnAdd7);
__m512 dat201 = _mm512_maskz_loadu_ps(127, datPtr1+6352+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat201 = _mm512_mask_fmadd_ps(dat201, 127, bnMul7, bnAdd7);
__m512 dat202 = _mm512_maskz_loadu_ps(127, datPtr1+7248+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat202 = _mm512_mask_fmadd_ps(dat202, 127, bnMul7, bnAdd7);
__m512 dat203 = _mm512_maskz_loadu_ps(127, datPtr1+8144+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat203 = _mm512_mask_fmadd_ps(dat203, 127, bnMul7, bnAdd7);
__m512 dat204 = _mm512_maskz_loadu_ps(127, datPtr1+9040+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat204 = _mm512_mask_fmadd_ps(dat204, 127, bnMul7, bnAdd7);
__m512 dat205 = _mm512_maskz_loadu_ps(127, datPtr1+9936+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat205 = _mm512_mask_fmadd_ps(dat205, 127, bnMul7, bnAdd7);
__m512 dat206 = _mm512_maskz_loadu_ps(127, datPtr1+10832+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat206 = _mm512_mask_fmadd_ps(dat206, 127, bnMul7, bnAdd7);
__m512 dat207 = _mm512_maskz_loadu_ps(127, datPtr1+11728+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat207 = _mm512_mask_fmadd_ps(dat207, 127, bnMul7, bnAdd7);
__m512 dat208 = _mm512_maskz_loadu_ps(127, datPtr1+12624+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat208 = _mm512_mask_fmadd_ps(dat208, 127, bnMul7, bnAdd7);
__m512 dat209 = _mm512_maskz_loadu_ps(127, datPtr1+13520+602112*i6+200704*k8+896*h7+4*w7+0*b16);
dat209 = _mm512_mask_fmadd_ps(dat209, 127, bnMul7, bnAdd7);
__m512 fft2521 = _mm512_add_ps(dat194, dat202);
__m512 fft2609 = _mm512_add_ps(dat195, dat203);
__m512 fft2522 = _mm512_sub_ps(dat194, dat202);
__m512 fft2610 = _mm512_sub_ps(dat195, dat203);
__m512 fft2523 = _mm512_add_ps(dat196, dat204);
__m512 fft2611 = _mm512_add_ps(dat197, dat205);
__m512 fft2524 = _mm512_sub_ps(dat196, dat204);
__m512 fft2612 = _mm512_sub_ps(dat197, dat205);
__m512 fft2525 = _mm512_add_ps(dat198, dat206);
__m512 fft2613 = _mm512_add_ps(dat199, dat207);
__m512 fft2526 = _mm512_sub_ps(dat198, dat206);
__m512 fft2614 = _mm512_sub_ps(dat199, dat207);
__m512 fft2527 = _mm512_add_ps(dat200, dat208);
__m512 fft2615 = _mm512_add_ps(dat201, dat209);
__m512 fft2528 = _mm512_sub_ps(dat200, dat208);
__m512 fft2616 = _mm512_sub_ps(dat201, dat209);
__m512 fft2529 = _mm512_add_ps(fft2521, fft2525);
__m512 fft2617 = _mm512_add_ps(fft2609, fft2613);
__m512 fft2530 = _mm512_sub_ps(fft2521, fft2525);
__m512 fft2618 = _mm512_sub_ps(fft2609, fft2613);
__m512 fft2531 = _mm512_add_ps(fft2523, fft2527);
__m512 fft2619 = _mm512_add_ps(fft2611, fft2615);
__m512 fft2532 = _mm512_sub_ps(fft2527, fft2523);
__m512 fft2620 = _mm512_sub_ps(fft2615, fft2611);
__m512 fft2533 = _mm512_sub_ps(fft2524, fft2528);
__m512 fft2621 = _mm512_sub_ps(fft2612, fft2616);
__m512 fft2534 = _mm512_add_ps(fft2524, fft2528);
__m512 fft2622 = _mm512_add_ps(fft2612, fft2616);
__m512 fft2535 = _mm512_add_ps(fft2529, fft2531);
__m512 fft2623 = _mm512_add_ps(fft2617, fft2619);
__m512 fft2536 = _mm512_sub_ps(fft2529, fft2531);
__m512 fft2624 = _mm512_sub_ps(fft2617, fft2619);
__m512 fft2537 = _mm512_fmadd_ps(fft2533, _mm512_set1_ps(7.0710677e-01f), fft2522);
__m512 fft2625 = _mm512_fmadd_ps(fft2621, _mm512_set1_ps(7.0710677e-01f), fft2610);
__m512 fft2538 = _mm512_fnmsub_ps(fft2534, _mm512_set1_ps(7.0710677e-01f), fft2526);
__m512 fft2626 = _mm512_fnmsub_ps(fft2622, _mm512_set1_ps(7.0710677e-01f), fft2614);
__m512 fft2539 = _mm512_fnmadd_ps(fft2533, _mm512_set1_ps(7.0710677e-01f), fft2522);
__m512 fft2627 = _mm512_fnmadd_ps(fft2621, _mm512_set1_ps(7.0710677e-01f), fft2610);
__m512 fft2540 = _mm512_fnmadd_ps(fft2534, _mm512_set1_ps(7.0710677e-01f), fft2526);
__m512 fft2628 = _mm512_fnmadd_ps(fft2622, _mm512_set1_ps(7.0710677e-01f), fft2614);
__m512 fft2541 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2542 = _mm512_fmadd_ps(fft2535, fft2541, _mm512_shuffle_f32x4(fft2535, fft2535, 78));
__m512 fft2629 = _mm512_fmadd_ps(fft2623, fft2541, _mm512_shuffle_f32x4(fft2623, fft2623, 78));
__m512 fft2543 = _mm512_fmadd_ps(fft2536, fft2541, _mm512_shuffle_f32x4(fft2536, fft2536, 78));
__m512 fft2630 = _mm512_fmadd_ps(fft2624, fft2541, _mm512_shuffle_f32x4(fft2624, fft2624, 78));
__m512 fft2544 = _mm512_fmadd_ps(fft2537, fft2541, _mm512_shuffle_f32x4(fft2537, fft2537, 78));
__m512 fft2631 = _mm512_fmadd_ps(fft2625, fft2541, _mm512_shuffle_f32x4(fft2625, fft2625, 78));
__m512 fft2545 = _mm512_fmadd_ps(fft2538, fft2541, _mm512_shuffle_f32x4(fft2538, fft2538, 78));
__m512 fft2632 = _mm512_fmadd_ps(fft2626, fft2541, _mm512_shuffle_f32x4(fft2626, fft2626, 78));
__m512 fft2546 = _mm512_fmadd_ps(fft2530, fft2541, _mm512_shuffle_f32x4(fft2530, fft2530, 78));
__m512 fft2633 = _mm512_fmadd_ps(fft2618, fft2541, _mm512_shuffle_f32x4(fft2618, fft2618, 78));
__m512 fft2547 = _mm512_fmadd_ps(fft2532, fft2541, _mm512_shuffle_f32x4(fft2532, fft2532, 78));
__m512 fft2634 = _mm512_fmadd_ps(fft2620, fft2541, _mm512_shuffle_f32x4(fft2620, fft2620, 78));
__m512 fft2548 = _mm512_fmadd_ps(fft2539, fft2541, _mm512_shuffle_f32x4(fft2539, fft2539, 78));
__m512 fft2635 = _mm512_fmadd_ps(fft2627, fft2541, _mm512_shuffle_f32x4(fft2627, fft2627, 78));
__m512 fft2549 = _mm512_fmadd_ps(fft2540, fft2541, _mm512_shuffle_f32x4(fft2540, fft2540, 78));
__m512 fft2636 = _mm512_fmadd_ps(fft2628, fft2541, _mm512_shuffle_f32x4(fft2628, fft2628, 78));
__m512 fft2550 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2551 = _mm512_mul_ps(fft2542, fft2550);
__m512 fft2637 = _mm512_mul_ps(fft2629, fft2550);
__m512 fft2552 = _mm512_mul_ps(fft2543, fft2550);
__m512 fft2638 = _mm512_mul_ps(fft2630, fft2550);
__m512 fft2553 = _mm512_mul_ps(fft2544, fft2550);
__m512 fft2639 = _mm512_mul_ps(fft2631, fft2550);
__m512 fft2554 = _mm512_mul_ps(fft2545, fft2550);
__m512 fft2640 = _mm512_mul_ps(fft2632, fft2550);
__m512 fft2555 = _mm512_mul_ps(fft2546, fft2550);
__m512 fft2641 = _mm512_mul_ps(fft2633, fft2550);
__m512 fft2556 = _mm512_mul_ps(fft2547, fft2550);
__m512 fft2642 = _mm512_mul_ps(fft2634, fft2550);
__m512 fft2557 = _mm512_mul_ps(fft2548, fft2550);
__m512 fft2643 = _mm512_mul_ps(fft2635, fft2550);
__m512 fft2558 = _mm512_mul_ps(fft2549, fft2550);
__m512 fft2644 = _mm512_mul_ps(fft2636, fft2550);
__m512 fft2559 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2560 = _mm512_fmadd_ps(fft2543, fft2559, fft2551);
__m512 fft2645 = _mm512_fmadd_ps(fft2630, fft2559, fft2637);
__m512 fft2561 = _mm512_fnmadd_ps(fft2542, fft2559, fft2552);
__m512 fft2646 = _mm512_fnmadd_ps(fft2629, fft2559, fft2638);
__m512 fft2562 = _mm512_fmadd_ps(fft2545, fft2559, fft2553);
__m512 fft2647 = _mm512_fmadd_ps(fft2632, fft2559, fft2639);
__m512 fft2563 = _mm512_fnmadd_ps(fft2544, fft2559, fft2554);
__m512 fft2648 = _mm512_fnmadd_ps(fft2631, fft2559, fft2640);
__m512 fft2564 = _mm512_fmadd_ps(fft2547, fft2559, fft2555);
__m512 fft2649 = _mm512_fmadd_ps(fft2634, fft2559, fft2641);
__m512 fft2565 = _mm512_fnmadd_ps(fft2546, fft2559, fft2556);
__m512 fft2650 = _mm512_fnmadd_ps(fft2633, fft2559, fft2642);
__m512 fft2566 = _mm512_fmadd_ps(fft2549, fft2559, fft2557);
__m512 fft2651 = _mm512_fmadd_ps(fft2636, fft2559, fft2643);
__m512 fft2567 = _mm512_fnmadd_ps(fft2548, fft2559, fft2558);
__m512 fft2652 = _mm512_fnmadd_ps(fft2635, fft2559, fft2644);
__m512 fft2568 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2569 = _mm512_fmadd_ps(fft2560, fft2568, _mm512_shuffle_f32x4(fft2560, fft2560, 177));
__m512 fft2653 = _mm512_fmadd_ps(fft2645, fft2568, _mm512_shuffle_f32x4(fft2645, fft2645, 177));
__m512 fft2570 = _mm512_fmadd_ps(fft2561, fft2568, _mm512_shuffle_f32x4(fft2561, fft2561, 177));
__m512 fft2654 = _mm512_fmadd_ps(fft2646, fft2568, _mm512_shuffle_f32x4(fft2646, fft2646, 177));
__m512 fft2571 = _mm512_fmadd_ps(fft2562, fft2568, _mm512_shuffle_f32x4(fft2562, fft2562, 177));
__m512 fft2655 = _mm512_fmadd_ps(fft2647, fft2568, _mm512_shuffle_f32x4(fft2647, fft2647, 177));
__m512 fft2572 = _mm512_fmadd_ps(fft2563, fft2568, _mm512_shuffle_f32x4(fft2563, fft2563, 177));
__m512 fft2656 = _mm512_fmadd_ps(fft2648, fft2568, _mm512_shuffle_f32x4(fft2648, fft2648, 177));
__m512 fft2573 = _mm512_fmadd_ps(fft2564, fft2568, _mm512_shuffle_f32x4(fft2564, fft2564, 177));
__m512 fft2657 = _mm512_fmadd_ps(fft2649, fft2568, _mm512_shuffle_f32x4(fft2649, fft2649, 177));
__m512 fft2574 = _mm512_fmadd_ps(fft2565, fft2568, _mm512_shuffle_f32x4(fft2565, fft2565, 177));
__m512 fft2658 = _mm512_fmadd_ps(fft2650, fft2568, _mm512_shuffle_f32x4(fft2650, fft2650, 177));
__m512 fft2575 = _mm512_fmadd_ps(fft2566, fft2568, _mm512_shuffle_f32x4(fft2566, fft2566, 177));
__m512 fft2659 = _mm512_fmadd_ps(fft2651, fft2568, _mm512_shuffle_f32x4(fft2651, fft2651, 177));
__m512 fft2576 = _mm512_fmadd_ps(fft2567, fft2568, _mm512_shuffle_f32x4(fft2567, fft2567, 177));
__m512 fft2660 = _mm512_fmadd_ps(fft2652, fft2568, _mm512_shuffle_f32x4(fft2652, fft2652, 177));
__m512 fft2577 = _mm512_mask_mov_ps(fft2569, 49344, fft2570);
__m512 fft2661 = _mm512_mask_mov_ps(fft2653, 49344, fft2654);
__m512 fft2578 = _mm512_mask_sub_ps(fft2570, 49344, _mm512_setzero_ps(), fft2569);
__m512 fft2662 = _mm512_mask_sub_ps(fft2654, 49344, _mm512_setzero_ps(), fft2653);
__m512 fft2579 = _mm512_mask_mov_ps(fft2571, 49344, fft2572);
__m512 fft2663 = _mm512_mask_mov_ps(fft2655, 49344, fft2656);
__m512 fft2580 = _mm512_mask_sub_ps(fft2572, 49344, _mm512_setzero_ps(), fft2571);
__m512 fft2664 = _mm512_mask_sub_ps(fft2656, 49344, _mm512_setzero_ps(), fft2655);
__m512 fft2581 = _mm512_mask_mov_ps(fft2573, 49344, fft2574);
__m512 fft2665 = _mm512_mask_mov_ps(fft2657, 49344, fft2658);
__m512 fft2582 = _mm512_mask_sub_ps(fft2574, 49344, _mm512_setzero_ps(), fft2573);
__m512 fft2666 = _mm512_mask_sub_ps(fft2658, 49344, _mm512_setzero_ps(), fft2657);
__m512 fft2583 = _mm512_mask_mov_ps(fft2575, 49344, fft2576);
__m512 fft2667 = _mm512_mask_mov_ps(fft2659, 49344, fft2660);
__m512 fft2584 = _mm512_mask_sub_ps(fft2576, 49344, _mm512_setzero_ps(), fft2575);
__m512 fft2668 = _mm512_mask_sub_ps(fft2660, 49344, _mm512_setzero_ps(), fft2659);
__m512 fft2585 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2586 = _mm512_fmadd_ps(fft2577, fft2585, _mm512_shuffle_ps(fft2577, fft2577, 78));
__m512 fft2669 = _mm512_fmadd_ps(fft2661, fft2585, _mm512_shuffle_ps(fft2661, fft2661, 78));
__m512 fft2587 = _mm512_fmadd_ps(fft2578, fft2585, _mm512_shuffle_ps(fft2578, fft2578, 78));
__m512 fft2670 = _mm512_fmadd_ps(fft2662, fft2585, _mm512_shuffle_ps(fft2662, fft2662, 78));
__m512 fft2588 = _mm512_fmadd_ps(fft2579, fft2585, _mm512_shuffle_ps(fft2579, fft2579, 78));
__m512 fft2671 = _mm512_fmadd_ps(fft2663, fft2585, _mm512_shuffle_ps(fft2663, fft2663, 78));
__m512 fft2589 = _mm512_fmadd_ps(fft2580, fft2585, _mm512_shuffle_ps(fft2580, fft2580, 78));
__m512 fft2672 = _mm512_fmadd_ps(fft2664, fft2585, _mm512_shuffle_ps(fft2664, fft2664, 78));
__m512 fft2590 = _mm512_fmadd_ps(fft2581, fft2585, _mm512_shuffle_ps(fft2581, fft2581, 78));
__m512 fft2673 = _mm512_fmadd_ps(fft2665, fft2585, _mm512_shuffle_ps(fft2665, fft2665, 78));
__m512 fft2591 = _mm512_fmadd_ps(fft2582, fft2585, _mm512_shuffle_ps(fft2582, fft2582, 78));
__m512 fft2674 = _mm512_fmadd_ps(fft2666, fft2585, _mm512_shuffle_ps(fft2666, fft2666, 78));
__m512 fft2592 = _mm512_fmadd_ps(fft2583, fft2585, _mm512_shuffle_ps(fft2583, fft2583, 78));
__m512 fft2675 = _mm512_fmadd_ps(fft2667, fft2585, _mm512_shuffle_ps(fft2667, fft2667, 78));
__m512 fft2593 = _mm512_fmadd_ps(fft2584, fft2585, _mm512_shuffle_ps(fft2584, fft2584, 78));
__m512 fft2676 = _mm512_fmadd_ps(fft2668, fft2585, _mm512_shuffle_ps(fft2668, fft2668, 78));
__m512i fft2594 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2595 = _mm512_permutexvar_ps(fft2594, fft2586);
__m512 fft2677 = _mm512_permutexvar_ps(fft2594, fft2669);
__m512i fft2596 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2597 = _mm512_permutexvar_ps(fft2596, fft2586);
__m512 fft2678 = _mm512_permutexvar_ps(fft2596, fft2669);
__m512 fft2598 = _mm512_permutexvar_ps(fft2594, fft2587);
__m512 fft2679 = _mm512_permutexvar_ps(fft2594, fft2670);
__m512 fft2599 = _mm512_permutexvar_ps(fft2596, fft2587);
__m512 fft2680 = _mm512_permutexvar_ps(fft2596, fft2670);
__m512 fft2600 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2601 = _mm512_fmadd_ps(fft2595, fft2600, fft2597);
__m512 fft2681 = _mm512_fmadd_ps(fft2677, fft2600, fft2678);
__m512 fft2602 = _mm512_fnmadd_ps(fft2599, fft2600, fft2598);
__m512 fft2682 = _mm512_fnmadd_ps(fft2680, fft2600, fft2679);
__m512 fft2603 = _mm512_mask_mov_ps(fft2599, 21845, fft2601);
__m512 fft2683 = _mm512_mask_mov_ps(fft2680, 21845, fft2681);
__m512 fft2604 = _mm512_mask_mov_ps(fft2595, 43176, fft2601);
__m512 fft2684 = _mm512_mask_mov_ps(fft2677, 43176, fft2681);
__m512 fft2605 = _mm512_mask_mov_ps(fft2603, 43176, fft2602);
__m512 fft2685 = _mm512_mask_mov_ps(fft2683, 43176, fft2682);
__m512 fft2606 = _mm512_mask_mov_ps(fft2604, 22102, fft2602);
__m512 fft2686 = _mm512_mask_mov_ps(fft2684, 22102, fft2682);
__m512 fft2607 = _mm512_mask_mul_ps(fft2605, 64764, fft2605, _mm512_set1_ps(5e-01f));
__m512 fft2687 = _mm512_mask_mul_ps(fft2685, 64764, fft2685, _mm512_set1_ps(5e-01f));
__m512 fft2608 = _mm512_mask_mul_ps(fft2606, 64764, fft2606, _mm512_set1_ps(5e-01f));
__m512 fft2688 = _mm512_mask_mul_ps(fft2686, 64764, fft2686, _mm512_set1_ps(5e-01f));
__m512 df209 = fft2607;
__m512 df217 = fft2687;
__m512 df210 = fft2608;
__m512 df218 = fft2688;
__m512 df211 = fft2588;
__m512 df219 = fft2671;
__m512 df212 = fft2589;
__m512 df220 = fft2672;
__m512 df213 = fft2590;
__m512 df221 = fft2673;
__m512 df214 = fft2591;
__m512 df222 = fft2674;
__m512 df215 = fft2592;
__m512 df223 = fft2675;
__m512 df216 = fft2593;
__m512 df224 = fft2676;
__m512i eo16 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df211 = _mm512_permutexvar_ps(eo16, df211);
df212 = _mm512_permutexvar_ps(eo16, df212);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df211);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df212);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df211);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df212);
df219 = _mm512_permutexvar_ps(eo16, df219);
df220 = _mm512_permutexvar_ps(eo16, df220);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df219);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df220);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df219);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df220);
df213 = _mm512_permutexvar_ps(eo16, df213);
df214 = _mm512_permutexvar_ps(eo16, df214);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df213);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df214);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df213);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df214);
df221 = _mm512_permutexvar_ps(eo16, df221);
df222 = _mm512_permutexvar_ps(eo16, df222);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df221);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df222);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df221);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df222);
df215 = _mm512_permutexvar_ps(eo16, df215);
df216 = _mm512_permutexvar_ps(eo16, df216);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df215);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df216);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df215);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df216);
df223 = _mm512_permutexvar_ps(eo16, df223);
df224 = _mm512_permutexvar_ps(eo16, df224);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df223);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df224);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df223);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df224);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df209);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df210);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df209);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df210);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df217);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m16+32*f17, 255, df218);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df217);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m16+32*f17, 65280, df218);
ptrdiff_t b17 = 3;
ptrdiff_t m17 = (size_t)b17/2;
ptrdiff_t f18 = (size_t)b17%2;
__m512 dat210 = _mm512_maskz_loadu_ps(65528, datPtr1+8160+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat210 = _mm512_mask_fmadd_ps(dat210, 65528, bnMul7, bnAdd7);
__m512 dat211 = _mm512_maskz_loadu_ps(65528, datPtr1+9056+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat211 = _mm512_mask_fmadd_ps(dat211, 65528, bnMul7, bnAdd7);
__m512 dat212 = _mm512_maskz_loadu_ps(65528, datPtr1+9952+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat212 = _mm512_mask_fmadd_ps(dat212, 65528, bnMul7, bnAdd7);
__m512 dat213 = _mm512_maskz_loadu_ps(65528, datPtr1+10848+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat213 = _mm512_mask_fmadd_ps(dat213, 65528, bnMul7, bnAdd7);
__m512 dat214 = _mm512_maskz_loadu_ps(65528, datPtr1+11744+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat214 = _mm512_mask_fmadd_ps(dat214, 65528, bnMul7, bnAdd7);
__m512 dat215 = _mm512_maskz_loadu_ps(65528, datPtr1+12640+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat215 = _mm512_mask_fmadd_ps(dat215, 65528, bnMul7, bnAdd7);
__m512 dat216 = _mm512_maskz_loadu_ps(65528, datPtr1+13536+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat216 = _mm512_mask_fmadd_ps(dat216, 65528, bnMul7, bnAdd7);
__m512 dat217 = _mm512_maskz_loadu_ps(65528, datPtr1+14432+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat217 = _mm512_mask_fmadd_ps(dat217, 65528, bnMul7, bnAdd7);
__m512 dat218 = _mm512_maskz_loadu_ps(65528, datPtr1+15328+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat218 = _mm512_mask_fmadd_ps(dat218, 65528, bnMul7, bnAdd7);
__m512 dat219 = _mm512_maskz_loadu_ps(65528, datPtr1+16224+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat219 = _mm512_mask_fmadd_ps(dat219, 65528, bnMul7, bnAdd7);
__m512 dat220 = _mm512_maskz_loadu_ps(65528, datPtr1+17120+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat220 = _mm512_mask_fmadd_ps(dat220, 65528, bnMul7, bnAdd7);
__m512 dat221 = _mm512_maskz_loadu_ps(65528, datPtr1+18016+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat221 = _mm512_mask_fmadd_ps(dat221, 65528, bnMul7, bnAdd7);
__m512 dat222 = _mm512_maskz_loadu_ps(65528, datPtr1+18912+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat222 = _mm512_mask_fmadd_ps(dat222, 65528, bnMul7, bnAdd7);
__m512 dat223 = _mm512_maskz_loadu_ps(65528, datPtr1+19808+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat223 = _mm512_mask_fmadd_ps(dat223, 65528, bnMul7, bnAdd7);
__m512 dat224 = _mm512_maskz_loadu_ps(65528, datPtr1+20704+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat224 = _mm512_mask_fmadd_ps(dat224, 65528, bnMul7, bnAdd7);
__m512 dat225 = _mm512_maskz_loadu_ps(65528, datPtr1+21600+602112*i6+200704*k8+896*h7+4*w7+0*b17);
dat225 = _mm512_mask_fmadd_ps(dat225, 65528, bnMul7, bnAdd7);
__m512 fft2689 = _mm512_add_ps(dat210, dat218);
__m512 fft2777 = _mm512_add_ps(dat211, dat219);
__m512 fft2690 = _mm512_sub_ps(dat210, dat218);
__m512 fft2778 = _mm512_sub_ps(dat211, dat219);
__m512 fft2691 = _mm512_add_ps(dat212, dat220);
__m512 fft2779 = _mm512_add_ps(dat213, dat221);
__m512 fft2692 = _mm512_sub_ps(dat212, dat220);
__m512 fft2780 = _mm512_sub_ps(dat213, dat221);
__m512 fft2693 = _mm512_add_ps(dat214, dat222);
__m512 fft2781 = _mm512_add_ps(dat215, dat223);
__m512 fft2694 = _mm512_sub_ps(dat214, dat222);
__m512 fft2782 = _mm512_sub_ps(dat215, dat223);
__m512 fft2695 = _mm512_add_ps(dat216, dat224);
__m512 fft2783 = _mm512_add_ps(dat217, dat225);
__m512 fft2696 = _mm512_sub_ps(dat216, dat224);
__m512 fft2784 = _mm512_sub_ps(dat217, dat225);
__m512 fft2697 = _mm512_add_ps(fft2689, fft2693);
__m512 fft2785 = _mm512_add_ps(fft2777, fft2781);
__m512 fft2698 = _mm512_sub_ps(fft2689, fft2693);
__m512 fft2786 = _mm512_sub_ps(fft2777, fft2781);
__m512 fft2699 = _mm512_add_ps(fft2691, fft2695);
__m512 fft2787 = _mm512_add_ps(fft2779, fft2783);
__m512 fft2700 = _mm512_sub_ps(fft2695, fft2691);
__m512 fft2788 = _mm512_sub_ps(fft2783, fft2779);
__m512 fft2701 = _mm512_sub_ps(fft2692, fft2696);
__m512 fft2789 = _mm512_sub_ps(fft2780, fft2784);
__m512 fft2702 = _mm512_add_ps(fft2692, fft2696);
__m512 fft2790 = _mm512_add_ps(fft2780, fft2784);
__m512 fft2703 = _mm512_add_ps(fft2697, fft2699);
__m512 fft2791 = _mm512_add_ps(fft2785, fft2787);
__m512 fft2704 = _mm512_sub_ps(fft2697, fft2699);
__m512 fft2792 = _mm512_sub_ps(fft2785, fft2787);
__m512 fft2705 = _mm512_fmadd_ps(fft2701, _mm512_set1_ps(7.0710677e-01f), fft2690);
__m512 fft2793 = _mm512_fmadd_ps(fft2789, _mm512_set1_ps(7.0710677e-01f), fft2778);
__m512 fft2706 = _mm512_fnmsub_ps(fft2702, _mm512_set1_ps(7.0710677e-01f), fft2694);
__m512 fft2794 = _mm512_fnmsub_ps(fft2790, _mm512_set1_ps(7.0710677e-01f), fft2782);
__m512 fft2707 = _mm512_fnmadd_ps(fft2701, _mm512_set1_ps(7.0710677e-01f), fft2690);
__m512 fft2795 = _mm512_fnmadd_ps(fft2789, _mm512_set1_ps(7.0710677e-01f), fft2778);
__m512 fft2708 = _mm512_fnmadd_ps(fft2702, _mm512_set1_ps(7.0710677e-01f), fft2694);
__m512 fft2796 = _mm512_fnmadd_ps(fft2790, _mm512_set1_ps(7.0710677e-01f), fft2782);
__m512 fft2709 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2710 = _mm512_fmadd_ps(fft2703, fft2709, _mm512_shuffle_f32x4(fft2703, fft2703, 78));
__m512 fft2797 = _mm512_fmadd_ps(fft2791, fft2709, _mm512_shuffle_f32x4(fft2791, fft2791, 78));
__m512 fft2711 = _mm512_fmadd_ps(fft2704, fft2709, _mm512_shuffle_f32x4(fft2704, fft2704, 78));
__m512 fft2798 = _mm512_fmadd_ps(fft2792, fft2709, _mm512_shuffle_f32x4(fft2792, fft2792, 78));
__m512 fft2712 = _mm512_fmadd_ps(fft2705, fft2709, _mm512_shuffle_f32x4(fft2705, fft2705, 78));
__m512 fft2799 = _mm512_fmadd_ps(fft2793, fft2709, _mm512_shuffle_f32x4(fft2793, fft2793, 78));
__m512 fft2713 = _mm512_fmadd_ps(fft2706, fft2709, _mm512_shuffle_f32x4(fft2706, fft2706, 78));
__m512 fft2800 = _mm512_fmadd_ps(fft2794, fft2709, _mm512_shuffle_f32x4(fft2794, fft2794, 78));
__m512 fft2714 = _mm512_fmadd_ps(fft2698, fft2709, _mm512_shuffle_f32x4(fft2698, fft2698, 78));
__m512 fft2801 = _mm512_fmadd_ps(fft2786, fft2709, _mm512_shuffle_f32x4(fft2786, fft2786, 78));
__m512 fft2715 = _mm512_fmadd_ps(fft2700, fft2709, _mm512_shuffle_f32x4(fft2700, fft2700, 78));
__m512 fft2802 = _mm512_fmadd_ps(fft2788, fft2709, _mm512_shuffle_f32x4(fft2788, fft2788, 78));
__m512 fft2716 = _mm512_fmadd_ps(fft2707, fft2709, _mm512_shuffle_f32x4(fft2707, fft2707, 78));
__m512 fft2803 = _mm512_fmadd_ps(fft2795, fft2709, _mm512_shuffle_f32x4(fft2795, fft2795, 78));
__m512 fft2717 = _mm512_fmadd_ps(fft2708, fft2709, _mm512_shuffle_f32x4(fft2708, fft2708, 78));
__m512 fft2804 = _mm512_fmadd_ps(fft2796, fft2709, _mm512_shuffle_f32x4(fft2796, fft2796, 78));
__m512 fft2718 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2719 = _mm512_mul_ps(fft2710, fft2718);
__m512 fft2805 = _mm512_mul_ps(fft2797, fft2718);
__m512 fft2720 = _mm512_mul_ps(fft2711, fft2718);
__m512 fft2806 = _mm512_mul_ps(fft2798, fft2718);
__m512 fft2721 = _mm512_mul_ps(fft2712, fft2718);
__m512 fft2807 = _mm512_mul_ps(fft2799, fft2718);
__m512 fft2722 = _mm512_mul_ps(fft2713, fft2718);
__m512 fft2808 = _mm512_mul_ps(fft2800, fft2718);
__m512 fft2723 = _mm512_mul_ps(fft2714, fft2718);
__m512 fft2809 = _mm512_mul_ps(fft2801, fft2718);
__m512 fft2724 = _mm512_mul_ps(fft2715, fft2718);
__m512 fft2810 = _mm512_mul_ps(fft2802, fft2718);
__m512 fft2725 = _mm512_mul_ps(fft2716, fft2718);
__m512 fft2811 = _mm512_mul_ps(fft2803, fft2718);
__m512 fft2726 = _mm512_mul_ps(fft2717, fft2718);
__m512 fft2812 = _mm512_mul_ps(fft2804, fft2718);
__m512 fft2727 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2728 = _mm512_fmadd_ps(fft2711, fft2727, fft2719);
__m512 fft2813 = _mm512_fmadd_ps(fft2798, fft2727, fft2805);
__m512 fft2729 = _mm512_fnmadd_ps(fft2710, fft2727, fft2720);
__m512 fft2814 = _mm512_fnmadd_ps(fft2797, fft2727, fft2806);
__m512 fft2730 = _mm512_fmadd_ps(fft2713, fft2727, fft2721);
__m512 fft2815 = _mm512_fmadd_ps(fft2800, fft2727, fft2807);
__m512 fft2731 = _mm512_fnmadd_ps(fft2712, fft2727, fft2722);
__m512 fft2816 = _mm512_fnmadd_ps(fft2799, fft2727, fft2808);
__m512 fft2732 = _mm512_fmadd_ps(fft2715, fft2727, fft2723);
__m512 fft2817 = _mm512_fmadd_ps(fft2802, fft2727, fft2809);
__m512 fft2733 = _mm512_fnmadd_ps(fft2714, fft2727, fft2724);
__m512 fft2818 = _mm512_fnmadd_ps(fft2801, fft2727, fft2810);
__m512 fft2734 = _mm512_fmadd_ps(fft2717, fft2727, fft2725);
__m512 fft2819 = _mm512_fmadd_ps(fft2804, fft2727, fft2811);
__m512 fft2735 = _mm512_fnmadd_ps(fft2716, fft2727, fft2726);
__m512 fft2820 = _mm512_fnmadd_ps(fft2803, fft2727, fft2812);
__m512 fft2736 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2737 = _mm512_fmadd_ps(fft2728, fft2736, _mm512_shuffle_f32x4(fft2728, fft2728, 177));
__m512 fft2821 = _mm512_fmadd_ps(fft2813, fft2736, _mm512_shuffle_f32x4(fft2813, fft2813, 177));
__m512 fft2738 = _mm512_fmadd_ps(fft2729, fft2736, _mm512_shuffle_f32x4(fft2729, fft2729, 177));
__m512 fft2822 = _mm512_fmadd_ps(fft2814, fft2736, _mm512_shuffle_f32x4(fft2814, fft2814, 177));
__m512 fft2739 = _mm512_fmadd_ps(fft2730, fft2736, _mm512_shuffle_f32x4(fft2730, fft2730, 177));
__m512 fft2823 = _mm512_fmadd_ps(fft2815, fft2736, _mm512_shuffle_f32x4(fft2815, fft2815, 177));
__m512 fft2740 = _mm512_fmadd_ps(fft2731, fft2736, _mm512_shuffle_f32x4(fft2731, fft2731, 177));
__m512 fft2824 = _mm512_fmadd_ps(fft2816, fft2736, _mm512_shuffle_f32x4(fft2816, fft2816, 177));
__m512 fft2741 = _mm512_fmadd_ps(fft2732, fft2736, _mm512_shuffle_f32x4(fft2732, fft2732, 177));
__m512 fft2825 = _mm512_fmadd_ps(fft2817, fft2736, _mm512_shuffle_f32x4(fft2817, fft2817, 177));
__m512 fft2742 = _mm512_fmadd_ps(fft2733, fft2736, _mm512_shuffle_f32x4(fft2733, fft2733, 177));
__m512 fft2826 = _mm512_fmadd_ps(fft2818, fft2736, _mm512_shuffle_f32x4(fft2818, fft2818, 177));
__m512 fft2743 = _mm512_fmadd_ps(fft2734, fft2736, _mm512_shuffle_f32x4(fft2734, fft2734, 177));
__m512 fft2827 = _mm512_fmadd_ps(fft2819, fft2736, _mm512_shuffle_f32x4(fft2819, fft2819, 177));
__m512 fft2744 = _mm512_fmadd_ps(fft2735, fft2736, _mm512_shuffle_f32x4(fft2735, fft2735, 177));
__m512 fft2828 = _mm512_fmadd_ps(fft2820, fft2736, _mm512_shuffle_f32x4(fft2820, fft2820, 177));
__m512 fft2745 = _mm512_mask_mov_ps(fft2737, 49344, fft2738);
__m512 fft2829 = _mm512_mask_mov_ps(fft2821, 49344, fft2822);
__m512 fft2746 = _mm512_mask_sub_ps(fft2738, 49344, _mm512_setzero_ps(), fft2737);
__m512 fft2830 = _mm512_mask_sub_ps(fft2822, 49344, _mm512_setzero_ps(), fft2821);
__m512 fft2747 = _mm512_mask_mov_ps(fft2739, 49344, fft2740);
__m512 fft2831 = _mm512_mask_mov_ps(fft2823, 49344, fft2824);
__m512 fft2748 = _mm512_mask_sub_ps(fft2740, 49344, _mm512_setzero_ps(), fft2739);
__m512 fft2832 = _mm512_mask_sub_ps(fft2824, 49344, _mm512_setzero_ps(), fft2823);
__m512 fft2749 = _mm512_mask_mov_ps(fft2741, 49344, fft2742);
__m512 fft2833 = _mm512_mask_mov_ps(fft2825, 49344, fft2826);
__m512 fft2750 = _mm512_mask_sub_ps(fft2742, 49344, _mm512_setzero_ps(), fft2741);
__m512 fft2834 = _mm512_mask_sub_ps(fft2826, 49344, _mm512_setzero_ps(), fft2825);
__m512 fft2751 = _mm512_mask_mov_ps(fft2743, 49344, fft2744);
__m512 fft2835 = _mm512_mask_mov_ps(fft2827, 49344, fft2828);
__m512 fft2752 = _mm512_mask_sub_ps(fft2744, 49344, _mm512_setzero_ps(), fft2743);
__m512 fft2836 = _mm512_mask_sub_ps(fft2828, 49344, _mm512_setzero_ps(), fft2827);
__m512 fft2753 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2754 = _mm512_fmadd_ps(fft2745, fft2753, _mm512_shuffle_ps(fft2745, fft2745, 78));
__m512 fft2837 = _mm512_fmadd_ps(fft2829, fft2753, _mm512_shuffle_ps(fft2829, fft2829, 78));
__m512 fft2755 = _mm512_fmadd_ps(fft2746, fft2753, _mm512_shuffle_ps(fft2746, fft2746, 78));
__m512 fft2838 = _mm512_fmadd_ps(fft2830, fft2753, _mm512_shuffle_ps(fft2830, fft2830, 78));
__m512 fft2756 = _mm512_fmadd_ps(fft2747, fft2753, _mm512_shuffle_ps(fft2747, fft2747, 78));
__m512 fft2839 = _mm512_fmadd_ps(fft2831, fft2753, _mm512_shuffle_ps(fft2831, fft2831, 78));
__m512 fft2757 = _mm512_fmadd_ps(fft2748, fft2753, _mm512_shuffle_ps(fft2748, fft2748, 78));
__m512 fft2840 = _mm512_fmadd_ps(fft2832, fft2753, _mm512_shuffle_ps(fft2832, fft2832, 78));
__m512 fft2758 = _mm512_fmadd_ps(fft2749, fft2753, _mm512_shuffle_ps(fft2749, fft2749, 78));
__m512 fft2841 = _mm512_fmadd_ps(fft2833, fft2753, _mm512_shuffle_ps(fft2833, fft2833, 78));
__m512 fft2759 = _mm512_fmadd_ps(fft2750, fft2753, _mm512_shuffle_ps(fft2750, fft2750, 78));
__m512 fft2842 = _mm512_fmadd_ps(fft2834, fft2753, _mm512_shuffle_ps(fft2834, fft2834, 78));
__m512 fft2760 = _mm512_fmadd_ps(fft2751, fft2753, _mm512_shuffle_ps(fft2751, fft2751, 78));
__m512 fft2843 = _mm512_fmadd_ps(fft2835, fft2753, _mm512_shuffle_ps(fft2835, fft2835, 78));
__m512 fft2761 = _mm512_fmadd_ps(fft2752, fft2753, _mm512_shuffle_ps(fft2752, fft2752, 78));
__m512 fft2844 = _mm512_fmadd_ps(fft2836, fft2753, _mm512_shuffle_ps(fft2836, fft2836, 78));
__m512i fft2762 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2763 = _mm512_permutexvar_ps(fft2762, fft2754);
__m512 fft2845 = _mm512_permutexvar_ps(fft2762, fft2837);
__m512i fft2764 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2765 = _mm512_permutexvar_ps(fft2764, fft2754);
__m512 fft2846 = _mm512_permutexvar_ps(fft2764, fft2837);
__m512 fft2766 = _mm512_permutexvar_ps(fft2762, fft2755);
__m512 fft2847 = _mm512_permutexvar_ps(fft2762, fft2838);
__m512 fft2767 = _mm512_permutexvar_ps(fft2764, fft2755);
__m512 fft2848 = _mm512_permutexvar_ps(fft2764, fft2838);
__m512 fft2768 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2769 = _mm512_fmadd_ps(fft2763, fft2768, fft2765);
__m512 fft2849 = _mm512_fmadd_ps(fft2845, fft2768, fft2846);
__m512 fft2770 = _mm512_fnmadd_ps(fft2767, fft2768, fft2766);
__m512 fft2850 = _mm512_fnmadd_ps(fft2848, fft2768, fft2847);
__m512 fft2771 = _mm512_mask_mov_ps(fft2767, 21845, fft2769);
__m512 fft2851 = _mm512_mask_mov_ps(fft2848, 21845, fft2849);
__m512 fft2772 = _mm512_mask_mov_ps(fft2763, 43176, fft2769);
__m512 fft2852 = _mm512_mask_mov_ps(fft2845, 43176, fft2849);
__m512 fft2773 = _mm512_mask_mov_ps(fft2771, 43176, fft2770);
__m512 fft2853 = _mm512_mask_mov_ps(fft2851, 43176, fft2850);
__m512 fft2774 = _mm512_mask_mov_ps(fft2772, 22102, fft2770);
__m512 fft2854 = _mm512_mask_mov_ps(fft2852, 22102, fft2850);
__m512 fft2775 = _mm512_mask_mul_ps(fft2773, 64764, fft2773, _mm512_set1_ps(5e-01f));
__m512 fft2855 = _mm512_mask_mul_ps(fft2853, 64764, fft2853, _mm512_set1_ps(5e-01f));
__m512 fft2776 = _mm512_mask_mul_ps(fft2774, 64764, fft2774, _mm512_set1_ps(5e-01f));
__m512 fft2856 = _mm512_mask_mul_ps(fft2854, 64764, fft2854, _mm512_set1_ps(5e-01f));
__m512 df225 = fft2775;
__m512 df233 = fft2855;
__m512 df226 = fft2776;
__m512 df234 = fft2856;
__m512 df227 = fft2756;
__m512 df235 = fft2839;
__m512 df228 = fft2757;
__m512 df236 = fft2840;
__m512 df229 = fft2758;
__m512 df237 = fft2841;
__m512 df230 = fft2759;
__m512 df238 = fft2842;
__m512 df231 = fft2760;
__m512 df239 = fft2843;
__m512 df232 = fft2761;
__m512 df240 = fft2844;
__m512i eo17 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df227 = _mm512_permutexvar_ps(eo17, df227);
df228 = _mm512_permutexvar_ps(eo17, df228);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df227);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df228);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df227);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df228);
df235 = _mm512_permutexvar_ps(eo17, df235);
df236 = _mm512_permutexvar_ps(eo17, df236);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df235);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df236);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df235);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df236);
df229 = _mm512_permutexvar_ps(eo17, df229);
df230 = _mm512_permutexvar_ps(eo17, df230);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df229);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df230);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df229);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df230);
df237 = _mm512_permutexvar_ps(eo17, df237);
df238 = _mm512_permutexvar_ps(eo17, df238);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df237);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df238);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df237);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df238);
df231 = _mm512_permutexvar_ps(eo17, df231);
df232 = _mm512_permutexvar_ps(eo17, df232);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df231);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df232);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df231);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df232);
df239 = _mm512_permutexvar_ps(eo17, df239);
df240 = _mm512_permutexvar_ps(eo17, df240);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df239);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df240);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df239);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df240);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df225);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df226);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df225);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df226);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df233);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m17+32*f18, 255, df234);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df233);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m17+32*f18, 65280, df234);
for (ptrdiff_t b18 = 4; b18 < 6; ++b18) {
ptrdiff_t m18 = (size_t)b18/2;
ptrdiff_t f19 = (size_t)b18%2;
__m512 dat226 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat226 = _mm512_mask_fmadd_ps(dat226, 65535, bnMul7, bnAdd7);
__m512 dat227 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat227 = _mm512_mask_fmadd_ps(dat227, 65535, bnMul7, bnAdd7);
__m512 dat228 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat228 = _mm512_mask_fmadd_ps(dat228, 65535, bnMul7, bnAdd7);
__m512 dat229 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat229 = _mm512_mask_fmadd_ps(dat229, 65535, bnMul7, bnAdd7);
__m512 dat230 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat230 = _mm512_mask_fmadd_ps(dat230, 65535, bnMul7, bnAdd7);
__m512 dat231 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat231 = _mm512_mask_fmadd_ps(dat231, 65535, bnMul7, bnAdd7);
__m512 dat232 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat232 = _mm512_mask_fmadd_ps(dat232, 65535, bnMul7, bnAdd7);
__m512 dat233 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat233 = _mm512_mask_fmadd_ps(dat233, 65535, bnMul7, bnAdd7);
__m512 dat234 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat234 = _mm512_mask_fmadd_ps(dat234, 65535, bnMul7, bnAdd7);
__m512 dat235 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat235 = _mm512_mask_fmadd_ps(dat235, 65535, bnMul7, bnAdd7);
__m512 dat236 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat236 = _mm512_mask_fmadd_ps(dat236, 65535, bnMul7, bnAdd7);
__m512 dat237 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat237 = _mm512_mask_fmadd_ps(dat237, 65535, bnMul7, bnAdd7);
__m512 dat238 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat238 = _mm512_mask_fmadd_ps(dat238, 65535, bnMul7, bnAdd7);
__m512 dat239 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat239 = _mm512_mask_fmadd_ps(dat239, 65535, bnMul7, bnAdd7);
__m512 dat240 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat240 = _mm512_mask_fmadd_ps(dat240, 65535, bnMul7, bnAdd7);
__m512 dat241 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k8+896*h7+4*w7+40*b18);
dat241 = _mm512_mask_fmadd_ps(dat241, 65535, bnMul7, bnAdd7);
__m512 fft2857 = _mm512_add_ps(dat226, dat234);
__m512 fft2945 = _mm512_add_ps(dat227, dat235);
__m512 fft2858 = _mm512_sub_ps(dat226, dat234);
__m512 fft2946 = _mm512_sub_ps(dat227, dat235);
__m512 fft2859 = _mm512_add_ps(dat228, dat236);
__m512 fft2947 = _mm512_add_ps(dat229, dat237);
__m512 fft2860 = _mm512_sub_ps(dat228, dat236);
__m512 fft2948 = _mm512_sub_ps(dat229, dat237);
__m512 fft2861 = _mm512_add_ps(dat230, dat238);
__m512 fft2949 = _mm512_add_ps(dat231, dat239);
__m512 fft2862 = _mm512_sub_ps(dat230, dat238);
__m512 fft2950 = _mm512_sub_ps(dat231, dat239);
__m512 fft2863 = _mm512_add_ps(dat232, dat240);
__m512 fft2951 = _mm512_add_ps(dat233, dat241);
__m512 fft2864 = _mm512_sub_ps(dat232, dat240);
__m512 fft2952 = _mm512_sub_ps(dat233, dat241);
__m512 fft2865 = _mm512_add_ps(fft2857, fft2861);
__m512 fft2953 = _mm512_add_ps(fft2945, fft2949);
__m512 fft2866 = _mm512_sub_ps(fft2857, fft2861);
__m512 fft2954 = _mm512_sub_ps(fft2945, fft2949);
__m512 fft2867 = _mm512_add_ps(fft2859, fft2863);
__m512 fft2955 = _mm512_add_ps(fft2947, fft2951);
__m512 fft2868 = _mm512_sub_ps(fft2863, fft2859);
__m512 fft2956 = _mm512_sub_ps(fft2951, fft2947);
__m512 fft2869 = _mm512_sub_ps(fft2860, fft2864);
__m512 fft2957 = _mm512_sub_ps(fft2948, fft2952);
__m512 fft2870 = _mm512_add_ps(fft2860, fft2864);
__m512 fft2958 = _mm512_add_ps(fft2948, fft2952);
__m512 fft2871 = _mm512_add_ps(fft2865, fft2867);
__m512 fft2959 = _mm512_add_ps(fft2953, fft2955);
__m512 fft2872 = _mm512_sub_ps(fft2865, fft2867);
__m512 fft2960 = _mm512_sub_ps(fft2953, fft2955);
__m512 fft2873 = _mm512_fmadd_ps(fft2869, _mm512_set1_ps(7.0710677e-01f), fft2858);
__m512 fft2961 = _mm512_fmadd_ps(fft2957, _mm512_set1_ps(7.0710677e-01f), fft2946);
__m512 fft2874 = _mm512_fnmsub_ps(fft2870, _mm512_set1_ps(7.0710677e-01f), fft2862);
__m512 fft2962 = _mm512_fnmsub_ps(fft2958, _mm512_set1_ps(7.0710677e-01f), fft2950);
__m512 fft2875 = _mm512_fnmadd_ps(fft2869, _mm512_set1_ps(7.0710677e-01f), fft2858);
__m512 fft2963 = _mm512_fnmadd_ps(fft2957, _mm512_set1_ps(7.0710677e-01f), fft2946);
__m512 fft2876 = _mm512_fnmadd_ps(fft2870, _mm512_set1_ps(7.0710677e-01f), fft2862);
__m512 fft2964 = _mm512_fnmadd_ps(fft2958, _mm512_set1_ps(7.0710677e-01f), fft2950);
__m512 fft2877 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2878 = _mm512_fmadd_ps(fft2871, fft2877, _mm512_shuffle_f32x4(fft2871, fft2871, 78));
__m512 fft2965 = _mm512_fmadd_ps(fft2959, fft2877, _mm512_shuffle_f32x4(fft2959, fft2959, 78));
__m512 fft2879 = _mm512_fmadd_ps(fft2872, fft2877, _mm512_shuffle_f32x4(fft2872, fft2872, 78));
__m512 fft2966 = _mm512_fmadd_ps(fft2960, fft2877, _mm512_shuffle_f32x4(fft2960, fft2960, 78));
__m512 fft2880 = _mm512_fmadd_ps(fft2873, fft2877, _mm512_shuffle_f32x4(fft2873, fft2873, 78));
__m512 fft2967 = _mm512_fmadd_ps(fft2961, fft2877, _mm512_shuffle_f32x4(fft2961, fft2961, 78));
__m512 fft2881 = _mm512_fmadd_ps(fft2874, fft2877, _mm512_shuffle_f32x4(fft2874, fft2874, 78));
__m512 fft2968 = _mm512_fmadd_ps(fft2962, fft2877, _mm512_shuffle_f32x4(fft2962, fft2962, 78));
__m512 fft2882 = _mm512_fmadd_ps(fft2866, fft2877, _mm512_shuffle_f32x4(fft2866, fft2866, 78));
__m512 fft2969 = _mm512_fmadd_ps(fft2954, fft2877, _mm512_shuffle_f32x4(fft2954, fft2954, 78));
__m512 fft2883 = _mm512_fmadd_ps(fft2868, fft2877, _mm512_shuffle_f32x4(fft2868, fft2868, 78));
__m512 fft2970 = _mm512_fmadd_ps(fft2956, fft2877, _mm512_shuffle_f32x4(fft2956, fft2956, 78));
__m512 fft2884 = _mm512_fmadd_ps(fft2875, fft2877, _mm512_shuffle_f32x4(fft2875, fft2875, 78));
__m512 fft2971 = _mm512_fmadd_ps(fft2963, fft2877, _mm512_shuffle_f32x4(fft2963, fft2963, 78));
__m512 fft2885 = _mm512_fmadd_ps(fft2876, fft2877, _mm512_shuffle_f32x4(fft2876, fft2876, 78));
__m512 fft2972 = _mm512_fmadd_ps(fft2964, fft2877, _mm512_shuffle_f32x4(fft2964, fft2964, 78));
__m512 fft2886 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft2887 = _mm512_mul_ps(fft2878, fft2886);
__m512 fft2973 = _mm512_mul_ps(fft2965, fft2886);
__m512 fft2888 = _mm512_mul_ps(fft2879, fft2886);
__m512 fft2974 = _mm512_mul_ps(fft2966, fft2886);
__m512 fft2889 = _mm512_mul_ps(fft2880, fft2886);
__m512 fft2975 = _mm512_mul_ps(fft2967, fft2886);
__m512 fft2890 = _mm512_mul_ps(fft2881, fft2886);
__m512 fft2976 = _mm512_mul_ps(fft2968, fft2886);
__m512 fft2891 = _mm512_mul_ps(fft2882, fft2886);
__m512 fft2977 = _mm512_mul_ps(fft2969, fft2886);
__m512 fft2892 = _mm512_mul_ps(fft2883, fft2886);
__m512 fft2978 = _mm512_mul_ps(fft2970, fft2886);
__m512 fft2893 = _mm512_mul_ps(fft2884, fft2886);
__m512 fft2979 = _mm512_mul_ps(fft2971, fft2886);
__m512 fft2894 = _mm512_mul_ps(fft2885, fft2886);
__m512 fft2980 = _mm512_mul_ps(fft2972, fft2886);
__m512 fft2895 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft2896 = _mm512_fmadd_ps(fft2879, fft2895, fft2887);
__m512 fft2981 = _mm512_fmadd_ps(fft2966, fft2895, fft2973);
__m512 fft2897 = _mm512_fnmadd_ps(fft2878, fft2895, fft2888);
__m512 fft2982 = _mm512_fnmadd_ps(fft2965, fft2895, fft2974);
__m512 fft2898 = _mm512_fmadd_ps(fft2881, fft2895, fft2889);
__m512 fft2983 = _mm512_fmadd_ps(fft2968, fft2895, fft2975);
__m512 fft2899 = _mm512_fnmadd_ps(fft2880, fft2895, fft2890);
__m512 fft2984 = _mm512_fnmadd_ps(fft2967, fft2895, fft2976);
__m512 fft2900 = _mm512_fmadd_ps(fft2883, fft2895, fft2891);
__m512 fft2985 = _mm512_fmadd_ps(fft2970, fft2895, fft2977);
__m512 fft2901 = _mm512_fnmadd_ps(fft2882, fft2895, fft2892);
__m512 fft2986 = _mm512_fnmadd_ps(fft2969, fft2895, fft2978);
__m512 fft2902 = _mm512_fmadd_ps(fft2885, fft2895, fft2893);
__m512 fft2987 = _mm512_fmadd_ps(fft2972, fft2895, fft2979);
__m512 fft2903 = _mm512_fnmadd_ps(fft2884, fft2895, fft2894);
__m512 fft2988 = _mm512_fnmadd_ps(fft2971, fft2895, fft2980);
__m512 fft2904 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft2905 = _mm512_fmadd_ps(fft2896, fft2904, _mm512_shuffle_f32x4(fft2896, fft2896, 177));
__m512 fft2989 = _mm512_fmadd_ps(fft2981, fft2904, _mm512_shuffle_f32x4(fft2981, fft2981, 177));
__m512 fft2906 = _mm512_fmadd_ps(fft2897, fft2904, _mm512_shuffle_f32x4(fft2897, fft2897, 177));
__m512 fft2990 = _mm512_fmadd_ps(fft2982, fft2904, _mm512_shuffle_f32x4(fft2982, fft2982, 177));
__m512 fft2907 = _mm512_fmadd_ps(fft2898, fft2904, _mm512_shuffle_f32x4(fft2898, fft2898, 177));
__m512 fft2991 = _mm512_fmadd_ps(fft2983, fft2904, _mm512_shuffle_f32x4(fft2983, fft2983, 177));
__m512 fft2908 = _mm512_fmadd_ps(fft2899, fft2904, _mm512_shuffle_f32x4(fft2899, fft2899, 177));
__m512 fft2992 = _mm512_fmadd_ps(fft2984, fft2904, _mm512_shuffle_f32x4(fft2984, fft2984, 177));
__m512 fft2909 = _mm512_fmadd_ps(fft2900, fft2904, _mm512_shuffle_f32x4(fft2900, fft2900, 177));
__m512 fft2993 = _mm512_fmadd_ps(fft2985, fft2904, _mm512_shuffle_f32x4(fft2985, fft2985, 177));
__m512 fft2910 = _mm512_fmadd_ps(fft2901, fft2904, _mm512_shuffle_f32x4(fft2901, fft2901, 177));
__m512 fft2994 = _mm512_fmadd_ps(fft2986, fft2904, _mm512_shuffle_f32x4(fft2986, fft2986, 177));
__m512 fft2911 = _mm512_fmadd_ps(fft2902, fft2904, _mm512_shuffle_f32x4(fft2902, fft2902, 177));
__m512 fft2995 = _mm512_fmadd_ps(fft2987, fft2904, _mm512_shuffle_f32x4(fft2987, fft2987, 177));
__m512 fft2912 = _mm512_fmadd_ps(fft2903, fft2904, _mm512_shuffle_f32x4(fft2903, fft2903, 177));
__m512 fft2996 = _mm512_fmadd_ps(fft2988, fft2904, _mm512_shuffle_f32x4(fft2988, fft2988, 177));
__m512 fft2913 = _mm512_mask_mov_ps(fft2905, 49344, fft2906);
__m512 fft2997 = _mm512_mask_mov_ps(fft2989, 49344, fft2990);
__m512 fft2914 = _mm512_mask_sub_ps(fft2906, 49344, _mm512_setzero_ps(), fft2905);
__m512 fft2998 = _mm512_mask_sub_ps(fft2990, 49344, _mm512_setzero_ps(), fft2989);
__m512 fft2915 = _mm512_mask_mov_ps(fft2907, 49344, fft2908);
__m512 fft2999 = _mm512_mask_mov_ps(fft2991, 49344, fft2992);
__m512 fft2916 = _mm512_mask_sub_ps(fft2908, 49344, _mm512_setzero_ps(), fft2907);
__m512 fft3000 = _mm512_mask_sub_ps(fft2992, 49344, _mm512_setzero_ps(), fft2991);
__m512 fft2917 = _mm512_mask_mov_ps(fft2909, 49344, fft2910);
__m512 fft3001 = _mm512_mask_mov_ps(fft2993, 49344, fft2994);
__m512 fft2918 = _mm512_mask_sub_ps(fft2910, 49344, _mm512_setzero_ps(), fft2909);
__m512 fft3002 = _mm512_mask_sub_ps(fft2994, 49344, _mm512_setzero_ps(), fft2993);
__m512 fft2919 = _mm512_mask_mov_ps(fft2911, 49344, fft2912);
__m512 fft3003 = _mm512_mask_mov_ps(fft2995, 49344, fft2996);
__m512 fft2920 = _mm512_mask_sub_ps(fft2912, 49344, _mm512_setzero_ps(), fft2911);
__m512 fft3004 = _mm512_mask_sub_ps(fft2996, 49344, _mm512_setzero_ps(), fft2995);
__m512 fft2921 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft2922 = _mm512_fmadd_ps(fft2913, fft2921, _mm512_shuffle_ps(fft2913, fft2913, 78));
__m512 fft3005 = _mm512_fmadd_ps(fft2997, fft2921, _mm512_shuffle_ps(fft2997, fft2997, 78));
__m512 fft2923 = _mm512_fmadd_ps(fft2914, fft2921, _mm512_shuffle_ps(fft2914, fft2914, 78));
__m512 fft3006 = _mm512_fmadd_ps(fft2998, fft2921, _mm512_shuffle_ps(fft2998, fft2998, 78));
__m512 fft2924 = _mm512_fmadd_ps(fft2915, fft2921, _mm512_shuffle_ps(fft2915, fft2915, 78));
__m512 fft3007 = _mm512_fmadd_ps(fft2999, fft2921, _mm512_shuffle_ps(fft2999, fft2999, 78));
__m512 fft2925 = _mm512_fmadd_ps(fft2916, fft2921, _mm512_shuffle_ps(fft2916, fft2916, 78));
__m512 fft3008 = _mm512_fmadd_ps(fft3000, fft2921, _mm512_shuffle_ps(fft3000, fft3000, 78));
__m512 fft2926 = _mm512_fmadd_ps(fft2917, fft2921, _mm512_shuffle_ps(fft2917, fft2917, 78));
__m512 fft3009 = _mm512_fmadd_ps(fft3001, fft2921, _mm512_shuffle_ps(fft3001, fft3001, 78));
__m512 fft2927 = _mm512_fmadd_ps(fft2918, fft2921, _mm512_shuffle_ps(fft2918, fft2918, 78));
__m512 fft3010 = _mm512_fmadd_ps(fft3002, fft2921, _mm512_shuffle_ps(fft3002, fft3002, 78));
__m512 fft2928 = _mm512_fmadd_ps(fft2919, fft2921, _mm512_shuffle_ps(fft2919, fft2919, 78));
__m512 fft3011 = _mm512_fmadd_ps(fft3003, fft2921, _mm512_shuffle_ps(fft3003, fft3003, 78));
__m512 fft2929 = _mm512_fmadd_ps(fft2920, fft2921, _mm512_shuffle_ps(fft2920, fft2920, 78));
__m512 fft3012 = _mm512_fmadd_ps(fft3004, fft2921, _mm512_shuffle_ps(fft3004, fft3004, 78));
__m512i fft2930 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft2931 = _mm512_permutexvar_ps(fft2930, fft2922);
__m512 fft3013 = _mm512_permutexvar_ps(fft2930, fft3005);
__m512i fft2932 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft2933 = _mm512_permutexvar_ps(fft2932, fft2922);
__m512 fft3014 = _mm512_permutexvar_ps(fft2932, fft3005);
__m512 fft2934 = _mm512_permutexvar_ps(fft2930, fft2923);
__m512 fft3015 = _mm512_permutexvar_ps(fft2930, fft3006);
__m512 fft2935 = _mm512_permutexvar_ps(fft2932, fft2923);
__m512 fft3016 = _mm512_permutexvar_ps(fft2932, fft3006);
__m512 fft2936 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft2937 = _mm512_fmadd_ps(fft2931, fft2936, fft2933);
__m512 fft3017 = _mm512_fmadd_ps(fft3013, fft2936, fft3014);
__m512 fft2938 = _mm512_fnmadd_ps(fft2935, fft2936, fft2934);
__m512 fft3018 = _mm512_fnmadd_ps(fft3016, fft2936, fft3015);
__m512 fft2939 = _mm512_mask_mov_ps(fft2935, 21845, fft2937);
__m512 fft3019 = _mm512_mask_mov_ps(fft3016, 21845, fft3017);
__m512 fft2940 = _mm512_mask_mov_ps(fft2931, 43176, fft2937);
__m512 fft3020 = _mm512_mask_mov_ps(fft3013, 43176, fft3017);
__m512 fft2941 = _mm512_mask_mov_ps(fft2939, 43176, fft2938);
__m512 fft3021 = _mm512_mask_mov_ps(fft3019, 43176, fft3018);
__m512 fft2942 = _mm512_mask_mov_ps(fft2940, 22102, fft2938);
__m512 fft3022 = _mm512_mask_mov_ps(fft3020, 22102, fft3018);
__m512 fft2943 = _mm512_mask_mul_ps(fft2941, 64764, fft2941, _mm512_set1_ps(5e-01f));
__m512 fft3023 = _mm512_mask_mul_ps(fft3021, 64764, fft3021, _mm512_set1_ps(5e-01f));
__m512 fft2944 = _mm512_mask_mul_ps(fft2942, 64764, fft2942, _mm512_set1_ps(5e-01f));
__m512 fft3024 = _mm512_mask_mul_ps(fft3022, 64764, fft3022, _mm512_set1_ps(5e-01f));
__m512 df241 = fft2943;
__m512 df249 = fft3023;
__m512 df242 = fft2944;
__m512 df250 = fft3024;
__m512 df243 = fft2924;
__m512 df251 = fft3007;
__m512 df244 = fft2925;
__m512 df252 = fft3008;
__m512 df245 = fft2926;
__m512 df253 = fft3009;
__m512 df246 = fft2927;
__m512 df254 = fft3010;
__m512 df247 = fft2928;
__m512 df255 = fft3011;
__m512 df248 = fft2929;
__m512 df256 = fft3012;
__m512i eo18 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df243 = _mm512_permutexvar_ps(eo18, df243);
df244 = _mm512_permutexvar_ps(eo18, df244);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df243);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df244);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df243);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df244);
df251 = _mm512_permutexvar_ps(eo18, df251);
df252 = _mm512_permutexvar_ps(eo18, df252);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df251);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df252);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df251);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df252);
df245 = _mm512_permutexvar_ps(eo18, df245);
df246 = _mm512_permutexvar_ps(eo18, df246);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df245);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df246);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df245);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df246);
df253 = _mm512_permutexvar_ps(eo18, df253);
df254 = _mm512_permutexvar_ps(eo18, df254);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df253);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df254);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df253);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df254);
df247 = _mm512_permutexvar_ps(eo18, df247);
df248 = _mm512_permutexvar_ps(eo18, df248);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df247);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df248);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df247);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df248);
df255 = _mm512_permutexvar_ps(eo18, df255);
df256 = _mm512_permutexvar_ps(eo18, df256);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df255);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df256);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df255);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df256);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df241);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df242);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df241);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df242);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df249);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k8+128*m18+32*f19, 255, df250);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df249);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k8+128*m18+32*f19, 65280, df250);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 8;
}
ptrdiff_t h8 = base2+20;
ptrdiff_t w8 = -450+60*rel2;
ptrdiff_t jj4 = 10-rel2+j2;
for (; j2 <= jj4; w8 += 60) {
ptrdiff_t k9 = 3*s1;
ptrdiff_t kk8 = k9+2;
for (; k9 <= kk8; ++k9) {
__m512 bnMul8 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k9+3*i6))[0]);
__m512 bnAdd8 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k9+3*i6))[1]);
for (ptrdiff_t b19 = 0; b19 < 6; ++b19) {
ptrdiff_t m19 = (size_t)b19/2;
ptrdiff_t f20 = (size_t)b19%2;
__m512 dat242 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat242 = _mm512_mask_fmadd_ps(dat242, 65535, bnMul8, bnAdd8);
__m512 dat243 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat243 = _mm512_mask_fmadd_ps(dat243, 65535, bnMul8, bnAdd8);
__m512 dat244 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat244 = _mm512_mask_fmadd_ps(dat244, 65535, bnMul8, bnAdd8);
__m512 dat245 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat245 = _mm512_mask_fmadd_ps(dat245, 65535, bnMul8, bnAdd8);
__m512 dat246 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat246 = _mm512_mask_fmadd_ps(dat246, 65535, bnMul8, bnAdd8);
__m512 dat247 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat247 = _mm512_mask_fmadd_ps(dat247, 65535, bnMul8, bnAdd8);
__m512 dat248 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat248 = _mm512_mask_fmadd_ps(dat248, 65535, bnMul8, bnAdd8);
__m512 dat249 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat249 = _mm512_mask_fmadd_ps(dat249, 65535, bnMul8, bnAdd8);
__m512 dat250 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat250 = _mm512_mask_fmadd_ps(dat250, 65535, bnMul8, bnAdd8);
__m512 dat251 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat251 = _mm512_mask_fmadd_ps(dat251, 65535, bnMul8, bnAdd8);
__m512 dat252 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat252 = _mm512_mask_fmadd_ps(dat252, 65535, bnMul8, bnAdd8);
__m512 dat253 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat253 = _mm512_mask_fmadd_ps(dat253, 65535, bnMul8, bnAdd8);
__m512 dat254 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat254 = _mm512_mask_fmadd_ps(dat254, 65535, bnMul8, bnAdd8);
__m512 dat255 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat255 = _mm512_mask_fmadd_ps(dat255, 65535, bnMul8, bnAdd8);
__m512 dat256 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat256 = _mm512_mask_fmadd_ps(dat256, 65535, bnMul8, bnAdd8);
__m512 dat257 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k9+896*h8+4*w8+40*b19);
dat257 = _mm512_mask_fmadd_ps(dat257, 65535, bnMul8, bnAdd8);
__m512 fft3025 = _mm512_add_ps(dat242, dat250);
__m512 fft3113 = _mm512_add_ps(dat243, dat251);
__m512 fft3026 = _mm512_sub_ps(dat242, dat250);
__m512 fft3114 = _mm512_sub_ps(dat243, dat251);
__m512 fft3027 = _mm512_add_ps(dat244, dat252);
__m512 fft3115 = _mm512_add_ps(dat245, dat253);
__m512 fft3028 = _mm512_sub_ps(dat244, dat252);
__m512 fft3116 = _mm512_sub_ps(dat245, dat253);
__m512 fft3029 = _mm512_add_ps(dat246, dat254);
__m512 fft3117 = _mm512_add_ps(dat247, dat255);
__m512 fft3030 = _mm512_sub_ps(dat246, dat254);
__m512 fft3118 = _mm512_sub_ps(dat247, dat255);
__m512 fft3031 = _mm512_add_ps(dat248, dat256);
__m512 fft3119 = _mm512_add_ps(dat249, dat257);
__m512 fft3032 = _mm512_sub_ps(dat248, dat256);
__m512 fft3120 = _mm512_sub_ps(dat249, dat257);
__m512 fft3033 = _mm512_add_ps(fft3025, fft3029);
__m512 fft3121 = _mm512_add_ps(fft3113, fft3117);
__m512 fft3034 = _mm512_sub_ps(fft3025, fft3029);
__m512 fft3122 = _mm512_sub_ps(fft3113, fft3117);
__m512 fft3035 = _mm512_add_ps(fft3027, fft3031);
__m512 fft3123 = _mm512_add_ps(fft3115, fft3119);
__m512 fft3036 = _mm512_sub_ps(fft3031, fft3027);
__m512 fft3124 = _mm512_sub_ps(fft3119, fft3115);
__m512 fft3037 = _mm512_sub_ps(fft3028, fft3032);
__m512 fft3125 = _mm512_sub_ps(fft3116, fft3120);
__m512 fft3038 = _mm512_add_ps(fft3028, fft3032);
__m512 fft3126 = _mm512_add_ps(fft3116, fft3120);
__m512 fft3039 = _mm512_add_ps(fft3033, fft3035);
__m512 fft3127 = _mm512_add_ps(fft3121, fft3123);
__m512 fft3040 = _mm512_sub_ps(fft3033, fft3035);
__m512 fft3128 = _mm512_sub_ps(fft3121, fft3123);
__m512 fft3041 = _mm512_fmadd_ps(fft3037, _mm512_set1_ps(7.0710677e-01f), fft3026);
__m512 fft3129 = _mm512_fmadd_ps(fft3125, _mm512_set1_ps(7.0710677e-01f), fft3114);
__m512 fft3042 = _mm512_fnmsub_ps(fft3038, _mm512_set1_ps(7.0710677e-01f), fft3030);
__m512 fft3130 = _mm512_fnmsub_ps(fft3126, _mm512_set1_ps(7.0710677e-01f), fft3118);
__m512 fft3043 = _mm512_fnmadd_ps(fft3037, _mm512_set1_ps(7.0710677e-01f), fft3026);
__m512 fft3131 = _mm512_fnmadd_ps(fft3125, _mm512_set1_ps(7.0710677e-01f), fft3114);
__m512 fft3044 = _mm512_fnmadd_ps(fft3038, _mm512_set1_ps(7.0710677e-01f), fft3030);
__m512 fft3132 = _mm512_fnmadd_ps(fft3126, _mm512_set1_ps(7.0710677e-01f), fft3118);
__m512 fft3045 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3046 = _mm512_fmadd_ps(fft3039, fft3045, _mm512_shuffle_f32x4(fft3039, fft3039, 78));
__m512 fft3133 = _mm512_fmadd_ps(fft3127, fft3045, _mm512_shuffle_f32x4(fft3127, fft3127, 78));
__m512 fft3047 = _mm512_fmadd_ps(fft3040, fft3045, _mm512_shuffle_f32x4(fft3040, fft3040, 78));
__m512 fft3134 = _mm512_fmadd_ps(fft3128, fft3045, _mm512_shuffle_f32x4(fft3128, fft3128, 78));
__m512 fft3048 = _mm512_fmadd_ps(fft3041, fft3045, _mm512_shuffle_f32x4(fft3041, fft3041, 78));
__m512 fft3135 = _mm512_fmadd_ps(fft3129, fft3045, _mm512_shuffle_f32x4(fft3129, fft3129, 78));
__m512 fft3049 = _mm512_fmadd_ps(fft3042, fft3045, _mm512_shuffle_f32x4(fft3042, fft3042, 78));
__m512 fft3136 = _mm512_fmadd_ps(fft3130, fft3045, _mm512_shuffle_f32x4(fft3130, fft3130, 78));
__m512 fft3050 = _mm512_fmadd_ps(fft3034, fft3045, _mm512_shuffle_f32x4(fft3034, fft3034, 78));
__m512 fft3137 = _mm512_fmadd_ps(fft3122, fft3045, _mm512_shuffle_f32x4(fft3122, fft3122, 78));
__m512 fft3051 = _mm512_fmadd_ps(fft3036, fft3045, _mm512_shuffle_f32x4(fft3036, fft3036, 78));
__m512 fft3138 = _mm512_fmadd_ps(fft3124, fft3045, _mm512_shuffle_f32x4(fft3124, fft3124, 78));
__m512 fft3052 = _mm512_fmadd_ps(fft3043, fft3045, _mm512_shuffle_f32x4(fft3043, fft3043, 78));
__m512 fft3139 = _mm512_fmadd_ps(fft3131, fft3045, _mm512_shuffle_f32x4(fft3131, fft3131, 78));
__m512 fft3053 = _mm512_fmadd_ps(fft3044, fft3045, _mm512_shuffle_f32x4(fft3044, fft3044, 78));
__m512 fft3140 = _mm512_fmadd_ps(fft3132, fft3045, _mm512_shuffle_f32x4(fft3132, fft3132, 78));
__m512 fft3054 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3055 = _mm512_mul_ps(fft3046, fft3054);
__m512 fft3141 = _mm512_mul_ps(fft3133, fft3054);
__m512 fft3056 = _mm512_mul_ps(fft3047, fft3054);
__m512 fft3142 = _mm512_mul_ps(fft3134, fft3054);
__m512 fft3057 = _mm512_mul_ps(fft3048, fft3054);
__m512 fft3143 = _mm512_mul_ps(fft3135, fft3054);
__m512 fft3058 = _mm512_mul_ps(fft3049, fft3054);
__m512 fft3144 = _mm512_mul_ps(fft3136, fft3054);
__m512 fft3059 = _mm512_mul_ps(fft3050, fft3054);
__m512 fft3145 = _mm512_mul_ps(fft3137, fft3054);
__m512 fft3060 = _mm512_mul_ps(fft3051, fft3054);
__m512 fft3146 = _mm512_mul_ps(fft3138, fft3054);
__m512 fft3061 = _mm512_mul_ps(fft3052, fft3054);
__m512 fft3147 = _mm512_mul_ps(fft3139, fft3054);
__m512 fft3062 = _mm512_mul_ps(fft3053, fft3054);
__m512 fft3148 = _mm512_mul_ps(fft3140, fft3054);
__m512 fft3063 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3064 = _mm512_fmadd_ps(fft3047, fft3063, fft3055);
__m512 fft3149 = _mm512_fmadd_ps(fft3134, fft3063, fft3141);
__m512 fft3065 = _mm512_fnmadd_ps(fft3046, fft3063, fft3056);
__m512 fft3150 = _mm512_fnmadd_ps(fft3133, fft3063, fft3142);
__m512 fft3066 = _mm512_fmadd_ps(fft3049, fft3063, fft3057);
__m512 fft3151 = _mm512_fmadd_ps(fft3136, fft3063, fft3143);
__m512 fft3067 = _mm512_fnmadd_ps(fft3048, fft3063, fft3058);
__m512 fft3152 = _mm512_fnmadd_ps(fft3135, fft3063, fft3144);
__m512 fft3068 = _mm512_fmadd_ps(fft3051, fft3063, fft3059);
__m512 fft3153 = _mm512_fmadd_ps(fft3138, fft3063, fft3145);
__m512 fft3069 = _mm512_fnmadd_ps(fft3050, fft3063, fft3060);
__m512 fft3154 = _mm512_fnmadd_ps(fft3137, fft3063, fft3146);
__m512 fft3070 = _mm512_fmadd_ps(fft3053, fft3063, fft3061);
__m512 fft3155 = _mm512_fmadd_ps(fft3140, fft3063, fft3147);
__m512 fft3071 = _mm512_fnmadd_ps(fft3052, fft3063, fft3062);
__m512 fft3156 = _mm512_fnmadd_ps(fft3139, fft3063, fft3148);
__m512 fft3072 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3073 = _mm512_fmadd_ps(fft3064, fft3072, _mm512_shuffle_f32x4(fft3064, fft3064, 177));
__m512 fft3157 = _mm512_fmadd_ps(fft3149, fft3072, _mm512_shuffle_f32x4(fft3149, fft3149, 177));
__m512 fft3074 = _mm512_fmadd_ps(fft3065, fft3072, _mm512_shuffle_f32x4(fft3065, fft3065, 177));
__m512 fft3158 = _mm512_fmadd_ps(fft3150, fft3072, _mm512_shuffle_f32x4(fft3150, fft3150, 177));
__m512 fft3075 = _mm512_fmadd_ps(fft3066, fft3072, _mm512_shuffle_f32x4(fft3066, fft3066, 177));
__m512 fft3159 = _mm512_fmadd_ps(fft3151, fft3072, _mm512_shuffle_f32x4(fft3151, fft3151, 177));
__m512 fft3076 = _mm512_fmadd_ps(fft3067, fft3072, _mm512_shuffle_f32x4(fft3067, fft3067, 177));
__m512 fft3160 = _mm512_fmadd_ps(fft3152, fft3072, _mm512_shuffle_f32x4(fft3152, fft3152, 177));
__m512 fft3077 = _mm512_fmadd_ps(fft3068, fft3072, _mm512_shuffle_f32x4(fft3068, fft3068, 177));
__m512 fft3161 = _mm512_fmadd_ps(fft3153, fft3072, _mm512_shuffle_f32x4(fft3153, fft3153, 177));
__m512 fft3078 = _mm512_fmadd_ps(fft3069, fft3072, _mm512_shuffle_f32x4(fft3069, fft3069, 177));
__m512 fft3162 = _mm512_fmadd_ps(fft3154, fft3072, _mm512_shuffle_f32x4(fft3154, fft3154, 177));
__m512 fft3079 = _mm512_fmadd_ps(fft3070, fft3072, _mm512_shuffle_f32x4(fft3070, fft3070, 177));
__m512 fft3163 = _mm512_fmadd_ps(fft3155, fft3072, _mm512_shuffle_f32x4(fft3155, fft3155, 177));
__m512 fft3080 = _mm512_fmadd_ps(fft3071, fft3072, _mm512_shuffle_f32x4(fft3071, fft3071, 177));
__m512 fft3164 = _mm512_fmadd_ps(fft3156, fft3072, _mm512_shuffle_f32x4(fft3156, fft3156, 177));
__m512 fft3081 = _mm512_mask_mov_ps(fft3073, 49344, fft3074);
__m512 fft3165 = _mm512_mask_mov_ps(fft3157, 49344, fft3158);
__m512 fft3082 = _mm512_mask_sub_ps(fft3074, 49344, _mm512_setzero_ps(), fft3073);
__m512 fft3166 = _mm512_mask_sub_ps(fft3158, 49344, _mm512_setzero_ps(), fft3157);
__m512 fft3083 = _mm512_mask_mov_ps(fft3075, 49344, fft3076);
__m512 fft3167 = _mm512_mask_mov_ps(fft3159, 49344, fft3160);
__m512 fft3084 = _mm512_mask_sub_ps(fft3076, 49344, _mm512_setzero_ps(), fft3075);
__m512 fft3168 = _mm512_mask_sub_ps(fft3160, 49344, _mm512_setzero_ps(), fft3159);
__m512 fft3085 = _mm512_mask_mov_ps(fft3077, 49344, fft3078);
__m512 fft3169 = _mm512_mask_mov_ps(fft3161, 49344, fft3162);
__m512 fft3086 = _mm512_mask_sub_ps(fft3078, 49344, _mm512_setzero_ps(), fft3077);
__m512 fft3170 = _mm512_mask_sub_ps(fft3162, 49344, _mm512_setzero_ps(), fft3161);
__m512 fft3087 = _mm512_mask_mov_ps(fft3079, 49344, fft3080);
__m512 fft3171 = _mm512_mask_mov_ps(fft3163, 49344, fft3164);
__m512 fft3088 = _mm512_mask_sub_ps(fft3080, 49344, _mm512_setzero_ps(), fft3079);
__m512 fft3172 = _mm512_mask_sub_ps(fft3164, 49344, _mm512_setzero_ps(), fft3163);
__m512 fft3089 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3090 = _mm512_fmadd_ps(fft3081, fft3089, _mm512_shuffle_ps(fft3081, fft3081, 78));
__m512 fft3173 = _mm512_fmadd_ps(fft3165, fft3089, _mm512_shuffle_ps(fft3165, fft3165, 78));
__m512 fft3091 = _mm512_fmadd_ps(fft3082, fft3089, _mm512_shuffle_ps(fft3082, fft3082, 78));
__m512 fft3174 = _mm512_fmadd_ps(fft3166, fft3089, _mm512_shuffle_ps(fft3166, fft3166, 78));
__m512 fft3092 = _mm512_fmadd_ps(fft3083, fft3089, _mm512_shuffle_ps(fft3083, fft3083, 78));
__m512 fft3175 = _mm512_fmadd_ps(fft3167, fft3089, _mm512_shuffle_ps(fft3167, fft3167, 78));
__m512 fft3093 = _mm512_fmadd_ps(fft3084, fft3089, _mm512_shuffle_ps(fft3084, fft3084, 78));
__m512 fft3176 = _mm512_fmadd_ps(fft3168, fft3089, _mm512_shuffle_ps(fft3168, fft3168, 78));
__m512 fft3094 = _mm512_fmadd_ps(fft3085, fft3089, _mm512_shuffle_ps(fft3085, fft3085, 78));
__m512 fft3177 = _mm512_fmadd_ps(fft3169, fft3089, _mm512_shuffle_ps(fft3169, fft3169, 78));
__m512 fft3095 = _mm512_fmadd_ps(fft3086, fft3089, _mm512_shuffle_ps(fft3086, fft3086, 78));
__m512 fft3178 = _mm512_fmadd_ps(fft3170, fft3089, _mm512_shuffle_ps(fft3170, fft3170, 78));
__m512 fft3096 = _mm512_fmadd_ps(fft3087, fft3089, _mm512_shuffle_ps(fft3087, fft3087, 78));
__m512 fft3179 = _mm512_fmadd_ps(fft3171, fft3089, _mm512_shuffle_ps(fft3171, fft3171, 78));
__m512 fft3097 = _mm512_fmadd_ps(fft3088, fft3089, _mm512_shuffle_ps(fft3088, fft3088, 78));
__m512 fft3180 = _mm512_fmadd_ps(fft3172, fft3089, _mm512_shuffle_ps(fft3172, fft3172, 78));
__m512i fft3098 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3099 = _mm512_permutexvar_ps(fft3098, fft3090);
__m512 fft3181 = _mm512_permutexvar_ps(fft3098, fft3173);
__m512i fft3100 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3101 = _mm512_permutexvar_ps(fft3100, fft3090);
__m512 fft3182 = _mm512_permutexvar_ps(fft3100, fft3173);
__m512 fft3102 = _mm512_permutexvar_ps(fft3098, fft3091);
__m512 fft3183 = _mm512_permutexvar_ps(fft3098, fft3174);
__m512 fft3103 = _mm512_permutexvar_ps(fft3100, fft3091);
__m512 fft3184 = _mm512_permutexvar_ps(fft3100, fft3174);
__m512 fft3104 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3105 = _mm512_fmadd_ps(fft3099, fft3104, fft3101);
__m512 fft3185 = _mm512_fmadd_ps(fft3181, fft3104, fft3182);
__m512 fft3106 = _mm512_fnmadd_ps(fft3103, fft3104, fft3102);
__m512 fft3186 = _mm512_fnmadd_ps(fft3184, fft3104, fft3183);
__m512 fft3107 = _mm512_mask_mov_ps(fft3103, 21845, fft3105);
__m512 fft3187 = _mm512_mask_mov_ps(fft3184, 21845, fft3185);
__m512 fft3108 = _mm512_mask_mov_ps(fft3099, 43176, fft3105);
__m512 fft3188 = _mm512_mask_mov_ps(fft3181, 43176, fft3185);
__m512 fft3109 = _mm512_mask_mov_ps(fft3107, 43176, fft3106);
__m512 fft3189 = _mm512_mask_mov_ps(fft3187, 43176, fft3186);
__m512 fft3110 = _mm512_mask_mov_ps(fft3108, 22102, fft3106);
__m512 fft3190 = _mm512_mask_mov_ps(fft3188, 22102, fft3186);
__m512 fft3111 = _mm512_mask_mul_ps(fft3109, 64764, fft3109, _mm512_set1_ps(5e-01f));
__m512 fft3191 = _mm512_mask_mul_ps(fft3189, 64764, fft3189, _mm512_set1_ps(5e-01f));
__m512 fft3112 = _mm512_mask_mul_ps(fft3110, 64764, fft3110, _mm512_set1_ps(5e-01f));
__m512 fft3192 = _mm512_mask_mul_ps(fft3190, 64764, fft3190, _mm512_set1_ps(5e-01f));
__m512 df257 = fft3111;
__m512 df265 = fft3191;
__m512 df258 = fft3112;
__m512 df266 = fft3192;
__m512 df259 = fft3092;
__m512 df267 = fft3175;
__m512 df260 = fft3093;
__m512 df268 = fft3176;
__m512 df261 = fft3094;
__m512 df269 = fft3177;
__m512 df262 = fft3095;
__m512 df270 = fft3178;
__m512 df263 = fft3096;
__m512 df271 = fft3179;
__m512 df264 = fft3097;
__m512 df272 = fft3180;
__m512i eo19 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df259 = _mm512_permutexvar_ps(eo19, df259);
df260 = _mm512_permutexvar_ps(eo19, df260);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df259);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df260);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df259);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df260);
df267 = _mm512_permutexvar_ps(eo19, df267);
df268 = _mm512_permutexvar_ps(eo19, df268);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df267);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df268);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df267);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df268);
df261 = _mm512_permutexvar_ps(eo19, df261);
df262 = _mm512_permutexvar_ps(eo19, df262);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df261);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df262);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df261);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df262);
df269 = _mm512_permutexvar_ps(eo19, df269);
df270 = _mm512_permutexvar_ps(eo19, df270);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df269);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df270);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df269);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df270);
df263 = _mm512_permutexvar_ps(eo19, df263);
df264 = _mm512_permutexvar_ps(eo19, df264);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df263);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df264);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df263);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df264);
df271 = _mm512_permutexvar_ps(eo19, df271);
df272 = _mm512_permutexvar_ps(eo19, df272);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df271);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df272);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df271);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df272);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df257);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df258);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df257);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df258);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df265);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k9+128*m19+32*f20, 255, df266);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df265);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k9+128*m19+32*f20, 65280, df266);
}
}
if (j2 >= last1) return;
++j2;
}
if (j2 >= 84) break;
rel2 = 11;
}
if (rel2 < 16) {
if (rel2 < 12) {
ptrdiff_t h9 = base2+20;
ptrdiff_t w9 = 210;
ptrdiff_t k10 = 3*s1;
ptrdiff_t kk9 = k10+2;
for (; k10 <= kk9; ++k10) {
__m512 bnMul9 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k10+3*i6))[0]);
__m512 bnAdd9 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k10+3*i6))[1]);
ptrdiff_t b20 = 0;
ptrdiff_t m20 = (size_t)b20/2;
ptrdiff_t f21 = (size_t)b20%2;
__m512 dat258 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat258 = _mm512_mask_fmadd_ps(dat258, 65535, bnMul9, bnAdd9);
__m512 dat259 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat259 = _mm512_mask_fmadd_ps(dat259, 65535, bnMul9, bnAdd9);
__m512 dat260 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat260 = _mm512_mask_fmadd_ps(dat260, 65535, bnMul9, bnAdd9);
__m512 dat261 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat261 = _mm512_mask_fmadd_ps(dat261, 65535, bnMul9, bnAdd9);
__m512 dat262 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat262 = _mm512_mask_fmadd_ps(dat262, 65535, bnMul9, bnAdd9);
__m512 dat263 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat263 = _mm512_mask_fmadd_ps(dat263, 65535, bnMul9, bnAdd9);
__m512 dat264 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat264 = _mm512_mask_fmadd_ps(dat264, 65535, bnMul9, bnAdd9);
__m512 dat265 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat265 = _mm512_mask_fmadd_ps(dat265, 65535, bnMul9, bnAdd9);
__m512 dat266 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat266 = _mm512_mask_fmadd_ps(dat266, 65535, bnMul9, bnAdd9);
__m512 dat267 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat267 = _mm512_mask_fmadd_ps(dat267, 65535, bnMul9, bnAdd9);
__m512 dat268 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat268 = _mm512_mask_fmadd_ps(dat268, 65535, bnMul9, bnAdd9);
__m512 dat269 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat269 = _mm512_mask_fmadd_ps(dat269, 65535, bnMul9, bnAdd9);
__m512 dat270 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat270 = _mm512_mask_fmadd_ps(dat270, 65535, bnMul9, bnAdd9);
__m512 dat271 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat271 = _mm512_mask_fmadd_ps(dat271, 65535, bnMul9, bnAdd9);
__m512 dat272 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat272 = _mm512_mask_fmadd_ps(dat272, 65535, bnMul9, bnAdd9);
__m512 dat273 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k10+896*h9+4*w9+0*b20);
dat273 = _mm512_mask_fmadd_ps(dat273, 65535, bnMul9, bnAdd9);
__m512 fft3193 = _mm512_add_ps(dat258, dat266);
__m512 fft3281 = _mm512_add_ps(dat259, dat267);
__m512 fft3194 = _mm512_sub_ps(dat258, dat266);
__m512 fft3282 = _mm512_sub_ps(dat259, dat267);
__m512 fft3195 = _mm512_add_ps(dat260, dat268);
__m512 fft3283 = _mm512_add_ps(dat261, dat269);
__m512 fft3196 = _mm512_sub_ps(dat260, dat268);
__m512 fft3284 = _mm512_sub_ps(dat261, dat269);
__m512 fft3197 = _mm512_add_ps(dat262, dat270);
__m512 fft3285 = _mm512_add_ps(dat263, dat271);
__m512 fft3198 = _mm512_sub_ps(dat262, dat270);
__m512 fft3286 = _mm512_sub_ps(dat263, dat271);
__m512 fft3199 = _mm512_add_ps(dat264, dat272);
__m512 fft3287 = _mm512_add_ps(dat265, dat273);
__m512 fft3200 = _mm512_sub_ps(dat264, dat272);
__m512 fft3288 = _mm512_sub_ps(dat265, dat273);
__m512 fft3201 = _mm512_add_ps(fft3193, fft3197);
__m512 fft3289 = _mm512_add_ps(fft3281, fft3285);
__m512 fft3202 = _mm512_sub_ps(fft3193, fft3197);
__m512 fft3290 = _mm512_sub_ps(fft3281, fft3285);
__m512 fft3203 = _mm512_add_ps(fft3195, fft3199);
__m512 fft3291 = _mm512_add_ps(fft3283, fft3287);
__m512 fft3204 = _mm512_sub_ps(fft3199, fft3195);
__m512 fft3292 = _mm512_sub_ps(fft3287, fft3283);
__m512 fft3205 = _mm512_sub_ps(fft3196, fft3200);
__m512 fft3293 = _mm512_sub_ps(fft3284, fft3288);
__m512 fft3206 = _mm512_add_ps(fft3196, fft3200);
__m512 fft3294 = _mm512_add_ps(fft3284, fft3288);
__m512 fft3207 = _mm512_add_ps(fft3201, fft3203);
__m512 fft3295 = _mm512_add_ps(fft3289, fft3291);
__m512 fft3208 = _mm512_sub_ps(fft3201, fft3203);
__m512 fft3296 = _mm512_sub_ps(fft3289, fft3291);
__m512 fft3209 = _mm512_fmadd_ps(fft3205, _mm512_set1_ps(7.0710677e-01f), fft3194);
__m512 fft3297 = _mm512_fmadd_ps(fft3293, _mm512_set1_ps(7.0710677e-01f), fft3282);
__m512 fft3210 = _mm512_fnmsub_ps(fft3206, _mm512_set1_ps(7.0710677e-01f), fft3198);
__m512 fft3298 = _mm512_fnmsub_ps(fft3294, _mm512_set1_ps(7.0710677e-01f), fft3286);
__m512 fft3211 = _mm512_fnmadd_ps(fft3205, _mm512_set1_ps(7.0710677e-01f), fft3194);
__m512 fft3299 = _mm512_fnmadd_ps(fft3293, _mm512_set1_ps(7.0710677e-01f), fft3282);
__m512 fft3212 = _mm512_fnmadd_ps(fft3206, _mm512_set1_ps(7.0710677e-01f), fft3198);
__m512 fft3300 = _mm512_fnmadd_ps(fft3294, _mm512_set1_ps(7.0710677e-01f), fft3286);
__m512 fft3213 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3214 = _mm512_fmadd_ps(fft3207, fft3213, _mm512_shuffle_f32x4(fft3207, fft3207, 78));
__m512 fft3301 = _mm512_fmadd_ps(fft3295, fft3213, _mm512_shuffle_f32x4(fft3295, fft3295, 78));
__m512 fft3215 = _mm512_fmadd_ps(fft3208, fft3213, _mm512_shuffle_f32x4(fft3208, fft3208, 78));
__m512 fft3302 = _mm512_fmadd_ps(fft3296, fft3213, _mm512_shuffle_f32x4(fft3296, fft3296, 78));
__m512 fft3216 = _mm512_fmadd_ps(fft3209, fft3213, _mm512_shuffle_f32x4(fft3209, fft3209, 78));
__m512 fft3303 = _mm512_fmadd_ps(fft3297, fft3213, _mm512_shuffle_f32x4(fft3297, fft3297, 78));
__m512 fft3217 = _mm512_fmadd_ps(fft3210, fft3213, _mm512_shuffle_f32x4(fft3210, fft3210, 78));
__m512 fft3304 = _mm512_fmadd_ps(fft3298, fft3213, _mm512_shuffle_f32x4(fft3298, fft3298, 78));
__m512 fft3218 = _mm512_fmadd_ps(fft3202, fft3213, _mm512_shuffle_f32x4(fft3202, fft3202, 78));
__m512 fft3305 = _mm512_fmadd_ps(fft3290, fft3213, _mm512_shuffle_f32x4(fft3290, fft3290, 78));
__m512 fft3219 = _mm512_fmadd_ps(fft3204, fft3213, _mm512_shuffle_f32x4(fft3204, fft3204, 78));
__m512 fft3306 = _mm512_fmadd_ps(fft3292, fft3213, _mm512_shuffle_f32x4(fft3292, fft3292, 78));
__m512 fft3220 = _mm512_fmadd_ps(fft3211, fft3213, _mm512_shuffle_f32x4(fft3211, fft3211, 78));
__m512 fft3307 = _mm512_fmadd_ps(fft3299, fft3213, _mm512_shuffle_f32x4(fft3299, fft3299, 78));
__m512 fft3221 = _mm512_fmadd_ps(fft3212, fft3213, _mm512_shuffle_f32x4(fft3212, fft3212, 78));
__m512 fft3308 = _mm512_fmadd_ps(fft3300, fft3213, _mm512_shuffle_f32x4(fft3300, fft3300, 78));
__m512 fft3222 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3223 = _mm512_mul_ps(fft3214, fft3222);
__m512 fft3309 = _mm512_mul_ps(fft3301, fft3222);
__m512 fft3224 = _mm512_mul_ps(fft3215, fft3222);
__m512 fft3310 = _mm512_mul_ps(fft3302, fft3222);
__m512 fft3225 = _mm512_mul_ps(fft3216, fft3222);
__m512 fft3311 = _mm512_mul_ps(fft3303, fft3222);
__m512 fft3226 = _mm512_mul_ps(fft3217, fft3222);
__m512 fft3312 = _mm512_mul_ps(fft3304, fft3222);
__m512 fft3227 = _mm512_mul_ps(fft3218, fft3222);
__m512 fft3313 = _mm512_mul_ps(fft3305, fft3222);
__m512 fft3228 = _mm512_mul_ps(fft3219, fft3222);
__m512 fft3314 = _mm512_mul_ps(fft3306, fft3222);
__m512 fft3229 = _mm512_mul_ps(fft3220, fft3222);
__m512 fft3315 = _mm512_mul_ps(fft3307, fft3222);
__m512 fft3230 = _mm512_mul_ps(fft3221, fft3222);
__m512 fft3316 = _mm512_mul_ps(fft3308, fft3222);
__m512 fft3231 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3232 = _mm512_fmadd_ps(fft3215, fft3231, fft3223);
__m512 fft3317 = _mm512_fmadd_ps(fft3302, fft3231, fft3309);
__m512 fft3233 = _mm512_fnmadd_ps(fft3214, fft3231, fft3224);
__m512 fft3318 = _mm512_fnmadd_ps(fft3301, fft3231, fft3310);
__m512 fft3234 = _mm512_fmadd_ps(fft3217, fft3231, fft3225);
__m512 fft3319 = _mm512_fmadd_ps(fft3304, fft3231, fft3311);
__m512 fft3235 = _mm512_fnmadd_ps(fft3216, fft3231, fft3226);
__m512 fft3320 = _mm512_fnmadd_ps(fft3303, fft3231, fft3312);
__m512 fft3236 = _mm512_fmadd_ps(fft3219, fft3231, fft3227);
__m512 fft3321 = _mm512_fmadd_ps(fft3306, fft3231, fft3313);
__m512 fft3237 = _mm512_fnmadd_ps(fft3218, fft3231, fft3228);
__m512 fft3322 = _mm512_fnmadd_ps(fft3305, fft3231, fft3314);
__m512 fft3238 = _mm512_fmadd_ps(fft3221, fft3231, fft3229);
__m512 fft3323 = _mm512_fmadd_ps(fft3308, fft3231, fft3315);
__m512 fft3239 = _mm512_fnmadd_ps(fft3220, fft3231, fft3230);
__m512 fft3324 = _mm512_fnmadd_ps(fft3307, fft3231, fft3316);
__m512 fft3240 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3241 = _mm512_fmadd_ps(fft3232, fft3240, _mm512_shuffle_f32x4(fft3232, fft3232, 177));
__m512 fft3325 = _mm512_fmadd_ps(fft3317, fft3240, _mm512_shuffle_f32x4(fft3317, fft3317, 177));
__m512 fft3242 = _mm512_fmadd_ps(fft3233, fft3240, _mm512_shuffle_f32x4(fft3233, fft3233, 177));
__m512 fft3326 = _mm512_fmadd_ps(fft3318, fft3240, _mm512_shuffle_f32x4(fft3318, fft3318, 177));
__m512 fft3243 = _mm512_fmadd_ps(fft3234, fft3240, _mm512_shuffle_f32x4(fft3234, fft3234, 177));
__m512 fft3327 = _mm512_fmadd_ps(fft3319, fft3240, _mm512_shuffle_f32x4(fft3319, fft3319, 177));
__m512 fft3244 = _mm512_fmadd_ps(fft3235, fft3240, _mm512_shuffle_f32x4(fft3235, fft3235, 177));
__m512 fft3328 = _mm512_fmadd_ps(fft3320, fft3240, _mm512_shuffle_f32x4(fft3320, fft3320, 177));
__m512 fft3245 = _mm512_fmadd_ps(fft3236, fft3240, _mm512_shuffle_f32x4(fft3236, fft3236, 177));
__m512 fft3329 = _mm512_fmadd_ps(fft3321, fft3240, _mm512_shuffle_f32x4(fft3321, fft3321, 177));
__m512 fft3246 = _mm512_fmadd_ps(fft3237, fft3240, _mm512_shuffle_f32x4(fft3237, fft3237, 177));
__m512 fft3330 = _mm512_fmadd_ps(fft3322, fft3240, _mm512_shuffle_f32x4(fft3322, fft3322, 177));
__m512 fft3247 = _mm512_fmadd_ps(fft3238, fft3240, _mm512_shuffle_f32x4(fft3238, fft3238, 177));
__m512 fft3331 = _mm512_fmadd_ps(fft3323, fft3240, _mm512_shuffle_f32x4(fft3323, fft3323, 177));
__m512 fft3248 = _mm512_fmadd_ps(fft3239, fft3240, _mm512_shuffle_f32x4(fft3239, fft3239, 177));
__m512 fft3332 = _mm512_fmadd_ps(fft3324, fft3240, _mm512_shuffle_f32x4(fft3324, fft3324, 177));
__m512 fft3249 = _mm512_mask_mov_ps(fft3241, 49344, fft3242);
__m512 fft3333 = _mm512_mask_mov_ps(fft3325, 49344, fft3326);
__m512 fft3250 = _mm512_mask_sub_ps(fft3242, 49344, _mm512_setzero_ps(), fft3241);
__m512 fft3334 = _mm512_mask_sub_ps(fft3326, 49344, _mm512_setzero_ps(), fft3325);
__m512 fft3251 = _mm512_mask_mov_ps(fft3243, 49344, fft3244);
__m512 fft3335 = _mm512_mask_mov_ps(fft3327, 49344, fft3328);
__m512 fft3252 = _mm512_mask_sub_ps(fft3244, 49344, _mm512_setzero_ps(), fft3243);
__m512 fft3336 = _mm512_mask_sub_ps(fft3328, 49344, _mm512_setzero_ps(), fft3327);
__m512 fft3253 = _mm512_mask_mov_ps(fft3245, 49344, fft3246);
__m512 fft3337 = _mm512_mask_mov_ps(fft3329, 49344, fft3330);
__m512 fft3254 = _mm512_mask_sub_ps(fft3246, 49344, _mm512_setzero_ps(), fft3245);
__m512 fft3338 = _mm512_mask_sub_ps(fft3330, 49344, _mm512_setzero_ps(), fft3329);
__m512 fft3255 = _mm512_mask_mov_ps(fft3247, 49344, fft3248);
__m512 fft3339 = _mm512_mask_mov_ps(fft3331, 49344, fft3332);
__m512 fft3256 = _mm512_mask_sub_ps(fft3248, 49344, _mm512_setzero_ps(), fft3247);
__m512 fft3340 = _mm512_mask_sub_ps(fft3332, 49344, _mm512_setzero_ps(), fft3331);
__m512 fft3257 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3258 = _mm512_fmadd_ps(fft3249, fft3257, _mm512_shuffle_ps(fft3249, fft3249, 78));
__m512 fft3341 = _mm512_fmadd_ps(fft3333, fft3257, _mm512_shuffle_ps(fft3333, fft3333, 78));
__m512 fft3259 = _mm512_fmadd_ps(fft3250, fft3257, _mm512_shuffle_ps(fft3250, fft3250, 78));
__m512 fft3342 = _mm512_fmadd_ps(fft3334, fft3257, _mm512_shuffle_ps(fft3334, fft3334, 78));
__m512 fft3260 = _mm512_fmadd_ps(fft3251, fft3257, _mm512_shuffle_ps(fft3251, fft3251, 78));
__m512 fft3343 = _mm512_fmadd_ps(fft3335, fft3257, _mm512_shuffle_ps(fft3335, fft3335, 78));
__m512 fft3261 = _mm512_fmadd_ps(fft3252, fft3257, _mm512_shuffle_ps(fft3252, fft3252, 78));
__m512 fft3344 = _mm512_fmadd_ps(fft3336, fft3257, _mm512_shuffle_ps(fft3336, fft3336, 78));
__m512 fft3262 = _mm512_fmadd_ps(fft3253, fft3257, _mm512_shuffle_ps(fft3253, fft3253, 78));
__m512 fft3345 = _mm512_fmadd_ps(fft3337, fft3257, _mm512_shuffle_ps(fft3337, fft3337, 78));
__m512 fft3263 = _mm512_fmadd_ps(fft3254, fft3257, _mm512_shuffle_ps(fft3254, fft3254, 78));
__m512 fft3346 = _mm512_fmadd_ps(fft3338, fft3257, _mm512_shuffle_ps(fft3338, fft3338, 78));
__m512 fft3264 = _mm512_fmadd_ps(fft3255, fft3257, _mm512_shuffle_ps(fft3255, fft3255, 78));
__m512 fft3347 = _mm512_fmadd_ps(fft3339, fft3257, _mm512_shuffle_ps(fft3339, fft3339, 78));
__m512 fft3265 = _mm512_fmadd_ps(fft3256, fft3257, _mm512_shuffle_ps(fft3256, fft3256, 78));
__m512 fft3348 = _mm512_fmadd_ps(fft3340, fft3257, _mm512_shuffle_ps(fft3340, fft3340, 78));
__m512i fft3266 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3267 = _mm512_permutexvar_ps(fft3266, fft3258);
__m512 fft3349 = _mm512_permutexvar_ps(fft3266, fft3341);
__m512i fft3268 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3269 = _mm512_permutexvar_ps(fft3268, fft3258);
__m512 fft3350 = _mm512_permutexvar_ps(fft3268, fft3341);
__m512 fft3270 = _mm512_permutexvar_ps(fft3266, fft3259);
__m512 fft3351 = _mm512_permutexvar_ps(fft3266, fft3342);
__m512 fft3271 = _mm512_permutexvar_ps(fft3268, fft3259);
__m512 fft3352 = _mm512_permutexvar_ps(fft3268, fft3342);
__m512 fft3272 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3273 = _mm512_fmadd_ps(fft3267, fft3272, fft3269);
__m512 fft3353 = _mm512_fmadd_ps(fft3349, fft3272, fft3350);
__m512 fft3274 = _mm512_fnmadd_ps(fft3271, fft3272, fft3270);
__m512 fft3354 = _mm512_fnmadd_ps(fft3352, fft3272, fft3351);
__m512 fft3275 = _mm512_mask_mov_ps(fft3271, 21845, fft3273);
__m512 fft3355 = _mm512_mask_mov_ps(fft3352, 21845, fft3353);
__m512 fft3276 = _mm512_mask_mov_ps(fft3267, 43176, fft3273);
__m512 fft3356 = _mm512_mask_mov_ps(fft3349, 43176, fft3353);
__m512 fft3277 = _mm512_mask_mov_ps(fft3275, 43176, fft3274);
__m512 fft3357 = _mm512_mask_mov_ps(fft3355, 43176, fft3354);
__m512 fft3278 = _mm512_mask_mov_ps(fft3276, 22102, fft3274);
__m512 fft3358 = _mm512_mask_mov_ps(fft3356, 22102, fft3354);
__m512 fft3279 = _mm512_mask_mul_ps(fft3277, 64764, fft3277, _mm512_set1_ps(5e-01f));
__m512 fft3359 = _mm512_mask_mul_ps(fft3357, 64764, fft3357, _mm512_set1_ps(5e-01f));
__m512 fft3280 = _mm512_mask_mul_ps(fft3278, 64764, fft3278, _mm512_set1_ps(5e-01f));
__m512 fft3360 = _mm512_mask_mul_ps(fft3358, 64764, fft3358, _mm512_set1_ps(5e-01f));
__m512 df273 = fft3279;
__m512 df281 = fft3359;
__m512 df274 = fft3280;
__m512 df282 = fft3360;
__m512 df275 = fft3260;
__m512 df283 = fft3343;
__m512 df276 = fft3261;
__m512 df284 = fft3344;
__m512 df277 = fft3262;
__m512 df285 = fft3345;
__m512 df278 = fft3263;
__m512 df286 = fft3346;
__m512 df279 = fft3264;
__m512 df287 = fft3347;
__m512 df280 = fft3265;
__m512 df288 = fft3348;
__m512i eo20 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df275 = _mm512_permutexvar_ps(eo20, df275);
df276 = _mm512_permutexvar_ps(eo20, df276);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df275);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df276);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df275);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df276);
df283 = _mm512_permutexvar_ps(eo20, df283);
df284 = _mm512_permutexvar_ps(eo20, df284);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df283);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df284);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df283);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df284);
df277 = _mm512_permutexvar_ps(eo20, df277);
df278 = _mm512_permutexvar_ps(eo20, df278);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df277);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df278);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df277);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df278);
df285 = _mm512_permutexvar_ps(eo20, df285);
df286 = _mm512_permutexvar_ps(eo20, df286);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df285);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df286);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df285);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df286);
df279 = _mm512_permutexvar_ps(eo20, df279);
df280 = _mm512_permutexvar_ps(eo20, df280);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df279);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df280);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df279);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df280);
df287 = _mm512_permutexvar_ps(eo20, df287);
df288 = _mm512_permutexvar_ps(eo20, df288);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df287);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df288);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df287);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df288);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df273);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df274);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df273);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df274);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df281);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m20+32*f21, 255, df282);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df281);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m20+32*f21, 65280, df282);
ptrdiff_t b21 = 1;
ptrdiff_t m21 = (size_t)b21/2;
ptrdiff_t f22 = (size_t)b21%2;
__m512 dat274 = _mm512_maskz_loadu_ps(127, datPtr1+40+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat274 = _mm512_mask_fmadd_ps(dat274, 127, bnMul9, bnAdd9);
__m512 dat275 = _mm512_maskz_loadu_ps(127, datPtr1+936+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat275 = _mm512_mask_fmadd_ps(dat275, 127, bnMul9, bnAdd9);
__m512 dat276 = _mm512_maskz_loadu_ps(127, datPtr1+1832+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat276 = _mm512_mask_fmadd_ps(dat276, 127, bnMul9, bnAdd9);
__m512 dat277 = _mm512_maskz_loadu_ps(127, datPtr1+2728+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat277 = _mm512_mask_fmadd_ps(dat277, 127, bnMul9, bnAdd9);
__m512 dat278 = _mm512_maskz_loadu_ps(127, datPtr1+3624+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat278 = _mm512_mask_fmadd_ps(dat278, 127, bnMul9, bnAdd9);
__m512 dat279 = _mm512_maskz_loadu_ps(127, datPtr1+4520+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat279 = _mm512_mask_fmadd_ps(dat279, 127, bnMul9, bnAdd9);
__m512 dat280 = _mm512_maskz_loadu_ps(127, datPtr1+5416+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat280 = _mm512_mask_fmadd_ps(dat280, 127, bnMul9, bnAdd9);
__m512 dat281 = _mm512_maskz_loadu_ps(127, datPtr1+6312+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat281 = _mm512_mask_fmadd_ps(dat281, 127, bnMul9, bnAdd9);
__m512 dat282 = _mm512_maskz_loadu_ps(127, datPtr1+7208+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat282 = _mm512_mask_fmadd_ps(dat282, 127, bnMul9, bnAdd9);
__m512 dat283 = _mm512_maskz_loadu_ps(127, datPtr1+8104+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat283 = _mm512_mask_fmadd_ps(dat283, 127, bnMul9, bnAdd9);
__m512 dat284 = _mm512_maskz_loadu_ps(127, datPtr1+9000+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat284 = _mm512_mask_fmadd_ps(dat284, 127, bnMul9, bnAdd9);
__m512 dat285 = _mm512_maskz_loadu_ps(127, datPtr1+9896+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat285 = _mm512_mask_fmadd_ps(dat285, 127, bnMul9, bnAdd9);
__m512 dat286 = _mm512_maskz_loadu_ps(127, datPtr1+10792+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat286 = _mm512_mask_fmadd_ps(dat286, 127, bnMul9, bnAdd9);
__m512 dat287 = _mm512_maskz_loadu_ps(127, datPtr1+11688+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat287 = _mm512_mask_fmadd_ps(dat287, 127, bnMul9, bnAdd9);
__m512 dat288 = _mm512_maskz_loadu_ps(127, datPtr1+12584+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat288 = _mm512_mask_fmadd_ps(dat288, 127, bnMul9, bnAdd9);
__m512 dat289 = _mm512_maskz_loadu_ps(127, datPtr1+13480+602112*i6+200704*k10+896*h9+4*w9+0*b21);
dat289 = _mm512_mask_fmadd_ps(dat289, 127, bnMul9, bnAdd9);
__m512 fft3361 = _mm512_add_ps(dat274, dat282);
__m512 fft3449 = _mm512_add_ps(dat275, dat283);
__m512 fft3362 = _mm512_sub_ps(dat274, dat282);
__m512 fft3450 = _mm512_sub_ps(dat275, dat283);
__m512 fft3363 = _mm512_add_ps(dat276, dat284);
__m512 fft3451 = _mm512_add_ps(dat277, dat285);
__m512 fft3364 = _mm512_sub_ps(dat276, dat284);
__m512 fft3452 = _mm512_sub_ps(dat277, dat285);
__m512 fft3365 = _mm512_add_ps(dat278, dat286);
__m512 fft3453 = _mm512_add_ps(dat279, dat287);
__m512 fft3366 = _mm512_sub_ps(dat278, dat286);
__m512 fft3454 = _mm512_sub_ps(dat279, dat287);
__m512 fft3367 = _mm512_add_ps(dat280, dat288);
__m512 fft3455 = _mm512_add_ps(dat281, dat289);
__m512 fft3368 = _mm512_sub_ps(dat280, dat288);
__m512 fft3456 = _mm512_sub_ps(dat281, dat289);
__m512 fft3369 = _mm512_add_ps(fft3361, fft3365);
__m512 fft3457 = _mm512_add_ps(fft3449, fft3453);
__m512 fft3370 = _mm512_sub_ps(fft3361, fft3365);
__m512 fft3458 = _mm512_sub_ps(fft3449, fft3453);
__m512 fft3371 = _mm512_add_ps(fft3363, fft3367);
__m512 fft3459 = _mm512_add_ps(fft3451, fft3455);
__m512 fft3372 = _mm512_sub_ps(fft3367, fft3363);
__m512 fft3460 = _mm512_sub_ps(fft3455, fft3451);
__m512 fft3373 = _mm512_sub_ps(fft3364, fft3368);
__m512 fft3461 = _mm512_sub_ps(fft3452, fft3456);
__m512 fft3374 = _mm512_add_ps(fft3364, fft3368);
__m512 fft3462 = _mm512_add_ps(fft3452, fft3456);
__m512 fft3375 = _mm512_add_ps(fft3369, fft3371);
__m512 fft3463 = _mm512_add_ps(fft3457, fft3459);
__m512 fft3376 = _mm512_sub_ps(fft3369, fft3371);
__m512 fft3464 = _mm512_sub_ps(fft3457, fft3459);
__m512 fft3377 = _mm512_fmadd_ps(fft3373, _mm512_set1_ps(7.0710677e-01f), fft3362);
__m512 fft3465 = _mm512_fmadd_ps(fft3461, _mm512_set1_ps(7.0710677e-01f), fft3450);
__m512 fft3378 = _mm512_fnmsub_ps(fft3374, _mm512_set1_ps(7.0710677e-01f), fft3366);
__m512 fft3466 = _mm512_fnmsub_ps(fft3462, _mm512_set1_ps(7.0710677e-01f), fft3454);
__m512 fft3379 = _mm512_fnmadd_ps(fft3373, _mm512_set1_ps(7.0710677e-01f), fft3362);
__m512 fft3467 = _mm512_fnmadd_ps(fft3461, _mm512_set1_ps(7.0710677e-01f), fft3450);
__m512 fft3380 = _mm512_fnmadd_ps(fft3374, _mm512_set1_ps(7.0710677e-01f), fft3366);
__m512 fft3468 = _mm512_fnmadd_ps(fft3462, _mm512_set1_ps(7.0710677e-01f), fft3454);
__m512 fft3381 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3382 = _mm512_fmadd_ps(fft3375, fft3381, _mm512_shuffle_f32x4(fft3375, fft3375, 78));
__m512 fft3469 = _mm512_fmadd_ps(fft3463, fft3381, _mm512_shuffle_f32x4(fft3463, fft3463, 78));
__m512 fft3383 = _mm512_fmadd_ps(fft3376, fft3381, _mm512_shuffle_f32x4(fft3376, fft3376, 78));
__m512 fft3470 = _mm512_fmadd_ps(fft3464, fft3381, _mm512_shuffle_f32x4(fft3464, fft3464, 78));
__m512 fft3384 = _mm512_fmadd_ps(fft3377, fft3381, _mm512_shuffle_f32x4(fft3377, fft3377, 78));
__m512 fft3471 = _mm512_fmadd_ps(fft3465, fft3381, _mm512_shuffle_f32x4(fft3465, fft3465, 78));
__m512 fft3385 = _mm512_fmadd_ps(fft3378, fft3381, _mm512_shuffle_f32x4(fft3378, fft3378, 78));
__m512 fft3472 = _mm512_fmadd_ps(fft3466, fft3381, _mm512_shuffle_f32x4(fft3466, fft3466, 78));
__m512 fft3386 = _mm512_fmadd_ps(fft3370, fft3381, _mm512_shuffle_f32x4(fft3370, fft3370, 78));
__m512 fft3473 = _mm512_fmadd_ps(fft3458, fft3381, _mm512_shuffle_f32x4(fft3458, fft3458, 78));
__m512 fft3387 = _mm512_fmadd_ps(fft3372, fft3381, _mm512_shuffle_f32x4(fft3372, fft3372, 78));
__m512 fft3474 = _mm512_fmadd_ps(fft3460, fft3381, _mm512_shuffle_f32x4(fft3460, fft3460, 78));
__m512 fft3388 = _mm512_fmadd_ps(fft3379, fft3381, _mm512_shuffle_f32x4(fft3379, fft3379, 78));
__m512 fft3475 = _mm512_fmadd_ps(fft3467, fft3381, _mm512_shuffle_f32x4(fft3467, fft3467, 78));
__m512 fft3389 = _mm512_fmadd_ps(fft3380, fft3381, _mm512_shuffle_f32x4(fft3380, fft3380, 78));
__m512 fft3476 = _mm512_fmadd_ps(fft3468, fft3381, _mm512_shuffle_f32x4(fft3468, fft3468, 78));
__m512 fft3390 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3391 = _mm512_mul_ps(fft3382, fft3390);
__m512 fft3477 = _mm512_mul_ps(fft3469, fft3390);
__m512 fft3392 = _mm512_mul_ps(fft3383, fft3390);
__m512 fft3478 = _mm512_mul_ps(fft3470, fft3390);
__m512 fft3393 = _mm512_mul_ps(fft3384, fft3390);
__m512 fft3479 = _mm512_mul_ps(fft3471, fft3390);
__m512 fft3394 = _mm512_mul_ps(fft3385, fft3390);
__m512 fft3480 = _mm512_mul_ps(fft3472, fft3390);
__m512 fft3395 = _mm512_mul_ps(fft3386, fft3390);
__m512 fft3481 = _mm512_mul_ps(fft3473, fft3390);
__m512 fft3396 = _mm512_mul_ps(fft3387, fft3390);
__m512 fft3482 = _mm512_mul_ps(fft3474, fft3390);
__m512 fft3397 = _mm512_mul_ps(fft3388, fft3390);
__m512 fft3483 = _mm512_mul_ps(fft3475, fft3390);
__m512 fft3398 = _mm512_mul_ps(fft3389, fft3390);
__m512 fft3484 = _mm512_mul_ps(fft3476, fft3390);
__m512 fft3399 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3400 = _mm512_fmadd_ps(fft3383, fft3399, fft3391);
__m512 fft3485 = _mm512_fmadd_ps(fft3470, fft3399, fft3477);
__m512 fft3401 = _mm512_fnmadd_ps(fft3382, fft3399, fft3392);
__m512 fft3486 = _mm512_fnmadd_ps(fft3469, fft3399, fft3478);
__m512 fft3402 = _mm512_fmadd_ps(fft3385, fft3399, fft3393);
__m512 fft3487 = _mm512_fmadd_ps(fft3472, fft3399, fft3479);
__m512 fft3403 = _mm512_fnmadd_ps(fft3384, fft3399, fft3394);
__m512 fft3488 = _mm512_fnmadd_ps(fft3471, fft3399, fft3480);
__m512 fft3404 = _mm512_fmadd_ps(fft3387, fft3399, fft3395);
__m512 fft3489 = _mm512_fmadd_ps(fft3474, fft3399, fft3481);
__m512 fft3405 = _mm512_fnmadd_ps(fft3386, fft3399, fft3396);
__m512 fft3490 = _mm512_fnmadd_ps(fft3473, fft3399, fft3482);
__m512 fft3406 = _mm512_fmadd_ps(fft3389, fft3399, fft3397);
__m512 fft3491 = _mm512_fmadd_ps(fft3476, fft3399, fft3483);
__m512 fft3407 = _mm512_fnmadd_ps(fft3388, fft3399, fft3398);
__m512 fft3492 = _mm512_fnmadd_ps(fft3475, fft3399, fft3484);
__m512 fft3408 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3409 = _mm512_fmadd_ps(fft3400, fft3408, _mm512_shuffle_f32x4(fft3400, fft3400, 177));
__m512 fft3493 = _mm512_fmadd_ps(fft3485, fft3408, _mm512_shuffle_f32x4(fft3485, fft3485, 177));
__m512 fft3410 = _mm512_fmadd_ps(fft3401, fft3408, _mm512_shuffle_f32x4(fft3401, fft3401, 177));
__m512 fft3494 = _mm512_fmadd_ps(fft3486, fft3408, _mm512_shuffle_f32x4(fft3486, fft3486, 177));
__m512 fft3411 = _mm512_fmadd_ps(fft3402, fft3408, _mm512_shuffle_f32x4(fft3402, fft3402, 177));
__m512 fft3495 = _mm512_fmadd_ps(fft3487, fft3408, _mm512_shuffle_f32x4(fft3487, fft3487, 177));
__m512 fft3412 = _mm512_fmadd_ps(fft3403, fft3408, _mm512_shuffle_f32x4(fft3403, fft3403, 177));
__m512 fft3496 = _mm512_fmadd_ps(fft3488, fft3408, _mm512_shuffle_f32x4(fft3488, fft3488, 177));
__m512 fft3413 = _mm512_fmadd_ps(fft3404, fft3408, _mm512_shuffle_f32x4(fft3404, fft3404, 177));
__m512 fft3497 = _mm512_fmadd_ps(fft3489, fft3408, _mm512_shuffle_f32x4(fft3489, fft3489, 177));
__m512 fft3414 = _mm512_fmadd_ps(fft3405, fft3408, _mm512_shuffle_f32x4(fft3405, fft3405, 177));
__m512 fft3498 = _mm512_fmadd_ps(fft3490, fft3408, _mm512_shuffle_f32x4(fft3490, fft3490, 177));
__m512 fft3415 = _mm512_fmadd_ps(fft3406, fft3408, _mm512_shuffle_f32x4(fft3406, fft3406, 177));
__m512 fft3499 = _mm512_fmadd_ps(fft3491, fft3408, _mm512_shuffle_f32x4(fft3491, fft3491, 177));
__m512 fft3416 = _mm512_fmadd_ps(fft3407, fft3408, _mm512_shuffle_f32x4(fft3407, fft3407, 177));
__m512 fft3500 = _mm512_fmadd_ps(fft3492, fft3408, _mm512_shuffle_f32x4(fft3492, fft3492, 177));
__m512 fft3417 = _mm512_mask_mov_ps(fft3409, 49344, fft3410);
__m512 fft3501 = _mm512_mask_mov_ps(fft3493, 49344, fft3494);
__m512 fft3418 = _mm512_mask_sub_ps(fft3410, 49344, _mm512_setzero_ps(), fft3409);
__m512 fft3502 = _mm512_mask_sub_ps(fft3494, 49344, _mm512_setzero_ps(), fft3493);
__m512 fft3419 = _mm512_mask_mov_ps(fft3411, 49344, fft3412);
__m512 fft3503 = _mm512_mask_mov_ps(fft3495, 49344, fft3496);
__m512 fft3420 = _mm512_mask_sub_ps(fft3412, 49344, _mm512_setzero_ps(), fft3411);
__m512 fft3504 = _mm512_mask_sub_ps(fft3496, 49344, _mm512_setzero_ps(), fft3495);
__m512 fft3421 = _mm512_mask_mov_ps(fft3413, 49344, fft3414);
__m512 fft3505 = _mm512_mask_mov_ps(fft3497, 49344, fft3498);
__m512 fft3422 = _mm512_mask_sub_ps(fft3414, 49344, _mm512_setzero_ps(), fft3413);
__m512 fft3506 = _mm512_mask_sub_ps(fft3498, 49344, _mm512_setzero_ps(), fft3497);
__m512 fft3423 = _mm512_mask_mov_ps(fft3415, 49344, fft3416);
__m512 fft3507 = _mm512_mask_mov_ps(fft3499, 49344, fft3500);
__m512 fft3424 = _mm512_mask_sub_ps(fft3416, 49344, _mm512_setzero_ps(), fft3415);
__m512 fft3508 = _mm512_mask_sub_ps(fft3500, 49344, _mm512_setzero_ps(), fft3499);
__m512 fft3425 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3426 = _mm512_fmadd_ps(fft3417, fft3425, _mm512_shuffle_ps(fft3417, fft3417, 78));
__m512 fft3509 = _mm512_fmadd_ps(fft3501, fft3425, _mm512_shuffle_ps(fft3501, fft3501, 78));
__m512 fft3427 = _mm512_fmadd_ps(fft3418, fft3425, _mm512_shuffle_ps(fft3418, fft3418, 78));
__m512 fft3510 = _mm512_fmadd_ps(fft3502, fft3425, _mm512_shuffle_ps(fft3502, fft3502, 78));
__m512 fft3428 = _mm512_fmadd_ps(fft3419, fft3425, _mm512_shuffle_ps(fft3419, fft3419, 78));
__m512 fft3511 = _mm512_fmadd_ps(fft3503, fft3425, _mm512_shuffle_ps(fft3503, fft3503, 78));
__m512 fft3429 = _mm512_fmadd_ps(fft3420, fft3425, _mm512_shuffle_ps(fft3420, fft3420, 78));
__m512 fft3512 = _mm512_fmadd_ps(fft3504, fft3425, _mm512_shuffle_ps(fft3504, fft3504, 78));
__m512 fft3430 = _mm512_fmadd_ps(fft3421, fft3425, _mm512_shuffle_ps(fft3421, fft3421, 78));
__m512 fft3513 = _mm512_fmadd_ps(fft3505, fft3425, _mm512_shuffle_ps(fft3505, fft3505, 78));
__m512 fft3431 = _mm512_fmadd_ps(fft3422, fft3425, _mm512_shuffle_ps(fft3422, fft3422, 78));
__m512 fft3514 = _mm512_fmadd_ps(fft3506, fft3425, _mm512_shuffle_ps(fft3506, fft3506, 78));
__m512 fft3432 = _mm512_fmadd_ps(fft3423, fft3425, _mm512_shuffle_ps(fft3423, fft3423, 78));
__m512 fft3515 = _mm512_fmadd_ps(fft3507, fft3425, _mm512_shuffle_ps(fft3507, fft3507, 78));
__m512 fft3433 = _mm512_fmadd_ps(fft3424, fft3425, _mm512_shuffle_ps(fft3424, fft3424, 78));
__m512 fft3516 = _mm512_fmadd_ps(fft3508, fft3425, _mm512_shuffle_ps(fft3508, fft3508, 78));
__m512i fft3434 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3435 = _mm512_permutexvar_ps(fft3434, fft3426);
__m512 fft3517 = _mm512_permutexvar_ps(fft3434, fft3509);
__m512i fft3436 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3437 = _mm512_permutexvar_ps(fft3436, fft3426);
__m512 fft3518 = _mm512_permutexvar_ps(fft3436, fft3509);
__m512 fft3438 = _mm512_permutexvar_ps(fft3434, fft3427);
__m512 fft3519 = _mm512_permutexvar_ps(fft3434, fft3510);
__m512 fft3439 = _mm512_permutexvar_ps(fft3436, fft3427);
__m512 fft3520 = _mm512_permutexvar_ps(fft3436, fft3510);
__m512 fft3440 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3441 = _mm512_fmadd_ps(fft3435, fft3440, fft3437);
__m512 fft3521 = _mm512_fmadd_ps(fft3517, fft3440, fft3518);
__m512 fft3442 = _mm512_fnmadd_ps(fft3439, fft3440, fft3438);
__m512 fft3522 = _mm512_fnmadd_ps(fft3520, fft3440, fft3519);
__m512 fft3443 = _mm512_mask_mov_ps(fft3439, 21845, fft3441);
__m512 fft3523 = _mm512_mask_mov_ps(fft3520, 21845, fft3521);
__m512 fft3444 = _mm512_mask_mov_ps(fft3435, 43176, fft3441);
__m512 fft3524 = _mm512_mask_mov_ps(fft3517, 43176, fft3521);
__m512 fft3445 = _mm512_mask_mov_ps(fft3443, 43176, fft3442);
__m512 fft3525 = _mm512_mask_mov_ps(fft3523, 43176, fft3522);
__m512 fft3446 = _mm512_mask_mov_ps(fft3444, 22102, fft3442);
__m512 fft3526 = _mm512_mask_mov_ps(fft3524, 22102, fft3522);
__m512 fft3447 = _mm512_mask_mul_ps(fft3445, 64764, fft3445, _mm512_set1_ps(5e-01f));
__m512 fft3527 = _mm512_mask_mul_ps(fft3525, 64764, fft3525, _mm512_set1_ps(5e-01f));
__m512 fft3448 = _mm512_mask_mul_ps(fft3446, 64764, fft3446, _mm512_set1_ps(5e-01f));
__m512 fft3528 = _mm512_mask_mul_ps(fft3526, 64764, fft3526, _mm512_set1_ps(5e-01f));
__m512 df289 = fft3447;
__m512 df297 = fft3527;
__m512 df290 = fft3448;
__m512 df298 = fft3528;
__m512 df291 = fft3428;
__m512 df299 = fft3511;
__m512 df292 = fft3429;
__m512 df300 = fft3512;
__m512 df293 = fft3430;
__m512 df301 = fft3513;
__m512 df294 = fft3431;
__m512 df302 = fft3514;
__m512 df295 = fft3432;
__m512 df303 = fft3515;
__m512 df296 = fft3433;
__m512 df304 = fft3516;
__m512i eo21 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df291 = _mm512_permutexvar_ps(eo21, df291);
df292 = _mm512_permutexvar_ps(eo21, df292);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df291);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df292);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df291);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df292);
df299 = _mm512_permutexvar_ps(eo21, df299);
df300 = _mm512_permutexvar_ps(eo21, df300);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df299);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df300);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df299);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df300);
df293 = _mm512_permutexvar_ps(eo21, df293);
df294 = _mm512_permutexvar_ps(eo21, df294);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df293);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df294);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df293);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df294);
df301 = _mm512_permutexvar_ps(eo21, df301);
df302 = _mm512_permutexvar_ps(eo21, df302);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df301);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df302);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df301);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df302);
df295 = _mm512_permutexvar_ps(eo21, df295);
df296 = _mm512_permutexvar_ps(eo21, df296);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df295);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df296);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df295);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df296);
df303 = _mm512_permutexvar_ps(eo21, df303);
df304 = _mm512_permutexvar_ps(eo21, df304);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df303);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df304);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df303);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df304);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df289);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df290);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df289);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df290);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df297);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m21+32*f22, 255, df298);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df297);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m21+32*f22, 65280, df298);
ptrdiff_t b22 = 2;
ptrdiff_t m22 = (size_t)b22/2;
ptrdiff_t f23 = (size_t)b22%2;
__m512 dat290 = _mm512_maskz_loadu_ps(65528, datPtr1+8120+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat290 = _mm512_mask_fmadd_ps(dat290, 65528, bnMul9, bnAdd9);
__m512 dat291 = _mm512_maskz_loadu_ps(65528, datPtr1+9016+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat291 = _mm512_mask_fmadd_ps(dat291, 65528, bnMul9, bnAdd9);
__m512 dat292 = _mm512_maskz_loadu_ps(65528, datPtr1+9912+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat292 = _mm512_mask_fmadd_ps(dat292, 65528, bnMul9, bnAdd9);
__m512 dat293 = _mm512_maskz_loadu_ps(65528, datPtr1+10808+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat293 = _mm512_mask_fmadd_ps(dat293, 65528, bnMul9, bnAdd9);
__m512 dat294 = _mm512_maskz_loadu_ps(65528, datPtr1+11704+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat294 = _mm512_mask_fmadd_ps(dat294, 65528, bnMul9, bnAdd9);
__m512 dat295 = _mm512_maskz_loadu_ps(65528, datPtr1+12600+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat295 = _mm512_mask_fmadd_ps(dat295, 65528, bnMul9, bnAdd9);
__m512 dat296 = _mm512_maskz_loadu_ps(65528, datPtr1+13496+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat296 = _mm512_mask_fmadd_ps(dat296, 65528, bnMul9, bnAdd9);
__m512 dat297 = _mm512_maskz_loadu_ps(65528, datPtr1+14392+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat297 = _mm512_mask_fmadd_ps(dat297, 65528, bnMul9, bnAdd9);
__m512 dat298 = _mm512_maskz_loadu_ps(65528, datPtr1+15288+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat298 = _mm512_mask_fmadd_ps(dat298, 65528, bnMul9, bnAdd9);
__m512 dat299 = _mm512_maskz_loadu_ps(65528, datPtr1+16184+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat299 = _mm512_mask_fmadd_ps(dat299, 65528, bnMul9, bnAdd9);
__m512 dat300 = _mm512_maskz_loadu_ps(65528, datPtr1+17080+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat300 = _mm512_mask_fmadd_ps(dat300, 65528, bnMul9, bnAdd9);
__m512 dat301 = _mm512_maskz_loadu_ps(65528, datPtr1+17976+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat301 = _mm512_mask_fmadd_ps(dat301, 65528, bnMul9, bnAdd9);
__m512 dat302 = _mm512_maskz_loadu_ps(65528, datPtr1+18872+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat302 = _mm512_mask_fmadd_ps(dat302, 65528, bnMul9, bnAdd9);
__m512 dat303 = _mm512_maskz_loadu_ps(65528, datPtr1+19768+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat303 = _mm512_mask_fmadd_ps(dat303, 65528, bnMul9, bnAdd9);
__m512 dat304 = _mm512_maskz_loadu_ps(65528, datPtr1+20664+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat304 = _mm512_mask_fmadd_ps(dat304, 65528, bnMul9, bnAdd9);
__m512 dat305 = _mm512_maskz_loadu_ps(65528, datPtr1+21560+602112*i6+200704*k10+896*h9+4*w9+0*b22);
dat305 = _mm512_mask_fmadd_ps(dat305, 65528, bnMul9, bnAdd9);
__m512 fft3529 = _mm512_add_ps(dat290, dat298);
__m512 fft3617 = _mm512_add_ps(dat291, dat299);
__m512 fft3530 = _mm512_sub_ps(dat290, dat298);
__m512 fft3618 = _mm512_sub_ps(dat291, dat299);
__m512 fft3531 = _mm512_add_ps(dat292, dat300);
__m512 fft3619 = _mm512_add_ps(dat293, dat301);
__m512 fft3532 = _mm512_sub_ps(dat292, dat300);
__m512 fft3620 = _mm512_sub_ps(dat293, dat301);
__m512 fft3533 = _mm512_add_ps(dat294, dat302);
__m512 fft3621 = _mm512_add_ps(dat295, dat303);
__m512 fft3534 = _mm512_sub_ps(dat294, dat302);
__m512 fft3622 = _mm512_sub_ps(dat295, dat303);
__m512 fft3535 = _mm512_add_ps(dat296, dat304);
__m512 fft3623 = _mm512_add_ps(dat297, dat305);
__m512 fft3536 = _mm512_sub_ps(dat296, dat304);
__m512 fft3624 = _mm512_sub_ps(dat297, dat305);
__m512 fft3537 = _mm512_add_ps(fft3529, fft3533);
__m512 fft3625 = _mm512_add_ps(fft3617, fft3621);
__m512 fft3538 = _mm512_sub_ps(fft3529, fft3533);
__m512 fft3626 = _mm512_sub_ps(fft3617, fft3621);
__m512 fft3539 = _mm512_add_ps(fft3531, fft3535);
__m512 fft3627 = _mm512_add_ps(fft3619, fft3623);
__m512 fft3540 = _mm512_sub_ps(fft3535, fft3531);
__m512 fft3628 = _mm512_sub_ps(fft3623, fft3619);
__m512 fft3541 = _mm512_sub_ps(fft3532, fft3536);
__m512 fft3629 = _mm512_sub_ps(fft3620, fft3624);
__m512 fft3542 = _mm512_add_ps(fft3532, fft3536);
__m512 fft3630 = _mm512_add_ps(fft3620, fft3624);
__m512 fft3543 = _mm512_add_ps(fft3537, fft3539);
__m512 fft3631 = _mm512_add_ps(fft3625, fft3627);
__m512 fft3544 = _mm512_sub_ps(fft3537, fft3539);
__m512 fft3632 = _mm512_sub_ps(fft3625, fft3627);
__m512 fft3545 = _mm512_fmadd_ps(fft3541, _mm512_set1_ps(7.0710677e-01f), fft3530);
__m512 fft3633 = _mm512_fmadd_ps(fft3629, _mm512_set1_ps(7.0710677e-01f), fft3618);
__m512 fft3546 = _mm512_fnmsub_ps(fft3542, _mm512_set1_ps(7.0710677e-01f), fft3534);
__m512 fft3634 = _mm512_fnmsub_ps(fft3630, _mm512_set1_ps(7.0710677e-01f), fft3622);
__m512 fft3547 = _mm512_fnmadd_ps(fft3541, _mm512_set1_ps(7.0710677e-01f), fft3530);
__m512 fft3635 = _mm512_fnmadd_ps(fft3629, _mm512_set1_ps(7.0710677e-01f), fft3618);
__m512 fft3548 = _mm512_fnmadd_ps(fft3542, _mm512_set1_ps(7.0710677e-01f), fft3534);
__m512 fft3636 = _mm512_fnmadd_ps(fft3630, _mm512_set1_ps(7.0710677e-01f), fft3622);
__m512 fft3549 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3550 = _mm512_fmadd_ps(fft3543, fft3549, _mm512_shuffle_f32x4(fft3543, fft3543, 78));
__m512 fft3637 = _mm512_fmadd_ps(fft3631, fft3549, _mm512_shuffle_f32x4(fft3631, fft3631, 78));
__m512 fft3551 = _mm512_fmadd_ps(fft3544, fft3549, _mm512_shuffle_f32x4(fft3544, fft3544, 78));
__m512 fft3638 = _mm512_fmadd_ps(fft3632, fft3549, _mm512_shuffle_f32x4(fft3632, fft3632, 78));
__m512 fft3552 = _mm512_fmadd_ps(fft3545, fft3549, _mm512_shuffle_f32x4(fft3545, fft3545, 78));
__m512 fft3639 = _mm512_fmadd_ps(fft3633, fft3549, _mm512_shuffle_f32x4(fft3633, fft3633, 78));
__m512 fft3553 = _mm512_fmadd_ps(fft3546, fft3549, _mm512_shuffle_f32x4(fft3546, fft3546, 78));
__m512 fft3640 = _mm512_fmadd_ps(fft3634, fft3549, _mm512_shuffle_f32x4(fft3634, fft3634, 78));
__m512 fft3554 = _mm512_fmadd_ps(fft3538, fft3549, _mm512_shuffle_f32x4(fft3538, fft3538, 78));
__m512 fft3641 = _mm512_fmadd_ps(fft3626, fft3549, _mm512_shuffle_f32x4(fft3626, fft3626, 78));
__m512 fft3555 = _mm512_fmadd_ps(fft3540, fft3549, _mm512_shuffle_f32x4(fft3540, fft3540, 78));
__m512 fft3642 = _mm512_fmadd_ps(fft3628, fft3549, _mm512_shuffle_f32x4(fft3628, fft3628, 78));
__m512 fft3556 = _mm512_fmadd_ps(fft3547, fft3549, _mm512_shuffle_f32x4(fft3547, fft3547, 78));
__m512 fft3643 = _mm512_fmadd_ps(fft3635, fft3549, _mm512_shuffle_f32x4(fft3635, fft3635, 78));
__m512 fft3557 = _mm512_fmadd_ps(fft3548, fft3549, _mm512_shuffle_f32x4(fft3548, fft3548, 78));
__m512 fft3644 = _mm512_fmadd_ps(fft3636, fft3549, _mm512_shuffle_f32x4(fft3636, fft3636, 78));
__m512 fft3558 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3559 = _mm512_mul_ps(fft3550, fft3558);
__m512 fft3645 = _mm512_mul_ps(fft3637, fft3558);
__m512 fft3560 = _mm512_mul_ps(fft3551, fft3558);
__m512 fft3646 = _mm512_mul_ps(fft3638, fft3558);
__m512 fft3561 = _mm512_mul_ps(fft3552, fft3558);
__m512 fft3647 = _mm512_mul_ps(fft3639, fft3558);
__m512 fft3562 = _mm512_mul_ps(fft3553, fft3558);
__m512 fft3648 = _mm512_mul_ps(fft3640, fft3558);
__m512 fft3563 = _mm512_mul_ps(fft3554, fft3558);
__m512 fft3649 = _mm512_mul_ps(fft3641, fft3558);
__m512 fft3564 = _mm512_mul_ps(fft3555, fft3558);
__m512 fft3650 = _mm512_mul_ps(fft3642, fft3558);
__m512 fft3565 = _mm512_mul_ps(fft3556, fft3558);
__m512 fft3651 = _mm512_mul_ps(fft3643, fft3558);
__m512 fft3566 = _mm512_mul_ps(fft3557, fft3558);
__m512 fft3652 = _mm512_mul_ps(fft3644, fft3558);
__m512 fft3567 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3568 = _mm512_fmadd_ps(fft3551, fft3567, fft3559);
__m512 fft3653 = _mm512_fmadd_ps(fft3638, fft3567, fft3645);
__m512 fft3569 = _mm512_fnmadd_ps(fft3550, fft3567, fft3560);
__m512 fft3654 = _mm512_fnmadd_ps(fft3637, fft3567, fft3646);
__m512 fft3570 = _mm512_fmadd_ps(fft3553, fft3567, fft3561);
__m512 fft3655 = _mm512_fmadd_ps(fft3640, fft3567, fft3647);
__m512 fft3571 = _mm512_fnmadd_ps(fft3552, fft3567, fft3562);
__m512 fft3656 = _mm512_fnmadd_ps(fft3639, fft3567, fft3648);
__m512 fft3572 = _mm512_fmadd_ps(fft3555, fft3567, fft3563);
__m512 fft3657 = _mm512_fmadd_ps(fft3642, fft3567, fft3649);
__m512 fft3573 = _mm512_fnmadd_ps(fft3554, fft3567, fft3564);
__m512 fft3658 = _mm512_fnmadd_ps(fft3641, fft3567, fft3650);
__m512 fft3574 = _mm512_fmadd_ps(fft3557, fft3567, fft3565);
__m512 fft3659 = _mm512_fmadd_ps(fft3644, fft3567, fft3651);
__m512 fft3575 = _mm512_fnmadd_ps(fft3556, fft3567, fft3566);
__m512 fft3660 = _mm512_fnmadd_ps(fft3643, fft3567, fft3652);
__m512 fft3576 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3577 = _mm512_fmadd_ps(fft3568, fft3576, _mm512_shuffle_f32x4(fft3568, fft3568, 177));
__m512 fft3661 = _mm512_fmadd_ps(fft3653, fft3576, _mm512_shuffle_f32x4(fft3653, fft3653, 177));
__m512 fft3578 = _mm512_fmadd_ps(fft3569, fft3576, _mm512_shuffle_f32x4(fft3569, fft3569, 177));
__m512 fft3662 = _mm512_fmadd_ps(fft3654, fft3576, _mm512_shuffle_f32x4(fft3654, fft3654, 177));
__m512 fft3579 = _mm512_fmadd_ps(fft3570, fft3576, _mm512_shuffle_f32x4(fft3570, fft3570, 177));
__m512 fft3663 = _mm512_fmadd_ps(fft3655, fft3576, _mm512_shuffle_f32x4(fft3655, fft3655, 177));
__m512 fft3580 = _mm512_fmadd_ps(fft3571, fft3576, _mm512_shuffle_f32x4(fft3571, fft3571, 177));
__m512 fft3664 = _mm512_fmadd_ps(fft3656, fft3576, _mm512_shuffle_f32x4(fft3656, fft3656, 177));
__m512 fft3581 = _mm512_fmadd_ps(fft3572, fft3576, _mm512_shuffle_f32x4(fft3572, fft3572, 177));
__m512 fft3665 = _mm512_fmadd_ps(fft3657, fft3576, _mm512_shuffle_f32x4(fft3657, fft3657, 177));
__m512 fft3582 = _mm512_fmadd_ps(fft3573, fft3576, _mm512_shuffle_f32x4(fft3573, fft3573, 177));
__m512 fft3666 = _mm512_fmadd_ps(fft3658, fft3576, _mm512_shuffle_f32x4(fft3658, fft3658, 177));
__m512 fft3583 = _mm512_fmadd_ps(fft3574, fft3576, _mm512_shuffle_f32x4(fft3574, fft3574, 177));
__m512 fft3667 = _mm512_fmadd_ps(fft3659, fft3576, _mm512_shuffle_f32x4(fft3659, fft3659, 177));
__m512 fft3584 = _mm512_fmadd_ps(fft3575, fft3576, _mm512_shuffle_f32x4(fft3575, fft3575, 177));
__m512 fft3668 = _mm512_fmadd_ps(fft3660, fft3576, _mm512_shuffle_f32x4(fft3660, fft3660, 177));
__m512 fft3585 = _mm512_mask_mov_ps(fft3577, 49344, fft3578);
__m512 fft3669 = _mm512_mask_mov_ps(fft3661, 49344, fft3662);
__m512 fft3586 = _mm512_mask_sub_ps(fft3578, 49344, _mm512_setzero_ps(), fft3577);
__m512 fft3670 = _mm512_mask_sub_ps(fft3662, 49344, _mm512_setzero_ps(), fft3661);
__m512 fft3587 = _mm512_mask_mov_ps(fft3579, 49344, fft3580);
__m512 fft3671 = _mm512_mask_mov_ps(fft3663, 49344, fft3664);
__m512 fft3588 = _mm512_mask_sub_ps(fft3580, 49344, _mm512_setzero_ps(), fft3579);
__m512 fft3672 = _mm512_mask_sub_ps(fft3664, 49344, _mm512_setzero_ps(), fft3663);
__m512 fft3589 = _mm512_mask_mov_ps(fft3581, 49344, fft3582);
__m512 fft3673 = _mm512_mask_mov_ps(fft3665, 49344, fft3666);
__m512 fft3590 = _mm512_mask_sub_ps(fft3582, 49344, _mm512_setzero_ps(), fft3581);
__m512 fft3674 = _mm512_mask_sub_ps(fft3666, 49344, _mm512_setzero_ps(), fft3665);
__m512 fft3591 = _mm512_mask_mov_ps(fft3583, 49344, fft3584);
__m512 fft3675 = _mm512_mask_mov_ps(fft3667, 49344, fft3668);
__m512 fft3592 = _mm512_mask_sub_ps(fft3584, 49344, _mm512_setzero_ps(), fft3583);
__m512 fft3676 = _mm512_mask_sub_ps(fft3668, 49344, _mm512_setzero_ps(), fft3667);
__m512 fft3593 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3594 = _mm512_fmadd_ps(fft3585, fft3593, _mm512_shuffle_ps(fft3585, fft3585, 78));
__m512 fft3677 = _mm512_fmadd_ps(fft3669, fft3593, _mm512_shuffle_ps(fft3669, fft3669, 78));
__m512 fft3595 = _mm512_fmadd_ps(fft3586, fft3593, _mm512_shuffle_ps(fft3586, fft3586, 78));
__m512 fft3678 = _mm512_fmadd_ps(fft3670, fft3593, _mm512_shuffle_ps(fft3670, fft3670, 78));
__m512 fft3596 = _mm512_fmadd_ps(fft3587, fft3593, _mm512_shuffle_ps(fft3587, fft3587, 78));
__m512 fft3679 = _mm512_fmadd_ps(fft3671, fft3593, _mm512_shuffle_ps(fft3671, fft3671, 78));
__m512 fft3597 = _mm512_fmadd_ps(fft3588, fft3593, _mm512_shuffle_ps(fft3588, fft3588, 78));
__m512 fft3680 = _mm512_fmadd_ps(fft3672, fft3593, _mm512_shuffle_ps(fft3672, fft3672, 78));
__m512 fft3598 = _mm512_fmadd_ps(fft3589, fft3593, _mm512_shuffle_ps(fft3589, fft3589, 78));
__m512 fft3681 = _mm512_fmadd_ps(fft3673, fft3593, _mm512_shuffle_ps(fft3673, fft3673, 78));
__m512 fft3599 = _mm512_fmadd_ps(fft3590, fft3593, _mm512_shuffle_ps(fft3590, fft3590, 78));
__m512 fft3682 = _mm512_fmadd_ps(fft3674, fft3593, _mm512_shuffle_ps(fft3674, fft3674, 78));
__m512 fft3600 = _mm512_fmadd_ps(fft3591, fft3593, _mm512_shuffle_ps(fft3591, fft3591, 78));
__m512 fft3683 = _mm512_fmadd_ps(fft3675, fft3593, _mm512_shuffle_ps(fft3675, fft3675, 78));
__m512 fft3601 = _mm512_fmadd_ps(fft3592, fft3593, _mm512_shuffle_ps(fft3592, fft3592, 78));
__m512 fft3684 = _mm512_fmadd_ps(fft3676, fft3593, _mm512_shuffle_ps(fft3676, fft3676, 78));
__m512i fft3602 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3603 = _mm512_permutexvar_ps(fft3602, fft3594);
__m512 fft3685 = _mm512_permutexvar_ps(fft3602, fft3677);
__m512i fft3604 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3605 = _mm512_permutexvar_ps(fft3604, fft3594);
__m512 fft3686 = _mm512_permutexvar_ps(fft3604, fft3677);
__m512 fft3606 = _mm512_permutexvar_ps(fft3602, fft3595);
__m512 fft3687 = _mm512_permutexvar_ps(fft3602, fft3678);
__m512 fft3607 = _mm512_permutexvar_ps(fft3604, fft3595);
__m512 fft3688 = _mm512_permutexvar_ps(fft3604, fft3678);
__m512 fft3608 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3609 = _mm512_fmadd_ps(fft3603, fft3608, fft3605);
__m512 fft3689 = _mm512_fmadd_ps(fft3685, fft3608, fft3686);
__m512 fft3610 = _mm512_fnmadd_ps(fft3607, fft3608, fft3606);
__m512 fft3690 = _mm512_fnmadd_ps(fft3688, fft3608, fft3687);
__m512 fft3611 = _mm512_mask_mov_ps(fft3607, 21845, fft3609);
__m512 fft3691 = _mm512_mask_mov_ps(fft3688, 21845, fft3689);
__m512 fft3612 = _mm512_mask_mov_ps(fft3603, 43176, fft3609);
__m512 fft3692 = _mm512_mask_mov_ps(fft3685, 43176, fft3689);
__m512 fft3613 = _mm512_mask_mov_ps(fft3611, 43176, fft3610);
__m512 fft3693 = _mm512_mask_mov_ps(fft3691, 43176, fft3690);
__m512 fft3614 = _mm512_mask_mov_ps(fft3612, 22102, fft3610);
__m512 fft3694 = _mm512_mask_mov_ps(fft3692, 22102, fft3690);
__m512 fft3615 = _mm512_mask_mul_ps(fft3613, 64764, fft3613, _mm512_set1_ps(5e-01f));
__m512 fft3695 = _mm512_mask_mul_ps(fft3693, 64764, fft3693, _mm512_set1_ps(5e-01f));
__m512 fft3616 = _mm512_mask_mul_ps(fft3614, 64764, fft3614, _mm512_set1_ps(5e-01f));
__m512 fft3696 = _mm512_mask_mul_ps(fft3694, 64764, fft3694, _mm512_set1_ps(5e-01f));
__m512 df305 = fft3615;
__m512 df313 = fft3695;
__m512 df306 = fft3616;
__m512 df314 = fft3696;
__m512 df307 = fft3596;
__m512 df315 = fft3679;
__m512 df308 = fft3597;
__m512 df316 = fft3680;
__m512 df309 = fft3598;
__m512 df317 = fft3681;
__m512 df310 = fft3599;
__m512 df318 = fft3682;
__m512 df311 = fft3600;
__m512 df319 = fft3683;
__m512 df312 = fft3601;
__m512 df320 = fft3684;
__m512i eo22 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df307 = _mm512_permutexvar_ps(eo22, df307);
df308 = _mm512_permutexvar_ps(eo22, df308);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df307);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df308);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df307);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df308);
df315 = _mm512_permutexvar_ps(eo22, df315);
df316 = _mm512_permutexvar_ps(eo22, df316);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df315);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df316);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df315);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df316);
df309 = _mm512_permutexvar_ps(eo22, df309);
df310 = _mm512_permutexvar_ps(eo22, df310);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df309);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df310);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df309);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df310);
df317 = _mm512_permutexvar_ps(eo22, df317);
df318 = _mm512_permutexvar_ps(eo22, df318);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df317);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df318);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df317);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df318);
df311 = _mm512_permutexvar_ps(eo22, df311);
df312 = _mm512_permutexvar_ps(eo22, df312);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df311);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df312);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df311);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df312);
df319 = _mm512_permutexvar_ps(eo22, df319);
df320 = _mm512_permutexvar_ps(eo22, df320);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df319);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df320);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df319);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df320);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df305);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df306);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df305);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df306);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df313);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m22+32*f23, 255, df314);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df313);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m22+32*f23, 65280, df314);
for (ptrdiff_t b23 = 3; b23 < 6; ++b23) {
ptrdiff_t m23 = (size_t)b23/2;
ptrdiff_t f24 = (size_t)b23%2;
__m512 dat306 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat306 = _mm512_mask_fmadd_ps(dat306, 65535, bnMul9, bnAdd9);
__m512 dat307 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat307 = _mm512_mask_fmadd_ps(dat307, 65535, bnMul9, bnAdd9);
__m512 dat308 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat308 = _mm512_mask_fmadd_ps(dat308, 65535, bnMul9, bnAdd9);
__m512 dat309 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat309 = _mm512_mask_fmadd_ps(dat309, 65535, bnMul9, bnAdd9);
__m512 dat310 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat310 = _mm512_mask_fmadd_ps(dat310, 65535, bnMul9, bnAdd9);
__m512 dat311 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat311 = _mm512_mask_fmadd_ps(dat311, 65535, bnMul9, bnAdd9);
__m512 dat312 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat312 = _mm512_mask_fmadd_ps(dat312, 65535, bnMul9, bnAdd9);
__m512 dat313 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat313 = _mm512_mask_fmadd_ps(dat313, 65535, bnMul9, bnAdd9);
__m512 dat314 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat314 = _mm512_mask_fmadd_ps(dat314, 65535, bnMul9, bnAdd9);
__m512 dat315 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat315 = _mm512_mask_fmadd_ps(dat315, 65535, bnMul9, bnAdd9);
__m512 dat316 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat316 = _mm512_mask_fmadd_ps(dat316, 65535, bnMul9, bnAdd9);
__m512 dat317 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat317 = _mm512_mask_fmadd_ps(dat317, 65535, bnMul9, bnAdd9);
__m512 dat318 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat318 = _mm512_mask_fmadd_ps(dat318, 65535, bnMul9, bnAdd9);
__m512 dat319 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat319 = _mm512_mask_fmadd_ps(dat319, 65535, bnMul9, bnAdd9);
__m512 dat320 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat320 = _mm512_mask_fmadd_ps(dat320, 65535, bnMul9, bnAdd9);
__m512 dat321 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k10+896*h9+4*w9+40*b23);
dat321 = _mm512_mask_fmadd_ps(dat321, 65535, bnMul9, bnAdd9);
__m512 fft3697 = _mm512_add_ps(dat306, dat314);
__m512 fft3785 = _mm512_add_ps(dat307, dat315);
__m512 fft3698 = _mm512_sub_ps(dat306, dat314);
__m512 fft3786 = _mm512_sub_ps(dat307, dat315);
__m512 fft3699 = _mm512_add_ps(dat308, dat316);
__m512 fft3787 = _mm512_add_ps(dat309, dat317);
__m512 fft3700 = _mm512_sub_ps(dat308, dat316);
__m512 fft3788 = _mm512_sub_ps(dat309, dat317);
__m512 fft3701 = _mm512_add_ps(dat310, dat318);
__m512 fft3789 = _mm512_add_ps(dat311, dat319);
__m512 fft3702 = _mm512_sub_ps(dat310, dat318);
__m512 fft3790 = _mm512_sub_ps(dat311, dat319);
__m512 fft3703 = _mm512_add_ps(dat312, dat320);
__m512 fft3791 = _mm512_add_ps(dat313, dat321);
__m512 fft3704 = _mm512_sub_ps(dat312, dat320);
__m512 fft3792 = _mm512_sub_ps(dat313, dat321);
__m512 fft3705 = _mm512_add_ps(fft3697, fft3701);
__m512 fft3793 = _mm512_add_ps(fft3785, fft3789);
__m512 fft3706 = _mm512_sub_ps(fft3697, fft3701);
__m512 fft3794 = _mm512_sub_ps(fft3785, fft3789);
__m512 fft3707 = _mm512_add_ps(fft3699, fft3703);
__m512 fft3795 = _mm512_add_ps(fft3787, fft3791);
__m512 fft3708 = _mm512_sub_ps(fft3703, fft3699);
__m512 fft3796 = _mm512_sub_ps(fft3791, fft3787);
__m512 fft3709 = _mm512_sub_ps(fft3700, fft3704);
__m512 fft3797 = _mm512_sub_ps(fft3788, fft3792);
__m512 fft3710 = _mm512_add_ps(fft3700, fft3704);
__m512 fft3798 = _mm512_add_ps(fft3788, fft3792);
__m512 fft3711 = _mm512_add_ps(fft3705, fft3707);
__m512 fft3799 = _mm512_add_ps(fft3793, fft3795);
__m512 fft3712 = _mm512_sub_ps(fft3705, fft3707);
__m512 fft3800 = _mm512_sub_ps(fft3793, fft3795);
__m512 fft3713 = _mm512_fmadd_ps(fft3709, _mm512_set1_ps(7.0710677e-01f), fft3698);
__m512 fft3801 = _mm512_fmadd_ps(fft3797, _mm512_set1_ps(7.0710677e-01f), fft3786);
__m512 fft3714 = _mm512_fnmsub_ps(fft3710, _mm512_set1_ps(7.0710677e-01f), fft3702);
__m512 fft3802 = _mm512_fnmsub_ps(fft3798, _mm512_set1_ps(7.0710677e-01f), fft3790);
__m512 fft3715 = _mm512_fnmadd_ps(fft3709, _mm512_set1_ps(7.0710677e-01f), fft3698);
__m512 fft3803 = _mm512_fnmadd_ps(fft3797, _mm512_set1_ps(7.0710677e-01f), fft3786);
__m512 fft3716 = _mm512_fnmadd_ps(fft3710, _mm512_set1_ps(7.0710677e-01f), fft3702);
__m512 fft3804 = _mm512_fnmadd_ps(fft3798, _mm512_set1_ps(7.0710677e-01f), fft3790);
__m512 fft3717 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3718 = _mm512_fmadd_ps(fft3711, fft3717, _mm512_shuffle_f32x4(fft3711, fft3711, 78));
__m512 fft3805 = _mm512_fmadd_ps(fft3799, fft3717, _mm512_shuffle_f32x4(fft3799, fft3799, 78));
__m512 fft3719 = _mm512_fmadd_ps(fft3712, fft3717, _mm512_shuffle_f32x4(fft3712, fft3712, 78));
__m512 fft3806 = _mm512_fmadd_ps(fft3800, fft3717, _mm512_shuffle_f32x4(fft3800, fft3800, 78));
__m512 fft3720 = _mm512_fmadd_ps(fft3713, fft3717, _mm512_shuffle_f32x4(fft3713, fft3713, 78));
__m512 fft3807 = _mm512_fmadd_ps(fft3801, fft3717, _mm512_shuffle_f32x4(fft3801, fft3801, 78));
__m512 fft3721 = _mm512_fmadd_ps(fft3714, fft3717, _mm512_shuffle_f32x4(fft3714, fft3714, 78));
__m512 fft3808 = _mm512_fmadd_ps(fft3802, fft3717, _mm512_shuffle_f32x4(fft3802, fft3802, 78));
__m512 fft3722 = _mm512_fmadd_ps(fft3706, fft3717, _mm512_shuffle_f32x4(fft3706, fft3706, 78));
__m512 fft3809 = _mm512_fmadd_ps(fft3794, fft3717, _mm512_shuffle_f32x4(fft3794, fft3794, 78));
__m512 fft3723 = _mm512_fmadd_ps(fft3708, fft3717, _mm512_shuffle_f32x4(fft3708, fft3708, 78));
__m512 fft3810 = _mm512_fmadd_ps(fft3796, fft3717, _mm512_shuffle_f32x4(fft3796, fft3796, 78));
__m512 fft3724 = _mm512_fmadd_ps(fft3715, fft3717, _mm512_shuffle_f32x4(fft3715, fft3715, 78));
__m512 fft3811 = _mm512_fmadd_ps(fft3803, fft3717, _mm512_shuffle_f32x4(fft3803, fft3803, 78));
__m512 fft3725 = _mm512_fmadd_ps(fft3716, fft3717, _mm512_shuffle_f32x4(fft3716, fft3716, 78));
__m512 fft3812 = _mm512_fmadd_ps(fft3804, fft3717, _mm512_shuffle_f32x4(fft3804, fft3804, 78));
__m512 fft3726 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3727 = _mm512_mul_ps(fft3718, fft3726);
__m512 fft3813 = _mm512_mul_ps(fft3805, fft3726);
__m512 fft3728 = _mm512_mul_ps(fft3719, fft3726);
__m512 fft3814 = _mm512_mul_ps(fft3806, fft3726);
__m512 fft3729 = _mm512_mul_ps(fft3720, fft3726);
__m512 fft3815 = _mm512_mul_ps(fft3807, fft3726);
__m512 fft3730 = _mm512_mul_ps(fft3721, fft3726);
__m512 fft3816 = _mm512_mul_ps(fft3808, fft3726);
__m512 fft3731 = _mm512_mul_ps(fft3722, fft3726);
__m512 fft3817 = _mm512_mul_ps(fft3809, fft3726);
__m512 fft3732 = _mm512_mul_ps(fft3723, fft3726);
__m512 fft3818 = _mm512_mul_ps(fft3810, fft3726);
__m512 fft3733 = _mm512_mul_ps(fft3724, fft3726);
__m512 fft3819 = _mm512_mul_ps(fft3811, fft3726);
__m512 fft3734 = _mm512_mul_ps(fft3725, fft3726);
__m512 fft3820 = _mm512_mul_ps(fft3812, fft3726);
__m512 fft3735 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3736 = _mm512_fmadd_ps(fft3719, fft3735, fft3727);
__m512 fft3821 = _mm512_fmadd_ps(fft3806, fft3735, fft3813);
__m512 fft3737 = _mm512_fnmadd_ps(fft3718, fft3735, fft3728);
__m512 fft3822 = _mm512_fnmadd_ps(fft3805, fft3735, fft3814);
__m512 fft3738 = _mm512_fmadd_ps(fft3721, fft3735, fft3729);
__m512 fft3823 = _mm512_fmadd_ps(fft3808, fft3735, fft3815);
__m512 fft3739 = _mm512_fnmadd_ps(fft3720, fft3735, fft3730);
__m512 fft3824 = _mm512_fnmadd_ps(fft3807, fft3735, fft3816);
__m512 fft3740 = _mm512_fmadd_ps(fft3723, fft3735, fft3731);
__m512 fft3825 = _mm512_fmadd_ps(fft3810, fft3735, fft3817);
__m512 fft3741 = _mm512_fnmadd_ps(fft3722, fft3735, fft3732);
__m512 fft3826 = _mm512_fnmadd_ps(fft3809, fft3735, fft3818);
__m512 fft3742 = _mm512_fmadd_ps(fft3725, fft3735, fft3733);
__m512 fft3827 = _mm512_fmadd_ps(fft3812, fft3735, fft3819);
__m512 fft3743 = _mm512_fnmadd_ps(fft3724, fft3735, fft3734);
__m512 fft3828 = _mm512_fnmadd_ps(fft3811, fft3735, fft3820);
__m512 fft3744 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3745 = _mm512_fmadd_ps(fft3736, fft3744, _mm512_shuffle_f32x4(fft3736, fft3736, 177));
__m512 fft3829 = _mm512_fmadd_ps(fft3821, fft3744, _mm512_shuffle_f32x4(fft3821, fft3821, 177));
__m512 fft3746 = _mm512_fmadd_ps(fft3737, fft3744, _mm512_shuffle_f32x4(fft3737, fft3737, 177));
__m512 fft3830 = _mm512_fmadd_ps(fft3822, fft3744, _mm512_shuffle_f32x4(fft3822, fft3822, 177));
__m512 fft3747 = _mm512_fmadd_ps(fft3738, fft3744, _mm512_shuffle_f32x4(fft3738, fft3738, 177));
__m512 fft3831 = _mm512_fmadd_ps(fft3823, fft3744, _mm512_shuffle_f32x4(fft3823, fft3823, 177));
__m512 fft3748 = _mm512_fmadd_ps(fft3739, fft3744, _mm512_shuffle_f32x4(fft3739, fft3739, 177));
__m512 fft3832 = _mm512_fmadd_ps(fft3824, fft3744, _mm512_shuffle_f32x4(fft3824, fft3824, 177));
__m512 fft3749 = _mm512_fmadd_ps(fft3740, fft3744, _mm512_shuffle_f32x4(fft3740, fft3740, 177));
__m512 fft3833 = _mm512_fmadd_ps(fft3825, fft3744, _mm512_shuffle_f32x4(fft3825, fft3825, 177));
__m512 fft3750 = _mm512_fmadd_ps(fft3741, fft3744, _mm512_shuffle_f32x4(fft3741, fft3741, 177));
__m512 fft3834 = _mm512_fmadd_ps(fft3826, fft3744, _mm512_shuffle_f32x4(fft3826, fft3826, 177));
__m512 fft3751 = _mm512_fmadd_ps(fft3742, fft3744, _mm512_shuffle_f32x4(fft3742, fft3742, 177));
__m512 fft3835 = _mm512_fmadd_ps(fft3827, fft3744, _mm512_shuffle_f32x4(fft3827, fft3827, 177));
__m512 fft3752 = _mm512_fmadd_ps(fft3743, fft3744, _mm512_shuffle_f32x4(fft3743, fft3743, 177));
__m512 fft3836 = _mm512_fmadd_ps(fft3828, fft3744, _mm512_shuffle_f32x4(fft3828, fft3828, 177));
__m512 fft3753 = _mm512_mask_mov_ps(fft3745, 49344, fft3746);
__m512 fft3837 = _mm512_mask_mov_ps(fft3829, 49344, fft3830);
__m512 fft3754 = _mm512_mask_sub_ps(fft3746, 49344, _mm512_setzero_ps(), fft3745);
__m512 fft3838 = _mm512_mask_sub_ps(fft3830, 49344, _mm512_setzero_ps(), fft3829);
__m512 fft3755 = _mm512_mask_mov_ps(fft3747, 49344, fft3748);
__m512 fft3839 = _mm512_mask_mov_ps(fft3831, 49344, fft3832);
__m512 fft3756 = _mm512_mask_sub_ps(fft3748, 49344, _mm512_setzero_ps(), fft3747);
__m512 fft3840 = _mm512_mask_sub_ps(fft3832, 49344, _mm512_setzero_ps(), fft3831);
__m512 fft3757 = _mm512_mask_mov_ps(fft3749, 49344, fft3750);
__m512 fft3841 = _mm512_mask_mov_ps(fft3833, 49344, fft3834);
__m512 fft3758 = _mm512_mask_sub_ps(fft3750, 49344, _mm512_setzero_ps(), fft3749);
__m512 fft3842 = _mm512_mask_sub_ps(fft3834, 49344, _mm512_setzero_ps(), fft3833);
__m512 fft3759 = _mm512_mask_mov_ps(fft3751, 49344, fft3752);
__m512 fft3843 = _mm512_mask_mov_ps(fft3835, 49344, fft3836);
__m512 fft3760 = _mm512_mask_sub_ps(fft3752, 49344, _mm512_setzero_ps(), fft3751);
__m512 fft3844 = _mm512_mask_sub_ps(fft3836, 49344, _mm512_setzero_ps(), fft3835);
__m512 fft3761 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3762 = _mm512_fmadd_ps(fft3753, fft3761, _mm512_shuffle_ps(fft3753, fft3753, 78));
__m512 fft3845 = _mm512_fmadd_ps(fft3837, fft3761, _mm512_shuffle_ps(fft3837, fft3837, 78));
__m512 fft3763 = _mm512_fmadd_ps(fft3754, fft3761, _mm512_shuffle_ps(fft3754, fft3754, 78));
__m512 fft3846 = _mm512_fmadd_ps(fft3838, fft3761, _mm512_shuffle_ps(fft3838, fft3838, 78));
__m512 fft3764 = _mm512_fmadd_ps(fft3755, fft3761, _mm512_shuffle_ps(fft3755, fft3755, 78));
__m512 fft3847 = _mm512_fmadd_ps(fft3839, fft3761, _mm512_shuffle_ps(fft3839, fft3839, 78));
__m512 fft3765 = _mm512_fmadd_ps(fft3756, fft3761, _mm512_shuffle_ps(fft3756, fft3756, 78));
__m512 fft3848 = _mm512_fmadd_ps(fft3840, fft3761, _mm512_shuffle_ps(fft3840, fft3840, 78));
__m512 fft3766 = _mm512_fmadd_ps(fft3757, fft3761, _mm512_shuffle_ps(fft3757, fft3757, 78));
__m512 fft3849 = _mm512_fmadd_ps(fft3841, fft3761, _mm512_shuffle_ps(fft3841, fft3841, 78));
__m512 fft3767 = _mm512_fmadd_ps(fft3758, fft3761, _mm512_shuffle_ps(fft3758, fft3758, 78));
__m512 fft3850 = _mm512_fmadd_ps(fft3842, fft3761, _mm512_shuffle_ps(fft3842, fft3842, 78));
__m512 fft3768 = _mm512_fmadd_ps(fft3759, fft3761, _mm512_shuffle_ps(fft3759, fft3759, 78));
__m512 fft3851 = _mm512_fmadd_ps(fft3843, fft3761, _mm512_shuffle_ps(fft3843, fft3843, 78));
__m512 fft3769 = _mm512_fmadd_ps(fft3760, fft3761, _mm512_shuffle_ps(fft3760, fft3760, 78));
__m512 fft3852 = _mm512_fmadd_ps(fft3844, fft3761, _mm512_shuffle_ps(fft3844, fft3844, 78));
__m512i fft3770 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3771 = _mm512_permutexvar_ps(fft3770, fft3762);
__m512 fft3853 = _mm512_permutexvar_ps(fft3770, fft3845);
__m512i fft3772 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3773 = _mm512_permutexvar_ps(fft3772, fft3762);
__m512 fft3854 = _mm512_permutexvar_ps(fft3772, fft3845);
__m512 fft3774 = _mm512_permutexvar_ps(fft3770, fft3763);
__m512 fft3855 = _mm512_permutexvar_ps(fft3770, fft3846);
__m512 fft3775 = _mm512_permutexvar_ps(fft3772, fft3763);
__m512 fft3856 = _mm512_permutexvar_ps(fft3772, fft3846);
__m512 fft3776 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3777 = _mm512_fmadd_ps(fft3771, fft3776, fft3773);
__m512 fft3857 = _mm512_fmadd_ps(fft3853, fft3776, fft3854);
__m512 fft3778 = _mm512_fnmadd_ps(fft3775, fft3776, fft3774);
__m512 fft3858 = _mm512_fnmadd_ps(fft3856, fft3776, fft3855);
__m512 fft3779 = _mm512_mask_mov_ps(fft3775, 21845, fft3777);
__m512 fft3859 = _mm512_mask_mov_ps(fft3856, 21845, fft3857);
__m512 fft3780 = _mm512_mask_mov_ps(fft3771, 43176, fft3777);
__m512 fft3860 = _mm512_mask_mov_ps(fft3853, 43176, fft3857);
__m512 fft3781 = _mm512_mask_mov_ps(fft3779, 43176, fft3778);
__m512 fft3861 = _mm512_mask_mov_ps(fft3859, 43176, fft3858);
__m512 fft3782 = _mm512_mask_mov_ps(fft3780, 22102, fft3778);
__m512 fft3862 = _mm512_mask_mov_ps(fft3860, 22102, fft3858);
__m512 fft3783 = _mm512_mask_mul_ps(fft3781, 64764, fft3781, _mm512_set1_ps(5e-01f));
__m512 fft3863 = _mm512_mask_mul_ps(fft3861, 64764, fft3861, _mm512_set1_ps(5e-01f));
__m512 fft3784 = _mm512_mask_mul_ps(fft3782, 64764, fft3782, _mm512_set1_ps(5e-01f));
__m512 fft3864 = _mm512_mask_mul_ps(fft3862, 64764, fft3862, _mm512_set1_ps(5e-01f));
__m512 df321 = fft3783;
__m512 df329 = fft3863;
__m512 df322 = fft3784;
__m512 df330 = fft3864;
__m512 df323 = fft3764;
__m512 df331 = fft3847;
__m512 df324 = fft3765;
__m512 df332 = fft3848;
__m512 df325 = fft3766;
__m512 df333 = fft3849;
__m512 df326 = fft3767;
__m512 df334 = fft3850;
__m512 df327 = fft3768;
__m512 df335 = fft3851;
__m512 df328 = fft3769;
__m512 df336 = fft3852;
__m512i eo23 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df323 = _mm512_permutexvar_ps(eo23, df323);
df324 = _mm512_permutexvar_ps(eo23, df324);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df323);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df324);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df323);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df324);
df331 = _mm512_permutexvar_ps(eo23, df331);
df332 = _mm512_permutexvar_ps(eo23, df332);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df331);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df332);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df331);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df332);
df325 = _mm512_permutexvar_ps(eo23, df325);
df326 = _mm512_permutexvar_ps(eo23, df326);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df325);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df326);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df325);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df326);
df333 = _mm512_permutexvar_ps(eo23, df333);
df334 = _mm512_permutexvar_ps(eo23, df334);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df333);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df334);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df333);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df334);
df327 = _mm512_permutexvar_ps(eo23, df327);
df328 = _mm512_permutexvar_ps(eo23, df328);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df327);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df328);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df327);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df328);
df335 = _mm512_permutexvar_ps(eo23, df335);
df336 = _mm512_permutexvar_ps(eo23, df336);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df335);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df336);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df335);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df336);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df321);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df322);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df321);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df322);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df329);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k10+128*m23+32*f24, 255, df330);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df329);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k10+128*m23+32*f24, 65280, df330);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 12;
}
if (rel2 < 15) {
ptrdiff_t h10 = base2+30;
ptrdiff_t w10 = -680+60*rel2;
ptrdiff_t jj5 = 14-rel2+j2;
for (; j2 <= jj5; w10 += 60) {
ptrdiff_t k11 = 3*s1;
ptrdiff_t kk10 = k11+2;
for (; k11 <= kk10; ++k11) {
__m512 bnMul10 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k11+3*i6))[0]);
__m512 bnAdd10 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k11+3*i6))[1]);
for (ptrdiff_t b24 = 0; b24 < 6; ++b24) {
ptrdiff_t m24 = (size_t)b24/2;
ptrdiff_t f25 = (size_t)b24%2;
__m512 dat322 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat322 = _mm512_mask_fmadd_ps(dat322, 65535, bnMul10, bnAdd10);
__m512 dat323 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat323 = _mm512_mask_fmadd_ps(dat323, 65535, bnMul10, bnAdd10);
__m512 dat324 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat324 = _mm512_mask_fmadd_ps(dat324, 65535, bnMul10, bnAdd10);
__m512 dat325 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat325 = _mm512_mask_fmadd_ps(dat325, 65535, bnMul10, bnAdd10);
__m512 dat326 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat326 = _mm512_mask_fmadd_ps(dat326, 65535, bnMul10, bnAdd10);
__m512 dat327 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat327 = _mm512_mask_fmadd_ps(dat327, 65535, bnMul10, bnAdd10);
__m512 dat328 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat328 = _mm512_mask_fmadd_ps(dat328, 65535, bnMul10, bnAdd10);
__m512 dat329 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat329 = _mm512_mask_fmadd_ps(dat329, 65535, bnMul10, bnAdd10);
__m512 dat330 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat330 = _mm512_mask_fmadd_ps(dat330, 65535, bnMul10, bnAdd10);
__m512 dat331 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat331 = _mm512_mask_fmadd_ps(dat331, 65535, bnMul10, bnAdd10);
__m512 dat332 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat332 = _mm512_mask_fmadd_ps(dat332, 65535, bnMul10, bnAdd10);
__m512 dat333 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat333 = _mm512_mask_fmadd_ps(dat333, 65535, bnMul10, bnAdd10);
__m512 dat334 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat334 = _mm512_mask_fmadd_ps(dat334, 65535, bnMul10, bnAdd10);
__m512 dat335 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat335 = _mm512_mask_fmadd_ps(dat335, 65535, bnMul10, bnAdd10);
__m512 dat336 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat336 = _mm512_mask_fmadd_ps(dat336, 65535, bnMul10, bnAdd10);
__m512 dat337 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k11+896*h10+4*w10+40*b24);
dat337 = _mm512_mask_fmadd_ps(dat337, 65535, bnMul10, bnAdd10);
__m512 fft3865 = _mm512_add_ps(dat322, dat330);
__m512 fft3953 = _mm512_add_ps(dat323, dat331);
__m512 fft3866 = _mm512_sub_ps(dat322, dat330);
__m512 fft3954 = _mm512_sub_ps(dat323, dat331);
__m512 fft3867 = _mm512_add_ps(dat324, dat332);
__m512 fft3955 = _mm512_add_ps(dat325, dat333);
__m512 fft3868 = _mm512_sub_ps(dat324, dat332);
__m512 fft3956 = _mm512_sub_ps(dat325, dat333);
__m512 fft3869 = _mm512_add_ps(dat326, dat334);
__m512 fft3957 = _mm512_add_ps(dat327, dat335);
__m512 fft3870 = _mm512_sub_ps(dat326, dat334);
__m512 fft3958 = _mm512_sub_ps(dat327, dat335);
__m512 fft3871 = _mm512_add_ps(dat328, dat336);
__m512 fft3959 = _mm512_add_ps(dat329, dat337);
__m512 fft3872 = _mm512_sub_ps(dat328, dat336);
__m512 fft3960 = _mm512_sub_ps(dat329, dat337);
__m512 fft3873 = _mm512_add_ps(fft3865, fft3869);
__m512 fft3961 = _mm512_add_ps(fft3953, fft3957);
__m512 fft3874 = _mm512_sub_ps(fft3865, fft3869);
__m512 fft3962 = _mm512_sub_ps(fft3953, fft3957);
__m512 fft3875 = _mm512_add_ps(fft3867, fft3871);
__m512 fft3963 = _mm512_add_ps(fft3955, fft3959);
__m512 fft3876 = _mm512_sub_ps(fft3871, fft3867);
__m512 fft3964 = _mm512_sub_ps(fft3959, fft3955);
__m512 fft3877 = _mm512_sub_ps(fft3868, fft3872);
__m512 fft3965 = _mm512_sub_ps(fft3956, fft3960);
__m512 fft3878 = _mm512_add_ps(fft3868, fft3872);
__m512 fft3966 = _mm512_add_ps(fft3956, fft3960);
__m512 fft3879 = _mm512_add_ps(fft3873, fft3875);
__m512 fft3967 = _mm512_add_ps(fft3961, fft3963);
__m512 fft3880 = _mm512_sub_ps(fft3873, fft3875);
__m512 fft3968 = _mm512_sub_ps(fft3961, fft3963);
__m512 fft3881 = _mm512_fmadd_ps(fft3877, _mm512_set1_ps(7.0710677e-01f), fft3866);
__m512 fft3969 = _mm512_fmadd_ps(fft3965, _mm512_set1_ps(7.0710677e-01f), fft3954);
__m512 fft3882 = _mm512_fnmsub_ps(fft3878, _mm512_set1_ps(7.0710677e-01f), fft3870);
__m512 fft3970 = _mm512_fnmsub_ps(fft3966, _mm512_set1_ps(7.0710677e-01f), fft3958);
__m512 fft3883 = _mm512_fnmadd_ps(fft3877, _mm512_set1_ps(7.0710677e-01f), fft3866);
__m512 fft3971 = _mm512_fnmadd_ps(fft3965, _mm512_set1_ps(7.0710677e-01f), fft3954);
__m512 fft3884 = _mm512_fnmadd_ps(fft3878, _mm512_set1_ps(7.0710677e-01f), fft3870);
__m512 fft3972 = _mm512_fnmadd_ps(fft3966, _mm512_set1_ps(7.0710677e-01f), fft3958);
__m512 fft3885 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3886 = _mm512_fmadd_ps(fft3879, fft3885, _mm512_shuffle_f32x4(fft3879, fft3879, 78));
__m512 fft3973 = _mm512_fmadd_ps(fft3967, fft3885, _mm512_shuffle_f32x4(fft3967, fft3967, 78));
__m512 fft3887 = _mm512_fmadd_ps(fft3880, fft3885, _mm512_shuffle_f32x4(fft3880, fft3880, 78));
__m512 fft3974 = _mm512_fmadd_ps(fft3968, fft3885, _mm512_shuffle_f32x4(fft3968, fft3968, 78));
__m512 fft3888 = _mm512_fmadd_ps(fft3881, fft3885, _mm512_shuffle_f32x4(fft3881, fft3881, 78));
__m512 fft3975 = _mm512_fmadd_ps(fft3969, fft3885, _mm512_shuffle_f32x4(fft3969, fft3969, 78));
__m512 fft3889 = _mm512_fmadd_ps(fft3882, fft3885, _mm512_shuffle_f32x4(fft3882, fft3882, 78));
__m512 fft3976 = _mm512_fmadd_ps(fft3970, fft3885, _mm512_shuffle_f32x4(fft3970, fft3970, 78));
__m512 fft3890 = _mm512_fmadd_ps(fft3874, fft3885, _mm512_shuffle_f32x4(fft3874, fft3874, 78));
__m512 fft3977 = _mm512_fmadd_ps(fft3962, fft3885, _mm512_shuffle_f32x4(fft3962, fft3962, 78));
__m512 fft3891 = _mm512_fmadd_ps(fft3876, fft3885, _mm512_shuffle_f32x4(fft3876, fft3876, 78));
__m512 fft3978 = _mm512_fmadd_ps(fft3964, fft3885, _mm512_shuffle_f32x4(fft3964, fft3964, 78));
__m512 fft3892 = _mm512_fmadd_ps(fft3883, fft3885, _mm512_shuffle_f32x4(fft3883, fft3883, 78));
__m512 fft3979 = _mm512_fmadd_ps(fft3971, fft3885, _mm512_shuffle_f32x4(fft3971, fft3971, 78));
__m512 fft3893 = _mm512_fmadd_ps(fft3884, fft3885, _mm512_shuffle_f32x4(fft3884, fft3884, 78));
__m512 fft3980 = _mm512_fmadd_ps(fft3972, fft3885, _mm512_shuffle_f32x4(fft3972, fft3972, 78));
__m512 fft3894 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft3895 = _mm512_mul_ps(fft3886, fft3894);
__m512 fft3981 = _mm512_mul_ps(fft3973, fft3894);
__m512 fft3896 = _mm512_mul_ps(fft3887, fft3894);
__m512 fft3982 = _mm512_mul_ps(fft3974, fft3894);
__m512 fft3897 = _mm512_mul_ps(fft3888, fft3894);
__m512 fft3983 = _mm512_mul_ps(fft3975, fft3894);
__m512 fft3898 = _mm512_mul_ps(fft3889, fft3894);
__m512 fft3984 = _mm512_mul_ps(fft3976, fft3894);
__m512 fft3899 = _mm512_mul_ps(fft3890, fft3894);
__m512 fft3985 = _mm512_mul_ps(fft3977, fft3894);
__m512 fft3900 = _mm512_mul_ps(fft3891, fft3894);
__m512 fft3986 = _mm512_mul_ps(fft3978, fft3894);
__m512 fft3901 = _mm512_mul_ps(fft3892, fft3894);
__m512 fft3987 = _mm512_mul_ps(fft3979, fft3894);
__m512 fft3902 = _mm512_mul_ps(fft3893, fft3894);
__m512 fft3988 = _mm512_mul_ps(fft3980, fft3894);
__m512 fft3903 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft3904 = _mm512_fmadd_ps(fft3887, fft3903, fft3895);
__m512 fft3989 = _mm512_fmadd_ps(fft3974, fft3903, fft3981);
__m512 fft3905 = _mm512_fnmadd_ps(fft3886, fft3903, fft3896);
__m512 fft3990 = _mm512_fnmadd_ps(fft3973, fft3903, fft3982);
__m512 fft3906 = _mm512_fmadd_ps(fft3889, fft3903, fft3897);
__m512 fft3991 = _mm512_fmadd_ps(fft3976, fft3903, fft3983);
__m512 fft3907 = _mm512_fnmadd_ps(fft3888, fft3903, fft3898);
__m512 fft3992 = _mm512_fnmadd_ps(fft3975, fft3903, fft3984);
__m512 fft3908 = _mm512_fmadd_ps(fft3891, fft3903, fft3899);
__m512 fft3993 = _mm512_fmadd_ps(fft3978, fft3903, fft3985);
__m512 fft3909 = _mm512_fnmadd_ps(fft3890, fft3903, fft3900);
__m512 fft3994 = _mm512_fnmadd_ps(fft3977, fft3903, fft3986);
__m512 fft3910 = _mm512_fmadd_ps(fft3893, fft3903, fft3901);
__m512 fft3995 = _mm512_fmadd_ps(fft3980, fft3903, fft3987);
__m512 fft3911 = _mm512_fnmadd_ps(fft3892, fft3903, fft3902);
__m512 fft3996 = _mm512_fnmadd_ps(fft3979, fft3903, fft3988);
__m512 fft3912 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft3913 = _mm512_fmadd_ps(fft3904, fft3912, _mm512_shuffle_f32x4(fft3904, fft3904, 177));
__m512 fft3997 = _mm512_fmadd_ps(fft3989, fft3912, _mm512_shuffle_f32x4(fft3989, fft3989, 177));
__m512 fft3914 = _mm512_fmadd_ps(fft3905, fft3912, _mm512_shuffle_f32x4(fft3905, fft3905, 177));
__m512 fft3998 = _mm512_fmadd_ps(fft3990, fft3912, _mm512_shuffle_f32x4(fft3990, fft3990, 177));
__m512 fft3915 = _mm512_fmadd_ps(fft3906, fft3912, _mm512_shuffle_f32x4(fft3906, fft3906, 177));
__m512 fft3999 = _mm512_fmadd_ps(fft3991, fft3912, _mm512_shuffle_f32x4(fft3991, fft3991, 177));
__m512 fft3916 = _mm512_fmadd_ps(fft3907, fft3912, _mm512_shuffle_f32x4(fft3907, fft3907, 177));
__m512 fft4000 = _mm512_fmadd_ps(fft3992, fft3912, _mm512_shuffle_f32x4(fft3992, fft3992, 177));
__m512 fft3917 = _mm512_fmadd_ps(fft3908, fft3912, _mm512_shuffle_f32x4(fft3908, fft3908, 177));
__m512 fft4001 = _mm512_fmadd_ps(fft3993, fft3912, _mm512_shuffle_f32x4(fft3993, fft3993, 177));
__m512 fft3918 = _mm512_fmadd_ps(fft3909, fft3912, _mm512_shuffle_f32x4(fft3909, fft3909, 177));
__m512 fft4002 = _mm512_fmadd_ps(fft3994, fft3912, _mm512_shuffle_f32x4(fft3994, fft3994, 177));
__m512 fft3919 = _mm512_fmadd_ps(fft3910, fft3912, _mm512_shuffle_f32x4(fft3910, fft3910, 177));
__m512 fft4003 = _mm512_fmadd_ps(fft3995, fft3912, _mm512_shuffle_f32x4(fft3995, fft3995, 177));
__m512 fft3920 = _mm512_fmadd_ps(fft3911, fft3912, _mm512_shuffle_f32x4(fft3911, fft3911, 177));
__m512 fft4004 = _mm512_fmadd_ps(fft3996, fft3912, _mm512_shuffle_f32x4(fft3996, fft3996, 177));
__m512 fft3921 = _mm512_mask_mov_ps(fft3913, 49344, fft3914);
__m512 fft4005 = _mm512_mask_mov_ps(fft3997, 49344, fft3998);
__m512 fft3922 = _mm512_mask_sub_ps(fft3914, 49344, _mm512_setzero_ps(), fft3913);
__m512 fft4006 = _mm512_mask_sub_ps(fft3998, 49344, _mm512_setzero_ps(), fft3997);
__m512 fft3923 = _mm512_mask_mov_ps(fft3915, 49344, fft3916);
__m512 fft4007 = _mm512_mask_mov_ps(fft3999, 49344, fft4000);
__m512 fft3924 = _mm512_mask_sub_ps(fft3916, 49344, _mm512_setzero_ps(), fft3915);
__m512 fft4008 = _mm512_mask_sub_ps(fft4000, 49344, _mm512_setzero_ps(), fft3999);
__m512 fft3925 = _mm512_mask_mov_ps(fft3917, 49344, fft3918);
__m512 fft4009 = _mm512_mask_mov_ps(fft4001, 49344, fft4002);
__m512 fft3926 = _mm512_mask_sub_ps(fft3918, 49344, _mm512_setzero_ps(), fft3917);
__m512 fft4010 = _mm512_mask_sub_ps(fft4002, 49344, _mm512_setzero_ps(), fft4001);
__m512 fft3927 = _mm512_mask_mov_ps(fft3919, 49344, fft3920);
__m512 fft4011 = _mm512_mask_mov_ps(fft4003, 49344, fft4004);
__m512 fft3928 = _mm512_mask_sub_ps(fft3920, 49344, _mm512_setzero_ps(), fft3919);
__m512 fft4012 = _mm512_mask_sub_ps(fft4004, 49344, _mm512_setzero_ps(), fft4003);
__m512 fft3929 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft3930 = _mm512_fmadd_ps(fft3921, fft3929, _mm512_shuffle_ps(fft3921, fft3921, 78));
__m512 fft4013 = _mm512_fmadd_ps(fft4005, fft3929, _mm512_shuffle_ps(fft4005, fft4005, 78));
__m512 fft3931 = _mm512_fmadd_ps(fft3922, fft3929, _mm512_shuffle_ps(fft3922, fft3922, 78));
__m512 fft4014 = _mm512_fmadd_ps(fft4006, fft3929, _mm512_shuffle_ps(fft4006, fft4006, 78));
__m512 fft3932 = _mm512_fmadd_ps(fft3923, fft3929, _mm512_shuffle_ps(fft3923, fft3923, 78));
__m512 fft4015 = _mm512_fmadd_ps(fft4007, fft3929, _mm512_shuffle_ps(fft4007, fft4007, 78));
__m512 fft3933 = _mm512_fmadd_ps(fft3924, fft3929, _mm512_shuffle_ps(fft3924, fft3924, 78));
__m512 fft4016 = _mm512_fmadd_ps(fft4008, fft3929, _mm512_shuffle_ps(fft4008, fft4008, 78));
__m512 fft3934 = _mm512_fmadd_ps(fft3925, fft3929, _mm512_shuffle_ps(fft3925, fft3925, 78));
__m512 fft4017 = _mm512_fmadd_ps(fft4009, fft3929, _mm512_shuffle_ps(fft4009, fft4009, 78));
__m512 fft3935 = _mm512_fmadd_ps(fft3926, fft3929, _mm512_shuffle_ps(fft3926, fft3926, 78));
__m512 fft4018 = _mm512_fmadd_ps(fft4010, fft3929, _mm512_shuffle_ps(fft4010, fft4010, 78));
__m512 fft3936 = _mm512_fmadd_ps(fft3927, fft3929, _mm512_shuffle_ps(fft3927, fft3927, 78));
__m512 fft4019 = _mm512_fmadd_ps(fft4011, fft3929, _mm512_shuffle_ps(fft4011, fft4011, 78));
__m512 fft3937 = _mm512_fmadd_ps(fft3928, fft3929, _mm512_shuffle_ps(fft3928, fft3928, 78));
__m512 fft4020 = _mm512_fmadd_ps(fft4012, fft3929, _mm512_shuffle_ps(fft4012, fft4012, 78));
__m512i fft3938 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft3939 = _mm512_permutexvar_ps(fft3938, fft3930);
__m512 fft4021 = _mm512_permutexvar_ps(fft3938, fft4013);
__m512i fft3940 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft3941 = _mm512_permutexvar_ps(fft3940, fft3930);
__m512 fft4022 = _mm512_permutexvar_ps(fft3940, fft4013);
__m512 fft3942 = _mm512_permutexvar_ps(fft3938, fft3931);
__m512 fft4023 = _mm512_permutexvar_ps(fft3938, fft4014);
__m512 fft3943 = _mm512_permutexvar_ps(fft3940, fft3931);
__m512 fft4024 = _mm512_permutexvar_ps(fft3940, fft4014);
__m512 fft3944 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft3945 = _mm512_fmadd_ps(fft3939, fft3944, fft3941);
__m512 fft4025 = _mm512_fmadd_ps(fft4021, fft3944, fft4022);
__m512 fft3946 = _mm512_fnmadd_ps(fft3943, fft3944, fft3942);
__m512 fft4026 = _mm512_fnmadd_ps(fft4024, fft3944, fft4023);
__m512 fft3947 = _mm512_mask_mov_ps(fft3943, 21845, fft3945);
__m512 fft4027 = _mm512_mask_mov_ps(fft4024, 21845, fft4025);
__m512 fft3948 = _mm512_mask_mov_ps(fft3939, 43176, fft3945);
__m512 fft4028 = _mm512_mask_mov_ps(fft4021, 43176, fft4025);
__m512 fft3949 = _mm512_mask_mov_ps(fft3947, 43176, fft3946);
__m512 fft4029 = _mm512_mask_mov_ps(fft4027, 43176, fft4026);
__m512 fft3950 = _mm512_mask_mov_ps(fft3948, 22102, fft3946);
__m512 fft4030 = _mm512_mask_mov_ps(fft4028, 22102, fft4026);
__m512 fft3951 = _mm512_mask_mul_ps(fft3949, 64764, fft3949, _mm512_set1_ps(5e-01f));
__m512 fft4031 = _mm512_mask_mul_ps(fft4029, 64764, fft4029, _mm512_set1_ps(5e-01f));
__m512 fft3952 = _mm512_mask_mul_ps(fft3950, 64764, fft3950, _mm512_set1_ps(5e-01f));
__m512 fft4032 = _mm512_mask_mul_ps(fft4030, 64764, fft4030, _mm512_set1_ps(5e-01f));
__m512 df337 = fft3951;
__m512 df345 = fft4031;
__m512 df338 = fft3952;
__m512 df346 = fft4032;
__m512 df339 = fft3932;
__m512 df347 = fft4015;
__m512 df340 = fft3933;
__m512 df348 = fft4016;
__m512 df341 = fft3934;
__m512 df349 = fft4017;
__m512 df342 = fft3935;
__m512 df350 = fft4018;
__m512 df343 = fft3936;
__m512 df351 = fft4019;
__m512 df344 = fft3937;
__m512 df352 = fft4020;
__m512i eo24 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df339 = _mm512_permutexvar_ps(eo24, df339);
df340 = _mm512_permutexvar_ps(eo24, df340);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df339);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df340);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df339);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df340);
df347 = _mm512_permutexvar_ps(eo24, df347);
df348 = _mm512_permutexvar_ps(eo24, df348);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df347);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df348);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df347);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df348);
df341 = _mm512_permutexvar_ps(eo24, df341);
df342 = _mm512_permutexvar_ps(eo24, df342);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df341);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df342);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df341);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df342);
df349 = _mm512_permutexvar_ps(eo24, df349);
df350 = _mm512_permutexvar_ps(eo24, df350);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df349);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df350);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df349);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df350);
df343 = _mm512_permutexvar_ps(eo24, df343);
df344 = _mm512_permutexvar_ps(eo24, df344);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df343);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df344);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df343);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df344);
df351 = _mm512_permutexvar_ps(eo24, df351);
df352 = _mm512_permutexvar_ps(eo24, df352);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df351);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df352);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df351);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df352);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df337);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df338);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df337);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df338);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df345);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k11+128*m24+32*f25, 255, df346);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df345);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k11+128*m24+32*f25, 65280, df346);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 15;
}
ptrdiff_t h11 = base2+30;
ptrdiff_t w11 = 220;
ptrdiff_t k12 = 3*s1;
ptrdiff_t kk11 = k12+2;
for (; k12 <= kk11; ++k12) {
__m512 bnMul11 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k12+3*i6))[0]);
__m512 bnAdd11 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k12+3*i6))[1]);
ptrdiff_t b25 = 0;
ptrdiff_t m25 = (size_t)b25/2;
ptrdiff_t f26 = (size_t)b25%2;
__m512 dat338 = _mm512_maskz_loadu_ps(127, datPtr1+0+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat338 = _mm512_mask_fmadd_ps(dat338, 127, bnMul11, bnAdd11);
__m512 dat339 = _mm512_maskz_loadu_ps(127, datPtr1+896+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat339 = _mm512_mask_fmadd_ps(dat339, 127, bnMul11, bnAdd11);
__m512 dat340 = _mm512_maskz_loadu_ps(127, datPtr1+1792+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat340 = _mm512_mask_fmadd_ps(dat340, 127, bnMul11, bnAdd11);
__m512 dat341 = _mm512_maskz_loadu_ps(127, datPtr1+2688+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat341 = _mm512_mask_fmadd_ps(dat341, 127, bnMul11, bnAdd11);
__m512 dat342 = _mm512_maskz_loadu_ps(127, datPtr1+3584+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat342 = _mm512_mask_fmadd_ps(dat342, 127, bnMul11, bnAdd11);
__m512 dat343 = _mm512_maskz_loadu_ps(127, datPtr1+4480+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat343 = _mm512_mask_fmadd_ps(dat343, 127, bnMul11, bnAdd11);
__m512 dat344 = _mm512_maskz_loadu_ps(127, datPtr1+5376+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat344 = _mm512_mask_fmadd_ps(dat344, 127, bnMul11, bnAdd11);
__m512 dat345 = _mm512_maskz_loadu_ps(127, datPtr1+6272+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat345 = _mm512_mask_fmadd_ps(dat345, 127, bnMul11, bnAdd11);
__m512 dat346 = _mm512_maskz_loadu_ps(127, datPtr1+7168+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat346 = _mm512_mask_fmadd_ps(dat346, 127, bnMul11, bnAdd11);
__m512 dat347 = _mm512_maskz_loadu_ps(127, datPtr1+8064+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat347 = _mm512_mask_fmadd_ps(dat347, 127, bnMul11, bnAdd11);
__m512 dat348 = _mm512_maskz_loadu_ps(127, datPtr1+8960+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat348 = _mm512_mask_fmadd_ps(dat348, 127, bnMul11, bnAdd11);
__m512 dat349 = _mm512_maskz_loadu_ps(127, datPtr1+9856+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat349 = _mm512_mask_fmadd_ps(dat349, 127, bnMul11, bnAdd11);
__m512 dat350 = _mm512_maskz_loadu_ps(127, datPtr1+10752+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat350 = _mm512_mask_fmadd_ps(dat350, 127, bnMul11, bnAdd11);
__m512 dat351 = _mm512_maskz_loadu_ps(127, datPtr1+11648+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat351 = _mm512_mask_fmadd_ps(dat351, 127, bnMul11, bnAdd11);
__m512 dat352 = _mm512_maskz_loadu_ps(127, datPtr1+12544+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat352 = _mm512_mask_fmadd_ps(dat352, 127, bnMul11, bnAdd11);
__m512 dat353 = _mm512_maskz_loadu_ps(127, datPtr1+13440+602112*i6+200704*k12+896*h11+4*w11+0*b25);
dat353 = _mm512_mask_fmadd_ps(dat353, 127, bnMul11, bnAdd11);
__m512 fft4033 = _mm512_add_ps(dat338, dat346);
__m512 fft4121 = _mm512_add_ps(dat339, dat347);
__m512 fft4034 = _mm512_sub_ps(dat338, dat346);
__m512 fft4122 = _mm512_sub_ps(dat339, dat347);
__m512 fft4035 = _mm512_add_ps(dat340, dat348);
__m512 fft4123 = _mm512_add_ps(dat341, dat349);
__m512 fft4036 = _mm512_sub_ps(dat340, dat348);
__m512 fft4124 = _mm512_sub_ps(dat341, dat349);
__m512 fft4037 = _mm512_add_ps(dat342, dat350);
__m512 fft4125 = _mm512_add_ps(dat343, dat351);
__m512 fft4038 = _mm512_sub_ps(dat342, dat350);
__m512 fft4126 = _mm512_sub_ps(dat343, dat351);
__m512 fft4039 = _mm512_add_ps(dat344, dat352);
__m512 fft4127 = _mm512_add_ps(dat345, dat353);
__m512 fft4040 = _mm512_sub_ps(dat344, dat352);
__m512 fft4128 = _mm512_sub_ps(dat345, dat353);
__m512 fft4041 = _mm512_add_ps(fft4033, fft4037);
__m512 fft4129 = _mm512_add_ps(fft4121, fft4125);
__m512 fft4042 = _mm512_sub_ps(fft4033, fft4037);
__m512 fft4130 = _mm512_sub_ps(fft4121, fft4125);
__m512 fft4043 = _mm512_add_ps(fft4035, fft4039);
__m512 fft4131 = _mm512_add_ps(fft4123, fft4127);
__m512 fft4044 = _mm512_sub_ps(fft4039, fft4035);
__m512 fft4132 = _mm512_sub_ps(fft4127, fft4123);
__m512 fft4045 = _mm512_sub_ps(fft4036, fft4040);
__m512 fft4133 = _mm512_sub_ps(fft4124, fft4128);
__m512 fft4046 = _mm512_add_ps(fft4036, fft4040);
__m512 fft4134 = _mm512_add_ps(fft4124, fft4128);
__m512 fft4047 = _mm512_add_ps(fft4041, fft4043);
__m512 fft4135 = _mm512_add_ps(fft4129, fft4131);
__m512 fft4048 = _mm512_sub_ps(fft4041, fft4043);
__m512 fft4136 = _mm512_sub_ps(fft4129, fft4131);
__m512 fft4049 = _mm512_fmadd_ps(fft4045, _mm512_set1_ps(7.0710677e-01f), fft4034);
__m512 fft4137 = _mm512_fmadd_ps(fft4133, _mm512_set1_ps(7.0710677e-01f), fft4122);
__m512 fft4050 = _mm512_fnmsub_ps(fft4046, _mm512_set1_ps(7.0710677e-01f), fft4038);
__m512 fft4138 = _mm512_fnmsub_ps(fft4134, _mm512_set1_ps(7.0710677e-01f), fft4126);
__m512 fft4051 = _mm512_fnmadd_ps(fft4045, _mm512_set1_ps(7.0710677e-01f), fft4034);
__m512 fft4139 = _mm512_fnmadd_ps(fft4133, _mm512_set1_ps(7.0710677e-01f), fft4122);
__m512 fft4052 = _mm512_fnmadd_ps(fft4046, _mm512_set1_ps(7.0710677e-01f), fft4038);
__m512 fft4140 = _mm512_fnmadd_ps(fft4134, _mm512_set1_ps(7.0710677e-01f), fft4126);
__m512 fft4053 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4054 = _mm512_fmadd_ps(fft4047, fft4053, _mm512_shuffle_f32x4(fft4047, fft4047, 78));
__m512 fft4141 = _mm512_fmadd_ps(fft4135, fft4053, _mm512_shuffle_f32x4(fft4135, fft4135, 78));
__m512 fft4055 = _mm512_fmadd_ps(fft4048, fft4053, _mm512_shuffle_f32x4(fft4048, fft4048, 78));
__m512 fft4142 = _mm512_fmadd_ps(fft4136, fft4053, _mm512_shuffle_f32x4(fft4136, fft4136, 78));
__m512 fft4056 = _mm512_fmadd_ps(fft4049, fft4053, _mm512_shuffle_f32x4(fft4049, fft4049, 78));
__m512 fft4143 = _mm512_fmadd_ps(fft4137, fft4053, _mm512_shuffle_f32x4(fft4137, fft4137, 78));
__m512 fft4057 = _mm512_fmadd_ps(fft4050, fft4053, _mm512_shuffle_f32x4(fft4050, fft4050, 78));
__m512 fft4144 = _mm512_fmadd_ps(fft4138, fft4053, _mm512_shuffle_f32x4(fft4138, fft4138, 78));
__m512 fft4058 = _mm512_fmadd_ps(fft4042, fft4053, _mm512_shuffle_f32x4(fft4042, fft4042, 78));
__m512 fft4145 = _mm512_fmadd_ps(fft4130, fft4053, _mm512_shuffle_f32x4(fft4130, fft4130, 78));
__m512 fft4059 = _mm512_fmadd_ps(fft4044, fft4053, _mm512_shuffle_f32x4(fft4044, fft4044, 78));
__m512 fft4146 = _mm512_fmadd_ps(fft4132, fft4053, _mm512_shuffle_f32x4(fft4132, fft4132, 78));
__m512 fft4060 = _mm512_fmadd_ps(fft4051, fft4053, _mm512_shuffle_f32x4(fft4051, fft4051, 78));
__m512 fft4147 = _mm512_fmadd_ps(fft4139, fft4053, _mm512_shuffle_f32x4(fft4139, fft4139, 78));
__m512 fft4061 = _mm512_fmadd_ps(fft4052, fft4053, _mm512_shuffle_f32x4(fft4052, fft4052, 78));
__m512 fft4148 = _mm512_fmadd_ps(fft4140, fft4053, _mm512_shuffle_f32x4(fft4140, fft4140, 78));
__m512 fft4062 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4063 = _mm512_mul_ps(fft4054, fft4062);
__m512 fft4149 = _mm512_mul_ps(fft4141, fft4062);
__m512 fft4064 = _mm512_mul_ps(fft4055, fft4062);
__m512 fft4150 = _mm512_mul_ps(fft4142, fft4062);
__m512 fft4065 = _mm512_mul_ps(fft4056, fft4062);
__m512 fft4151 = _mm512_mul_ps(fft4143, fft4062);
__m512 fft4066 = _mm512_mul_ps(fft4057, fft4062);
__m512 fft4152 = _mm512_mul_ps(fft4144, fft4062);
__m512 fft4067 = _mm512_mul_ps(fft4058, fft4062);
__m512 fft4153 = _mm512_mul_ps(fft4145, fft4062);
__m512 fft4068 = _mm512_mul_ps(fft4059, fft4062);
__m512 fft4154 = _mm512_mul_ps(fft4146, fft4062);
__m512 fft4069 = _mm512_mul_ps(fft4060, fft4062);
__m512 fft4155 = _mm512_mul_ps(fft4147, fft4062);
__m512 fft4070 = _mm512_mul_ps(fft4061, fft4062);
__m512 fft4156 = _mm512_mul_ps(fft4148, fft4062);
__m512 fft4071 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4072 = _mm512_fmadd_ps(fft4055, fft4071, fft4063);
__m512 fft4157 = _mm512_fmadd_ps(fft4142, fft4071, fft4149);
__m512 fft4073 = _mm512_fnmadd_ps(fft4054, fft4071, fft4064);
__m512 fft4158 = _mm512_fnmadd_ps(fft4141, fft4071, fft4150);
__m512 fft4074 = _mm512_fmadd_ps(fft4057, fft4071, fft4065);
__m512 fft4159 = _mm512_fmadd_ps(fft4144, fft4071, fft4151);
__m512 fft4075 = _mm512_fnmadd_ps(fft4056, fft4071, fft4066);
__m512 fft4160 = _mm512_fnmadd_ps(fft4143, fft4071, fft4152);
__m512 fft4076 = _mm512_fmadd_ps(fft4059, fft4071, fft4067);
__m512 fft4161 = _mm512_fmadd_ps(fft4146, fft4071, fft4153);
__m512 fft4077 = _mm512_fnmadd_ps(fft4058, fft4071, fft4068);
__m512 fft4162 = _mm512_fnmadd_ps(fft4145, fft4071, fft4154);
__m512 fft4078 = _mm512_fmadd_ps(fft4061, fft4071, fft4069);
__m512 fft4163 = _mm512_fmadd_ps(fft4148, fft4071, fft4155);
__m512 fft4079 = _mm512_fnmadd_ps(fft4060, fft4071, fft4070);
__m512 fft4164 = _mm512_fnmadd_ps(fft4147, fft4071, fft4156);
__m512 fft4080 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4081 = _mm512_fmadd_ps(fft4072, fft4080, _mm512_shuffle_f32x4(fft4072, fft4072, 177));
__m512 fft4165 = _mm512_fmadd_ps(fft4157, fft4080, _mm512_shuffle_f32x4(fft4157, fft4157, 177));
__m512 fft4082 = _mm512_fmadd_ps(fft4073, fft4080, _mm512_shuffle_f32x4(fft4073, fft4073, 177));
__m512 fft4166 = _mm512_fmadd_ps(fft4158, fft4080, _mm512_shuffle_f32x4(fft4158, fft4158, 177));
__m512 fft4083 = _mm512_fmadd_ps(fft4074, fft4080, _mm512_shuffle_f32x4(fft4074, fft4074, 177));
__m512 fft4167 = _mm512_fmadd_ps(fft4159, fft4080, _mm512_shuffle_f32x4(fft4159, fft4159, 177));
__m512 fft4084 = _mm512_fmadd_ps(fft4075, fft4080, _mm512_shuffle_f32x4(fft4075, fft4075, 177));
__m512 fft4168 = _mm512_fmadd_ps(fft4160, fft4080, _mm512_shuffle_f32x4(fft4160, fft4160, 177));
__m512 fft4085 = _mm512_fmadd_ps(fft4076, fft4080, _mm512_shuffle_f32x4(fft4076, fft4076, 177));
__m512 fft4169 = _mm512_fmadd_ps(fft4161, fft4080, _mm512_shuffle_f32x4(fft4161, fft4161, 177));
__m512 fft4086 = _mm512_fmadd_ps(fft4077, fft4080, _mm512_shuffle_f32x4(fft4077, fft4077, 177));
__m512 fft4170 = _mm512_fmadd_ps(fft4162, fft4080, _mm512_shuffle_f32x4(fft4162, fft4162, 177));
__m512 fft4087 = _mm512_fmadd_ps(fft4078, fft4080, _mm512_shuffle_f32x4(fft4078, fft4078, 177));
__m512 fft4171 = _mm512_fmadd_ps(fft4163, fft4080, _mm512_shuffle_f32x4(fft4163, fft4163, 177));
__m512 fft4088 = _mm512_fmadd_ps(fft4079, fft4080, _mm512_shuffle_f32x4(fft4079, fft4079, 177));
__m512 fft4172 = _mm512_fmadd_ps(fft4164, fft4080, _mm512_shuffle_f32x4(fft4164, fft4164, 177));
__m512 fft4089 = _mm512_mask_mov_ps(fft4081, 49344, fft4082);
__m512 fft4173 = _mm512_mask_mov_ps(fft4165, 49344, fft4166);
__m512 fft4090 = _mm512_mask_sub_ps(fft4082, 49344, _mm512_setzero_ps(), fft4081);
__m512 fft4174 = _mm512_mask_sub_ps(fft4166, 49344, _mm512_setzero_ps(), fft4165);
__m512 fft4091 = _mm512_mask_mov_ps(fft4083, 49344, fft4084);
__m512 fft4175 = _mm512_mask_mov_ps(fft4167, 49344, fft4168);
__m512 fft4092 = _mm512_mask_sub_ps(fft4084, 49344, _mm512_setzero_ps(), fft4083);
__m512 fft4176 = _mm512_mask_sub_ps(fft4168, 49344, _mm512_setzero_ps(), fft4167);
__m512 fft4093 = _mm512_mask_mov_ps(fft4085, 49344, fft4086);
__m512 fft4177 = _mm512_mask_mov_ps(fft4169, 49344, fft4170);
__m512 fft4094 = _mm512_mask_sub_ps(fft4086, 49344, _mm512_setzero_ps(), fft4085);
__m512 fft4178 = _mm512_mask_sub_ps(fft4170, 49344, _mm512_setzero_ps(), fft4169);
__m512 fft4095 = _mm512_mask_mov_ps(fft4087, 49344, fft4088);
__m512 fft4179 = _mm512_mask_mov_ps(fft4171, 49344, fft4172);
__m512 fft4096 = _mm512_mask_sub_ps(fft4088, 49344, _mm512_setzero_ps(), fft4087);
__m512 fft4180 = _mm512_mask_sub_ps(fft4172, 49344, _mm512_setzero_ps(), fft4171);
__m512 fft4097 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4098 = _mm512_fmadd_ps(fft4089, fft4097, _mm512_shuffle_ps(fft4089, fft4089, 78));
__m512 fft4181 = _mm512_fmadd_ps(fft4173, fft4097, _mm512_shuffle_ps(fft4173, fft4173, 78));
__m512 fft4099 = _mm512_fmadd_ps(fft4090, fft4097, _mm512_shuffle_ps(fft4090, fft4090, 78));
__m512 fft4182 = _mm512_fmadd_ps(fft4174, fft4097, _mm512_shuffle_ps(fft4174, fft4174, 78));
__m512 fft4100 = _mm512_fmadd_ps(fft4091, fft4097, _mm512_shuffle_ps(fft4091, fft4091, 78));
__m512 fft4183 = _mm512_fmadd_ps(fft4175, fft4097, _mm512_shuffle_ps(fft4175, fft4175, 78));
__m512 fft4101 = _mm512_fmadd_ps(fft4092, fft4097, _mm512_shuffle_ps(fft4092, fft4092, 78));
__m512 fft4184 = _mm512_fmadd_ps(fft4176, fft4097, _mm512_shuffle_ps(fft4176, fft4176, 78));
__m512 fft4102 = _mm512_fmadd_ps(fft4093, fft4097, _mm512_shuffle_ps(fft4093, fft4093, 78));
__m512 fft4185 = _mm512_fmadd_ps(fft4177, fft4097, _mm512_shuffle_ps(fft4177, fft4177, 78));
__m512 fft4103 = _mm512_fmadd_ps(fft4094, fft4097, _mm512_shuffle_ps(fft4094, fft4094, 78));
__m512 fft4186 = _mm512_fmadd_ps(fft4178, fft4097, _mm512_shuffle_ps(fft4178, fft4178, 78));
__m512 fft4104 = _mm512_fmadd_ps(fft4095, fft4097, _mm512_shuffle_ps(fft4095, fft4095, 78));
__m512 fft4187 = _mm512_fmadd_ps(fft4179, fft4097, _mm512_shuffle_ps(fft4179, fft4179, 78));
__m512 fft4105 = _mm512_fmadd_ps(fft4096, fft4097, _mm512_shuffle_ps(fft4096, fft4096, 78));
__m512 fft4188 = _mm512_fmadd_ps(fft4180, fft4097, _mm512_shuffle_ps(fft4180, fft4180, 78));
__m512i fft4106 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4107 = _mm512_permutexvar_ps(fft4106, fft4098);
__m512 fft4189 = _mm512_permutexvar_ps(fft4106, fft4181);
__m512i fft4108 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4109 = _mm512_permutexvar_ps(fft4108, fft4098);
__m512 fft4190 = _mm512_permutexvar_ps(fft4108, fft4181);
__m512 fft4110 = _mm512_permutexvar_ps(fft4106, fft4099);
__m512 fft4191 = _mm512_permutexvar_ps(fft4106, fft4182);
__m512 fft4111 = _mm512_permutexvar_ps(fft4108, fft4099);
__m512 fft4192 = _mm512_permutexvar_ps(fft4108, fft4182);
__m512 fft4112 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4113 = _mm512_fmadd_ps(fft4107, fft4112, fft4109);
__m512 fft4193 = _mm512_fmadd_ps(fft4189, fft4112, fft4190);
__m512 fft4114 = _mm512_fnmadd_ps(fft4111, fft4112, fft4110);
__m512 fft4194 = _mm512_fnmadd_ps(fft4192, fft4112, fft4191);
__m512 fft4115 = _mm512_mask_mov_ps(fft4111, 21845, fft4113);
__m512 fft4195 = _mm512_mask_mov_ps(fft4192, 21845, fft4193);
__m512 fft4116 = _mm512_mask_mov_ps(fft4107, 43176, fft4113);
__m512 fft4196 = _mm512_mask_mov_ps(fft4189, 43176, fft4193);
__m512 fft4117 = _mm512_mask_mov_ps(fft4115, 43176, fft4114);
__m512 fft4197 = _mm512_mask_mov_ps(fft4195, 43176, fft4194);
__m512 fft4118 = _mm512_mask_mov_ps(fft4116, 22102, fft4114);
__m512 fft4198 = _mm512_mask_mov_ps(fft4196, 22102, fft4194);
__m512 fft4119 = _mm512_mask_mul_ps(fft4117, 64764, fft4117, _mm512_set1_ps(5e-01f));
__m512 fft4199 = _mm512_mask_mul_ps(fft4197, 64764, fft4197, _mm512_set1_ps(5e-01f));
__m512 fft4120 = _mm512_mask_mul_ps(fft4118, 64764, fft4118, _mm512_set1_ps(5e-01f));
__m512 fft4200 = _mm512_mask_mul_ps(fft4198, 64764, fft4198, _mm512_set1_ps(5e-01f));
__m512 df353 = fft4119;
__m512 df361 = fft4199;
__m512 df354 = fft4120;
__m512 df362 = fft4200;
__m512 df355 = fft4100;
__m512 df363 = fft4183;
__m512 df356 = fft4101;
__m512 df364 = fft4184;
__m512 df357 = fft4102;
__m512 df365 = fft4185;
__m512 df358 = fft4103;
__m512 df366 = fft4186;
__m512 df359 = fft4104;
__m512 df367 = fft4187;
__m512 df360 = fft4105;
__m512 df368 = fft4188;
__m512i eo25 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df355 = _mm512_permutexvar_ps(eo25, df355);
df356 = _mm512_permutexvar_ps(eo25, df356);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df355);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df356);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df355);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df356);
df363 = _mm512_permutexvar_ps(eo25, df363);
df364 = _mm512_permutexvar_ps(eo25, df364);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df363);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df364);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df363);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df364);
df357 = _mm512_permutexvar_ps(eo25, df357);
df358 = _mm512_permutexvar_ps(eo25, df358);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df357);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df358);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df357);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df358);
df365 = _mm512_permutexvar_ps(eo25, df365);
df366 = _mm512_permutexvar_ps(eo25, df366);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df365);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df366);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df365);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df366);
df359 = _mm512_permutexvar_ps(eo25, df359);
df360 = _mm512_permutexvar_ps(eo25, df360);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df359);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df360);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df359);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df360);
df367 = _mm512_permutexvar_ps(eo25, df367);
df368 = _mm512_permutexvar_ps(eo25, df368);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df367);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df368);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df367);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df368);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df353);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df354);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df353);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df354);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df361);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m25+32*f26, 255, df362);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df361);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m25+32*f26, 65280, df362);
ptrdiff_t b26 = 1;
ptrdiff_t m26 = (size_t)b26/2;
ptrdiff_t f27 = (size_t)b26%2;
__m512 dat354 = _mm512_maskz_loadu_ps(65528, datPtr1+8080+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat354 = _mm512_mask_fmadd_ps(dat354, 65528, bnMul11, bnAdd11);
__m512 dat355 = _mm512_maskz_loadu_ps(65528, datPtr1+8976+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat355 = _mm512_mask_fmadd_ps(dat355, 65528, bnMul11, bnAdd11);
__m512 dat356 = _mm512_maskz_loadu_ps(65528, datPtr1+9872+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat356 = _mm512_mask_fmadd_ps(dat356, 65528, bnMul11, bnAdd11);
__m512 dat357 = _mm512_maskz_loadu_ps(65528, datPtr1+10768+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat357 = _mm512_mask_fmadd_ps(dat357, 65528, bnMul11, bnAdd11);
__m512 dat358 = _mm512_maskz_loadu_ps(65528, datPtr1+11664+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat358 = _mm512_mask_fmadd_ps(dat358, 65528, bnMul11, bnAdd11);
__m512 dat359 = _mm512_maskz_loadu_ps(65528, datPtr1+12560+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat359 = _mm512_mask_fmadd_ps(dat359, 65528, bnMul11, bnAdd11);
__m512 dat360 = _mm512_maskz_loadu_ps(65528, datPtr1+13456+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat360 = _mm512_mask_fmadd_ps(dat360, 65528, bnMul11, bnAdd11);
__m512 dat361 = _mm512_maskz_loadu_ps(65528, datPtr1+14352+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat361 = _mm512_mask_fmadd_ps(dat361, 65528, bnMul11, bnAdd11);
__m512 dat362 = _mm512_maskz_loadu_ps(65528, datPtr1+15248+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat362 = _mm512_mask_fmadd_ps(dat362, 65528, bnMul11, bnAdd11);
__m512 dat363 = _mm512_maskz_loadu_ps(65528, datPtr1+16144+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat363 = _mm512_mask_fmadd_ps(dat363, 65528, bnMul11, bnAdd11);
__m512 dat364 = _mm512_maskz_loadu_ps(65528, datPtr1+17040+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat364 = _mm512_mask_fmadd_ps(dat364, 65528, bnMul11, bnAdd11);
__m512 dat365 = _mm512_maskz_loadu_ps(65528, datPtr1+17936+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat365 = _mm512_mask_fmadd_ps(dat365, 65528, bnMul11, bnAdd11);
__m512 dat366 = _mm512_maskz_loadu_ps(65528, datPtr1+18832+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat366 = _mm512_mask_fmadd_ps(dat366, 65528, bnMul11, bnAdd11);
__m512 dat367 = _mm512_maskz_loadu_ps(65528, datPtr1+19728+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat367 = _mm512_mask_fmadd_ps(dat367, 65528, bnMul11, bnAdd11);
__m512 dat368 = _mm512_maskz_loadu_ps(65528, datPtr1+20624+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat368 = _mm512_mask_fmadd_ps(dat368, 65528, bnMul11, bnAdd11);
__m512 dat369 = _mm512_maskz_loadu_ps(65528, datPtr1+21520+602112*i6+200704*k12+896*h11+4*w11+0*b26);
dat369 = _mm512_mask_fmadd_ps(dat369, 65528, bnMul11, bnAdd11);
__m512 fft4201 = _mm512_add_ps(dat354, dat362);
__m512 fft4289 = _mm512_add_ps(dat355, dat363);
__m512 fft4202 = _mm512_sub_ps(dat354, dat362);
__m512 fft4290 = _mm512_sub_ps(dat355, dat363);
__m512 fft4203 = _mm512_add_ps(dat356, dat364);
__m512 fft4291 = _mm512_add_ps(dat357, dat365);
__m512 fft4204 = _mm512_sub_ps(dat356, dat364);
__m512 fft4292 = _mm512_sub_ps(dat357, dat365);
__m512 fft4205 = _mm512_add_ps(dat358, dat366);
__m512 fft4293 = _mm512_add_ps(dat359, dat367);
__m512 fft4206 = _mm512_sub_ps(dat358, dat366);
__m512 fft4294 = _mm512_sub_ps(dat359, dat367);
__m512 fft4207 = _mm512_add_ps(dat360, dat368);
__m512 fft4295 = _mm512_add_ps(dat361, dat369);
__m512 fft4208 = _mm512_sub_ps(dat360, dat368);
__m512 fft4296 = _mm512_sub_ps(dat361, dat369);
__m512 fft4209 = _mm512_add_ps(fft4201, fft4205);
__m512 fft4297 = _mm512_add_ps(fft4289, fft4293);
__m512 fft4210 = _mm512_sub_ps(fft4201, fft4205);
__m512 fft4298 = _mm512_sub_ps(fft4289, fft4293);
__m512 fft4211 = _mm512_add_ps(fft4203, fft4207);
__m512 fft4299 = _mm512_add_ps(fft4291, fft4295);
__m512 fft4212 = _mm512_sub_ps(fft4207, fft4203);
__m512 fft4300 = _mm512_sub_ps(fft4295, fft4291);
__m512 fft4213 = _mm512_sub_ps(fft4204, fft4208);
__m512 fft4301 = _mm512_sub_ps(fft4292, fft4296);
__m512 fft4214 = _mm512_add_ps(fft4204, fft4208);
__m512 fft4302 = _mm512_add_ps(fft4292, fft4296);
__m512 fft4215 = _mm512_add_ps(fft4209, fft4211);
__m512 fft4303 = _mm512_add_ps(fft4297, fft4299);
__m512 fft4216 = _mm512_sub_ps(fft4209, fft4211);
__m512 fft4304 = _mm512_sub_ps(fft4297, fft4299);
__m512 fft4217 = _mm512_fmadd_ps(fft4213, _mm512_set1_ps(7.0710677e-01f), fft4202);
__m512 fft4305 = _mm512_fmadd_ps(fft4301, _mm512_set1_ps(7.0710677e-01f), fft4290);
__m512 fft4218 = _mm512_fnmsub_ps(fft4214, _mm512_set1_ps(7.0710677e-01f), fft4206);
__m512 fft4306 = _mm512_fnmsub_ps(fft4302, _mm512_set1_ps(7.0710677e-01f), fft4294);
__m512 fft4219 = _mm512_fnmadd_ps(fft4213, _mm512_set1_ps(7.0710677e-01f), fft4202);
__m512 fft4307 = _mm512_fnmadd_ps(fft4301, _mm512_set1_ps(7.0710677e-01f), fft4290);
__m512 fft4220 = _mm512_fnmadd_ps(fft4214, _mm512_set1_ps(7.0710677e-01f), fft4206);
__m512 fft4308 = _mm512_fnmadd_ps(fft4302, _mm512_set1_ps(7.0710677e-01f), fft4294);
__m512 fft4221 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4222 = _mm512_fmadd_ps(fft4215, fft4221, _mm512_shuffle_f32x4(fft4215, fft4215, 78));
__m512 fft4309 = _mm512_fmadd_ps(fft4303, fft4221, _mm512_shuffle_f32x4(fft4303, fft4303, 78));
__m512 fft4223 = _mm512_fmadd_ps(fft4216, fft4221, _mm512_shuffle_f32x4(fft4216, fft4216, 78));
__m512 fft4310 = _mm512_fmadd_ps(fft4304, fft4221, _mm512_shuffle_f32x4(fft4304, fft4304, 78));
__m512 fft4224 = _mm512_fmadd_ps(fft4217, fft4221, _mm512_shuffle_f32x4(fft4217, fft4217, 78));
__m512 fft4311 = _mm512_fmadd_ps(fft4305, fft4221, _mm512_shuffle_f32x4(fft4305, fft4305, 78));
__m512 fft4225 = _mm512_fmadd_ps(fft4218, fft4221, _mm512_shuffle_f32x4(fft4218, fft4218, 78));
__m512 fft4312 = _mm512_fmadd_ps(fft4306, fft4221, _mm512_shuffle_f32x4(fft4306, fft4306, 78));
__m512 fft4226 = _mm512_fmadd_ps(fft4210, fft4221, _mm512_shuffle_f32x4(fft4210, fft4210, 78));
__m512 fft4313 = _mm512_fmadd_ps(fft4298, fft4221, _mm512_shuffle_f32x4(fft4298, fft4298, 78));
__m512 fft4227 = _mm512_fmadd_ps(fft4212, fft4221, _mm512_shuffle_f32x4(fft4212, fft4212, 78));
__m512 fft4314 = _mm512_fmadd_ps(fft4300, fft4221, _mm512_shuffle_f32x4(fft4300, fft4300, 78));
__m512 fft4228 = _mm512_fmadd_ps(fft4219, fft4221, _mm512_shuffle_f32x4(fft4219, fft4219, 78));
__m512 fft4315 = _mm512_fmadd_ps(fft4307, fft4221, _mm512_shuffle_f32x4(fft4307, fft4307, 78));
__m512 fft4229 = _mm512_fmadd_ps(fft4220, fft4221, _mm512_shuffle_f32x4(fft4220, fft4220, 78));
__m512 fft4316 = _mm512_fmadd_ps(fft4308, fft4221, _mm512_shuffle_f32x4(fft4308, fft4308, 78));
__m512 fft4230 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4231 = _mm512_mul_ps(fft4222, fft4230);
__m512 fft4317 = _mm512_mul_ps(fft4309, fft4230);
__m512 fft4232 = _mm512_mul_ps(fft4223, fft4230);
__m512 fft4318 = _mm512_mul_ps(fft4310, fft4230);
__m512 fft4233 = _mm512_mul_ps(fft4224, fft4230);
__m512 fft4319 = _mm512_mul_ps(fft4311, fft4230);
__m512 fft4234 = _mm512_mul_ps(fft4225, fft4230);
__m512 fft4320 = _mm512_mul_ps(fft4312, fft4230);
__m512 fft4235 = _mm512_mul_ps(fft4226, fft4230);
__m512 fft4321 = _mm512_mul_ps(fft4313, fft4230);
__m512 fft4236 = _mm512_mul_ps(fft4227, fft4230);
__m512 fft4322 = _mm512_mul_ps(fft4314, fft4230);
__m512 fft4237 = _mm512_mul_ps(fft4228, fft4230);
__m512 fft4323 = _mm512_mul_ps(fft4315, fft4230);
__m512 fft4238 = _mm512_mul_ps(fft4229, fft4230);
__m512 fft4324 = _mm512_mul_ps(fft4316, fft4230);
__m512 fft4239 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4240 = _mm512_fmadd_ps(fft4223, fft4239, fft4231);
__m512 fft4325 = _mm512_fmadd_ps(fft4310, fft4239, fft4317);
__m512 fft4241 = _mm512_fnmadd_ps(fft4222, fft4239, fft4232);
__m512 fft4326 = _mm512_fnmadd_ps(fft4309, fft4239, fft4318);
__m512 fft4242 = _mm512_fmadd_ps(fft4225, fft4239, fft4233);
__m512 fft4327 = _mm512_fmadd_ps(fft4312, fft4239, fft4319);
__m512 fft4243 = _mm512_fnmadd_ps(fft4224, fft4239, fft4234);
__m512 fft4328 = _mm512_fnmadd_ps(fft4311, fft4239, fft4320);
__m512 fft4244 = _mm512_fmadd_ps(fft4227, fft4239, fft4235);
__m512 fft4329 = _mm512_fmadd_ps(fft4314, fft4239, fft4321);
__m512 fft4245 = _mm512_fnmadd_ps(fft4226, fft4239, fft4236);
__m512 fft4330 = _mm512_fnmadd_ps(fft4313, fft4239, fft4322);
__m512 fft4246 = _mm512_fmadd_ps(fft4229, fft4239, fft4237);
__m512 fft4331 = _mm512_fmadd_ps(fft4316, fft4239, fft4323);
__m512 fft4247 = _mm512_fnmadd_ps(fft4228, fft4239, fft4238);
__m512 fft4332 = _mm512_fnmadd_ps(fft4315, fft4239, fft4324);
__m512 fft4248 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4249 = _mm512_fmadd_ps(fft4240, fft4248, _mm512_shuffle_f32x4(fft4240, fft4240, 177));
__m512 fft4333 = _mm512_fmadd_ps(fft4325, fft4248, _mm512_shuffle_f32x4(fft4325, fft4325, 177));
__m512 fft4250 = _mm512_fmadd_ps(fft4241, fft4248, _mm512_shuffle_f32x4(fft4241, fft4241, 177));
__m512 fft4334 = _mm512_fmadd_ps(fft4326, fft4248, _mm512_shuffle_f32x4(fft4326, fft4326, 177));
__m512 fft4251 = _mm512_fmadd_ps(fft4242, fft4248, _mm512_shuffle_f32x4(fft4242, fft4242, 177));
__m512 fft4335 = _mm512_fmadd_ps(fft4327, fft4248, _mm512_shuffle_f32x4(fft4327, fft4327, 177));
__m512 fft4252 = _mm512_fmadd_ps(fft4243, fft4248, _mm512_shuffle_f32x4(fft4243, fft4243, 177));
__m512 fft4336 = _mm512_fmadd_ps(fft4328, fft4248, _mm512_shuffle_f32x4(fft4328, fft4328, 177));
__m512 fft4253 = _mm512_fmadd_ps(fft4244, fft4248, _mm512_shuffle_f32x4(fft4244, fft4244, 177));
__m512 fft4337 = _mm512_fmadd_ps(fft4329, fft4248, _mm512_shuffle_f32x4(fft4329, fft4329, 177));
__m512 fft4254 = _mm512_fmadd_ps(fft4245, fft4248, _mm512_shuffle_f32x4(fft4245, fft4245, 177));
__m512 fft4338 = _mm512_fmadd_ps(fft4330, fft4248, _mm512_shuffle_f32x4(fft4330, fft4330, 177));
__m512 fft4255 = _mm512_fmadd_ps(fft4246, fft4248, _mm512_shuffle_f32x4(fft4246, fft4246, 177));
__m512 fft4339 = _mm512_fmadd_ps(fft4331, fft4248, _mm512_shuffle_f32x4(fft4331, fft4331, 177));
__m512 fft4256 = _mm512_fmadd_ps(fft4247, fft4248, _mm512_shuffle_f32x4(fft4247, fft4247, 177));
__m512 fft4340 = _mm512_fmadd_ps(fft4332, fft4248, _mm512_shuffle_f32x4(fft4332, fft4332, 177));
__m512 fft4257 = _mm512_mask_mov_ps(fft4249, 49344, fft4250);
__m512 fft4341 = _mm512_mask_mov_ps(fft4333, 49344, fft4334);
__m512 fft4258 = _mm512_mask_sub_ps(fft4250, 49344, _mm512_setzero_ps(), fft4249);
__m512 fft4342 = _mm512_mask_sub_ps(fft4334, 49344, _mm512_setzero_ps(), fft4333);
__m512 fft4259 = _mm512_mask_mov_ps(fft4251, 49344, fft4252);
__m512 fft4343 = _mm512_mask_mov_ps(fft4335, 49344, fft4336);
__m512 fft4260 = _mm512_mask_sub_ps(fft4252, 49344, _mm512_setzero_ps(), fft4251);
__m512 fft4344 = _mm512_mask_sub_ps(fft4336, 49344, _mm512_setzero_ps(), fft4335);
__m512 fft4261 = _mm512_mask_mov_ps(fft4253, 49344, fft4254);
__m512 fft4345 = _mm512_mask_mov_ps(fft4337, 49344, fft4338);
__m512 fft4262 = _mm512_mask_sub_ps(fft4254, 49344, _mm512_setzero_ps(), fft4253);
__m512 fft4346 = _mm512_mask_sub_ps(fft4338, 49344, _mm512_setzero_ps(), fft4337);
__m512 fft4263 = _mm512_mask_mov_ps(fft4255, 49344, fft4256);
__m512 fft4347 = _mm512_mask_mov_ps(fft4339, 49344, fft4340);
__m512 fft4264 = _mm512_mask_sub_ps(fft4256, 49344, _mm512_setzero_ps(), fft4255);
__m512 fft4348 = _mm512_mask_sub_ps(fft4340, 49344, _mm512_setzero_ps(), fft4339);
__m512 fft4265 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4266 = _mm512_fmadd_ps(fft4257, fft4265, _mm512_shuffle_ps(fft4257, fft4257, 78));
__m512 fft4349 = _mm512_fmadd_ps(fft4341, fft4265, _mm512_shuffle_ps(fft4341, fft4341, 78));
__m512 fft4267 = _mm512_fmadd_ps(fft4258, fft4265, _mm512_shuffle_ps(fft4258, fft4258, 78));
__m512 fft4350 = _mm512_fmadd_ps(fft4342, fft4265, _mm512_shuffle_ps(fft4342, fft4342, 78));
__m512 fft4268 = _mm512_fmadd_ps(fft4259, fft4265, _mm512_shuffle_ps(fft4259, fft4259, 78));
__m512 fft4351 = _mm512_fmadd_ps(fft4343, fft4265, _mm512_shuffle_ps(fft4343, fft4343, 78));
__m512 fft4269 = _mm512_fmadd_ps(fft4260, fft4265, _mm512_shuffle_ps(fft4260, fft4260, 78));
__m512 fft4352 = _mm512_fmadd_ps(fft4344, fft4265, _mm512_shuffle_ps(fft4344, fft4344, 78));
__m512 fft4270 = _mm512_fmadd_ps(fft4261, fft4265, _mm512_shuffle_ps(fft4261, fft4261, 78));
__m512 fft4353 = _mm512_fmadd_ps(fft4345, fft4265, _mm512_shuffle_ps(fft4345, fft4345, 78));
__m512 fft4271 = _mm512_fmadd_ps(fft4262, fft4265, _mm512_shuffle_ps(fft4262, fft4262, 78));
__m512 fft4354 = _mm512_fmadd_ps(fft4346, fft4265, _mm512_shuffle_ps(fft4346, fft4346, 78));
__m512 fft4272 = _mm512_fmadd_ps(fft4263, fft4265, _mm512_shuffle_ps(fft4263, fft4263, 78));
__m512 fft4355 = _mm512_fmadd_ps(fft4347, fft4265, _mm512_shuffle_ps(fft4347, fft4347, 78));
__m512 fft4273 = _mm512_fmadd_ps(fft4264, fft4265, _mm512_shuffle_ps(fft4264, fft4264, 78));
__m512 fft4356 = _mm512_fmadd_ps(fft4348, fft4265, _mm512_shuffle_ps(fft4348, fft4348, 78));
__m512i fft4274 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4275 = _mm512_permutexvar_ps(fft4274, fft4266);
__m512 fft4357 = _mm512_permutexvar_ps(fft4274, fft4349);
__m512i fft4276 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4277 = _mm512_permutexvar_ps(fft4276, fft4266);
__m512 fft4358 = _mm512_permutexvar_ps(fft4276, fft4349);
__m512 fft4278 = _mm512_permutexvar_ps(fft4274, fft4267);
__m512 fft4359 = _mm512_permutexvar_ps(fft4274, fft4350);
__m512 fft4279 = _mm512_permutexvar_ps(fft4276, fft4267);
__m512 fft4360 = _mm512_permutexvar_ps(fft4276, fft4350);
__m512 fft4280 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4281 = _mm512_fmadd_ps(fft4275, fft4280, fft4277);
__m512 fft4361 = _mm512_fmadd_ps(fft4357, fft4280, fft4358);
__m512 fft4282 = _mm512_fnmadd_ps(fft4279, fft4280, fft4278);
__m512 fft4362 = _mm512_fnmadd_ps(fft4360, fft4280, fft4359);
__m512 fft4283 = _mm512_mask_mov_ps(fft4279, 21845, fft4281);
__m512 fft4363 = _mm512_mask_mov_ps(fft4360, 21845, fft4361);
__m512 fft4284 = _mm512_mask_mov_ps(fft4275, 43176, fft4281);
__m512 fft4364 = _mm512_mask_mov_ps(fft4357, 43176, fft4361);
__m512 fft4285 = _mm512_mask_mov_ps(fft4283, 43176, fft4282);
__m512 fft4365 = _mm512_mask_mov_ps(fft4363, 43176, fft4362);
__m512 fft4286 = _mm512_mask_mov_ps(fft4284, 22102, fft4282);
__m512 fft4366 = _mm512_mask_mov_ps(fft4364, 22102, fft4362);
__m512 fft4287 = _mm512_mask_mul_ps(fft4285, 64764, fft4285, _mm512_set1_ps(5e-01f));
__m512 fft4367 = _mm512_mask_mul_ps(fft4365, 64764, fft4365, _mm512_set1_ps(5e-01f));
__m512 fft4288 = _mm512_mask_mul_ps(fft4286, 64764, fft4286, _mm512_set1_ps(5e-01f));
__m512 fft4368 = _mm512_mask_mul_ps(fft4366, 64764, fft4366, _mm512_set1_ps(5e-01f));
__m512 df369 = fft4287;
__m512 df377 = fft4367;
__m512 df370 = fft4288;
__m512 df378 = fft4368;
__m512 df371 = fft4268;
__m512 df379 = fft4351;
__m512 df372 = fft4269;
__m512 df380 = fft4352;
__m512 df373 = fft4270;
__m512 df381 = fft4353;
__m512 df374 = fft4271;
__m512 df382 = fft4354;
__m512 df375 = fft4272;
__m512 df383 = fft4355;
__m512 df376 = fft4273;
__m512 df384 = fft4356;
__m512i eo26 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df371 = _mm512_permutexvar_ps(eo26, df371);
df372 = _mm512_permutexvar_ps(eo26, df372);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df371);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df372);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df371);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df372);
df379 = _mm512_permutexvar_ps(eo26, df379);
df380 = _mm512_permutexvar_ps(eo26, df380);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df379);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df380);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df379);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df380);
df373 = _mm512_permutexvar_ps(eo26, df373);
df374 = _mm512_permutexvar_ps(eo26, df374);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df373);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df374);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df373);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df374);
df381 = _mm512_permutexvar_ps(eo26, df381);
df382 = _mm512_permutexvar_ps(eo26, df382);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df381);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df382);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df381);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df382);
df375 = _mm512_permutexvar_ps(eo26, df375);
df376 = _mm512_permutexvar_ps(eo26, df376);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df375);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df376);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df375);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df376);
df383 = _mm512_permutexvar_ps(eo26, df383);
df384 = _mm512_permutexvar_ps(eo26, df384);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df383);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df384);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df383);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df384);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df369);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df370);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df369);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df370);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df377);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m26+32*f27, 255, df378);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df377);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m26+32*f27, 65280, df378);
for (ptrdiff_t b27 = 2; b27 < 6; ++b27) {
ptrdiff_t m27 = (size_t)b27/2;
ptrdiff_t f28 = (size_t)b27%2;
__m512 dat370 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat370 = _mm512_mask_fmadd_ps(dat370, 65535, bnMul11, bnAdd11);
__m512 dat371 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat371 = _mm512_mask_fmadd_ps(dat371, 65535, bnMul11, bnAdd11);
__m512 dat372 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat372 = _mm512_mask_fmadd_ps(dat372, 65535, bnMul11, bnAdd11);
__m512 dat373 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat373 = _mm512_mask_fmadd_ps(dat373, 65535, bnMul11, bnAdd11);
__m512 dat374 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat374 = _mm512_mask_fmadd_ps(dat374, 65535, bnMul11, bnAdd11);
__m512 dat375 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat375 = _mm512_mask_fmadd_ps(dat375, 65535, bnMul11, bnAdd11);
__m512 dat376 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat376 = _mm512_mask_fmadd_ps(dat376, 65535, bnMul11, bnAdd11);
__m512 dat377 = _mm512_maskz_loadu_ps(65535, datPtr1+14312+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat377 = _mm512_mask_fmadd_ps(dat377, 65535, bnMul11, bnAdd11);
__m512 dat378 = _mm512_maskz_loadu_ps(65535, datPtr1+15208+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat378 = _mm512_mask_fmadd_ps(dat378, 65535, bnMul11, bnAdd11);
__m512 dat379 = _mm512_maskz_loadu_ps(65535, datPtr1+16104+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat379 = _mm512_mask_fmadd_ps(dat379, 65535, bnMul11, bnAdd11);
__m512 dat380 = _mm512_maskz_loadu_ps(65535, datPtr1+17000+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat380 = _mm512_mask_fmadd_ps(dat380, 65535, bnMul11, bnAdd11);
__m512 dat381 = _mm512_maskz_loadu_ps(65535, datPtr1+17896+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat381 = _mm512_mask_fmadd_ps(dat381, 65535, bnMul11, bnAdd11);
__m512 dat382 = _mm512_maskz_loadu_ps(65535, datPtr1+18792+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat382 = _mm512_mask_fmadd_ps(dat382, 65535, bnMul11, bnAdd11);
__m512 dat383 = _mm512_maskz_loadu_ps(65535, datPtr1+19688+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat383 = _mm512_mask_fmadd_ps(dat383, 65535, bnMul11, bnAdd11);
__m512 dat384 = _mm512_maskz_loadu_ps(65535, datPtr1+20584+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat384 = _mm512_mask_fmadd_ps(dat384, 65535, bnMul11, bnAdd11);
__m512 dat385 = _mm512_maskz_loadu_ps(65535, datPtr1+21480+602112*i6+200704*k12+896*h11+4*w11+40*b27);
dat385 = _mm512_mask_fmadd_ps(dat385, 65535, bnMul11, bnAdd11);
__m512 fft4369 = _mm512_add_ps(dat370, dat378);
__m512 fft4457 = _mm512_add_ps(dat371, dat379);
__m512 fft4370 = _mm512_sub_ps(dat370, dat378);
__m512 fft4458 = _mm512_sub_ps(dat371, dat379);
__m512 fft4371 = _mm512_add_ps(dat372, dat380);
__m512 fft4459 = _mm512_add_ps(dat373, dat381);
__m512 fft4372 = _mm512_sub_ps(dat372, dat380);
__m512 fft4460 = _mm512_sub_ps(dat373, dat381);
__m512 fft4373 = _mm512_add_ps(dat374, dat382);
__m512 fft4461 = _mm512_add_ps(dat375, dat383);
__m512 fft4374 = _mm512_sub_ps(dat374, dat382);
__m512 fft4462 = _mm512_sub_ps(dat375, dat383);
__m512 fft4375 = _mm512_add_ps(dat376, dat384);
__m512 fft4463 = _mm512_add_ps(dat377, dat385);
__m512 fft4376 = _mm512_sub_ps(dat376, dat384);
__m512 fft4464 = _mm512_sub_ps(dat377, dat385);
__m512 fft4377 = _mm512_add_ps(fft4369, fft4373);
__m512 fft4465 = _mm512_add_ps(fft4457, fft4461);
__m512 fft4378 = _mm512_sub_ps(fft4369, fft4373);
__m512 fft4466 = _mm512_sub_ps(fft4457, fft4461);
__m512 fft4379 = _mm512_add_ps(fft4371, fft4375);
__m512 fft4467 = _mm512_add_ps(fft4459, fft4463);
__m512 fft4380 = _mm512_sub_ps(fft4375, fft4371);
__m512 fft4468 = _mm512_sub_ps(fft4463, fft4459);
__m512 fft4381 = _mm512_sub_ps(fft4372, fft4376);
__m512 fft4469 = _mm512_sub_ps(fft4460, fft4464);
__m512 fft4382 = _mm512_add_ps(fft4372, fft4376);
__m512 fft4470 = _mm512_add_ps(fft4460, fft4464);
__m512 fft4383 = _mm512_add_ps(fft4377, fft4379);
__m512 fft4471 = _mm512_add_ps(fft4465, fft4467);
__m512 fft4384 = _mm512_sub_ps(fft4377, fft4379);
__m512 fft4472 = _mm512_sub_ps(fft4465, fft4467);
__m512 fft4385 = _mm512_fmadd_ps(fft4381, _mm512_set1_ps(7.0710677e-01f), fft4370);
__m512 fft4473 = _mm512_fmadd_ps(fft4469, _mm512_set1_ps(7.0710677e-01f), fft4458);
__m512 fft4386 = _mm512_fnmsub_ps(fft4382, _mm512_set1_ps(7.0710677e-01f), fft4374);
__m512 fft4474 = _mm512_fnmsub_ps(fft4470, _mm512_set1_ps(7.0710677e-01f), fft4462);
__m512 fft4387 = _mm512_fnmadd_ps(fft4381, _mm512_set1_ps(7.0710677e-01f), fft4370);
__m512 fft4475 = _mm512_fnmadd_ps(fft4469, _mm512_set1_ps(7.0710677e-01f), fft4458);
__m512 fft4388 = _mm512_fnmadd_ps(fft4382, _mm512_set1_ps(7.0710677e-01f), fft4374);
__m512 fft4476 = _mm512_fnmadd_ps(fft4470, _mm512_set1_ps(7.0710677e-01f), fft4462);
__m512 fft4389 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4390 = _mm512_fmadd_ps(fft4383, fft4389, _mm512_shuffle_f32x4(fft4383, fft4383, 78));
__m512 fft4477 = _mm512_fmadd_ps(fft4471, fft4389, _mm512_shuffle_f32x4(fft4471, fft4471, 78));
__m512 fft4391 = _mm512_fmadd_ps(fft4384, fft4389, _mm512_shuffle_f32x4(fft4384, fft4384, 78));
__m512 fft4478 = _mm512_fmadd_ps(fft4472, fft4389, _mm512_shuffle_f32x4(fft4472, fft4472, 78));
__m512 fft4392 = _mm512_fmadd_ps(fft4385, fft4389, _mm512_shuffle_f32x4(fft4385, fft4385, 78));
__m512 fft4479 = _mm512_fmadd_ps(fft4473, fft4389, _mm512_shuffle_f32x4(fft4473, fft4473, 78));
__m512 fft4393 = _mm512_fmadd_ps(fft4386, fft4389, _mm512_shuffle_f32x4(fft4386, fft4386, 78));
__m512 fft4480 = _mm512_fmadd_ps(fft4474, fft4389, _mm512_shuffle_f32x4(fft4474, fft4474, 78));
__m512 fft4394 = _mm512_fmadd_ps(fft4378, fft4389, _mm512_shuffle_f32x4(fft4378, fft4378, 78));
__m512 fft4481 = _mm512_fmadd_ps(fft4466, fft4389, _mm512_shuffle_f32x4(fft4466, fft4466, 78));
__m512 fft4395 = _mm512_fmadd_ps(fft4380, fft4389, _mm512_shuffle_f32x4(fft4380, fft4380, 78));
__m512 fft4482 = _mm512_fmadd_ps(fft4468, fft4389, _mm512_shuffle_f32x4(fft4468, fft4468, 78));
__m512 fft4396 = _mm512_fmadd_ps(fft4387, fft4389, _mm512_shuffle_f32x4(fft4387, fft4387, 78));
__m512 fft4483 = _mm512_fmadd_ps(fft4475, fft4389, _mm512_shuffle_f32x4(fft4475, fft4475, 78));
__m512 fft4397 = _mm512_fmadd_ps(fft4388, fft4389, _mm512_shuffle_f32x4(fft4388, fft4388, 78));
__m512 fft4484 = _mm512_fmadd_ps(fft4476, fft4389, _mm512_shuffle_f32x4(fft4476, fft4476, 78));
__m512 fft4398 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4399 = _mm512_mul_ps(fft4390, fft4398);
__m512 fft4485 = _mm512_mul_ps(fft4477, fft4398);
__m512 fft4400 = _mm512_mul_ps(fft4391, fft4398);
__m512 fft4486 = _mm512_mul_ps(fft4478, fft4398);
__m512 fft4401 = _mm512_mul_ps(fft4392, fft4398);
__m512 fft4487 = _mm512_mul_ps(fft4479, fft4398);
__m512 fft4402 = _mm512_mul_ps(fft4393, fft4398);
__m512 fft4488 = _mm512_mul_ps(fft4480, fft4398);
__m512 fft4403 = _mm512_mul_ps(fft4394, fft4398);
__m512 fft4489 = _mm512_mul_ps(fft4481, fft4398);
__m512 fft4404 = _mm512_mul_ps(fft4395, fft4398);
__m512 fft4490 = _mm512_mul_ps(fft4482, fft4398);
__m512 fft4405 = _mm512_mul_ps(fft4396, fft4398);
__m512 fft4491 = _mm512_mul_ps(fft4483, fft4398);
__m512 fft4406 = _mm512_mul_ps(fft4397, fft4398);
__m512 fft4492 = _mm512_mul_ps(fft4484, fft4398);
__m512 fft4407 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4408 = _mm512_fmadd_ps(fft4391, fft4407, fft4399);
__m512 fft4493 = _mm512_fmadd_ps(fft4478, fft4407, fft4485);
__m512 fft4409 = _mm512_fnmadd_ps(fft4390, fft4407, fft4400);
__m512 fft4494 = _mm512_fnmadd_ps(fft4477, fft4407, fft4486);
__m512 fft4410 = _mm512_fmadd_ps(fft4393, fft4407, fft4401);
__m512 fft4495 = _mm512_fmadd_ps(fft4480, fft4407, fft4487);
__m512 fft4411 = _mm512_fnmadd_ps(fft4392, fft4407, fft4402);
__m512 fft4496 = _mm512_fnmadd_ps(fft4479, fft4407, fft4488);
__m512 fft4412 = _mm512_fmadd_ps(fft4395, fft4407, fft4403);
__m512 fft4497 = _mm512_fmadd_ps(fft4482, fft4407, fft4489);
__m512 fft4413 = _mm512_fnmadd_ps(fft4394, fft4407, fft4404);
__m512 fft4498 = _mm512_fnmadd_ps(fft4481, fft4407, fft4490);
__m512 fft4414 = _mm512_fmadd_ps(fft4397, fft4407, fft4405);
__m512 fft4499 = _mm512_fmadd_ps(fft4484, fft4407, fft4491);
__m512 fft4415 = _mm512_fnmadd_ps(fft4396, fft4407, fft4406);
__m512 fft4500 = _mm512_fnmadd_ps(fft4483, fft4407, fft4492);
__m512 fft4416 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4417 = _mm512_fmadd_ps(fft4408, fft4416, _mm512_shuffle_f32x4(fft4408, fft4408, 177));
__m512 fft4501 = _mm512_fmadd_ps(fft4493, fft4416, _mm512_shuffle_f32x4(fft4493, fft4493, 177));
__m512 fft4418 = _mm512_fmadd_ps(fft4409, fft4416, _mm512_shuffle_f32x4(fft4409, fft4409, 177));
__m512 fft4502 = _mm512_fmadd_ps(fft4494, fft4416, _mm512_shuffle_f32x4(fft4494, fft4494, 177));
__m512 fft4419 = _mm512_fmadd_ps(fft4410, fft4416, _mm512_shuffle_f32x4(fft4410, fft4410, 177));
__m512 fft4503 = _mm512_fmadd_ps(fft4495, fft4416, _mm512_shuffle_f32x4(fft4495, fft4495, 177));
__m512 fft4420 = _mm512_fmadd_ps(fft4411, fft4416, _mm512_shuffle_f32x4(fft4411, fft4411, 177));
__m512 fft4504 = _mm512_fmadd_ps(fft4496, fft4416, _mm512_shuffle_f32x4(fft4496, fft4496, 177));
__m512 fft4421 = _mm512_fmadd_ps(fft4412, fft4416, _mm512_shuffle_f32x4(fft4412, fft4412, 177));
__m512 fft4505 = _mm512_fmadd_ps(fft4497, fft4416, _mm512_shuffle_f32x4(fft4497, fft4497, 177));
__m512 fft4422 = _mm512_fmadd_ps(fft4413, fft4416, _mm512_shuffle_f32x4(fft4413, fft4413, 177));
__m512 fft4506 = _mm512_fmadd_ps(fft4498, fft4416, _mm512_shuffle_f32x4(fft4498, fft4498, 177));
__m512 fft4423 = _mm512_fmadd_ps(fft4414, fft4416, _mm512_shuffle_f32x4(fft4414, fft4414, 177));
__m512 fft4507 = _mm512_fmadd_ps(fft4499, fft4416, _mm512_shuffle_f32x4(fft4499, fft4499, 177));
__m512 fft4424 = _mm512_fmadd_ps(fft4415, fft4416, _mm512_shuffle_f32x4(fft4415, fft4415, 177));
__m512 fft4508 = _mm512_fmadd_ps(fft4500, fft4416, _mm512_shuffle_f32x4(fft4500, fft4500, 177));
__m512 fft4425 = _mm512_mask_mov_ps(fft4417, 49344, fft4418);
__m512 fft4509 = _mm512_mask_mov_ps(fft4501, 49344, fft4502);
__m512 fft4426 = _mm512_mask_sub_ps(fft4418, 49344, _mm512_setzero_ps(), fft4417);
__m512 fft4510 = _mm512_mask_sub_ps(fft4502, 49344, _mm512_setzero_ps(), fft4501);
__m512 fft4427 = _mm512_mask_mov_ps(fft4419, 49344, fft4420);
__m512 fft4511 = _mm512_mask_mov_ps(fft4503, 49344, fft4504);
__m512 fft4428 = _mm512_mask_sub_ps(fft4420, 49344, _mm512_setzero_ps(), fft4419);
__m512 fft4512 = _mm512_mask_sub_ps(fft4504, 49344, _mm512_setzero_ps(), fft4503);
__m512 fft4429 = _mm512_mask_mov_ps(fft4421, 49344, fft4422);
__m512 fft4513 = _mm512_mask_mov_ps(fft4505, 49344, fft4506);
__m512 fft4430 = _mm512_mask_sub_ps(fft4422, 49344, _mm512_setzero_ps(), fft4421);
__m512 fft4514 = _mm512_mask_sub_ps(fft4506, 49344, _mm512_setzero_ps(), fft4505);
__m512 fft4431 = _mm512_mask_mov_ps(fft4423, 49344, fft4424);
__m512 fft4515 = _mm512_mask_mov_ps(fft4507, 49344, fft4508);
__m512 fft4432 = _mm512_mask_sub_ps(fft4424, 49344, _mm512_setzero_ps(), fft4423);
__m512 fft4516 = _mm512_mask_sub_ps(fft4508, 49344, _mm512_setzero_ps(), fft4507);
__m512 fft4433 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4434 = _mm512_fmadd_ps(fft4425, fft4433, _mm512_shuffle_ps(fft4425, fft4425, 78));
__m512 fft4517 = _mm512_fmadd_ps(fft4509, fft4433, _mm512_shuffle_ps(fft4509, fft4509, 78));
__m512 fft4435 = _mm512_fmadd_ps(fft4426, fft4433, _mm512_shuffle_ps(fft4426, fft4426, 78));
__m512 fft4518 = _mm512_fmadd_ps(fft4510, fft4433, _mm512_shuffle_ps(fft4510, fft4510, 78));
__m512 fft4436 = _mm512_fmadd_ps(fft4427, fft4433, _mm512_shuffle_ps(fft4427, fft4427, 78));
__m512 fft4519 = _mm512_fmadd_ps(fft4511, fft4433, _mm512_shuffle_ps(fft4511, fft4511, 78));
__m512 fft4437 = _mm512_fmadd_ps(fft4428, fft4433, _mm512_shuffle_ps(fft4428, fft4428, 78));
__m512 fft4520 = _mm512_fmadd_ps(fft4512, fft4433, _mm512_shuffle_ps(fft4512, fft4512, 78));
__m512 fft4438 = _mm512_fmadd_ps(fft4429, fft4433, _mm512_shuffle_ps(fft4429, fft4429, 78));
__m512 fft4521 = _mm512_fmadd_ps(fft4513, fft4433, _mm512_shuffle_ps(fft4513, fft4513, 78));
__m512 fft4439 = _mm512_fmadd_ps(fft4430, fft4433, _mm512_shuffle_ps(fft4430, fft4430, 78));
__m512 fft4522 = _mm512_fmadd_ps(fft4514, fft4433, _mm512_shuffle_ps(fft4514, fft4514, 78));
__m512 fft4440 = _mm512_fmadd_ps(fft4431, fft4433, _mm512_shuffle_ps(fft4431, fft4431, 78));
__m512 fft4523 = _mm512_fmadd_ps(fft4515, fft4433, _mm512_shuffle_ps(fft4515, fft4515, 78));
__m512 fft4441 = _mm512_fmadd_ps(fft4432, fft4433, _mm512_shuffle_ps(fft4432, fft4432, 78));
__m512 fft4524 = _mm512_fmadd_ps(fft4516, fft4433, _mm512_shuffle_ps(fft4516, fft4516, 78));
__m512i fft4442 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4443 = _mm512_permutexvar_ps(fft4442, fft4434);
__m512 fft4525 = _mm512_permutexvar_ps(fft4442, fft4517);
__m512i fft4444 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4445 = _mm512_permutexvar_ps(fft4444, fft4434);
__m512 fft4526 = _mm512_permutexvar_ps(fft4444, fft4517);
__m512 fft4446 = _mm512_permutexvar_ps(fft4442, fft4435);
__m512 fft4527 = _mm512_permutexvar_ps(fft4442, fft4518);
__m512 fft4447 = _mm512_permutexvar_ps(fft4444, fft4435);
__m512 fft4528 = _mm512_permutexvar_ps(fft4444, fft4518);
__m512 fft4448 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4449 = _mm512_fmadd_ps(fft4443, fft4448, fft4445);
__m512 fft4529 = _mm512_fmadd_ps(fft4525, fft4448, fft4526);
__m512 fft4450 = _mm512_fnmadd_ps(fft4447, fft4448, fft4446);
__m512 fft4530 = _mm512_fnmadd_ps(fft4528, fft4448, fft4527);
__m512 fft4451 = _mm512_mask_mov_ps(fft4447, 21845, fft4449);
__m512 fft4531 = _mm512_mask_mov_ps(fft4528, 21845, fft4529);
__m512 fft4452 = _mm512_mask_mov_ps(fft4443, 43176, fft4449);
__m512 fft4532 = _mm512_mask_mov_ps(fft4525, 43176, fft4529);
__m512 fft4453 = _mm512_mask_mov_ps(fft4451, 43176, fft4450);
__m512 fft4533 = _mm512_mask_mov_ps(fft4531, 43176, fft4530);
__m512 fft4454 = _mm512_mask_mov_ps(fft4452, 22102, fft4450);
__m512 fft4534 = _mm512_mask_mov_ps(fft4532, 22102, fft4530);
__m512 fft4455 = _mm512_mask_mul_ps(fft4453, 64764, fft4453, _mm512_set1_ps(5e-01f));
__m512 fft4535 = _mm512_mask_mul_ps(fft4533, 64764, fft4533, _mm512_set1_ps(5e-01f));
__m512 fft4456 = _mm512_mask_mul_ps(fft4454, 64764, fft4454, _mm512_set1_ps(5e-01f));
__m512 fft4536 = _mm512_mask_mul_ps(fft4534, 64764, fft4534, _mm512_set1_ps(5e-01f));
__m512 df385 = fft4455;
__m512 df393 = fft4535;
__m512 df386 = fft4456;
__m512 df394 = fft4536;
__m512 df387 = fft4436;
__m512 df395 = fft4519;
__m512 df388 = fft4437;
__m512 df396 = fft4520;
__m512 df389 = fft4438;
__m512 df397 = fft4521;
__m512 df390 = fft4439;
__m512 df398 = fft4522;
__m512 df391 = fft4440;
__m512 df399 = fft4523;
__m512 df392 = fft4441;
__m512 df400 = fft4524;
__m512i eo27 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df387 = _mm512_permutexvar_ps(eo27, df387);
df388 = _mm512_permutexvar_ps(eo27, df388);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df387);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df388);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df387);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df388);
df395 = _mm512_permutexvar_ps(eo27, df395);
df396 = _mm512_permutexvar_ps(eo27, df396);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df395);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df396);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df395);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df396);
df389 = _mm512_permutexvar_ps(eo27, df389);
df390 = _mm512_permutexvar_ps(eo27, df390);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df389);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df390);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df389);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df390);
df397 = _mm512_permutexvar_ps(eo27, df397);
df398 = _mm512_permutexvar_ps(eo27, df398);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df397);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df398);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df397);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df398);
df391 = _mm512_permutexvar_ps(eo27, df391);
df392 = _mm512_permutexvar_ps(eo27, df392);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df391);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df392);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df391);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df392);
df399 = _mm512_permutexvar_ps(eo27, df399);
df400 = _mm512_permutexvar_ps(eo27, df400);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df399);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df400);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df399);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df400);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df385);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df386);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df385);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df386);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df393);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k12+128*m27+32*f28, 255, df394);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df393);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k12+128*m27+32*f28, 65280, df394);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 16;
}
if (rel2 < 19) {
if (rel2 < 18) {
ptrdiff_t h12 = base2+40;
ptrdiff_t w12 = -910+60*rel2;
ptrdiff_t jj6 = 17-rel2+j2;
for (; j2 <= jj6; w12 += 60) {
ptrdiff_t k13 = 3*s1;
ptrdiff_t kk12 = k13+2;
for (; k13 <= kk12; ++k13) {
__m512 bnMul12 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k13+3*i6))[0]);
__m512 bnAdd12 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k13+3*i6))[1]);
for (ptrdiff_t b28 = 0; b28 < 6; ++b28) {
ptrdiff_t m28 = (size_t)b28/2;
ptrdiff_t f29 = (size_t)b28%2;
__m512 dat386 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat386 = _mm512_mask_fmadd_ps(dat386, 65535, bnMul12, bnAdd12);
__m512 dat387 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat387 = _mm512_mask_fmadd_ps(dat387, 65535, bnMul12, bnAdd12);
__m512 dat388 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat388 = _mm512_mask_fmadd_ps(dat388, 65535, bnMul12, bnAdd12);
__m512 dat389 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat389 = _mm512_mask_fmadd_ps(dat389, 65535, bnMul12, bnAdd12);
__m512 dat390 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat390 = _mm512_mask_fmadd_ps(dat390, 65535, bnMul12, bnAdd12);
__m512 dat391 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat391 = _mm512_mask_fmadd_ps(dat391, 65535, bnMul12, bnAdd12);
__m512 dat392 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat392 = _mm512_mask_fmadd_ps(dat392, 65535, bnMul12, bnAdd12);
__m512 dat393 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat393 = _mm512_mask_fmadd_ps(dat393, 65535, bnMul12, bnAdd12);
__m512 dat394 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat394 = _mm512_mask_fmadd_ps(dat394, 65535, bnMul12, bnAdd12);
__m512 dat395 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat395 = _mm512_mask_fmadd_ps(dat395, 65535, bnMul12, bnAdd12);
__m512 dat396 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat396 = _mm512_mask_fmadd_ps(dat396, 65535, bnMul12, bnAdd12);
__m512 dat397 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat397 = _mm512_mask_fmadd_ps(dat397, 65535, bnMul12, bnAdd12);
__m512 dat398 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat398 = _mm512_mask_fmadd_ps(dat398, 65535, bnMul12, bnAdd12);
__m512 dat399 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat399 = _mm512_mask_fmadd_ps(dat399, 65535, bnMul12, bnAdd12);
__m512 dat400 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat400 = _mm512_mask_fmadd_ps(dat400, 65535, bnMul12, bnAdd12);
__m512 dat401 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k13+896*h12+4*w12+40*b28);
dat401 = _mm512_mask_fmadd_ps(dat401, 65535, bnMul12, bnAdd12);
__m512 fft4537 = _mm512_add_ps(dat386, dat394);
__m512 fft4625 = _mm512_add_ps(dat387, dat395);
__m512 fft4538 = _mm512_sub_ps(dat386, dat394);
__m512 fft4626 = _mm512_sub_ps(dat387, dat395);
__m512 fft4539 = _mm512_add_ps(dat388, dat396);
__m512 fft4627 = _mm512_add_ps(dat389, dat397);
__m512 fft4540 = _mm512_sub_ps(dat388, dat396);
__m512 fft4628 = _mm512_sub_ps(dat389, dat397);
__m512 fft4541 = _mm512_add_ps(dat390, dat398);
__m512 fft4629 = _mm512_add_ps(dat391, dat399);
__m512 fft4542 = _mm512_sub_ps(dat390, dat398);
__m512 fft4630 = _mm512_sub_ps(dat391, dat399);
__m512 fft4543 = _mm512_add_ps(dat392, dat400);
__m512 fft4631 = _mm512_add_ps(dat393, dat401);
__m512 fft4544 = _mm512_sub_ps(dat392, dat400);
__m512 fft4632 = _mm512_sub_ps(dat393, dat401);
__m512 fft4545 = _mm512_add_ps(fft4537, fft4541);
__m512 fft4633 = _mm512_add_ps(fft4625, fft4629);
__m512 fft4546 = _mm512_sub_ps(fft4537, fft4541);
__m512 fft4634 = _mm512_sub_ps(fft4625, fft4629);
__m512 fft4547 = _mm512_add_ps(fft4539, fft4543);
__m512 fft4635 = _mm512_add_ps(fft4627, fft4631);
__m512 fft4548 = _mm512_sub_ps(fft4543, fft4539);
__m512 fft4636 = _mm512_sub_ps(fft4631, fft4627);
__m512 fft4549 = _mm512_sub_ps(fft4540, fft4544);
__m512 fft4637 = _mm512_sub_ps(fft4628, fft4632);
__m512 fft4550 = _mm512_add_ps(fft4540, fft4544);
__m512 fft4638 = _mm512_add_ps(fft4628, fft4632);
__m512 fft4551 = _mm512_add_ps(fft4545, fft4547);
__m512 fft4639 = _mm512_add_ps(fft4633, fft4635);
__m512 fft4552 = _mm512_sub_ps(fft4545, fft4547);
__m512 fft4640 = _mm512_sub_ps(fft4633, fft4635);
__m512 fft4553 = _mm512_fmadd_ps(fft4549, _mm512_set1_ps(7.0710677e-01f), fft4538);
__m512 fft4641 = _mm512_fmadd_ps(fft4637, _mm512_set1_ps(7.0710677e-01f), fft4626);
__m512 fft4554 = _mm512_fnmsub_ps(fft4550, _mm512_set1_ps(7.0710677e-01f), fft4542);
__m512 fft4642 = _mm512_fnmsub_ps(fft4638, _mm512_set1_ps(7.0710677e-01f), fft4630);
__m512 fft4555 = _mm512_fnmadd_ps(fft4549, _mm512_set1_ps(7.0710677e-01f), fft4538);
__m512 fft4643 = _mm512_fnmadd_ps(fft4637, _mm512_set1_ps(7.0710677e-01f), fft4626);
__m512 fft4556 = _mm512_fnmadd_ps(fft4550, _mm512_set1_ps(7.0710677e-01f), fft4542);
__m512 fft4644 = _mm512_fnmadd_ps(fft4638, _mm512_set1_ps(7.0710677e-01f), fft4630);
__m512 fft4557 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4558 = _mm512_fmadd_ps(fft4551, fft4557, _mm512_shuffle_f32x4(fft4551, fft4551, 78));
__m512 fft4645 = _mm512_fmadd_ps(fft4639, fft4557, _mm512_shuffle_f32x4(fft4639, fft4639, 78));
__m512 fft4559 = _mm512_fmadd_ps(fft4552, fft4557, _mm512_shuffle_f32x4(fft4552, fft4552, 78));
__m512 fft4646 = _mm512_fmadd_ps(fft4640, fft4557, _mm512_shuffle_f32x4(fft4640, fft4640, 78));
__m512 fft4560 = _mm512_fmadd_ps(fft4553, fft4557, _mm512_shuffle_f32x4(fft4553, fft4553, 78));
__m512 fft4647 = _mm512_fmadd_ps(fft4641, fft4557, _mm512_shuffle_f32x4(fft4641, fft4641, 78));
__m512 fft4561 = _mm512_fmadd_ps(fft4554, fft4557, _mm512_shuffle_f32x4(fft4554, fft4554, 78));
__m512 fft4648 = _mm512_fmadd_ps(fft4642, fft4557, _mm512_shuffle_f32x4(fft4642, fft4642, 78));
__m512 fft4562 = _mm512_fmadd_ps(fft4546, fft4557, _mm512_shuffle_f32x4(fft4546, fft4546, 78));
__m512 fft4649 = _mm512_fmadd_ps(fft4634, fft4557, _mm512_shuffle_f32x4(fft4634, fft4634, 78));
__m512 fft4563 = _mm512_fmadd_ps(fft4548, fft4557, _mm512_shuffle_f32x4(fft4548, fft4548, 78));
__m512 fft4650 = _mm512_fmadd_ps(fft4636, fft4557, _mm512_shuffle_f32x4(fft4636, fft4636, 78));
__m512 fft4564 = _mm512_fmadd_ps(fft4555, fft4557, _mm512_shuffle_f32x4(fft4555, fft4555, 78));
__m512 fft4651 = _mm512_fmadd_ps(fft4643, fft4557, _mm512_shuffle_f32x4(fft4643, fft4643, 78));
__m512 fft4565 = _mm512_fmadd_ps(fft4556, fft4557, _mm512_shuffle_f32x4(fft4556, fft4556, 78));
__m512 fft4652 = _mm512_fmadd_ps(fft4644, fft4557, _mm512_shuffle_f32x4(fft4644, fft4644, 78));
__m512 fft4566 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4567 = _mm512_mul_ps(fft4558, fft4566);
__m512 fft4653 = _mm512_mul_ps(fft4645, fft4566);
__m512 fft4568 = _mm512_mul_ps(fft4559, fft4566);
__m512 fft4654 = _mm512_mul_ps(fft4646, fft4566);
__m512 fft4569 = _mm512_mul_ps(fft4560, fft4566);
__m512 fft4655 = _mm512_mul_ps(fft4647, fft4566);
__m512 fft4570 = _mm512_mul_ps(fft4561, fft4566);
__m512 fft4656 = _mm512_mul_ps(fft4648, fft4566);
__m512 fft4571 = _mm512_mul_ps(fft4562, fft4566);
__m512 fft4657 = _mm512_mul_ps(fft4649, fft4566);
__m512 fft4572 = _mm512_mul_ps(fft4563, fft4566);
__m512 fft4658 = _mm512_mul_ps(fft4650, fft4566);
__m512 fft4573 = _mm512_mul_ps(fft4564, fft4566);
__m512 fft4659 = _mm512_mul_ps(fft4651, fft4566);
__m512 fft4574 = _mm512_mul_ps(fft4565, fft4566);
__m512 fft4660 = _mm512_mul_ps(fft4652, fft4566);
__m512 fft4575 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4576 = _mm512_fmadd_ps(fft4559, fft4575, fft4567);
__m512 fft4661 = _mm512_fmadd_ps(fft4646, fft4575, fft4653);
__m512 fft4577 = _mm512_fnmadd_ps(fft4558, fft4575, fft4568);
__m512 fft4662 = _mm512_fnmadd_ps(fft4645, fft4575, fft4654);
__m512 fft4578 = _mm512_fmadd_ps(fft4561, fft4575, fft4569);
__m512 fft4663 = _mm512_fmadd_ps(fft4648, fft4575, fft4655);
__m512 fft4579 = _mm512_fnmadd_ps(fft4560, fft4575, fft4570);
__m512 fft4664 = _mm512_fnmadd_ps(fft4647, fft4575, fft4656);
__m512 fft4580 = _mm512_fmadd_ps(fft4563, fft4575, fft4571);
__m512 fft4665 = _mm512_fmadd_ps(fft4650, fft4575, fft4657);
__m512 fft4581 = _mm512_fnmadd_ps(fft4562, fft4575, fft4572);
__m512 fft4666 = _mm512_fnmadd_ps(fft4649, fft4575, fft4658);
__m512 fft4582 = _mm512_fmadd_ps(fft4565, fft4575, fft4573);
__m512 fft4667 = _mm512_fmadd_ps(fft4652, fft4575, fft4659);
__m512 fft4583 = _mm512_fnmadd_ps(fft4564, fft4575, fft4574);
__m512 fft4668 = _mm512_fnmadd_ps(fft4651, fft4575, fft4660);
__m512 fft4584 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4585 = _mm512_fmadd_ps(fft4576, fft4584, _mm512_shuffle_f32x4(fft4576, fft4576, 177));
__m512 fft4669 = _mm512_fmadd_ps(fft4661, fft4584, _mm512_shuffle_f32x4(fft4661, fft4661, 177));
__m512 fft4586 = _mm512_fmadd_ps(fft4577, fft4584, _mm512_shuffle_f32x4(fft4577, fft4577, 177));
__m512 fft4670 = _mm512_fmadd_ps(fft4662, fft4584, _mm512_shuffle_f32x4(fft4662, fft4662, 177));
__m512 fft4587 = _mm512_fmadd_ps(fft4578, fft4584, _mm512_shuffle_f32x4(fft4578, fft4578, 177));
__m512 fft4671 = _mm512_fmadd_ps(fft4663, fft4584, _mm512_shuffle_f32x4(fft4663, fft4663, 177));
__m512 fft4588 = _mm512_fmadd_ps(fft4579, fft4584, _mm512_shuffle_f32x4(fft4579, fft4579, 177));
__m512 fft4672 = _mm512_fmadd_ps(fft4664, fft4584, _mm512_shuffle_f32x4(fft4664, fft4664, 177));
__m512 fft4589 = _mm512_fmadd_ps(fft4580, fft4584, _mm512_shuffle_f32x4(fft4580, fft4580, 177));
__m512 fft4673 = _mm512_fmadd_ps(fft4665, fft4584, _mm512_shuffle_f32x4(fft4665, fft4665, 177));
__m512 fft4590 = _mm512_fmadd_ps(fft4581, fft4584, _mm512_shuffle_f32x4(fft4581, fft4581, 177));
__m512 fft4674 = _mm512_fmadd_ps(fft4666, fft4584, _mm512_shuffle_f32x4(fft4666, fft4666, 177));
__m512 fft4591 = _mm512_fmadd_ps(fft4582, fft4584, _mm512_shuffle_f32x4(fft4582, fft4582, 177));
__m512 fft4675 = _mm512_fmadd_ps(fft4667, fft4584, _mm512_shuffle_f32x4(fft4667, fft4667, 177));
__m512 fft4592 = _mm512_fmadd_ps(fft4583, fft4584, _mm512_shuffle_f32x4(fft4583, fft4583, 177));
__m512 fft4676 = _mm512_fmadd_ps(fft4668, fft4584, _mm512_shuffle_f32x4(fft4668, fft4668, 177));
__m512 fft4593 = _mm512_mask_mov_ps(fft4585, 49344, fft4586);
__m512 fft4677 = _mm512_mask_mov_ps(fft4669, 49344, fft4670);
__m512 fft4594 = _mm512_mask_sub_ps(fft4586, 49344, _mm512_setzero_ps(), fft4585);
__m512 fft4678 = _mm512_mask_sub_ps(fft4670, 49344, _mm512_setzero_ps(), fft4669);
__m512 fft4595 = _mm512_mask_mov_ps(fft4587, 49344, fft4588);
__m512 fft4679 = _mm512_mask_mov_ps(fft4671, 49344, fft4672);
__m512 fft4596 = _mm512_mask_sub_ps(fft4588, 49344, _mm512_setzero_ps(), fft4587);
__m512 fft4680 = _mm512_mask_sub_ps(fft4672, 49344, _mm512_setzero_ps(), fft4671);
__m512 fft4597 = _mm512_mask_mov_ps(fft4589, 49344, fft4590);
__m512 fft4681 = _mm512_mask_mov_ps(fft4673, 49344, fft4674);
__m512 fft4598 = _mm512_mask_sub_ps(fft4590, 49344, _mm512_setzero_ps(), fft4589);
__m512 fft4682 = _mm512_mask_sub_ps(fft4674, 49344, _mm512_setzero_ps(), fft4673);
__m512 fft4599 = _mm512_mask_mov_ps(fft4591, 49344, fft4592);
__m512 fft4683 = _mm512_mask_mov_ps(fft4675, 49344, fft4676);
__m512 fft4600 = _mm512_mask_sub_ps(fft4592, 49344, _mm512_setzero_ps(), fft4591);
__m512 fft4684 = _mm512_mask_sub_ps(fft4676, 49344, _mm512_setzero_ps(), fft4675);
__m512 fft4601 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4602 = _mm512_fmadd_ps(fft4593, fft4601, _mm512_shuffle_ps(fft4593, fft4593, 78));
__m512 fft4685 = _mm512_fmadd_ps(fft4677, fft4601, _mm512_shuffle_ps(fft4677, fft4677, 78));
__m512 fft4603 = _mm512_fmadd_ps(fft4594, fft4601, _mm512_shuffle_ps(fft4594, fft4594, 78));
__m512 fft4686 = _mm512_fmadd_ps(fft4678, fft4601, _mm512_shuffle_ps(fft4678, fft4678, 78));
__m512 fft4604 = _mm512_fmadd_ps(fft4595, fft4601, _mm512_shuffle_ps(fft4595, fft4595, 78));
__m512 fft4687 = _mm512_fmadd_ps(fft4679, fft4601, _mm512_shuffle_ps(fft4679, fft4679, 78));
__m512 fft4605 = _mm512_fmadd_ps(fft4596, fft4601, _mm512_shuffle_ps(fft4596, fft4596, 78));
__m512 fft4688 = _mm512_fmadd_ps(fft4680, fft4601, _mm512_shuffle_ps(fft4680, fft4680, 78));
__m512 fft4606 = _mm512_fmadd_ps(fft4597, fft4601, _mm512_shuffle_ps(fft4597, fft4597, 78));
__m512 fft4689 = _mm512_fmadd_ps(fft4681, fft4601, _mm512_shuffle_ps(fft4681, fft4681, 78));
__m512 fft4607 = _mm512_fmadd_ps(fft4598, fft4601, _mm512_shuffle_ps(fft4598, fft4598, 78));
__m512 fft4690 = _mm512_fmadd_ps(fft4682, fft4601, _mm512_shuffle_ps(fft4682, fft4682, 78));
__m512 fft4608 = _mm512_fmadd_ps(fft4599, fft4601, _mm512_shuffle_ps(fft4599, fft4599, 78));
__m512 fft4691 = _mm512_fmadd_ps(fft4683, fft4601, _mm512_shuffle_ps(fft4683, fft4683, 78));
__m512 fft4609 = _mm512_fmadd_ps(fft4600, fft4601, _mm512_shuffle_ps(fft4600, fft4600, 78));
__m512 fft4692 = _mm512_fmadd_ps(fft4684, fft4601, _mm512_shuffle_ps(fft4684, fft4684, 78));
__m512i fft4610 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4611 = _mm512_permutexvar_ps(fft4610, fft4602);
__m512 fft4693 = _mm512_permutexvar_ps(fft4610, fft4685);
__m512i fft4612 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4613 = _mm512_permutexvar_ps(fft4612, fft4602);
__m512 fft4694 = _mm512_permutexvar_ps(fft4612, fft4685);
__m512 fft4614 = _mm512_permutexvar_ps(fft4610, fft4603);
__m512 fft4695 = _mm512_permutexvar_ps(fft4610, fft4686);
__m512 fft4615 = _mm512_permutexvar_ps(fft4612, fft4603);
__m512 fft4696 = _mm512_permutexvar_ps(fft4612, fft4686);
__m512 fft4616 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4617 = _mm512_fmadd_ps(fft4611, fft4616, fft4613);
__m512 fft4697 = _mm512_fmadd_ps(fft4693, fft4616, fft4694);
__m512 fft4618 = _mm512_fnmadd_ps(fft4615, fft4616, fft4614);
__m512 fft4698 = _mm512_fnmadd_ps(fft4696, fft4616, fft4695);
__m512 fft4619 = _mm512_mask_mov_ps(fft4615, 21845, fft4617);
__m512 fft4699 = _mm512_mask_mov_ps(fft4696, 21845, fft4697);
__m512 fft4620 = _mm512_mask_mov_ps(fft4611, 43176, fft4617);
__m512 fft4700 = _mm512_mask_mov_ps(fft4693, 43176, fft4697);
__m512 fft4621 = _mm512_mask_mov_ps(fft4619, 43176, fft4618);
__m512 fft4701 = _mm512_mask_mov_ps(fft4699, 43176, fft4698);
__m512 fft4622 = _mm512_mask_mov_ps(fft4620, 22102, fft4618);
__m512 fft4702 = _mm512_mask_mov_ps(fft4700, 22102, fft4698);
__m512 fft4623 = _mm512_mask_mul_ps(fft4621, 64764, fft4621, _mm512_set1_ps(5e-01f));
__m512 fft4703 = _mm512_mask_mul_ps(fft4701, 64764, fft4701, _mm512_set1_ps(5e-01f));
__m512 fft4624 = _mm512_mask_mul_ps(fft4622, 64764, fft4622, _mm512_set1_ps(5e-01f));
__m512 fft4704 = _mm512_mask_mul_ps(fft4702, 64764, fft4702, _mm512_set1_ps(5e-01f));
__m512 df401 = fft4623;
__m512 df409 = fft4703;
__m512 df402 = fft4624;
__m512 df410 = fft4704;
__m512 df403 = fft4604;
__m512 df411 = fft4687;
__m512 df404 = fft4605;
__m512 df412 = fft4688;
__m512 df405 = fft4606;
__m512 df413 = fft4689;
__m512 df406 = fft4607;
__m512 df414 = fft4690;
__m512 df407 = fft4608;
__m512 df415 = fft4691;
__m512 df408 = fft4609;
__m512 df416 = fft4692;
__m512i eo28 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df403 = _mm512_permutexvar_ps(eo28, df403);
df404 = _mm512_permutexvar_ps(eo28, df404);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df403);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df404);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df403);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df404);
df411 = _mm512_permutexvar_ps(eo28, df411);
df412 = _mm512_permutexvar_ps(eo28, df412);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df411);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df412);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df411);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df412);
df405 = _mm512_permutexvar_ps(eo28, df405);
df406 = _mm512_permutexvar_ps(eo28, df406);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df405);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df406);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df405);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df406);
df413 = _mm512_permutexvar_ps(eo28, df413);
df414 = _mm512_permutexvar_ps(eo28, df414);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df413);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df414);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df413);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df414);
df407 = _mm512_permutexvar_ps(eo28, df407);
df408 = _mm512_permutexvar_ps(eo28, df408);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df407);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df408);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df407);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df408);
df415 = _mm512_permutexvar_ps(eo28, df415);
df416 = _mm512_permutexvar_ps(eo28, df416);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df415);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df416);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df415);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df416);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df401);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df402);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df401);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df402);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df409);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k13+128*m28+32*f29, 255, df410);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df409);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k13+128*m28+32*f29, 65280, df410);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 18;
}
ptrdiff_t h13 = base2+40;
ptrdiff_t w13 = 170;
ptrdiff_t k14 = 3*s1;
ptrdiff_t kk13 = k14+2;
for (; k14 <= kk13; ++k14) {
__m512 bnMul13 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k14+3*i6))[0]);
__m512 bnAdd13 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k14+3*i6))[1]);
for (ptrdiff_t b29 = 0; b29 < 5; ++b29) {
ptrdiff_t m29 = (size_t)b29/2;
ptrdiff_t f30 = (size_t)b29%2;
__m512 dat402 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat402 = _mm512_mask_fmadd_ps(dat402, 65535, bnMul13, bnAdd13);
__m512 dat403 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat403 = _mm512_mask_fmadd_ps(dat403, 65535, bnMul13, bnAdd13);
__m512 dat404 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat404 = _mm512_mask_fmadd_ps(dat404, 65535, bnMul13, bnAdd13);
__m512 dat405 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat405 = _mm512_mask_fmadd_ps(dat405, 65535, bnMul13, bnAdd13);
__m512 dat406 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat406 = _mm512_mask_fmadd_ps(dat406, 65535, bnMul13, bnAdd13);
__m512 dat407 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat407 = _mm512_mask_fmadd_ps(dat407, 65535, bnMul13, bnAdd13);
__m512 dat408 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat408 = _mm512_mask_fmadd_ps(dat408, 65535, bnMul13, bnAdd13);
__m512 dat409 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat409 = _mm512_mask_fmadd_ps(dat409, 65535, bnMul13, bnAdd13);
__m512 dat410 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat410 = _mm512_mask_fmadd_ps(dat410, 65535, bnMul13, bnAdd13);
__m512 dat411 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat411 = _mm512_mask_fmadd_ps(dat411, 65535, bnMul13, bnAdd13);
__m512 dat412 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat412 = _mm512_mask_fmadd_ps(dat412, 65535, bnMul13, bnAdd13);
__m512 dat413 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat413 = _mm512_mask_fmadd_ps(dat413, 65535, bnMul13, bnAdd13);
__m512 dat414 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat414 = _mm512_mask_fmadd_ps(dat414, 65535, bnMul13, bnAdd13);
__m512 dat415 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat415 = _mm512_mask_fmadd_ps(dat415, 65535, bnMul13, bnAdd13);
__m512 dat416 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat416 = _mm512_mask_fmadd_ps(dat416, 65535, bnMul13, bnAdd13);
__m512 dat417 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k14+896*h13+4*w13+40*b29);
dat417 = _mm512_mask_fmadd_ps(dat417, 65535, bnMul13, bnAdd13);
__m512 fft4705 = _mm512_add_ps(dat402, dat410);
__m512 fft4793 = _mm512_add_ps(dat403, dat411);
__m512 fft4706 = _mm512_sub_ps(dat402, dat410);
__m512 fft4794 = _mm512_sub_ps(dat403, dat411);
__m512 fft4707 = _mm512_add_ps(dat404, dat412);
__m512 fft4795 = _mm512_add_ps(dat405, dat413);
__m512 fft4708 = _mm512_sub_ps(dat404, dat412);
__m512 fft4796 = _mm512_sub_ps(dat405, dat413);
__m512 fft4709 = _mm512_add_ps(dat406, dat414);
__m512 fft4797 = _mm512_add_ps(dat407, dat415);
__m512 fft4710 = _mm512_sub_ps(dat406, dat414);
__m512 fft4798 = _mm512_sub_ps(dat407, dat415);
__m512 fft4711 = _mm512_add_ps(dat408, dat416);
__m512 fft4799 = _mm512_add_ps(dat409, dat417);
__m512 fft4712 = _mm512_sub_ps(dat408, dat416);
__m512 fft4800 = _mm512_sub_ps(dat409, dat417);
__m512 fft4713 = _mm512_add_ps(fft4705, fft4709);
__m512 fft4801 = _mm512_add_ps(fft4793, fft4797);
__m512 fft4714 = _mm512_sub_ps(fft4705, fft4709);
__m512 fft4802 = _mm512_sub_ps(fft4793, fft4797);
__m512 fft4715 = _mm512_add_ps(fft4707, fft4711);
__m512 fft4803 = _mm512_add_ps(fft4795, fft4799);
__m512 fft4716 = _mm512_sub_ps(fft4711, fft4707);
__m512 fft4804 = _mm512_sub_ps(fft4799, fft4795);
__m512 fft4717 = _mm512_sub_ps(fft4708, fft4712);
__m512 fft4805 = _mm512_sub_ps(fft4796, fft4800);
__m512 fft4718 = _mm512_add_ps(fft4708, fft4712);
__m512 fft4806 = _mm512_add_ps(fft4796, fft4800);
__m512 fft4719 = _mm512_add_ps(fft4713, fft4715);
__m512 fft4807 = _mm512_add_ps(fft4801, fft4803);
__m512 fft4720 = _mm512_sub_ps(fft4713, fft4715);
__m512 fft4808 = _mm512_sub_ps(fft4801, fft4803);
__m512 fft4721 = _mm512_fmadd_ps(fft4717, _mm512_set1_ps(7.0710677e-01f), fft4706);
__m512 fft4809 = _mm512_fmadd_ps(fft4805, _mm512_set1_ps(7.0710677e-01f), fft4794);
__m512 fft4722 = _mm512_fnmsub_ps(fft4718, _mm512_set1_ps(7.0710677e-01f), fft4710);
__m512 fft4810 = _mm512_fnmsub_ps(fft4806, _mm512_set1_ps(7.0710677e-01f), fft4798);
__m512 fft4723 = _mm512_fnmadd_ps(fft4717, _mm512_set1_ps(7.0710677e-01f), fft4706);
__m512 fft4811 = _mm512_fnmadd_ps(fft4805, _mm512_set1_ps(7.0710677e-01f), fft4794);
__m512 fft4724 = _mm512_fnmadd_ps(fft4718, _mm512_set1_ps(7.0710677e-01f), fft4710);
__m512 fft4812 = _mm512_fnmadd_ps(fft4806, _mm512_set1_ps(7.0710677e-01f), fft4798);
__m512 fft4725 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4726 = _mm512_fmadd_ps(fft4719, fft4725, _mm512_shuffle_f32x4(fft4719, fft4719, 78));
__m512 fft4813 = _mm512_fmadd_ps(fft4807, fft4725, _mm512_shuffle_f32x4(fft4807, fft4807, 78));
__m512 fft4727 = _mm512_fmadd_ps(fft4720, fft4725, _mm512_shuffle_f32x4(fft4720, fft4720, 78));
__m512 fft4814 = _mm512_fmadd_ps(fft4808, fft4725, _mm512_shuffle_f32x4(fft4808, fft4808, 78));
__m512 fft4728 = _mm512_fmadd_ps(fft4721, fft4725, _mm512_shuffle_f32x4(fft4721, fft4721, 78));
__m512 fft4815 = _mm512_fmadd_ps(fft4809, fft4725, _mm512_shuffle_f32x4(fft4809, fft4809, 78));
__m512 fft4729 = _mm512_fmadd_ps(fft4722, fft4725, _mm512_shuffle_f32x4(fft4722, fft4722, 78));
__m512 fft4816 = _mm512_fmadd_ps(fft4810, fft4725, _mm512_shuffle_f32x4(fft4810, fft4810, 78));
__m512 fft4730 = _mm512_fmadd_ps(fft4714, fft4725, _mm512_shuffle_f32x4(fft4714, fft4714, 78));
__m512 fft4817 = _mm512_fmadd_ps(fft4802, fft4725, _mm512_shuffle_f32x4(fft4802, fft4802, 78));
__m512 fft4731 = _mm512_fmadd_ps(fft4716, fft4725, _mm512_shuffle_f32x4(fft4716, fft4716, 78));
__m512 fft4818 = _mm512_fmadd_ps(fft4804, fft4725, _mm512_shuffle_f32x4(fft4804, fft4804, 78));
__m512 fft4732 = _mm512_fmadd_ps(fft4723, fft4725, _mm512_shuffle_f32x4(fft4723, fft4723, 78));
__m512 fft4819 = _mm512_fmadd_ps(fft4811, fft4725, _mm512_shuffle_f32x4(fft4811, fft4811, 78));
__m512 fft4733 = _mm512_fmadd_ps(fft4724, fft4725, _mm512_shuffle_f32x4(fft4724, fft4724, 78));
__m512 fft4820 = _mm512_fmadd_ps(fft4812, fft4725, _mm512_shuffle_f32x4(fft4812, fft4812, 78));
__m512 fft4734 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4735 = _mm512_mul_ps(fft4726, fft4734);
__m512 fft4821 = _mm512_mul_ps(fft4813, fft4734);
__m512 fft4736 = _mm512_mul_ps(fft4727, fft4734);
__m512 fft4822 = _mm512_mul_ps(fft4814, fft4734);
__m512 fft4737 = _mm512_mul_ps(fft4728, fft4734);
__m512 fft4823 = _mm512_mul_ps(fft4815, fft4734);
__m512 fft4738 = _mm512_mul_ps(fft4729, fft4734);
__m512 fft4824 = _mm512_mul_ps(fft4816, fft4734);
__m512 fft4739 = _mm512_mul_ps(fft4730, fft4734);
__m512 fft4825 = _mm512_mul_ps(fft4817, fft4734);
__m512 fft4740 = _mm512_mul_ps(fft4731, fft4734);
__m512 fft4826 = _mm512_mul_ps(fft4818, fft4734);
__m512 fft4741 = _mm512_mul_ps(fft4732, fft4734);
__m512 fft4827 = _mm512_mul_ps(fft4819, fft4734);
__m512 fft4742 = _mm512_mul_ps(fft4733, fft4734);
__m512 fft4828 = _mm512_mul_ps(fft4820, fft4734);
__m512 fft4743 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4744 = _mm512_fmadd_ps(fft4727, fft4743, fft4735);
__m512 fft4829 = _mm512_fmadd_ps(fft4814, fft4743, fft4821);
__m512 fft4745 = _mm512_fnmadd_ps(fft4726, fft4743, fft4736);
__m512 fft4830 = _mm512_fnmadd_ps(fft4813, fft4743, fft4822);
__m512 fft4746 = _mm512_fmadd_ps(fft4729, fft4743, fft4737);
__m512 fft4831 = _mm512_fmadd_ps(fft4816, fft4743, fft4823);
__m512 fft4747 = _mm512_fnmadd_ps(fft4728, fft4743, fft4738);
__m512 fft4832 = _mm512_fnmadd_ps(fft4815, fft4743, fft4824);
__m512 fft4748 = _mm512_fmadd_ps(fft4731, fft4743, fft4739);
__m512 fft4833 = _mm512_fmadd_ps(fft4818, fft4743, fft4825);
__m512 fft4749 = _mm512_fnmadd_ps(fft4730, fft4743, fft4740);
__m512 fft4834 = _mm512_fnmadd_ps(fft4817, fft4743, fft4826);
__m512 fft4750 = _mm512_fmadd_ps(fft4733, fft4743, fft4741);
__m512 fft4835 = _mm512_fmadd_ps(fft4820, fft4743, fft4827);
__m512 fft4751 = _mm512_fnmadd_ps(fft4732, fft4743, fft4742);
__m512 fft4836 = _mm512_fnmadd_ps(fft4819, fft4743, fft4828);
__m512 fft4752 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4753 = _mm512_fmadd_ps(fft4744, fft4752, _mm512_shuffle_f32x4(fft4744, fft4744, 177));
__m512 fft4837 = _mm512_fmadd_ps(fft4829, fft4752, _mm512_shuffle_f32x4(fft4829, fft4829, 177));
__m512 fft4754 = _mm512_fmadd_ps(fft4745, fft4752, _mm512_shuffle_f32x4(fft4745, fft4745, 177));
__m512 fft4838 = _mm512_fmadd_ps(fft4830, fft4752, _mm512_shuffle_f32x4(fft4830, fft4830, 177));
__m512 fft4755 = _mm512_fmadd_ps(fft4746, fft4752, _mm512_shuffle_f32x4(fft4746, fft4746, 177));
__m512 fft4839 = _mm512_fmadd_ps(fft4831, fft4752, _mm512_shuffle_f32x4(fft4831, fft4831, 177));
__m512 fft4756 = _mm512_fmadd_ps(fft4747, fft4752, _mm512_shuffle_f32x4(fft4747, fft4747, 177));
__m512 fft4840 = _mm512_fmadd_ps(fft4832, fft4752, _mm512_shuffle_f32x4(fft4832, fft4832, 177));
__m512 fft4757 = _mm512_fmadd_ps(fft4748, fft4752, _mm512_shuffle_f32x4(fft4748, fft4748, 177));
__m512 fft4841 = _mm512_fmadd_ps(fft4833, fft4752, _mm512_shuffle_f32x4(fft4833, fft4833, 177));
__m512 fft4758 = _mm512_fmadd_ps(fft4749, fft4752, _mm512_shuffle_f32x4(fft4749, fft4749, 177));
__m512 fft4842 = _mm512_fmadd_ps(fft4834, fft4752, _mm512_shuffle_f32x4(fft4834, fft4834, 177));
__m512 fft4759 = _mm512_fmadd_ps(fft4750, fft4752, _mm512_shuffle_f32x4(fft4750, fft4750, 177));
__m512 fft4843 = _mm512_fmadd_ps(fft4835, fft4752, _mm512_shuffle_f32x4(fft4835, fft4835, 177));
__m512 fft4760 = _mm512_fmadd_ps(fft4751, fft4752, _mm512_shuffle_f32x4(fft4751, fft4751, 177));
__m512 fft4844 = _mm512_fmadd_ps(fft4836, fft4752, _mm512_shuffle_f32x4(fft4836, fft4836, 177));
__m512 fft4761 = _mm512_mask_mov_ps(fft4753, 49344, fft4754);
__m512 fft4845 = _mm512_mask_mov_ps(fft4837, 49344, fft4838);
__m512 fft4762 = _mm512_mask_sub_ps(fft4754, 49344, _mm512_setzero_ps(), fft4753);
__m512 fft4846 = _mm512_mask_sub_ps(fft4838, 49344, _mm512_setzero_ps(), fft4837);
__m512 fft4763 = _mm512_mask_mov_ps(fft4755, 49344, fft4756);
__m512 fft4847 = _mm512_mask_mov_ps(fft4839, 49344, fft4840);
__m512 fft4764 = _mm512_mask_sub_ps(fft4756, 49344, _mm512_setzero_ps(), fft4755);
__m512 fft4848 = _mm512_mask_sub_ps(fft4840, 49344, _mm512_setzero_ps(), fft4839);
__m512 fft4765 = _mm512_mask_mov_ps(fft4757, 49344, fft4758);
__m512 fft4849 = _mm512_mask_mov_ps(fft4841, 49344, fft4842);
__m512 fft4766 = _mm512_mask_sub_ps(fft4758, 49344, _mm512_setzero_ps(), fft4757);
__m512 fft4850 = _mm512_mask_sub_ps(fft4842, 49344, _mm512_setzero_ps(), fft4841);
__m512 fft4767 = _mm512_mask_mov_ps(fft4759, 49344, fft4760);
__m512 fft4851 = _mm512_mask_mov_ps(fft4843, 49344, fft4844);
__m512 fft4768 = _mm512_mask_sub_ps(fft4760, 49344, _mm512_setzero_ps(), fft4759);
__m512 fft4852 = _mm512_mask_sub_ps(fft4844, 49344, _mm512_setzero_ps(), fft4843);
__m512 fft4769 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4770 = _mm512_fmadd_ps(fft4761, fft4769, _mm512_shuffle_ps(fft4761, fft4761, 78));
__m512 fft4853 = _mm512_fmadd_ps(fft4845, fft4769, _mm512_shuffle_ps(fft4845, fft4845, 78));
__m512 fft4771 = _mm512_fmadd_ps(fft4762, fft4769, _mm512_shuffle_ps(fft4762, fft4762, 78));
__m512 fft4854 = _mm512_fmadd_ps(fft4846, fft4769, _mm512_shuffle_ps(fft4846, fft4846, 78));
__m512 fft4772 = _mm512_fmadd_ps(fft4763, fft4769, _mm512_shuffle_ps(fft4763, fft4763, 78));
__m512 fft4855 = _mm512_fmadd_ps(fft4847, fft4769, _mm512_shuffle_ps(fft4847, fft4847, 78));
__m512 fft4773 = _mm512_fmadd_ps(fft4764, fft4769, _mm512_shuffle_ps(fft4764, fft4764, 78));
__m512 fft4856 = _mm512_fmadd_ps(fft4848, fft4769, _mm512_shuffle_ps(fft4848, fft4848, 78));
__m512 fft4774 = _mm512_fmadd_ps(fft4765, fft4769, _mm512_shuffle_ps(fft4765, fft4765, 78));
__m512 fft4857 = _mm512_fmadd_ps(fft4849, fft4769, _mm512_shuffle_ps(fft4849, fft4849, 78));
__m512 fft4775 = _mm512_fmadd_ps(fft4766, fft4769, _mm512_shuffle_ps(fft4766, fft4766, 78));
__m512 fft4858 = _mm512_fmadd_ps(fft4850, fft4769, _mm512_shuffle_ps(fft4850, fft4850, 78));
__m512 fft4776 = _mm512_fmadd_ps(fft4767, fft4769, _mm512_shuffle_ps(fft4767, fft4767, 78));
__m512 fft4859 = _mm512_fmadd_ps(fft4851, fft4769, _mm512_shuffle_ps(fft4851, fft4851, 78));
__m512 fft4777 = _mm512_fmadd_ps(fft4768, fft4769, _mm512_shuffle_ps(fft4768, fft4768, 78));
__m512 fft4860 = _mm512_fmadd_ps(fft4852, fft4769, _mm512_shuffle_ps(fft4852, fft4852, 78));
__m512i fft4778 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4779 = _mm512_permutexvar_ps(fft4778, fft4770);
__m512 fft4861 = _mm512_permutexvar_ps(fft4778, fft4853);
__m512i fft4780 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4781 = _mm512_permutexvar_ps(fft4780, fft4770);
__m512 fft4862 = _mm512_permutexvar_ps(fft4780, fft4853);
__m512 fft4782 = _mm512_permutexvar_ps(fft4778, fft4771);
__m512 fft4863 = _mm512_permutexvar_ps(fft4778, fft4854);
__m512 fft4783 = _mm512_permutexvar_ps(fft4780, fft4771);
__m512 fft4864 = _mm512_permutexvar_ps(fft4780, fft4854);
__m512 fft4784 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4785 = _mm512_fmadd_ps(fft4779, fft4784, fft4781);
__m512 fft4865 = _mm512_fmadd_ps(fft4861, fft4784, fft4862);
__m512 fft4786 = _mm512_fnmadd_ps(fft4783, fft4784, fft4782);
__m512 fft4866 = _mm512_fnmadd_ps(fft4864, fft4784, fft4863);
__m512 fft4787 = _mm512_mask_mov_ps(fft4783, 21845, fft4785);
__m512 fft4867 = _mm512_mask_mov_ps(fft4864, 21845, fft4865);
__m512 fft4788 = _mm512_mask_mov_ps(fft4779, 43176, fft4785);
__m512 fft4868 = _mm512_mask_mov_ps(fft4861, 43176, fft4865);
__m512 fft4789 = _mm512_mask_mov_ps(fft4787, 43176, fft4786);
__m512 fft4869 = _mm512_mask_mov_ps(fft4867, 43176, fft4866);
__m512 fft4790 = _mm512_mask_mov_ps(fft4788, 22102, fft4786);
__m512 fft4870 = _mm512_mask_mov_ps(fft4868, 22102, fft4866);
__m512 fft4791 = _mm512_mask_mul_ps(fft4789, 64764, fft4789, _mm512_set1_ps(5e-01f));
__m512 fft4871 = _mm512_mask_mul_ps(fft4869, 64764, fft4869, _mm512_set1_ps(5e-01f));
__m512 fft4792 = _mm512_mask_mul_ps(fft4790, 64764, fft4790, _mm512_set1_ps(5e-01f));
__m512 fft4872 = _mm512_mask_mul_ps(fft4870, 64764, fft4870, _mm512_set1_ps(5e-01f));
__m512 df417 = fft4791;
__m512 df425 = fft4871;
__m512 df418 = fft4792;
__m512 df426 = fft4872;
__m512 df419 = fft4772;
__m512 df427 = fft4855;
__m512 df420 = fft4773;
__m512 df428 = fft4856;
__m512 df421 = fft4774;
__m512 df429 = fft4857;
__m512 df422 = fft4775;
__m512 df430 = fft4858;
__m512 df423 = fft4776;
__m512 df431 = fft4859;
__m512 df424 = fft4777;
__m512 df432 = fft4860;
__m512i eo29 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df419 = _mm512_permutexvar_ps(eo29, df419);
df420 = _mm512_permutexvar_ps(eo29, df420);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df419);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df420);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df419);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df420);
df427 = _mm512_permutexvar_ps(eo29, df427);
df428 = _mm512_permutexvar_ps(eo29, df428);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df427);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df428);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df427);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df428);
df421 = _mm512_permutexvar_ps(eo29, df421);
df422 = _mm512_permutexvar_ps(eo29, df422);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df421);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df422);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df421);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df422);
df429 = _mm512_permutexvar_ps(eo29, df429);
df430 = _mm512_permutexvar_ps(eo29, df430);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df429);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df430);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df429);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df430);
df423 = _mm512_permutexvar_ps(eo29, df423);
df424 = _mm512_permutexvar_ps(eo29, df424);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df423);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df424);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df423);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df424);
df431 = _mm512_permutexvar_ps(eo29, df431);
df432 = _mm512_permutexvar_ps(eo29, df432);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df431);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df432);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df431);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df432);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df417);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df418);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df417);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df418);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df425);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k14+128*m29+32*f30, 255, df426);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df425);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k14+128*m29+32*f30, 65280, df426);
}
ptrdiff_t b30 = 5;
ptrdiff_t m30 = (size_t)b30/2;
ptrdiff_t f31 = (size_t)b30%2;
__m512 dat418 = _mm512_maskz_loadu_ps(127, datPtr1+200+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat418 = _mm512_mask_fmadd_ps(dat418, 127, bnMul13, bnAdd13);
__m512 dat419 = _mm512_maskz_loadu_ps(127, datPtr1+1096+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat419 = _mm512_mask_fmadd_ps(dat419, 127, bnMul13, bnAdd13);
__m512 dat420 = _mm512_maskz_loadu_ps(127, datPtr1+1992+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat420 = _mm512_mask_fmadd_ps(dat420, 127, bnMul13, bnAdd13);
__m512 dat421 = _mm512_maskz_loadu_ps(127, datPtr1+2888+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat421 = _mm512_mask_fmadd_ps(dat421, 127, bnMul13, bnAdd13);
__m512 dat422 = _mm512_maskz_loadu_ps(127, datPtr1+3784+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat422 = _mm512_mask_fmadd_ps(dat422, 127, bnMul13, bnAdd13);
__m512 dat423 = _mm512_maskz_loadu_ps(127, datPtr1+4680+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat423 = _mm512_mask_fmadd_ps(dat423, 127, bnMul13, bnAdd13);
__m512 dat424 = _mm512_maskz_loadu_ps(127, datPtr1+5576+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat424 = _mm512_mask_fmadd_ps(dat424, 127, bnMul13, bnAdd13);
__m512 dat425 = _mm512_maskz_loadu_ps(127, datPtr1+6472+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat425 = _mm512_mask_fmadd_ps(dat425, 127, bnMul13, bnAdd13);
__m512 dat426 = _mm512_maskz_loadu_ps(127, datPtr1+7368+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat426 = _mm512_mask_fmadd_ps(dat426, 127, bnMul13, bnAdd13);
__m512 dat427 = _mm512_maskz_loadu_ps(127, datPtr1+8264+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat427 = _mm512_mask_fmadd_ps(dat427, 127, bnMul13, bnAdd13);
__m512 dat428 = _mm512_maskz_loadu_ps(127, datPtr1+9160+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat428 = _mm512_mask_fmadd_ps(dat428, 127, bnMul13, bnAdd13);
__m512 dat429 = _mm512_maskz_loadu_ps(127, datPtr1+10056+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat429 = _mm512_mask_fmadd_ps(dat429, 127, bnMul13, bnAdd13);
__m512 dat430 = _mm512_maskz_loadu_ps(127, datPtr1+10952+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat430 = _mm512_mask_fmadd_ps(dat430, 127, bnMul13, bnAdd13);
__m512 dat431 = _mm512_maskz_loadu_ps(127, datPtr1+11848+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat431 = _mm512_mask_fmadd_ps(dat431, 127, bnMul13, bnAdd13);
__m512 dat432 = _mm512_maskz_loadu_ps(127, datPtr1+12744+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat432 = _mm512_mask_fmadd_ps(dat432, 127, bnMul13, bnAdd13);
__m512 dat433 = _mm512_maskz_loadu_ps(127, datPtr1+13640+602112*i6+200704*k14+896*h13+4*w13+0*b30);
dat433 = _mm512_mask_fmadd_ps(dat433, 127, bnMul13, bnAdd13);
__m512 fft4873 = _mm512_add_ps(dat418, dat426);
__m512 fft4961 = _mm512_add_ps(dat419, dat427);
__m512 fft4874 = _mm512_sub_ps(dat418, dat426);
__m512 fft4962 = _mm512_sub_ps(dat419, dat427);
__m512 fft4875 = _mm512_add_ps(dat420, dat428);
__m512 fft4963 = _mm512_add_ps(dat421, dat429);
__m512 fft4876 = _mm512_sub_ps(dat420, dat428);
__m512 fft4964 = _mm512_sub_ps(dat421, dat429);
__m512 fft4877 = _mm512_add_ps(dat422, dat430);
__m512 fft4965 = _mm512_add_ps(dat423, dat431);
__m512 fft4878 = _mm512_sub_ps(dat422, dat430);
__m512 fft4966 = _mm512_sub_ps(dat423, dat431);
__m512 fft4879 = _mm512_add_ps(dat424, dat432);
__m512 fft4967 = _mm512_add_ps(dat425, dat433);
__m512 fft4880 = _mm512_sub_ps(dat424, dat432);
__m512 fft4968 = _mm512_sub_ps(dat425, dat433);
__m512 fft4881 = _mm512_add_ps(fft4873, fft4877);
__m512 fft4969 = _mm512_add_ps(fft4961, fft4965);
__m512 fft4882 = _mm512_sub_ps(fft4873, fft4877);
__m512 fft4970 = _mm512_sub_ps(fft4961, fft4965);
__m512 fft4883 = _mm512_add_ps(fft4875, fft4879);
__m512 fft4971 = _mm512_add_ps(fft4963, fft4967);
__m512 fft4884 = _mm512_sub_ps(fft4879, fft4875);
__m512 fft4972 = _mm512_sub_ps(fft4967, fft4963);
__m512 fft4885 = _mm512_sub_ps(fft4876, fft4880);
__m512 fft4973 = _mm512_sub_ps(fft4964, fft4968);
__m512 fft4886 = _mm512_add_ps(fft4876, fft4880);
__m512 fft4974 = _mm512_add_ps(fft4964, fft4968);
__m512 fft4887 = _mm512_add_ps(fft4881, fft4883);
__m512 fft4975 = _mm512_add_ps(fft4969, fft4971);
__m512 fft4888 = _mm512_sub_ps(fft4881, fft4883);
__m512 fft4976 = _mm512_sub_ps(fft4969, fft4971);
__m512 fft4889 = _mm512_fmadd_ps(fft4885, _mm512_set1_ps(7.0710677e-01f), fft4874);
__m512 fft4977 = _mm512_fmadd_ps(fft4973, _mm512_set1_ps(7.0710677e-01f), fft4962);
__m512 fft4890 = _mm512_fnmsub_ps(fft4886, _mm512_set1_ps(7.0710677e-01f), fft4878);
__m512 fft4978 = _mm512_fnmsub_ps(fft4974, _mm512_set1_ps(7.0710677e-01f), fft4966);
__m512 fft4891 = _mm512_fnmadd_ps(fft4885, _mm512_set1_ps(7.0710677e-01f), fft4874);
__m512 fft4979 = _mm512_fnmadd_ps(fft4973, _mm512_set1_ps(7.0710677e-01f), fft4962);
__m512 fft4892 = _mm512_fnmadd_ps(fft4886, _mm512_set1_ps(7.0710677e-01f), fft4878);
__m512 fft4980 = _mm512_fnmadd_ps(fft4974, _mm512_set1_ps(7.0710677e-01f), fft4966);
__m512 fft4893 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4894 = _mm512_fmadd_ps(fft4887, fft4893, _mm512_shuffle_f32x4(fft4887, fft4887, 78));
__m512 fft4981 = _mm512_fmadd_ps(fft4975, fft4893, _mm512_shuffle_f32x4(fft4975, fft4975, 78));
__m512 fft4895 = _mm512_fmadd_ps(fft4888, fft4893, _mm512_shuffle_f32x4(fft4888, fft4888, 78));
__m512 fft4982 = _mm512_fmadd_ps(fft4976, fft4893, _mm512_shuffle_f32x4(fft4976, fft4976, 78));
__m512 fft4896 = _mm512_fmadd_ps(fft4889, fft4893, _mm512_shuffle_f32x4(fft4889, fft4889, 78));
__m512 fft4983 = _mm512_fmadd_ps(fft4977, fft4893, _mm512_shuffle_f32x4(fft4977, fft4977, 78));
__m512 fft4897 = _mm512_fmadd_ps(fft4890, fft4893, _mm512_shuffle_f32x4(fft4890, fft4890, 78));
__m512 fft4984 = _mm512_fmadd_ps(fft4978, fft4893, _mm512_shuffle_f32x4(fft4978, fft4978, 78));
__m512 fft4898 = _mm512_fmadd_ps(fft4882, fft4893, _mm512_shuffle_f32x4(fft4882, fft4882, 78));
__m512 fft4985 = _mm512_fmadd_ps(fft4970, fft4893, _mm512_shuffle_f32x4(fft4970, fft4970, 78));
__m512 fft4899 = _mm512_fmadd_ps(fft4884, fft4893, _mm512_shuffle_f32x4(fft4884, fft4884, 78));
__m512 fft4986 = _mm512_fmadd_ps(fft4972, fft4893, _mm512_shuffle_f32x4(fft4972, fft4972, 78));
__m512 fft4900 = _mm512_fmadd_ps(fft4891, fft4893, _mm512_shuffle_f32x4(fft4891, fft4891, 78));
__m512 fft4987 = _mm512_fmadd_ps(fft4979, fft4893, _mm512_shuffle_f32x4(fft4979, fft4979, 78));
__m512 fft4901 = _mm512_fmadd_ps(fft4892, fft4893, _mm512_shuffle_f32x4(fft4892, fft4892, 78));
__m512 fft4988 = _mm512_fmadd_ps(fft4980, fft4893, _mm512_shuffle_f32x4(fft4980, fft4980, 78));
__m512 fft4902 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft4903 = _mm512_mul_ps(fft4894, fft4902);
__m512 fft4989 = _mm512_mul_ps(fft4981, fft4902);
__m512 fft4904 = _mm512_mul_ps(fft4895, fft4902);
__m512 fft4990 = _mm512_mul_ps(fft4982, fft4902);
__m512 fft4905 = _mm512_mul_ps(fft4896, fft4902);
__m512 fft4991 = _mm512_mul_ps(fft4983, fft4902);
__m512 fft4906 = _mm512_mul_ps(fft4897, fft4902);
__m512 fft4992 = _mm512_mul_ps(fft4984, fft4902);
__m512 fft4907 = _mm512_mul_ps(fft4898, fft4902);
__m512 fft4993 = _mm512_mul_ps(fft4985, fft4902);
__m512 fft4908 = _mm512_mul_ps(fft4899, fft4902);
__m512 fft4994 = _mm512_mul_ps(fft4986, fft4902);
__m512 fft4909 = _mm512_mul_ps(fft4900, fft4902);
__m512 fft4995 = _mm512_mul_ps(fft4987, fft4902);
__m512 fft4910 = _mm512_mul_ps(fft4901, fft4902);
__m512 fft4996 = _mm512_mul_ps(fft4988, fft4902);
__m512 fft4911 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft4912 = _mm512_fmadd_ps(fft4895, fft4911, fft4903);
__m512 fft4997 = _mm512_fmadd_ps(fft4982, fft4911, fft4989);
__m512 fft4913 = _mm512_fnmadd_ps(fft4894, fft4911, fft4904);
__m512 fft4998 = _mm512_fnmadd_ps(fft4981, fft4911, fft4990);
__m512 fft4914 = _mm512_fmadd_ps(fft4897, fft4911, fft4905);
__m512 fft4999 = _mm512_fmadd_ps(fft4984, fft4911, fft4991);
__m512 fft4915 = _mm512_fnmadd_ps(fft4896, fft4911, fft4906);
__m512 fft5000 = _mm512_fnmadd_ps(fft4983, fft4911, fft4992);
__m512 fft4916 = _mm512_fmadd_ps(fft4899, fft4911, fft4907);
__m512 fft5001 = _mm512_fmadd_ps(fft4986, fft4911, fft4993);
__m512 fft4917 = _mm512_fnmadd_ps(fft4898, fft4911, fft4908);
__m512 fft5002 = _mm512_fnmadd_ps(fft4985, fft4911, fft4994);
__m512 fft4918 = _mm512_fmadd_ps(fft4901, fft4911, fft4909);
__m512 fft5003 = _mm512_fmadd_ps(fft4988, fft4911, fft4995);
__m512 fft4919 = _mm512_fnmadd_ps(fft4900, fft4911, fft4910);
__m512 fft5004 = _mm512_fnmadd_ps(fft4987, fft4911, fft4996);
__m512 fft4920 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft4921 = _mm512_fmadd_ps(fft4912, fft4920, _mm512_shuffle_f32x4(fft4912, fft4912, 177));
__m512 fft5005 = _mm512_fmadd_ps(fft4997, fft4920, _mm512_shuffle_f32x4(fft4997, fft4997, 177));
__m512 fft4922 = _mm512_fmadd_ps(fft4913, fft4920, _mm512_shuffle_f32x4(fft4913, fft4913, 177));
__m512 fft5006 = _mm512_fmadd_ps(fft4998, fft4920, _mm512_shuffle_f32x4(fft4998, fft4998, 177));
__m512 fft4923 = _mm512_fmadd_ps(fft4914, fft4920, _mm512_shuffle_f32x4(fft4914, fft4914, 177));
__m512 fft5007 = _mm512_fmadd_ps(fft4999, fft4920, _mm512_shuffle_f32x4(fft4999, fft4999, 177));
__m512 fft4924 = _mm512_fmadd_ps(fft4915, fft4920, _mm512_shuffle_f32x4(fft4915, fft4915, 177));
__m512 fft5008 = _mm512_fmadd_ps(fft5000, fft4920, _mm512_shuffle_f32x4(fft5000, fft5000, 177));
__m512 fft4925 = _mm512_fmadd_ps(fft4916, fft4920, _mm512_shuffle_f32x4(fft4916, fft4916, 177));
__m512 fft5009 = _mm512_fmadd_ps(fft5001, fft4920, _mm512_shuffle_f32x4(fft5001, fft5001, 177));
__m512 fft4926 = _mm512_fmadd_ps(fft4917, fft4920, _mm512_shuffle_f32x4(fft4917, fft4917, 177));
__m512 fft5010 = _mm512_fmadd_ps(fft5002, fft4920, _mm512_shuffle_f32x4(fft5002, fft5002, 177));
__m512 fft4927 = _mm512_fmadd_ps(fft4918, fft4920, _mm512_shuffle_f32x4(fft4918, fft4918, 177));
__m512 fft5011 = _mm512_fmadd_ps(fft5003, fft4920, _mm512_shuffle_f32x4(fft5003, fft5003, 177));
__m512 fft4928 = _mm512_fmadd_ps(fft4919, fft4920, _mm512_shuffle_f32x4(fft4919, fft4919, 177));
__m512 fft5012 = _mm512_fmadd_ps(fft5004, fft4920, _mm512_shuffle_f32x4(fft5004, fft5004, 177));
__m512 fft4929 = _mm512_mask_mov_ps(fft4921, 49344, fft4922);
__m512 fft5013 = _mm512_mask_mov_ps(fft5005, 49344, fft5006);
__m512 fft4930 = _mm512_mask_sub_ps(fft4922, 49344, _mm512_setzero_ps(), fft4921);
__m512 fft5014 = _mm512_mask_sub_ps(fft5006, 49344, _mm512_setzero_ps(), fft5005);
__m512 fft4931 = _mm512_mask_mov_ps(fft4923, 49344, fft4924);
__m512 fft5015 = _mm512_mask_mov_ps(fft5007, 49344, fft5008);
__m512 fft4932 = _mm512_mask_sub_ps(fft4924, 49344, _mm512_setzero_ps(), fft4923);
__m512 fft5016 = _mm512_mask_sub_ps(fft5008, 49344, _mm512_setzero_ps(), fft5007);
__m512 fft4933 = _mm512_mask_mov_ps(fft4925, 49344, fft4926);
__m512 fft5017 = _mm512_mask_mov_ps(fft5009, 49344, fft5010);
__m512 fft4934 = _mm512_mask_sub_ps(fft4926, 49344, _mm512_setzero_ps(), fft4925);
__m512 fft5018 = _mm512_mask_sub_ps(fft5010, 49344, _mm512_setzero_ps(), fft5009);
__m512 fft4935 = _mm512_mask_mov_ps(fft4927, 49344, fft4928);
__m512 fft5019 = _mm512_mask_mov_ps(fft5011, 49344, fft5012);
__m512 fft4936 = _mm512_mask_sub_ps(fft4928, 49344, _mm512_setzero_ps(), fft4927);
__m512 fft5020 = _mm512_mask_sub_ps(fft5012, 49344, _mm512_setzero_ps(), fft5011);
__m512 fft4937 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft4938 = _mm512_fmadd_ps(fft4929, fft4937, _mm512_shuffle_ps(fft4929, fft4929, 78));
__m512 fft5021 = _mm512_fmadd_ps(fft5013, fft4937, _mm512_shuffle_ps(fft5013, fft5013, 78));
__m512 fft4939 = _mm512_fmadd_ps(fft4930, fft4937, _mm512_shuffle_ps(fft4930, fft4930, 78));
__m512 fft5022 = _mm512_fmadd_ps(fft5014, fft4937, _mm512_shuffle_ps(fft5014, fft5014, 78));
__m512 fft4940 = _mm512_fmadd_ps(fft4931, fft4937, _mm512_shuffle_ps(fft4931, fft4931, 78));
__m512 fft5023 = _mm512_fmadd_ps(fft5015, fft4937, _mm512_shuffle_ps(fft5015, fft5015, 78));
__m512 fft4941 = _mm512_fmadd_ps(fft4932, fft4937, _mm512_shuffle_ps(fft4932, fft4932, 78));
__m512 fft5024 = _mm512_fmadd_ps(fft5016, fft4937, _mm512_shuffle_ps(fft5016, fft5016, 78));
__m512 fft4942 = _mm512_fmadd_ps(fft4933, fft4937, _mm512_shuffle_ps(fft4933, fft4933, 78));
__m512 fft5025 = _mm512_fmadd_ps(fft5017, fft4937, _mm512_shuffle_ps(fft5017, fft5017, 78));
__m512 fft4943 = _mm512_fmadd_ps(fft4934, fft4937, _mm512_shuffle_ps(fft4934, fft4934, 78));
__m512 fft5026 = _mm512_fmadd_ps(fft5018, fft4937, _mm512_shuffle_ps(fft5018, fft5018, 78));
__m512 fft4944 = _mm512_fmadd_ps(fft4935, fft4937, _mm512_shuffle_ps(fft4935, fft4935, 78));
__m512 fft5027 = _mm512_fmadd_ps(fft5019, fft4937, _mm512_shuffle_ps(fft5019, fft5019, 78));
__m512 fft4945 = _mm512_fmadd_ps(fft4936, fft4937, _mm512_shuffle_ps(fft4936, fft4936, 78));
__m512 fft5028 = _mm512_fmadd_ps(fft5020, fft4937, _mm512_shuffle_ps(fft5020, fft5020, 78));
__m512i fft4946 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft4947 = _mm512_permutexvar_ps(fft4946, fft4938);
__m512 fft5029 = _mm512_permutexvar_ps(fft4946, fft5021);
__m512i fft4948 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft4949 = _mm512_permutexvar_ps(fft4948, fft4938);
__m512 fft5030 = _mm512_permutexvar_ps(fft4948, fft5021);
__m512 fft4950 = _mm512_permutexvar_ps(fft4946, fft4939);
__m512 fft5031 = _mm512_permutexvar_ps(fft4946, fft5022);
__m512 fft4951 = _mm512_permutexvar_ps(fft4948, fft4939);
__m512 fft5032 = _mm512_permutexvar_ps(fft4948, fft5022);
__m512 fft4952 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft4953 = _mm512_fmadd_ps(fft4947, fft4952, fft4949);
__m512 fft5033 = _mm512_fmadd_ps(fft5029, fft4952, fft5030);
__m512 fft4954 = _mm512_fnmadd_ps(fft4951, fft4952, fft4950);
__m512 fft5034 = _mm512_fnmadd_ps(fft5032, fft4952, fft5031);
__m512 fft4955 = _mm512_mask_mov_ps(fft4951, 21845, fft4953);
__m512 fft5035 = _mm512_mask_mov_ps(fft5032, 21845, fft5033);
__m512 fft4956 = _mm512_mask_mov_ps(fft4947, 43176, fft4953);
__m512 fft5036 = _mm512_mask_mov_ps(fft5029, 43176, fft5033);
__m512 fft4957 = _mm512_mask_mov_ps(fft4955, 43176, fft4954);
__m512 fft5037 = _mm512_mask_mov_ps(fft5035, 43176, fft5034);
__m512 fft4958 = _mm512_mask_mov_ps(fft4956, 22102, fft4954);
__m512 fft5038 = _mm512_mask_mov_ps(fft5036, 22102, fft5034);
__m512 fft4959 = _mm512_mask_mul_ps(fft4957, 64764, fft4957, _mm512_set1_ps(5e-01f));
__m512 fft5039 = _mm512_mask_mul_ps(fft5037, 64764, fft5037, _mm512_set1_ps(5e-01f));
__m512 fft4960 = _mm512_mask_mul_ps(fft4958, 64764, fft4958, _mm512_set1_ps(5e-01f));
__m512 fft5040 = _mm512_mask_mul_ps(fft5038, 64764, fft5038, _mm512_set1_ps(5e-01f));
__m512 df433 = fft4959;
__m512 df441 = fft5039;
__m512 df434 = fft4960;
__m512 df442 = fft5040;
__m512 df435 = fft4940;
__m512 df443 = fft5023;
__m512 df436 = fft4941;
__m512 df444 = fft5024;
__m512 df437 = fft4942;
__m512 df445 = fft5025;
__m512 df438 = fft4943;
__m512 df446 = fft5026;
__m512 df439 = fft4944;
__m512 df447 = fft5027;
__m512 df440 = fft4945;
__m512 df448 = fft5028;
__m512i eo30 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df435 = _mm512_permutexvar_ps(eo30, df435);
df436 = _mm512_permutexvar_ps(eo30, df436);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df435);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df436);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df435);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df436);
df443 = _mm512_permutexvar_ps(eo30, df443);
df444 = _mm512_permutexvar_ps(eo30, df444);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df443);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df444);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df443);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df444);
df437 = _mm512_permutexvar_ps(eo30, df437);
df438 = _mm512_permutexvar_ps(eo30, df438);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df437);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df438);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df437);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df438);
df445 = _mm512_permutexvar_ps(eo30, df445);
df446 = _mm512_permutexvar_ps(eo30, df446);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df445);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df446);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df445);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df446);
df439 = _mm512_permutexvar_ps(eo30, df439);
df440 = _mm512_permutexvar_ps(eo30, df440);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df439);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df440);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df439);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df440);
df447 = _mm512_permutexvar_ps(eo30, df447);
df448 = _mm512_permutexvar_ps(eo30, df448);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df447);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df448);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df447);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df448);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df433);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df434);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df433);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df434);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df441);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k14+128*m30+32*f31, 255, df442);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df441);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k14+128*m30+32*f31, 65280, df442);
}
if (j2 >= last1) return;
++j2;
rel2 = 19;
}
if (rel2 < 20) {
ptrdiff_t h14 = base2+50;
ptrdiff_t w14 = 0;
ptrdiff_t k15 = 3*s1;
ptrdiff_t kk14 = k15+2;
for (; k15 <= kk14; ++k15) {
__m512 bnMul14 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k15+3*i6))[0]);
__m512 bnAdd14 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k15+3*i6))[1]);
ptrdiff_t b31 = 0;
ptrdiff_t m31 = (size_t)b31/2;
ptrdiff_t f32 = (size_t)b31%2;
__m512 dat434 = _mm512_maskz_loadu_ps(65528, datPtr1+0+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat434 = _mm512_mask_fmadd_ps(dat434, 65528, bnMul14, bnAdd14);
__m512 dat435 = _mm512_maskz_loadu_ps(65528, datPtr1+896+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat435 = _mm512_mask_fmadd_ps(dat435, 65528, bnMul14, bnAdd14);
__m512 dat436 = _mm512_maskz_loadu_ps(65528, datPtr1+1792+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat436 = _mm512_mask_fmadd_ps(dat436, 65528, bnMul14, bnAdd14);
__m512 dat437 = _mm512_maskz_loadu_ps(65528, datPtr1+2688+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat437 = _mm512_mask_fmadd_ps(dat437, 65528, bnMul14, bnAdd14);
__m512 dat438 = _mm512_maskz_loadu_ps(65528, datPtr1+3584+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat438 = _mm512_mask_fmadd_ps(dat438, 65528, bnMul14, bnAdd14);
__m512 dat439 = _mm512_maskz_loadu_ps(65528, datPtr1+4480+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat439 = _mm512_mask_fmadd_ps(dat439, 65528, bnMul14, bnAdd14);
__m512 dat440 = _mm512_maskz_loadu_ps(65528, datPtr1+5376+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat440 = _mm512_mask_fmadd_ps(dat440, 65528, bnMul14, bnAdd14);
__m512 dat441 = _mm512_maskz_loadu_ps(65528, datPtr1+6272+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat441 = _mm512_mask_fmadd_ps(dat441, 65528, bnMul14, bnAdd14);
__m512 dat442 = _mm512_maskz_loadu_ps(65528, datPtr1+7168+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat442 = _mm512_mask_fmadd_ps(dat442, 65528, bnMul14, bnAdd14);
__m512 dat443 = _mm512_maskz_loadu_ps(65528, datPtr1+8064+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat443 = _mm512_mask_fmadd_ps(dat443, 65528, bnMul14, bnAdd14);
__m512 dat444 = _mm512_maskz_loadu_ps(65528, datPtr1+8960+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat444 = _mm512_mask_fmadd_ps(dat444, 65528, bnMul14, bnAdd14);
__m512 dat445 = _mm512_maskz_loadu_ps(65528, datPtr1+9856+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat445 = _mm512_mask_fmadd_ps(dat445, 65528, bnMul14, bnAdd14);
__m512 dat446 = _mm512_maskz_loadu_ps(65528, datPtr1+10752+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat446 = _mm512_mask_fmadd_ps(dat446, 65528, bnMul14, bnAdd14);
__m512 dat447 = _mm512_maskz_loadu_ps(65528, datPtr1+11648+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat447 = _mm512_mask_fmadd_ps(dat447, 65528, bnMul14, bnAdd14);
__m512 dat448 = _mm512_maskz_loadu_ps(65528, datPtr1+12544+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat448 = _mm512_mask_fmadd_ps(dat448, 65528, bnMul14, bnAdd14);
__m512 dat449 = _mm512_maskz_loadu_ps(65528, datPtr1+13440+602112*i6+200704*k15+896*h14+4*w14+0*b31);
dat449 = _mm512_mask_fmadd_ps(dat449, 65528, bnMul14, bnAdd14);
__m512 fft5041 = _mm512_add_ps(dat434, dat442);
__m512 fft5129 = _mm512_add_ps(dat435, dat443);
__m512 fft5042 = _mm512_sub_ps(dat434, dat442);
__m512 fft5130 = _mm512_sub_ps(dat435, dat443);
__m512 fft5043 = _mm512_add_ps(dat436, dat444);
__m512 fft5131 = _mm512_add_ps(dat437, dat445);
__m512 fft5044 = _mm512_sub_ps(dat436, dat444);
__m512 fft5132 = _mm512_sub_ps(dat437, dat445);
__m512 fft5045 = _mm512_add_ps(dat438, dat446);
__m512 fft5133 = _mm512_add_ps(dat439, dat447);
__m512 fft5046 = _mm512_sub_ps(dat438, dat446);
__m512 fft5134 = _mm512_sub_ps(dat439, dat447);
__m512 fft5047 = _mm512_add_ps(dat440, dat448);
__m512 fft5135 = _mm512_add_ps(dat441, dat449);
__m512 fft5048 = _mm512_sub_ps(dat440, dat448);
__m512 fft5136 = _mm512_sub_ps(dat441, dat449);
__m512 fft5049 = _mm512_add_ps(fft5041, fft5045);
__m512 fft5137 = _mm512_add_ps(fft5129, fft5133);
__m512 fft5050 = _mm512_sub_ps(fft5041, fft5045);
__m512 fft5138 = _mm512_sub_ps(fft5129, fft5133);
__m512 fft5051 = _mm512_add_ps(fft5043, fft5047);
__m512 fft5139 = _mm512_add_ps(fft5131, fft5135);
__m512 fft5052 = _mm512_sub_ps(fft5047, fft5043);
__m512 fft5140 = _mm512_sub_ps(fft5135, fft5131);
__m512 fft5053 = _mm512_sub_ps(fft5044, fft5048);
__m512 fft5141 = _mm512_sub_ps(fft5132, fft5136);
__m512 fft5054 = _mm512_add_ps(fft5044, fft5048);
__m512 fft5142 = _mm512_add_ps(fft5132, fft5136);
__m512 fft5055 = _mm512_add_ps(fft5049, fft5051);
__m512 fft5143 = _mm512_add_ps(fft5137, fft5139);
__m512 fft5056 = _mm512_sub_ps(fft5049, fft5051);
__m512 fft5144 = _mm512_sub_ps(fft5137, fft5139);
__m512 fft5057 = _mm512_fmadd_ps(fft5053, _mm512_set1_ps(7.0710677e-01f), fft5042);
__m512 fft5145 = _mm512_fmadd_ps(fft5141, _mm512_set1_ps(7.0710677e-01f), fft5130);
__m512 fft5058 = _mm512_fnmsub_ps(fft5054, _mm512_set1_ps(7.0710677e-01f), fft5046);
__m512 fft5146 = _mm512_fnmsub_ps(fft5142, _mm512_set1_ps(7.0710677e-01f), fft5134);
__m512 fft5059 = _mm512_fnmadd_ps(fft5053, _mm512_set1_ps(7.0710677e-01f), fft5042);
__m512 fft5147 = _mm512_fnmadd_ps(fft5141, _mm512_set1_ps(7.0710677e-01f), fft5130);
__m512 fft5060 = _mm512_fnmadd_ps(fft5054, _mm512_set1_ps(7.0710677e-01f), fft5046);
__m512 fft5148 = _mm512_fnmadd_ps(fft5142, _mm512_set1_ps(7.0710677e-01f), fft5134);
__m512 fft5061 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5062 = _mm512_fmadd_ps(fft5055, fft5061, _mm512_shuffle_f32x4(fft5055, fft5055, 78));
__m512 fft5149 = _mm512_fmadd_ps(fft5143, fft5061, _mm512_shuffle_f32x4(fft5143, fft5143, 78));
__m512 fft5063 = _mm512_fmadd_ps(fft5056, fft5061, _mm512_shuffle_f32x4(fft5056, fft5056, 78));
__m512 fft5150 = _mm512_fmadd_ps(fft5144, fft5061, _mm512_shuffle_f32x4(fft5144, fft5144, 78));
__m512 fft5064 = _mm512_fmadd_ps(fft5057, fft5061, _mm512_shuffle_f32x4(fft5057, fft5057, 78));
__m512 fft5151 = _mm512_fmadd_ps(fft5145, fft5061, _mm512_shuffle_f32x4(fft5145, fft5145, 78));
__m512 fft5065 = _mm512_fmadd_ps(fft5058, fft5061, _mm512_shuffle_f32x4(fft5058, fft5058, 78));
__m512 fft5152 = _mm512_fmadd_ps(fft5146, fft5061, _mm512_shuffle_f32x4(fft5146, fft5146, 78));
__m512 fft5066 = _mm512_fmadd_ps(fft5050, fft5061, _mm512_shuffle_f32x4(fft5050, fft5050, 78));
__m512 fft5153 = _mm512_fmadd_ps(fft5138, fft5061, _mm512_shuffle_f32x4(fft5138, fft5138, 78));
__m512 fft5067 = _mm512_fmadd_ps(fft5052, fft5061, _mm512_shuffle_f32x4(fft5052, fft5052, 78));
__m512 fft5154 = _mm512_fmadd_ps(fft5140, fft5061, _mm512_shuffle_f32x4(fft5140, fft5140, 78));
__m512 fft5068 = _mm512_fmadd_ps(fft5059, fft5061, _mm512_shuffle_f32x4(fft5059, fft5059, 78));
__m512 fft5155 = _mm512_fmadd_ps(fft5147, fft5061, _mm512_shuffle_f32x4(fft5147, fft5147, 78));
__m512 fft5069 = _mm512_fmadd_ps(fft5060, fft5061, _mm512_shuffle_f32x4(fft5060, fft5060, 78));
__m512 fft5156 = _mm512_fmadd_ps(fft5148, fft5061, _mm512_shuffle_f32x4(fft5148, fft5148, 78));
__m512 fft5070 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5071 = _mm512_mul_ps(fft5062, fft5070);
__m512 fft5157 = _mm512_mul_ps(fft5149, fft5070);
__m512 fft5072 = _mm512_mul_ps(fft5063, fft5070);
__m512 fft5158 = _mm512_mul_ps(fft5150, fft5070);
__m512 fft5073 = _mm512_mul_ps(fft5064, fft5070);
__m512 fft5159 = _mm512_mul_ps(fft5151, fft5070);
__m512 fft5074 = _mm512_mul_ps(fft5065, fft5070);
__m512 fft5160 = _mm512_mul_ps(fft5152, fft5070);
__m512 fft5075 = _mm512_mul_ps(fft5066, fft5070);
__m512 fft5161 = _mm512_mul_ps(fft5153, fft5070);
__m512 fft5076 = _mm512_mul_ps(fft5067, fft5070);
__m512 fft5162 = _mm512_mul_ps(fft5154, fft5070);
__m512 fft5077 = _mm512_mul_ps(fft5068, fft5070);
__m512 fft5163 = _mm512_mul_ps(fft5155, fft5070);
__m512 fft5078 = _mm512_mul_ps(fft5069, fft5070);
__m512 fft5164 = _mm512_mul_ps(fft5156, fft5070);
__m512 fft5079 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5080 = _mm512_fmadd_ps(fft5063, fft5079, fft5071);
__m512 fft5165 = _mm512_fmadd_ps(fft5150, fft5079, fft5157);
__m512 fft5081 = _mm512_fnmadd_ps(fft5062, fft5079, fft5072);
__m512 fft5166 = _mm512_fnmadd_ps(fft5149, fft5079, fft5158);
__m512 fft5082 = _mm512_fmadd_ps(fft5065, fft5079, fft5073);
__m512 fft5167 = _mm512_fmadd_ps(fft5152, fft5079, fft5159);
__m512 fft5083 = _mm512_fnmadd_ps(fft5064, fft5079, fft5074);
__m512 fft5168 = _mm512_fnmadd_ps(fft5151, fft5079, fft5160);
__m512 fft5084 = _mm512_fmadd_ps(fft5067, fft5079, fft5075);
__m512 fft5169 = _mm512_fmadd_ps(fft5154, fft5079, fft5161);
__m512 fft5085 = _mm512_fnmadd_ps(fft5066, fft5079, fft5076);
__m512 fft5170 = _mm512_fnmadd_ps(fft5153, fft5079, fft5162);
__m512 fft5086 = _mm512_fmadd_ps(fft5069, fft5079, fft5077);
__m512 fft5171 = _mm512_fmadd_ps(fft5156, fft5079, fft5163);
__m512 fft5087 = _mm512_fnmadd_ps(fft5068, fft5079, fft5078);
__m512 fft5172 = _mm512_fnmadd_ps(fft5155, fft5079, fft5164);
__m512 fft5088 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5089 = _mm512_fmadd_ps(fft5080, fft5088, _mm512_shuffle_f32x4(fft5080, fft5080, 177));
__m512 fft5173 = _mm512_fmadd_ps(fft5165, fft5088, _mm512_shuffle_f32x4(fft5165, fft5165, 177));
__m512 fft5090 = _mm512_fmadd_ps(fft5081, fft5088, _mm512_shuffle_f32x4(fft5081, fft5081, 177));
__m512 fft5174 = _mm512_fmadd_ps(fft5166, fft5088, _mm512_shuffle_f32x4(fft5166, fft5166, 177));
__m512 fft5091 = _mm512_fmadd_ps(fft5082, fft5088, _mm512_shuffle_f32x4(fft5082, fft5082, 177));
__m512 fft5175 = _mm512_fmadd_ps(fft5167, fft5088, _mm512_shuffle_f32x4(fft5167, fft5167, 177));
__m512 fft5092 = _mm512_fmadd_ps(fft5083, fft5088, _mm512_shuffle_f32x4(fft5083, fft5083, 177));
__m512 fft5176 = _mm512_fmadd_ps(fft5168, fft5088, _mm512_shuffle_f32x4(fft5168, fft5168, 177));
__m512 fft5093 = _mm512_fmadd_ps(fft5084, fft5088, _mm512_shuffle_f32x4(fft5084, fft5084, 177));
__m512 fft5177 = _mm512_fmadd_ps(fft5169, fft5088, _mm512_shuffle_f32x4(fft5169, fft5169, 177));
__m512 fft5094 = _mm512_fmadd_ps(fft5085, fft5088, _mm512_shuffle_f32x4(fft5085, fft5085, 177));
__m512 fft5178 = _mm512_fmadd_ps(fft5170, fft5088, _mm512_shuffle_f32x4(fft5170, fft5170, 177));
__m512 fft5095 = _mm512_fmadd_ps(fft5086, fft5088, _mm512_shuffle_f32x4(fft5086, fft5086, 177));
__m512 fft5179 = _mm512_fmadd_ps(fft5171, fft5088, _mm512_shuffle_f32x4(fft5171, fft5171, 177));
__m512 fft5096 = _mm512_fmadd_ps(fft5087, fft5088, _mm512_shuffle_f32x4(fft5087, fft5087, 177));
__m512 fft5180 = _mm512_fmadd_ps(fft5172, fft5088, _mm512_shuffle_f32x4(fft5172, fft5172, 177));
__m512 fft5097 = _mm512_mask_mov_ps(fft5089, 49344, fft5090);
__m512 fft5181 = _mm512_mask_mov_ps(fft5173, 49344, fft5174);
__m512 fft5098 = _mm512_mask_sub_ps(fft5090, 49344, _mm512_setzero_ps(), fft5089);
__m512 fft5182 = _mm512_mask_sub_ps(fft5174, 49344, _mm512_setzero_ps(), fft5173);
__m512 fft5099 = _mm512_mask_mov_ps(fft5091, 49344, fft5092);
__m512 fft5183 = _mm512_mask_mov_ps(fft5175, 49344, fft5176);
__m512 fft5100 = _mm512_mask_sub_ps(fft5092, 49344, _mm512_setzero_ps(), fft5091);
__m512 fft5184 = _mm512_mask_sub_ps(fft5176, 49344, _mm512_setzero_ps(), fft5175);
__m512 fft5101 = _mm512_mask_mov_ps(fft5093, 49344, fft5094);
__m512 fft5185 = _mm512_mask_mov_ps(fft5177, 49344, fft5178);
__m512 fft5102 = _mm512_mask_sub_ps(fft5094, 49344, _mm512_setzero_ps(), fft5093);
__m512 fft5186 = _mm512_mask_sub_ps(fft5178, 49344, _mm512_setzero_ps(), fft5177);
__m512 fft5103 = _mm512_mask_mov_ps(fft5095, 49344, fft5096);
__m512 fft5187 = _mm512_mask_mov_ps(fft5179, 49344, fft5180);
__m512 fft5104 = _mm512_mask_sub_ps(fft5096, 49344, _mm512_setzero_ps(), fft5095);
__m512 fft5188 = _mm512_mask_sub_ps(fft5180, 49344, _mm512_setzero_ps(), fft5179);
__m512 fft5105 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5106 = _mm512_fmadd_ps(fft5097, fft5105, _mm512_shuffle_ps(fft5097, fft5097, 78));
__m512 fft5189 = _mm512_fmadd_ps(fft5181, fft5105, _mm512_shuffle_ps(fft5181, fft5181, 78));
__m512 fft5107 = _mm512_fmadd_ps(fft5098, fft5105, _mm512_shuffle_ps(fft5098, fft5098, 78));
__m512 fft5190 = _mm512_fmadd_ps(fft5182, fft5105, _mm512_shuffle_ps(fft5182, fft5182, 78));
__m512 fft5108 = _mm512_fmadd_ps(fft5099, fft5105, _mm512_shuffle_ps(fft5099, fft5099, 78));
__m512 fft5191 = _mm512_fmadd_ps(fft5183, fft5105, _mm512_shuffle_ps(fft5183, fft5183, 78));
__m512 fft5109 = _mm512_fmadd_ps(fft5100, fft5105, _mm512_shuffle_ps(fft5100, fft5100, 78));
__m512 fft5192 = _mm512_fmadd_ps(fft5184, fft5105, _mm512_shuffle_ps(fft5184, fft5184, 78));
__m512 fft5110 = _mm512_fmadd_ps(fft5101, fft5105, _mm512_shuffle_ps(fft5101, fft5101, 78));
__m512 fft5193 = _mm512_fmadd_ps(fft5185, fft5105, _mm512_shuffle_ps(fft5185, fft5185, 78));
__m512 fft5111 = _mm512_fmadd_ps(fft5102, fft5105, _mm512_shuffle_ps(fft5102, fft5102, 78));
__m512 fft5194 = _mm512_fmadd_ps(fft5186, fft5105, _mm512_shuffle_ps(fft5186, fft5186, 78));
__m512 fft5112 = _mm512_fmadd_ps(fft5103, fft5105, _mm512_shuffle_ps(fft5103, fft5103, 78));
__m512 fft5195 = _mm512_fmadd_ps(fft5187, fft5105, _mm512_shuffle_ps(fft5187, fft5187, 78));
__m512 fft5113 = _mm512_fmadd_ps(fft5104, fft5105, _mm512_shuffle_ps(fft5104, fft5104, 78));
__m512 fft5196 = _mm512_fmadd_ps(fft5188, fft5105, _mm512_shuffle_ps(fft5188, fft5188, 78));
__m512i fft5114 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5115 = _mm512_permutexvar_ps(fft5114, fft5106);
__m512 fft5197 = _mm512_permutexvar_ps(fft5114, fft5189);
__m512i fft5116 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5117 = _mm512_permutexvar_ps(fft5116, fft5106);
__m512 fft5198 = _mm512_permutexvar_ps(fft5116, fft5189);
__m512 fft5118 = _mm512_permutexvar_ps(fft5114, fft5107);
__m512 fft5199 = _mm512_permutexvar_ps(fft5114, fft5190);
__m512 fft5119 = _mm512_permutexvar_ps(fft5116, fft5107);
__m512 fft5200 = _mm512_permutexvar_ps(fft5116, fft5190);
__m512 fft5120 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5121 = _mm512_fmadd_ps(fft5115, fft5120, fft5117);
__m512 fft5201 = _mm512_fmadd_ps(fft5197, fft5120, fft5198);
__m512 fft5122 = _mm512_fnmadd_ps(fft5119, fft5120, fft5118);
__m512 fft5202 = _mm512_fnmadd_ps(fft5200, fft5120, fft5199);
__m512 fft5123 = _mm512_mask_mov_ps(fft5119, 21845, fft5121);
__m512 fft5203 = _mm512_mask_mov_ps(fft5200, 21845, fft5201);
__m512 fft5124 = _mm512_mask_mov_ps(fft5115, 43176, fft5121);
__m512 fft5204 = _mm512_mask_mov_ps(fft5197, 43176, fft5201);
__m512 fft5125 = _mm512_mask_mov_ps(fft5123, 43176, fft5122);
__m512 fft5205 = _mm512_mask_mov_ps(fft5203, 43176, fft5202);
__m512 fft5126 = _mm512_mask_mov_ps(fft5124, 22102, fft5122);
__m512 fft5206 = _mm512_mask_mov_ps(fft5204, 22102, fft5202);
__m512 fft5127 = _mm512_mask_mul_ps(fft5125, 64764, fft5125, _mm512_set1_ps(5e-01f));
__m512 fft5207 = _mm512_mask_mul_ps(fft5205, 64764, fft5205, _mm512_set1_ps(5e-01f));
__m512 fft5128 = _mm512_mask_mul_ps(fft5126, 64764, fft5126, _mm512_set1_ps(5e-01f));
__m512 fft5208 = _mm512_mask_mul_ps(fft5206, 64764, fft5206, _mm512_set1_ps(5e-01f));
__m512 df449 = fft5127;
__m512 df457 = fft5207;
__m512 df450 = fft5128;
__m512 df458 = fft5208;
__m512 df451 = fft5108;
__m512 df459 = fft5191;
__m512 df452 = fft5109;
__m512 df460 = fft5192;
__m512 df453 = fft5110;
__m512 df461 = fft5193;
__m512 df454 = fft5111;
__m512 df462 = fft5194;
__m512 df455 = fft5112;
__m512 df463 = fft5195;
__m512 df456 = fft5113;
__m512 df464 = fft5196;
__m512i eo31 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df451 = _mm512_permutexvar_ps(eo31, df451);
df452 = _mm512_permutexvar_ps(eo31, df452);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df451);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df452);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df451);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df452);
df459 = _mm512_permutexvar_ps(eo31, df459);
df460 = _mm512_permutexvar_ps(eo31, df460);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df459);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df460);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df459);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df460);
df453 = _mm512_permutexvar_ps(eo31, df453);
df454 = _mm512_permutexvar_ps(eo31, df454);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df453);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df454);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df453);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df454);
df461 = _mm512_permutexvar_ps(eo31, df461);
df462 = _mm512_permutexvar_ps(eo31, df462);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df461);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df462);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df461);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df462);
df455 = _mm512_permutexvar_ps(eo31, df455);
df456 = _mm512_permutexvar_ps(eo31, df456);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df455);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df456);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df455);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df456);
df463 = _mm512_permutexvar_ps(eo31, df463);
df464 = _mm512_permutexvar_ps(eo31, df464);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df463);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df464);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df463);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df464);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df449);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df450);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df449);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df450);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df457);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k15+128*m31+32*f32, 255, df458);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df457);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k15+128*m31+32*f32, 65280, df458);
for (ptrdiff_t b32 = 1; b32 < 6; ++b32) {
ptrdiff_t m32 = (size_t)b32/2;
ptrdiff_t f33 = (size_t)b32%2;
__m512 dat450 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat450 = _mm512_mask_fmadd_ps(dat450, 65535, bnMul14, bnAdd14);
__m512 dat451 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat451 = _mm512_mask_fmadd_ps(dat451, 65535, bnMul14, bnAdd14);
__m512 dat452 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat452 = _mm512_mask_fmadd_ps(dat452, 65535, bnMul14, bnAdd14);
__m512 dat453 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat453 = _mm512_mask_fmadd_ps(dat453, 65535, bnMul14, bnAdd14);
__m512 dat454 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat454 = _mm512_mask_fmadd_ps(dat454, 65535, bnMul14, bnAdd14);
__m512 dat455 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat455 = _mm512_mask_fmadd_ps(dat455, 65535, bnMul14, bnAdd14);
__m512 dat456 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat456 = _mm512_mask_fmadd_ps(dat456, 65535, bnMul14, bnAdd14);
__m512 dat457 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat457 = _mm512_mask_fmadd_ps(dat457, 65535, bnMul14, bnAdd14);
__m512 dat458 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat458 = _mm512_mask_fmadd_ps(dat458, 65535, bnMul14, bnAdd14);
__m512 dat459 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat459 = _mm512_mask_fmadd_ps(dat459, 65535, bnMul14, bnAdd14);
__m512 dat460 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat460 = _mm512_mask_fmadd_ps(dat460, 65535, bnMul14, bnAdd14);
__m512 dat461 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat461 = _mm512_mask_fmadd_ps(dat461, 65535, bnMul14, bnAdd14);
__m512 dat462 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat462 = _mm512_mask_fmadd_ps(dat462, 65535, bnMul14, bnAdd14);
__m512 dat463 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat463 = _mm512_mask_fmadd_ps(dat463, 65535, bnMul14, bnAdd14);
__m512 dat464 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat464 = _mm512_mask_fmadd_ps(dat464, 65535, bnMul14, bnAdd14);
__m512 dat465 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k15+896*h14+4*w14+40*b32);
dat465 = _mm512_mask_fmadd_ps(dat465, 65535, bnMul14, bnAdd14);
__m512 fft5209 = _mm512_add_ps(dat450, dat458);
__m512 fft5297 = _mm512_add_ps(dat451, dat459);
__m512 fft5210 = _mm512_sub_ps(dat450, dat458);
__m512 fft5298 = _mm512_sub_ps(dat451, dat459);
__m512 fft5211 = _mm512_add_ps(dat452, dat460);
__m512 fft5299 = _mm512_add_ps(dat453, dat461);
__m512 fft5212 = _mm512_sub_ps(dat452, dat460);
__m512 fft5300 = _mm512_sub_ps(dat453, dat461);
__m512 fft5213 = _mm512_add_ps(dat454, dat462);
__m512 fft5301 = _mm512_add_ps(dat455, dat463);
__m512 fft5214 = _mm512_sub_ps(dat454, dat462);
__m512 fft5302 = _mm512_sub_ps(dat455, dat463);
__m512 fft5215 = _mm512_add_ps(dat456, dat464);
__m512 fft5303 = _mm512_add_ps(dat457, dat465);
__m512 fft5216 = _mm512_sub_ps(dat456, dat464);
__m512 fft5304 = _mm512_sub_ps(dat457, dat465);
__m512 fft5217 = _mm512_add_ps(fft5209, fft5213);
__m512 fft5305 = _mm512_add_ps(fft5297, fft5301);
__m512 fft5218 = _mm512_sub_ps(fft5209, fft5213);
__m512 fft5306 = _mm512_sub_ps(fft5297, fft5301);
__m512 fft5219 = _mm512_add_ps(fft5211, fft5215);
__m512 fft5307 = _mm512_add_ps(fft5299, fft5303);
__m512 fft5220 = _mm512_sub_ps(fft5215, fft5211);
__m512 fft5308 = _mm512_sub_ps(fft5303, fft5299);
__m512 fft5221 = _mm512_sub_ps(fft5212, fft5216);
__m512 fft5309 = _mm512_sub_ps(fft5300, fft5304);
__m512 fft5222 = _mm512_add_ps(fft5212, fft5216);
__m512 fft5310 = _mm512_add_ps(fft5300, fft5304);
__m512 fft5223 = _mm512_add_ps(fft5217, fft5219);
__m512 fft5311 = _mm512_add_ps(fft5305, fft5307);
__m512 fft5224 = _mm512_sub_ps(fft5217, fft5219);
__m512 fft5312 = _mm512_sub_ps(fft5305, fft5307);
__m512 fft5225 = _mm512_fmadd_ps(fft5221, _mm512_set1_ps(7.0710677e-01f), fft5210);
__m512 fft5313 = _mm512_fmadd_ps(fft5309, _mm512_set1_ps(7.0710677e-01f), fft5298);
__m512 fft5226 = _mm512_fnmsub_ps(fft5222, _mm512_set1_ps(7.0710677e-01f), fft5214);
__m512 fft5314 = _mm512_fnmsub_ps(fft5310, _mm512_set1_ps(7.0710677e-01f), fft5302);
__m512 fft5227 = _mm512_fnmadd_ps(fft5221, _mm512_set1_ps(7.0710677e-01f), fft5210);
__m512 fft5315 = _mm512_fnmadd_ps(fft5309, _mm512_set1_ps(7.0710677e-01f), fft5298);
__m512 fft5228 = _mm512_fnmadd_ps(fft5222, _mm512_set1_ps(7.0710677e-01f), fft5214);
__m512 fft5316 = _mm512_fnmadd_ps(fft5310, _mm512_set1_ps(7.0710677e-01f), fft5302);
__m512 fft5229 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5230 = _mm512_fmadd_ps(fft5223, fft5229, _mm512_shuffle_f32x4(fft5223, fft5223, 78));
__m512 fft5317 = _mm512_fmadd_ps(fft5311, fft5229, _mm512_shuffle_f32x4(fft5311, fft5311, 78));
__m512 fft5231 = _mm512_fmadd_ps(fft5224, fft5229, _mm512_shuffle_f32x4(fft5224, fft5224, 78));
__m512 fft5318 = _mm512_fmadd_ps(fft5312, fft5229, _mm512_shuffle_f32x4(fft5312, fft5312, 78));
__m512 fft5232 = _mm512_fmadd_ps(fft5225, fft5229, _mm512_shuffle_f32x4(fft5225, fft5225, 78));
__m512 fft5319 = _mm512_fmadd_ps(fft5313, fft5229, _mm512_shuffle_f32x4(fft5313, fft5313, 78));
__m512 fft5233 = _mm512_fmadd_ps(fft5226, fft5229, _mm512_shuffle_f32x4(fft5226, fft5226, 78));
__m512 fft5320 = _mm512_fmadd_ps(fft5314, fft5229, _mm512_shuffle_f32x4(fft5314, fft5314, 78));
__m512 fft5234 = _mm512_fmadd_ps(fft5218, fft5229, _mm512_shuffle_f32x4(fft5218, fft5218, 78));
__m512 fft5321 = _mm512_fmadd_ps(fft5306, fft5229, _mm512_shuffle_f32x4(fft5306, fft5306, 78));
__m512 fft5235 = _mm512_fmadd_ps(fft5220, fft5229, _mm512_shuffle_f32x4(fft5220, fft5220, 78));
__m512 fft5322 = _mm512_fmadd_ps(fft5308, fft5229, _mm512_shuffle_f32x4(fft5308, fft5308, 78));
__m512 fft5236 = _mm512_fmadd_ps(fft5227, fft5229, _mm512_shuffle_f32x4(fft5227, fft5227, 78));
__m512 fft5323 = _mm512_fmadd_ps(fft5315, fft5229, _mm512_shuffle_f32x4(fft5315, fft5315, 78));
__m512 fft5237 = _mm512_fmadd_ps(fft5228, fft5229, _mm512_shuffle_f32x4(fft5228, fft5228, 78));
__m512 fft5324 = _mm512_fmadd_ps(fft5316, fft5229, _mm512_shuffle_f32x4(fft5316, fft5316, 78));
__m512 fft5238 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5239 = _mm512_mul_ps(fft5230, fft5238);
__m512 fft5325 = _mm512_mul_ps(fft5317, fft5238);
__m512 fft5240 = _mm512_mul_ps(fft5231, fft5238);
__m512 fft5326 = _mm512_mul_ps(fft5318, fft5238);
__m512 fft5241 = _mm512_mul_ps(fft5232, fft5238);
__m512 fft5327 = _mm512_mul_ps(fft5319, fft5238);
__m512 fft5242 = _mm512_mul_ps(fft5233, fft5238);
__m512 fft5328 = _mm512_mul_ps(fft5320, fft5238);
__m512 fft5243 = _mm512_mul_ps(fft5234, fft5238);
__m512 fft5329 = _mm512_mul_ps(fft5321, fft5238);
__m512 fft5244 = _mm512_mul_ps(fft5235, fft5238);
__m512 fft5330 = _mm512_mul_ps(fft5322, fft5238);
__m512 fft5245 = _mm512_mul_ps(fft5236, fft5238);
__m512 fft5331 = _mm512_mul_ps(fft5323, fft5238);
__m512 fft5246 = _mm512_mul_ps(fft5237, fft5238);
__m512 fft5332 = _mm512_mul_ps(fft5324, fft5238);
__m512 fft5247 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5248 = _mm512_fmadd_ps(fft5231, fft5247, fft5239);
__m512 fft5333 = _mm512_fmadd_ps(fft5318, fft5247, fft5325);
__m512 fft5249 = _mm512_fnmadd_ps(fft5230, fft5247, fft5240);
__m512 fft5334 = _mm512_fnmadd_ps(fft5317, fft5247, fft5326);
__m512 fft5250 = _mm512_fmadd_ps(fft5233, fft5247, fft5241);
__m512 fft5335 = _mm512_fmadd_ps(fft5320, fft5247, fft5327);
__m512 fft5251 = _mm512_fnmadd_ps(fft5232, fft5247, fft5242);
__m512 fft5336 = _mm512_fnmadd_ps(fft5319, fft5247, fft5328);
__m512 fft5252 = _mm512_fmadd_ps(fft5235, fft5247, fft5243);
__m512 fft5337 = _mm512_fmadd_ps(fft5322, fft5247, fft5329);
__m512 fft5253 = _mm512_fnmadd_ps(fft5234, fft5247, fft5244);
__m512 fft5338 = _mm512_fnmadd_ps(fft5321, fft5247, fft5330);
__m512 fft5254 = _mm512_fmadd_ps(fft5237, fft5247, fft5245);
__m512 fft5339 = _mm512_fmadd_ps(fft5324, fft5247, fft5331);
__m512 fft5255 = _mm512_fnmadd_ps(fft5236, fft5247, fft5246);
__m512 fft5340 = _mm512_fnmadd_ps(fft5323, fft5247, fft5332);
__m512 fft5256 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5257 = _mm512_fmadd_ps(fft5248, fft5256, _mm512_shuffle_f32x4(fft5248, fft5248, 177));
__m512 fft5341 = _mm512_fmadd_ps(fft5333, fft5256, _mm512_shuffle_f32x4(fft5333, fft5333, 177));
__m512 fft5258 = _mm512_fmadd_ps(fft5249, fft5256, _mm512_shuffle_f32x4(fft5249, fft5249, 177));
__m512 fft5342 = _mm512_fmadd_ps(fft5334, fft5256, _mm512_shuffle_f32x4(fft5334, fft5334, 177));
__m512 fft5259 = _mm512_fmadd_ps(fft5250, fft5256, _mm512_shuffle_f32x4(fft5250, fft5250, 177));
__m512 fft5343 = _mm512_fmadd_ps(fft5335, fft5256, _mm512_shuffle_f32x4(fft5335, fft5335, 177));
__m512 fft5260 = _mm512_fmadd_ps(fft5251, fft5256, _mm512_shuffle_f32x4(fft5251, fft5251, 177));
__m512 fft5344 = _mm512_fmadd_ps(fft5336, fft5256, _mm512_shuffle_f32x4(fft5336, fft5336, 177));
__m512 fft5261 = _mm512_fmadd_ps(fft5252, fft5256, _mm512_shuffle_f32x4(fft5252, fft5252, 177));
__m512 fft5345 = _mm512_fmadd_ps(fft5337, fft5256, _mm512_shuffle_f32x4(fft5337, fft5337, 177));
__m512 fft5262 = _mm512_fmadd_ps(fft5253, fft5256, _mm512_shuffle_f32x4(fft5253, fft5253, 177));
__m512 fft5346 = _mm512_fmadd_ps(fft5338, fft5256, _mm512_shuffle_f32x4(fft5338, fft5338, 177));
__m512 fft5263 = _mm512_fmadd_ps(fft5254, fft5256, _mm512_shuffle_f32x4(fft5254, fft5254, 177));
__m512 fft5347 = _mm512_fmadd_ps(fft5339, fft5256, _mm512_shuffle_f32x4(fft5339, fft5339, 177));
__m512 fft5264 = _mm512_fmadd_ps(fft5255, fft5256, _mm512_shuffle_f32x4(fft5255, fft5255, 177));
__m512 fft5348 = _mm512_fmadd_ps(fft5340, fft5256, _mm512_shuffle_f32x4(fft5340, fft5340, 177));
__m512 fft5265 = _mm512_mask_mov_ps(fft5257, 49344, fft5258);
__m512 fft5349 = _mm512_mask_mov_ps(fft5341, 49344, fft5342);
__m512 fft5266 = _mm512_mask_sub_ps(fft5258, 49344, _mm512_setzero_ps(), fft5257);
__m512 fft5350 = _mm512_mask_sub_ps(fft5342, 49344, _mm512_setzero_ps(), fft5341);
__m512 fft5267 = _mm512_mask_mov_ps(fft5259, 49344, fft5260);
__m512 fft5351 = _mm512_mask_mov_ps(fft5343, 49344, fft5344);
__m512 fft5268 = _mm512_mask_sub_ps(fft5260, 49344, _mm512_setzero_ps(), fft5259);
__m512 fft5352 = _mm512_mask_sub_ps(fft5344, 49344, _mm512_setzero_ps(), fft5343);
__m512 fft5269 = _mm512_mask_mov_ps(fft5261, 49344, fft5262);
__m512 fft5353 = _mm512_mask_mov_ps(fft5345, 49344, fft5346);
__m512 fft5270 = _mm512_mask_sub_ps(fft5262, 49344, _mm512_setzero_ps(), fft5261);
__m512 fft5354 = _mm512_mask_sub_ps(fft5346, 49344, _mm512_setzero_ps(), fft5345);
__m512 fft5271 = _mm512_mask_mov_ps(fft5263, 49344, fft5264);
__m512 fft5355 = _mm512_mask_mov_ps(fft5347, 49344, fft5348);
__m512 fft5272 = _mm512_mask_sub_ps(fft5264, 49344, _mm512_setzero_ps(), fft5263);
__m512 fft5356 = _mm512_mask_sub_ps(fft5348, 49344, _mm512_setzero_ps(), fft5347);
__m512 fft5273 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5274 = _mm512_fmadd_ps(fft5265, fft5273, _mm512_shuffle_ps(fft5265, fft5265, 78));
__m512 fft5357 = _mm512_fmadd_ps(fft5349, fft5273, _mm512_shuffle_ps(fft5349, fft5349, 78));
__m512 fft5275 = _mm512_fmadd_ps(fft5266, fft5273, _mm512_shuffle_ps(fft5266, fft5266, 78));
__m512 fft5358 = _mm512_fmadd_ps(fft5350, fft5273, _mm512_shuffle_ps(fft5350, fft5350, 78));
__m512 fft5276 = _mm512_fmadd_ps(fft5267, fft5273, _mm512_shuffle_ps(fft5267, fft5267, 78));
__m512 fft5359 = _mm512_fmadd_ps(fft5351, fft5273, _mm512_shuffle_ps(fft5351, fft5351, 78));
__m512 fft5277 = _mm512_fmadd_ps(fft5268, fft5273, _mm512_shuffle_ps(fft5268, fft5268, 78));
__m512 fft5360 = _mm512_fmadd_ps(fft5352, fft5273, _mm512_shuffle_ps(fft5352, fft5352, 78));
__m512 fft5278 = _mm512_fmadd_ps(fft5269, fft5273, _mm512_shuffle_ps(fft5269, fft5269, 78));
__m512 fft5361 = _mm512_fmadd_ps(fft5353, fft5273, _mm512_shuffle_ps(fft5353, fft5353, 78));
__m512 fft5279 = _mm512_fmadd_ps(fft5270, fft5273, _mm512_shuffle_ps(fft5270, fft5270, 78));
__m512 fft5362 = _mm512_fmadd_ps(fft5354, fft5273, _mm512_shuffle_ps(fft5354, fft5354, 78));
__m512 fft5280 = _mm512_fmadd_ps(fft5271, fft5273, _mm512_shuffle_ps(fft5271, fft5271, 78));
__m512 fft5363 = _mm512_fmadd_ps(fft5355, fft5273, _mm512_shuffle_ps(fft5355, fft5355, 78));
__m512 fft5281 = _mm512_fmadd_ps(fft5272, fft5273, _mm512_shuffle_ps(fft5272, fft5272, 78));
__m512 fft5364 = _mm512_fmadd_ps(fft5356, fft5273, _mm512_shuffle_ps(fft5356, fft5356, 78));
__m512i fft5282 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5283 = _mm512_permutexvar_ps(fft5282, fft5274);
__m512 fft5365 = _mm512_permutexvar_ps(fft5282, fft5357);
__m512i fft5284 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5285 = _mm512_permutexvar_ps(fft5284, fft5274);
__m512 fft5366 = _mm512_permutexvar_ps(fft5284, fft5357);
__m512 fft5286 = _mm512_permutexvar_ps(fft5282, fft5275);
__m512 fft5367 = _mm512_permutexvar_ps(fft5282, fft5358);
__m512 fft5287 = _mm512_permutexvar_ps(fft5284, fft5275);
__m512 fft5368 = _mm512_permutexvar_ps(fft5284, fft5358);
__m512 fft5288 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5289 = _mm512_fmadd_ps(fft5283, fft5288, fft5285);
__m512 fft5369 = _mm512_fmadd_ps(fft5365, fft5288, fft5366);
__m512 fft5290 = _mm512_fnmadd_ps(fft5287, fft5288, fft5286);
__m512 fft5370 = _mm512_fnmadd_ps(fft5368, fft5288, fft5367);
__m512 fft5291 = _mm512_mask_mov_ps(fft5287, 21845, fft5289);
__m512 fft5371 = _mm512_mask_mov_ps(fft5368, 21845, fft5369);
__m512 fft5292 = _mm512_mask_mov_ps(fft5283, 43176, fft5289);
__m512 fft5372 = _mm512_mask_mov_ps(fft5365, 43176, fft5369);
__m512 fft5293 = _mm512_mask_mov_ps(fft5291, 43176, fft5290);
__m512 fft5373 = _mm512_mask_mov_ps(fft5371, 43176, fft5370);
__m512 fft5294 = _mm512_mask_mov_ps(fft5292, 22102, fft5290);
__m512 fft5374 = _mm512_mask_mov_ps(fft5372, 22102, fft5370);
__m512 fft5295 = _mm512_mask_mul_ps(fft5293, 64764, fft5293, _mm512_set1_ps(5e-01f));
__m512 fft5375 = _mm512_mask_mul_ps(fft5373, 64764, fft5373, _mm512_set1_ps(5e-01f));
__m512 fft5296 = _mm512_mask_mul_ps(fft5294, 64764, fft5294, _mm512_set1_ps(5e-01f));
__m512 fft5376 = _mm512_mask_mul_ps(fft5374, 64764, fft5374, _mm512_set1_ps(5e-01f));
__m512 df465 = fft5295;
__m512 df473 = fft5375;
__m512 df466 = fft5296;
__m512 df474 = fft5376;
__m512 df467 = fft5276;
__m512 df475 = fft5359;
__m512 df468 = fft5277;
__m512 df476 = fft5360;
__m512 df469 = fft5278;
__m512 df477 = fft5361;
__m512 df470 = fft5279;
__m512 df478 = fft5362;
__m512 df471 = fft5280;
__m512 df479 = fft5363;
__m512 df472 = fft5281;
__m512 df480 = fft5364;
__m512i eo32 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df467 = _mm512_permutexvar_ps(eo32, df467);
df468 = _mm512_permutexvar_ps(eo32, df468);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df467);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df468);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df467);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df468);
df475 = _mm512_permutexvar_ps(eo32, df475);
df476 = _mm512_permutexvar_ps(eo32, df476);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df475);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df476);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df475);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df476);
df469 = _mm512_permutexvar_ps(eo32, df469);
df470 = _mm512_permutexvar_ps(eo32, df470);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df469);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df470);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df469);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df470);
df477 = _mm512_permutexvar_ps(eo32, df477);
df478 = _mm512_permutexvar_ps(eo32, df478);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df477);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df478);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df477);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df478);
df471 = _mm512_permutexvar_ps(eo32, df471);
df472 = _mm512_permutexvar_ps(eo32, df472);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df471);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df472);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df471);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df472);
df479 = _mm512_permutexvar_ps(eo32, df479);
df480 = _mm512_permutexvar_ps(eo32, df480);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df479);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df480);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df479);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df480);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df465);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df466);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df465);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df466);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df473);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k15+128*m32+32*f33, 255, df474);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df473);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k15+128*m32+32*f33, 65280, df474);
}
}
if (j2 >= last1) return;
++j2;
rel2 = 20;
}
if (rel2 < 22) {
ptrdiff_t h15 = base2+50;
ptrdiff_t w15 = -1140+60*rel2;
ptrdiff_t jj7 = 21-rel2+j2;
for (; j2 <= jj7; w15 += 60) {
ptrdiff_t k16 = 3*s1;
ptrdiff_t kk15 = k16+2;
for (; k16 <= kk15; ++k16) {
__m512 bnMul15 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k16+3*i6))[0]);
__m512 bnAdd15 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k16+3*i6))[1]);
for (ptrdiff_t b33 = 0; b33 < 6; ++b33) {
ptrdiff_t m33 = (size_t)b33/2;
ptrdiff_t f34 = (size_t)b33%2;
__m512 dat466 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat466 = _mm512_mask_fmadd_ps(dat466, 65535, bnMul15, bnAdd15);
__m512 dat467 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat467 = _mm512_mask_fmadd_ps(dat467, 65535, bnMul15, bnAdd15);
__m512 dat468 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat468 = _mm512_mask_fmadd_ps(dat468, 65535, bnMul15, bnAdd15);
__m512 dat469 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat469 = _mm512_mask_fmadd_ps(dat469, 65535, bnMul15, bnAdd15);
__m512 dat470 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat470 = _mm512_mask_fmadd_ps(dat470, 65535, bnMul15, bnAdd15);
__m512 dat471 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat471 = _mm512_mask_fmadd_ps(dat471, 65535, bnMul15, bnAdd15);
__m512 dat472 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat472 = _mm512_mask_fmadd_ps(dat472, 65535, bnMul15, bnAdd15);
__m512 dat473 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat473 = _mm512_mask_fmadd_ps(dat473, 65535, bnMul15, bnAdd15);
__m512 dat474 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat474 = _mm512_mask_fmadd_ps(dat474, 65535, bnMul15, bnAdd15);
__m512 dat475 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat475 = _mm512_mask_fmadd_ps(dat475, 65535, bnMul15, bnAdd15);
__m512 dat476 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat476 = _mm512_mask_fmadd_ps(dat476, 65535, bnMul15, bnAdd15);
__m512 dat477 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat477 = _mm512_mask_fmadd_ps(dat477, 65535, bnMul15, bnAdd15);
__m512 dat478 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat478 = _mm512_mask_fmadd_ps(dat478, 65535, bnMul15, bnAdd15);
__m512 dat479 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat479 = _mm512_mask_fmadd_ps(dat479, 65535, bnMul15, bnAdd15);
__m512 dat480 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat480 = _mm512_mask_fmadd_ps(dat480, 65535, bnMul15, bnAdd15);
__m512 dat481 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k16+896*h15+4*w15+40*b33);
dat481 = _mm512_mask_fmadd_ps(dat481, 65535, bnMul15, bnAdd15);
__m512 fft5377 = _mm512_add_ps(dat466, dat474);
__m512 fft5465 = _mm512_add_ps(dat467, dat475);
__m512 fft5378 = _mm512_sub_ps(dat466, dat474);
__m512 fft5466 = _mm512_sub_ps(dat467, dat475);
__m512 fft5379 = _mm512_add_ps(dat468, dat476);
__m512 fft5467 = _mm512_add_ps(dat469, dat477);
__m512 fft5380 = _mm512_sub_ps(dat468, dat476);
__m512 fft5468 = _mm512_sub_ps(dat469, dat477);
__m512 fft5381 = _mm512_add_ps(dat470, dat478);
__m512 fft5469 = _mm512_add_ps(dat471, dat479);
__m512 fft5382 = _mm512_sub_ps(dat470, dat478);
__m512 fft5470 = _mm512_sub_ps(dat471, dat479);
__m512 fft5383 = _mm512_add_ps(dat472, dat480);
__m512 fft5471 = _mm512_add_ps(dat473, dat481);
__m512 fft5384 = _mm512_sub_ps(dat472, dat480);
__m512 fft5472 = _mm512_sub_ps(dat473, dat481);
__m512 fft5385 = _mm512_add_ps(fft5377, fft5381);
__m512 fft5473 = _mm512_add_ps(fft5465, fft5469);
__m512 fft5386 = _mm512_sub_ps(fft5377, fft5381);
__m512 fft5474 = _mm512_sub_ps(fft5465, fft5469);
__m512 fft5387 = _mm512_add_ps(fft5379, fft5383);
__m512 fft5475 = _mm512_add_ps(fft5467, fft5471);
__m512 fft5388 = _mm512_sub_ps(fft5383, fft5379);
__m512 fft5476 = _mm512_sub_ps(fft5471, fft5467);
__m512 fft5389 = _mm512_sub_ps(fft5380, fft5384);
__m512 fft5477 = _mm512_sub_ps(fft5468, fft5472);
__m512 fft5390 = _mm512_add_ps(fft5380, fft5384);
__m512 fft5478 = _mm512_add_ps(fft5468, fft5472);
__m512 fft5391 = _mm512_add_ps(fft5385, fft5387);
__m512 fft5479 = _mm512_add_ps(fft5473, fft5475);
__m512 fft5392 = _mm512_sub_ps(fft5385, fft5387);
__m512 fft5480 = _mm512_sub_ps(fft5473, fft5475);
__m512 fft5393 = _mm512_fmadd_ps(fft5389, _mm512_set1_ps(7.0710677e-01f), fft5378);
__m512 fft5481 = _mm512_fmadd_ps(fft5477, _mm512_set1_ps(7.0710677e-01f), fft5466);
__m512 fft5394 = _mm512_fnmsub_ps(fft5390, _mm512_set1_ps(7.0710677e-01f), fft5382);
__m512 fft5482 = _mm512_fnmsub_ps(fft5478, _mm512_set1_ps(7.0710677e-01f), fft5470);
__m512 fft5395 = _mm512_fnmadd_ps(fft5389, _mm512_set1_ps(7.0710677e-01f), fft5378);
__m512 fft5483 = _mm512_fnmadd_ps(fft5477, _mm512_set1_ps(7.0710677e-01f), fft5466);
__m512 fft5396 = _mm512_fnmadd_ps(fft5390, _mm512_set1_ps(7.0710677e-01f), fft5382);
__m512 fft5484 = _mm512_fnmadd_ps(fft5478, _mm512_set1_ps(7.0710677e-01f), fft5470);
__m512 fft5397 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5398 = _mm512_fmadd_ps(fft5391, fft5397, _mm512_shuffle_f32x4(fft5391, fft5391, 78));
__m512 fft5485 = _mm512_fmadd_ps(fft5479, fft5397, _mm512_shuffle_f32x4(fft5479, fft5479, 78));
__m512 fft5399 = _mm512_fmadd_ps(fft5392, fft5397, _mm512_shuffle_f32x4(fft5392, fft5392, 78));
__m512 fft5486 = _mm512_fmadd_ps(fft5480, fft5397, _mm512_shuffle_f32x4(fft5480, fft5480, 78));
__m512 fft5400 = _mm512_fmadd_ps(fft5393, fft5397, _mm512_shuffle_f32x4(fft5393, fft5393, 78));
__m512 fft5487 = _mm512_fmadd_ps(fft5481, fft5397, _mm512_shuffle_f32x4(fft5481, fft5481, 78));
__m512 fft5401 = _mm512_fmadd_ps(fft5394, fft5397, _mm512_shuffle_f32x4(fft5394, fft5394, 78));
__m512 fft5488 = _mm512_fmadd_ps(fft5482, fft5397, _mm512_shuffle_f32x4(fft5482, fft5482, 78));
__m512 fft5402 = _mm512_fmadd_ps(fft5386, fft5397, _mm512_shuffle_f32x4(fft5386, fft5386, 78));
__m512 fft5489 = _mm512_fmadd_ps(fft5474, fft5397, _mm512_shuffle_f32x4(fft5474, fft5474, 78));
__m512 fft5403 = _mm512_fmadd_ps(fft5388, fft5397, _mm512_shuffle_f32x4(fft5388, fft5388, 78));
__m512 fft5490 = _mm512_fmadd_ps(fft5476, fft5397, _mm512_shuffle_f32x4(fft5476, fft5476, 78));
__m512 fft5404 = _mm512_fmadd_ps(fft5395, fft5397, _mm512_shuffle_f32x4(fft5395, fft5395, 78));
__m512 fft5491 = _mm512_fmadd_ps(fft5483, fft5397, _mm512_shuffle_f32x4(fft5483, fft5483, 78));
__m512 fft5405 = _mm512_fmadd_ps(fft5396, fft5397, _mm512_shuffle_f32x4(fft5396, fft5396, 78));
__m512 fft5492 = _mm512_fmadd_ps(fft5484, fft5397, _mm512_shuffle_f32x4(fft5484, fft5484, 78));
__m512 fft5406 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5407 = _mm512_mul_ps(fft5398, fft5406);
__m512 fft5493 = _mm512_mul_ps(fft5485, fft5406);
__m512 fft5408 = _mm512_mul_ps(fft5399, fft5406);
__m512 fft5494 = _mm512_mul_ps(fft5486, fft5406);
__m512 fft5409 = _mm512_mul_ps(fft5400, fft5406);
__m512 fft5495 = _mm512_mul_ps(fft5487, fft5406);
__m512 fft5410 = _mm512_mul_ps(fft5401, fft5406);
__m512 fft5496 = _mm512_mul_ps(fft5488, fft5406);
__m512 fft5411 = _mm512_mul_ps(fft5402, fft5406);
__m512 fft5497 = _mm512_mul_ps(fft5489, fft5406);
__m512 fft5412 = _mm512_mul_ps(fft5403, fft5406);
__m512 fft5498 = _mm512_mul_ps(fft5490, fft5406);
__m512 fft5413 = _mm512_mul_ps(fft5404, fft5406);
__m512 fft5499 = _mm512_mul_ps(fft5491, fft5406);
__m512 fft5414 = _mm512_mul_ps(fft5405, fft5406);
__m512 fft5500 = _mm512_mul_ps(fft5492, fft5406);
__m512 fft5415 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5416 = _mm512_fmadd_ps(fft5399, fft5415, fft5407);
__m512 fft5501 = _mm512_fmadd_ps(fft5486, fft5415, fft5493);
__m512 fft5417 = _mm512_fnmadd_ps(fft5398, fft5415, fft5408);
__m512 fft5502 = _mm512_fnmadd_ps(fft5485, fft5415, fft5494);
__m512 fft5418 = _mm512_fmadd_ps(fft5401, fft5415, fft5409);
__m512 fft5503 = _mm512_fmadd_ps(fft5488, fft5415, fft5495);
__m512 fft5419 = _mm512_fnmadd_ps(fft5400, fft5415, fft5410);
__m512 fft5504 = _mm512_fnmadd_ps(fft5487, fft5415, fft5496);
__m512 fft5420 = _mm512_fmadd_ps(fft5403, fft5415, fft5411);
__m512 fft5505 = _mm512_fmadd_ps(fft5490, fft5415, fft5497);
__m512 fft5421 = _mm512_fnmadd_ps(fft5402, fft5415, fft5412);
__m512 fft5506 = _mm512_fnmadd_ps(fft5489, fft5415, fft5498);
__m512 fft5422 = _mm512_fmadd_ps(fft5405, fft5415, fft5413);
__m512 fft5507 = _mm512_fmadd_ps(fft5492, fft5415, fft5499);
__m512 fft5423 = _mm512_fnmadd_ps(fft5404, fft5415, fft5414);
__m512 fft5508 = _mm512_fnmadd_ps(fft5491, fft5415, fft5500);
__m512 fft5424 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5425 = _mm512_fmadd_ps(fft5416, fft5424, _mm512_shuffle_f32x4(fft5416, fft5416, 177));
__m512 fft5509 = _mm512_fmadd_ps(fft5501, fft5424, _mm512_shuffle_f32x4(fft5501, fft5501, 177));
__m512 fft5426 = _mm512_fmadd_ps(fft5417, fft5424, _mm512_shuffle_f32x4(fft5417, fft5417, 177));
__m512 fft5510 = _mm512_fmadd_ps(fft5502, fft5424, _mm512_shuffle_f32x4(fft5502, fft5502, 177));
__m512 fft5427 = _mm512_fmadd_ps(fft5418, fft5424, _mm512_shuffle_f32x4(fft5418, fft5418, 177));
__m512 fft5511 = _mm512_fmadd_ps(fft5503, fft5424, _mm512_shuffle_f32x4(fft5503, fft5503, 177));
__m512 fft5428 = _mm512_fmadd_ps(fft5419, fft5424, _mm512_shuffle_f32x4(fft5419, fft5419, 177));
__m512 fft5512 = _mm512_fmadd_ps(fft5504, fft5424, _mm512_shuffle_f32x4(fft5504, fft5504, 177));
__m512 fft5429 = _mm512_fmadd_ps(fft5420, fft5424, _mm512_shuffle_f32x4(fft5420, fft5420, 177));
__m512 fft5513 = _mm512_fmadd_ps(fft5505, fft5424, _mm512_shuffle_f32x4(fft5505, fft5505, 177));
__m512 fft5430 = _mm512_fmadd_ps(fft5421, fft5424, _mm512_shuffle_f32x4(fft5421, fft5421, 177));
__m512 fft5514 = _mm512_fmadd_ps(fft5506, fft5424, _mm512_shuffle_f32x4(fft5506, fft5506, 177));
__m512 fft5431 = _mm512_fmadd_ps(fft5422, fft5424, _mm512_shuffle_f32x4(fft5422, fft5422, 177));
__m512 fft5515 = _mm512_fmadd_ps(fft5507, fft5424, _mm512_shuffle_f32x4(fft5507, fft5507, 177));
__m512 fft5432 = _mm512_fmadd_ps(fft5423, fft5424, _mm512_shuffle_f32x4(fft5423, fft5423, 177));
__m512 fft5516 = _mm512_fmadd_ps(fft5508, fft5424, _mm512_shuffle_f32x4(fft5508, fft5508, 177));
__m512 fft5433 = _mm512_mask_mov_ps(fft5425, 49344, fft5426);
__m512 fft5517 = _mm512_mask_mov_ps(fft5509, 49344, fft5510);
__m512 fft5434 = _mm512_mask_sub_ps(fft5426, 49344, _mm512_setzero_ps(), fft5425);
__m512 fft5518 = _mm512_mask_sub_ps(fft5510, 49344, _mm512_setzero_ps(), fft5509);
__m512 fft5435 = _mm512_mask_mov_ps(fft5427, 49344, fft5428);
__m512 fft5519 = _mm512_mask_mov_ps(fft5511, 49344, fft5512);
__m512 fft5436 = _mm512_mask_sub_ps(fft5428, 49344, _mm512_setzero_ps(), fft5427);
__m512 fft5520 = _mm512_mask_sub_ps(fft5512, 49344, _mm512_setzero_ps(), fft5511);
__m512 fft5437 = _mm512_mask_mov_ps(fft5429, 49344, fft5430);
__m512 fft5521 = _mm512_mask_mov_ps(fft5513, 49344, fft5514);
__m512 fft5438 = _mm512_mask_sub_ps(fft5430, 49344, _mm512_setzero_ps(), fft5429);
__m512 fft5522 = _mm512_mask_sub_ps(fft5514, 49344, _mm512_setzero_ps(), fft5513);
__m512 fft5439 = _mm512_mask_mov_ps(fft5431, 49344, fft5432);
__m512 fft5523 = _mm512_mask_mov_ps(fft5515, 49344, fft5516);
__m512 fft5440 = _mm512_mask_sub_ps(fft5432, 49344, _mm512_setzero_ps(), fft5431);
__m512 fft5524 = _mm512_mask_sub_ps(fft5516, 49344, _mm512_setzero_ps(), fft5515);
__m512 fft5441 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5442 = _mm512_fmadd_ps(fft5433, fft5441, _mm512_shuffle_ps(fft5433, fft5433, 78));
__m512 fft5525 = _mm512_fmadd_ps(fft5517, fft5441, _mm512_shuffle_ps(fft5517, fft5517, 78));
__m512 fft5443 = _mm512_fmadd_ps(fft5434, fft5441, _mm512_shuffle_ps(fft5434, fft5434, 78));
__m512 fft5526 = _mm512_fmadd_ps(fft5518, fft5441, _mm512_shuffle_ps(fft5518, fft5518, 78));
__m512 fft5444 = _mm512_fmadd_ps(fft5435, fft5441, _mm512_shuffle_ps(fft5435, fft5435, 78));
__m512 fft5527 = _mm512_fmadd_ps(fft5519, fft5441, _mm512_shuffle_ps(fft5519, fft5519, 78));
__m512 fft5445 = _mm512_fmadd_ps(fft5436, fft5441, _mm512_shuffle_ps(fft5436, fft5436, 78));
__m512 fft5528 = _mm512_fmadd_ps(fft5520, fft5441, _mm512_shuffle_ps(fft5520, fft5520, 78));
__m512 fft5446 = _mm512_fmadd_ps(fft5437, fft5441, _mm512_shuffle_ps(fft5437, fft5437, 78));
__m512 fft5529 = _mm512_fmadd_ps(fft5521, fft5441, _mm512_shuffle_ps(fft5521, fft5521, 78));
__m512 fft5447 = _mm512_fmadd_ps(fft5438, fft5441, _mm512_shuffle_ps(fft5438, fft5438, 78));
__m512 fft5530 = _mm512_fmadd_ps(fft5522, fft5441, _mm512_shuffle_ps(fft5522, fft5522, 78));
__m512 fft5448 = _mm512_fmadd_ps(fft5439, fft5441, _mm512_shuffle_ps(fft5439, fft5439, 78));
__m512 fft5531 = _mm512_fmadd_ps(fft5523, fft5441, _mm512_shuffle_ps(fft5523, fft5523, 78));
__m512 fft5449 = _mm512_fmadd_ps(fft5440, fft5441, _mm512_shuffle_ps(fft5440, fft5440, 78));
__m512 fft5532 = _mm512_fmadd_ps(fft5524, fft5441, _mm512_shuffle_ps(fft5524, fft5524, 78));
__m512i fft5450 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5451 = _mm512_permutexvar_ps(fft5450, fft5442);
__m512 fft5533 = _mm512_permutexvar_ps(fft5450, fft5525);
__m512i fft5452 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5453 = _mm512_permutexvar_ps(fft5452, fft5442);
__m512 fft5534 = _mm512_permutexvar_ps(fft5452, fft5525);
__m512 fft5454 = _mm512_permutexvar_ps(fft5450, fft5443);
__m512 fft5535 = _mm512_permutexvar_ps(fft5450, fft5526);
__m512 fft5455 = _mm512_permutexvar_ps(fft5452, fft5443);
__m512 fft5536 = _mm512_permutexvar_ps(fft5452, fft5526);
__m512 fft5456 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5457 = _mm512_fmadd_ps(fft5451, fft5456, fft5453);
__m512 fft5537 = _mm512_fmadd_ps(fft5533, fft5456, fft5534);
__m512 fft5458 = _mm512_fnmadd_ps(fft5455, fft5456, fft5454);
__m512 fft5538 = _mm512_fnmadd_ps(fft5536, fft5456, fft5535);
__m512 fft5459 = _mm512_mask_mov_ps(fft5455, 21845, fft5457);
__m512 fft5539 = _mm512_mask_mov_ps(fft5536, 21845, fft5537);
__m512 fft5460 = _mm512_mask_mov_ps(fft5451, 43176, fft5457);
__m512 fft5540 = _mm512_mask_mov_ps(fft5533, 43176, fft5537);
__m512 fft5461 = _mm512_mask_mov_ps(fft5459, 43176, fft5458);
__m512 fft5541 = _mm512_mask_mov_ps(fft5539, 43176, fft5538);
__m512 fft5462 = _mm512_mask_mov_ps(fft5460, 22102, fft5458);
__m512 fft5542 = _mm512_mask_mov_ps(fft5540, 22102, fft5538);
__m512 fft5463 = _mm512_mask_mul_ps(fft5461, 64764, fft5461, _mm512_set1_ps(5e-01f));
__m512 fft5543 = _mm512_mask_mul_ps(fft5541, 64764, fft5541, _mm512_set1_ps(5e-01f));
__m512 fft5464 = _mm512_mask_mul_ps(fft5462, 64764, fft5462, _mm512_set1_ps(5e-01f));
__m512 fft5544 = _mm512_mask_mul_ps(fft5542, 64764, fft5542, _mm512_set1_ps(5e-01f));
__m512 df481 = fft5463;
__m512 df489 = fft5543;
__m512 df482 = fft5464;
__m512 df490 = fft5544;
__m512 df483 = fft5444;
__m512 df491 = fft5527;
__m512 df484 = fft5445;
__m512 df492 = fft5528;
__m512 df485 = fft5446;
__m512 df493 = fft5529;
__m512 df486 = fft5447;
__m512 df494 = fft5530;
__m512 df487 = fft5448;
__m512 df495 = fft5531;
__m512 df488 = fft5449;
__m512 df496 = fft5532;
__m512i eo33 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df483 = _mm512_permutexvar_ps(eo33, df483);
df484 = _mm512_permutexvar_ps(eo33, df484);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df483);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df484);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df483);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df484);
df491 = _mm512_permutexvar_ps(eo33, df491);
df492 = _mm512_permutexvar_ps(eo33, df492);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df491);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df492);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df491);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df492);
df485 = _mm512_permutexvar_ps(eo33, df485);
df486 = _mm512_permutexvar_ps(eo33, df486);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df485);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df486);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df485);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df486);
df493 = _mm512_permutexvar_ps(eo33, df493);
df494 = _mm512_permutexvar_ps(eo33, df494);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df493);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df494);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df493);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df494);
df487 = _mm512_permutexvar_ps(eo33, df487);
df488 = _mm512_permutexvar_ps(eo33, df488);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df487);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df488);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df487);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df488);
df495 = _mm512_permutexvar_ps(eo33, df495);
df496 = _mm512_permutexvar_ps(eo33, df496);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df495);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df496);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df495);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df496);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df481);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df482);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df481);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df482);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df489);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k16+128*m33+32*f34, 255, df490);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df489);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k16+128*m33+32*f34, 65280, df490);
}
}
if (j2 >= last1) return;
++j2;
}
rel2 = 22;
}
ptrdiff_t h16 = base2+50;
ptrdiff_t w16 = 180;
ptrdiff_t k17 = 3*s1;
ptrdiff_t kk16 = k17+2;
for (; k17 <= kk16; ++k17) {
__m512 bnMul16 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k17+3*i6))[0]);
__m512 bnAdd16 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k17+3*i6))[1]);
for (ptrdiff_t b34 = 0; b34 < 4; ++b34) {
ptrdiff_t m34 = (size_t)b34/2;
ptrdiff_t f35 = (size_t)b34%2;
__m512 dat482 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat482 = _mm512_mask_fmadd_ps(dat482, 65535, bnMul16, bnAdd16);
__m512 dat483 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat483 = _mm512_mask_fmadd_ps(dat483, 65535, bnMul16, bnAdd16);
__m512 dat484 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat484 = _mm512_mask_fmadd_ps(dat484, 65535, bnMul16, bnAdd16);
__m512 dat485 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat485 = _mm512_mask_fmadd_ps(dat485, 65535, bnMul16, bnAdd16);
__m512 dat486 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat486 = _mm512_mask_fmadd_ps(dat486, 65535, bnMul16, bnAdd16);
__m512 dat487 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat487 = _mm512_mask_fmadd_ps(dat487, 65535, bnMul16, bnAdd16);
__m512 dat488 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat488 = _mm512_mask_fmadd_ps(dat488, 65535, bnMul16, bnAdd16);
__m512 dat489 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat489 = _mm512_mask_fmadd_ps(dat489, 65535, bnMul16, bnAdd16);
__m512 dat490 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat490 = _mm512_mask_fmadd_ps(dat490, 65535, bnMul16, bnAdd16);
__m512 dat491 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat491 = _mm512_mask_fmadd_ps(dat491, 65535, bnMul16, bnAdd16);
__m512 dat492 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat492 = _mm512_mask_fmadd_ps(dat492, 65535, bnMul16, bnAdd16);
__m512 dat493 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat493 = _mm512_mask_fmadd_ps(dat493, 65535, bnMul16, bnAdd16);
__m512 dat494 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat494 = _mm512_mask_fmadd_ps(dat494, 65535, bnMul16, bnAdd16);
__m512 dat495 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat495 = _mm512_mask_fmadd_ps(dat495, 65535, bnMul16, bnAdd16);
__m512 dat496 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat496 = _mm512_mask_fmadd_ps(dat496, 65535, bnMul16, bnAdd16);
__m512 dat497 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k17+896*h16+4*w16+40*b34);
dat497 = _mm512_mask_fmadd_ps(dat497, 65535, bnMul16, bnAdd16);
__m512 fft5545 = _mm512_add_ps(dat482, dat490);
__m512 fft5633 = _mm512_add_ps(dat483, dat491);
__m512 fft5546 = _mm512_sub_ps(dat482, dat490);
__m512 fft5634 = _mm512_sub_ps(dat483, dat491);
__m512 fft5547 = _mm512_add_ps(dat484, dat492);
__m512 fft5635 = _mm512_add_ps(dat485, dat493);
__m512 fft5548 = _mm512_sub_ps(dat484, dat492);
__m512 fft5636 = _mm512_sub_ps(dat485, dat493);
__m512 fft5549 = _mm512_add_ps(dat486, dat494);
__m512 fft5637 = _mm512_add_ps(dat487, dat495);
__m512 fft5550 = _mm512_sub_ps(dat486, dat494);
__m512 fft5638 = _mm512_sub_ps(dat487, dat495);
__m512 fft5551 = _mm512_add_ps(dat488, dat496);
__m512 fft5639 = _mm512_add_ps(dat489, dat497);
__m512 fft5552 = _mm512_sub_ps(dat488, dat496);
__m512 fft5640 = _mm512_sub_ps(dat489, dat497);
__m512 fft5553 = _mm512_add_ps(fft5545, fft5549);
__m512 fft5641 = _mm512_add_ps(fft5633, fft5637);
__m512 fft5554 = _mm512_sub_ps(fft5545, fft5549);
__m512 fft5642 = _mm512_sub_ps(fft5633, fft5637);
__m512 fft5555 = _mm512_add_ps(fft5547, fft5551);
__m512 fft5643 = _mm512_add_ps(fft5635, fft5639);
__m512 fft5556 = _mm512_sub_ps(fft5551, fft5547);
__m512 fft5644 = _mm512_sub_ps(fft5639, fft5635);
__m512 fft5557 = _mm512_sub_ps(fft5548, fft5552);
__m512 fft5645 = _mm512_sub_ps(fft5636, fft5640);
__m512 fft5558 = _mm512_add_ps(fft5548, fft5552);
__m512 fft5646 = _mm512_add_ps(fft5636, fft5640);
__m512 fft5559 = _mm512_add_ps(fft5553, fft5555);
__m512 fft5647 = _mm512_add_ps(fft5641, fft5643);
__m512 fft5560 = _mm512_sub_ps(fft5553, fft5555);
__m512 fft5648 = _mm512_sub_ps(fft5641, fft5643);
__m512 fft5561 = _mm512_fmadd_ps(fft5557, _mm512_set1_ps(7.0710677e-01f), fft5546);
__m512 fft5649 = _mm512_fmadd_ps(fft5645, _mm512_set1_ps(7.0710677e-01f), fft5634);
__m512 fft5562 = _mm512_fnmsub_ps(fft5558, _mm512_set1_ps(7.0710677e-01f), fft5550);
__m512 fft5650 = _mm512_fnmsub_ps(fft5646, _mm512_set1_ps(7.0710677e-01f), fft5638);
__m512 fft5563 = _mm512_fnmadd_ps(fft5557, _mm512_set1_ps(7.0710677e-01f), fft5546);
__m512 fft5651 = _mm512_fnmadd_ps(fft5645, _mm512_set1_ps(7.0710677e-01f), fft5634);
__m512 fft5564 = _mm512_fnmadd_ps(fft5558, _mm512_set1_ps(7.0710677e-01f), fft5550);
__m512 fft5652 = _mm512_fnmadd_ps(fft5646, _mm512_set1_ps(7.0710677e-01f), fft5638);
__m512 fft5565 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5566 = _mm512_fmadd_ps(fft5559, fft5565, _mm512_shuffle_f32x4(fft5559, fft5559, 78));
__m512 fft5653 = _mm512_fmadd_ps(fft5647, fft5565, _mm512_shuffle_f32x4(fft5647, fft5647, 78));
__m512 fft5567 = _mm512_fmadd_ps(fft5560, fft5565, _mm512_shuffle_f32x4(fft5560, fft5560, 78));
__m512 fft5654 = _mm512_fmadd_ps(fft5648, fft5565, _mm512_shuffle_f32x4(fft5648, fft5648, 78));
__m512 fft5568 = _mm512_fmadd_ps(fft5561, fft5565, _mm512_shuffle_f32x4(fft5561, fft5561, 78));
__m512 fft5655 = _mm512_fmadd_ps(fft5649, fft5565, _mm512_shuffle_f32x4(fft5649, fft5649, 78));
__m512 fft5569 = _mm512_fmadd_ps(fft5562, fft5565, _mm512_shuffle_f32x4(fft5562, fft5562, 78));
__m512 fft5656 = _mm512_fmadd_ps(fft5650, fft5565, _mm512_shuffle_f32x4(fft5650, fft5650, 78));
__m512 fft5570 = _mm512_fmadd_ps(fft5554, fft5565, _mm512_shuffle_f32x4(fft5554, fft5554, 78));
__m512 fft5657 = _mm512_fmadd_ps(fft5642, fft5565, _mm512_shuffle_f32x4(fft5642, fft5642, 78));
__m512 fft5571 = _mm512_fmadd_ps(fft5556, fft5565, _mm512_shuffle_f32x4(fft5556, fft5556, 78));
__m512 fft5658 = _mm512_fmadd_ps(fft5644, fft5565, _mm512_shuffle_f32x4(fft5644, fft5644, 78));
__m512 fft5572 = _mm512_fmadd_ps(fft5563, fft5565, _mm512_shuffle_f32x4(fft5563, fft5563, 78));
__m512 fft5659 = _mm512_fmadd_ps(fft5651, fft5565, _mm512_shuffle_f32x4(fft5651, fft5651, 78));
__m512 fft5573 = _mm512_fmadd_ps(fft5564, fft5565, _mm512_shuffle_f32x4(fft5564, fft5564, 78));
__m512 fft5660 = _mm512_fmadd_ps(fft5652, fft5565, _mm512_shuffle_f32x4(fft5652, fft5652, 78));
__m512 fft5574 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5575 = _mm512_mul_ps(fft5566, fft5574);
__m512 fft5661 = _mm512_mul_ps(fft5653, fft5574);
__m512 fft5576 = _mm512_mul_ps(fft5567, fft5574);
__m512 fft5662 = _mm512_mul_ps(fft5654, fft5574);
__m512 fft5577 = _mm512_mul_ps(fft5568, fft5574);
__m512 fft5663 = _mm512_mul_ps(fft5655, fft5574);
__m512 fft5578 = _mm512_mul_ps(fft5569, fft5574);
__m512 fft5664 = _mm512_mul_ps(fft5656, fft5574);
__m512 fft5579 = _mm512_mul_ps(fft5570, fft5574);
__m512 fft5665 = _mm512_mul_ps(fft5657, fft5574);
__m512 fft5580 = _mm512_mul_ps(fft5571, fft5574);
__m512 fft5666 = _mm512_mul_ps(fft5658, fft5574);
__m512 fft5581 = _mm512_mul_ps(fft5572, fft5574);
__m512 fft5667 = _mm512_mul_ps(fft5659, fft5574);
__m512 fft5582 = _mm512_mul_ps(fft5573, fft5574);
__m512 fft5668 = _mm512_mul_ps(fft5660, fft5574);
__m512 fft5583 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5584 = _mm512_fmadd_ps(fft5567, fft5583, fft5575);
__m512 fft5669 = _mm512_fmadd_ps(fft5654, fft5583, fft5661);
__m512 fft5585 = _mm512_fnmadd_ps(fft5566, fft5583, fft5576);
__m512 fft5670 = _mm512_fnmadd_ps(fft5653, fft5583, fft5662);
__m512 fft5586 = _mm512_fmadd_ps(fft5569, fft5583, fft5577);
__m512 fft5671 = _mm512_fmadd_ps(fft5656, fft5583, fft5663);
__m512 fft5587 = _mm512_fnmadd_ps(fft5568, fft5583, fft5578);
__m512 fft5672 = _mm512_fnmadd_ps(fft5655, fft5583, fft5664);
__m512 fft5588 = _mm512_fmadd_ps(fft5571, fft5583, fft5579);
__m512 fft5673 = _mm512_fmadd_ps(fft5658, fft5583, fft5665);
__m512 fft5589 = _mm512_fnmadd_ps(fft5570, fft5583, fft5580);
__m512 fft5674 = _mm512_fnmadd_ps(fft5657, fft5583, fft5666);
__m512 fft5590 = _mm512_fmadd_ps(fft5573, fft5583, fft5581);
__m512 fft5675 = _mm512_fmadd_ps(fft5660, fft5583, fft5667);
__m512 fft5591 = _mm512_fnmadd_ps(fft5572, fft5583, fft5582);
__m512 fft5676 = _mm512_fnmadd_ps(fft5659, fft5583, fft5668);
__m512 fft5592 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5593 = _mm512_fmadd_ps(fft5584, fft5592, _mm512_shuffle_f32x4(fft5584, fft5584, 177));
__m512 fft5677 = _mm512_fmadd_ps(fft5669, fft5592, _mm512_shuffle_f32x4(fft5669, fft5669, 177));
__m512 fft5594 = _mm512_fmadd_ps(fft5585, fft5592, _mm512_shuffle_f32x4(fft5585, fft5585, 177));
__m512 fft5678 = _mm512_fmadd_ps(fft5670, fft5592, _mm512_shuffle_f32x4(fft5670, fft5670, 177));
__m512 fft5595 = _mm512_fmadd_ps(fft5586, fft5592, _mm512_shuffle_f32x4(fft5586, fft5586, 177));
__m512 fft5679 = _mm512_fmadd_ps(fft5671, fft5592, _mm512_shuffle_f32x4(fft5671, fft5671, 177));
__m512 fft5596 = _mm512_fmadd_ps(fft5587, fft5592, _mm512_shuffle_f32x4(fft5587, fft5587, 177));
__m512 fft5680 = _mm512_fmadd_ps(fft5672, fft5592, _mm512_shuffle_f32x4(fft5672, fft5672, 177));
__m512 fft5597 = _mm512_fmadd_ps(fft5588, fft5592, _mm512_shuffle_f32x4(fft5588, fft5588, 177));
__m512 fft5681 = _mm512_fmadd_ps(fft5673, fft5592, _mm512_shuffle_f32x4(fft5673, fft5673, 177));
__m512 fft5598 = _mm512_fmadd_ps(fft5589, fft5592, _mm512_shuffle_f32x4(fft5589, fft5589, 177));
__m512 fft5682 = _mm512_fmadd_ps(fft5674, fft5592, _mm512_shuffle_f32x4(fft5674, fft5674, 177));
__m512 fft5599 = _mm512_fmadd_ps(fft5590, fft5592, _mm512_shuffle_f32x4(fft5590, fft5590, 177));
__m512 fft5683 = _mm512_fmadd_ps(fft5675, fft5592, _mm512_shuffle_f32x4(fft5675, fft5675, 177));
__m512 fft5600 = _mm512_fmadd_ps(fft5591, fft5592, _mm512_shuffle_f32x4(fft5591, fft5591, 177));
__m512 fft5684 = _mm512_fmadd_ps(fft5676, fft5592, _mm512_shuffle_f32x4(fft5676, fft5676, 177));
__m512 fft5601 = _mm512_mask_mov_ps(fft5593, 49344, fft5594);
__m512 fft5685 = _mm512_mask_mov_ps(fft5677, 49344, fft5678);
__m512 fft5602 = _mm512_mask_sub_ps(fft5594, 49344, _mm512_setzero_ps(), fft5593);
__m512 fft5686 = _mm512_mask_sub_ps(fft5678, 49344, _mm512_setzero_ps(), fft5677);
__m512 fft5603 = _mm512_mask_mov_ps(fft5595, 49344, fft5596);
__m512 fft5687 = _mm512_mask_mov_ps(fft5679, 49344, fft5680);
__m512 fft5604 = _mm512_mask_sub_ps(fft5596, 49344, _mm512_setzero_ps(), fft5595);
__m512 fft5688 = _mm512_mask_sub_ps(fft5680, 49344, _mm512_setzero_ps(), fft5679);
__m512 fft5605 = _mm512_mask_mov_ps(fft5597, 49344, fft5598);
__m512 fft5689 = _mm512_mask_mov_ps(fft5681, 49344, fft5682);
__m512 fft5606 = _mm512_mask_sub_ps(fft5598, 49344, _mm512_setzero_ps(), fft5597);
__m512 fft5690 = _mm512_mask_sub_ps(fft5682, 49344, _mm512_setzero_ps(), fft5681);
__m512 fft5607 = _mm512_mask_mov_ps(fft5599, 49344, fft5600);
__m512 fft5691 = _mm512_mask_mov_ps(fft5683, 49344, fft5684);
__m512 fft5608 = _mm512_mask_sub_ps(fft5600, 49344, _mm512_setzero_ps(), fft5599);
__m512 fft5692 = _mm512_mask_sub_ps(fft5684, 49344, _mm512_setzero_ps(), fft5683);
__m512 fft5609 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5610 = _mm512_fmadd_ps(fft5601, fft5609, _mm512_shuffle_ps(fft5601, fft5601, 78));
__m512 fft5693 = _mm512_fmadd_ps(fft5685, fft5609, _mm512_shuffle_ps(fft5685, fft5685, 78));
__m512 fft5611 = _mm512_fmadd_ps(fft5602, fft5609, _mm512_shuffle_ps(fft5602, fft5602, 78));
__m512 fft5694 = _mm512_fmadd_ps(fft5686, fft5609, _mm512_shuffle_ps(fft5686, fft5686, 78));
__m512 fft5612 = _mm512_fmadd_ps(fft5603, fft5609, _mm512_shuffle_ps(fft5603, fft5603, 78));
__m512 fft5695 = _mm512_fmadd_ps(fft5687, fft5609, _mm512_shuffle_ps(fft5687, fft5687, 78));
__m512 fft5613 = _mm512_fmadd_ps(fft5604, fft5609, _mm512_shuffle_ps(fft5604, fft5604, 78));
__m512 fft5696 = _mm512_fmadd_ps(fft5688, fft5609, _mm512_shuffle_ps(fft5688, fft5688, 78));
__m512 fft5614 = _mm512_fmadd_ps(fft5605, fft5609, _mm512_shuffle_ps(fft5605, fft5605, 78));
__m512 fft5697 = _mm512_fmadd_ps(fft5689, fft5609, _mm512_shuffle_ps(fft5689, fft5689, 78));
__m512 fft5615 = _mm512_fmadd_ps(fft5606, fft5609, _mm512_shuffle_ps(fft5606, fft5606, 78));
__m512 fft5698 = _mm512_fmadd_ps(fft5690, fft5609, _mm512_shuffle_ps(fft5690, fft5690, 78));
__m512 fft5616 = _mm512_fmadd_ps(fft5607, fft5609, _mm512_shuffle_ps(fft5607, fft5607, 78));
__m512 fft5699 = _mm512_fmadd_ps(fft5691, fft5609, _mm512_shuffle_ps(fft5691, fft5691, 78));
__m512 fft5617 = _mm512_fmadd_ps(fft5608, fft5609, _mm512_shuffle_ps(fft5608, fft5608, 78));
__m512 fft5700 = _mm512_fmadd_ps(fft5692, fft5609, _mm512_shuffle_ps(fft5692, fft5692, 78));
__m512i fft5618 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5619 = _mm512_permutexvar_ps(fft5618, fft5610);
__m512 fft5701 = _mm512_permutexvar_ps(fft5618, fft5693);
__m512i fft5620 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5621 = _mm512_permutexvar_ps(fft5620, fft5610);
__m512 fft5702 = _mm512_permutexvar_ps(fft5620, fft5693);
__m512 fft5622 = _mm512_permutexvar_ps(fft5618, fft5611);
__m512 fft5703 = _mm512_permutexvar_ps(fft5618, fft5694);
__m512 fft5623 = _mm512_permutexvar_ps(fft5620, fft5611);
__m512 fft5704 = _mm512_permutexvar_ps(fft5620, fft5694);
__m512 fft5624 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5625 = _mm512_fmadd_ps(fft5619, fft5624, fft5621);
__m512 fft5705 = _mm512_fmadd_ps(fft5701, fft5624, fft5702);
__m512 fft5626 = _mm512_fnmadd_ps(fft5623, fft5624, fft5622);
__m512 fft5706 = _mm512_fnmadd_ps(fft5704, fft5624, fft5703);
__m512 fft5627 = _mm512_mask_mov_ps(fft5623, 21845, fft5625);
__m512 fft5707 = _mm512_mask_mov_ps(fft5704, 21845, fft5705);
__m512 fft5628 = _mm512_mask_mov_ps(fft5619, 43176, fft5625);
__m512 fft5708 = _mm512_mask_mov_ps(fft5701, 43176, fft5705);
__m512 fft5629 = _mm512_mask_mov_ps(fft5627, 43176, fft5626);
__m512 fft5709 = _mm512_mask_mov_ps(fft5707, 43176, fft5706);
__m512 fft5630 = _mm512_mask_mov_ps(fft5628, 22102, fft5626);
__m512 fft5710 = _mm512_mask_mov_ps(fft5708, 22102, fft5706);
__m512 fft5631 = _mm512_mask_mul_ps(fft5629, 64764, fft5629, _mm512_set1_ps(5e-01f));
__m512 fft5711 = _mm512_mask_mul_ps(fft5709, 64764, fft5709, _mm512_set1_ps(5e-01f));
__m512 fft5632 = _mm512_mask_mul_ps(fft5630, 64764, fft5630, _mm512_set1_ps(5e-01f));
__m512 fft5712 = _mm512_mask_mul_ps(fft5710, 64764, fft5710, _mm512_set1_ps(5e-01f));
__m512 df497 = fft5631;
__m512 df505 = fft5711;
__m512 df498 = fft5632;
__m512 df506 = fft5712;
__m512 df499 = fft5612;
__m512 df507 = fft5695;
__m512 df500 = fft5613;
__m512 df508 = fft5696;
__m512 df501 = fft5614;
__m512 df509 = fft5697;
__m512 df502 = fft5615;
__m512 df510 = fft5698;
__m512 df503 = fft5616;
__m512 df511 = fft5699;
__m512 df504 = fft5617;
__m512 df512 = fft5700;
__m512i eo34 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df499 = _mm512_permutexvar_ps(eo34, df499);
df500 = _mm512_permutexvar_ps(eo34, df500);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df499);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df500);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df499);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df500);
df507 = _mm512_permutexvar_ps(eo34, df507);
df508 = _mm512_permutexvar_ps(eo34, df508);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df507);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df508);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df507);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df508);
df501 = _mm512_permutexvar_ps(eo34, df501);
df502 = _mm512_permutexvar_ps(eo34, df502);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df501);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df502);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df501);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df502);
df509 = _mm512_permutexvar_ps(eo34, df509);
df510 = _mm512_permutexvar_ps(eo34, df510);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df509);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df510);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df509);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df510);
df503 = _mm512_permutexvar_ps(eo34, df503);
df504 = _mm512_permutexvar_ps(eo34, df504);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df503);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df504);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df503);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df504);
df511 = _mm512_permutexvar_ps(eo34, df511);
df512 = _mm512_permutexvar_ps(eo34, df512);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df511);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df512);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df511);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df512);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df497);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df498);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df497);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df498);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df505);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m34+32*f35, 255, df506);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df505);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m34+32*f35, 65280, df506);
}
ptrdiff_t b35 = 4;
ptrdiff_t m35 = (size_t)b35/2;
ptrdiff_t f36 = (size_t)b35%2;
__m512 dat498 = _mm512_maskz_loadu_ps(127, datPtr1+160+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat498 = _mm512_mask_fmadd_ps(dat498, 127, bnMul16, bnAdd16);
__m512 dat499 = _mm512_maskz_loadu_ps(127, datPtr1+1056+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat499 = _mm512_mask_fmadd_ps(dat499, 127, bnMul16, bnAdd16);
__m512 dat500 = _mm512_maskz_loadu_ps(127, datPtr1+1952+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat500 = _mm512_mask_fmadd_ps(dat500, 127, bnMul16, bnAdd16);
__m512 dat501 = _mm512_maskz_loadu_ps(127, datPtr1+2848+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat501 = _mm512_mask_fmadd_ps(dat501, 127, bnMul16, bnAdd16);
__m512 dat502 = _mm512_maskz_loadu_ps(127, datPtr1+3744+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat502 = _mm512_mask_fmadd_ps(dat502, 127, bnMul16, bnAdd16);
__m512 dat503 = _mm512_maskz_loadu_ps(127, datPtr1+4640+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat503 = _mm512_mask_fmadd_ps(dat503, 127, bnMul16, bnAdd16);
__m512 dat504 = _mm512_maskz_loadu_ps(127, datPtr1+5536+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat504 = _mm512_mask_fmadd_ps(dat504, 127, bnMul16, bnAdd16);
__m512 dat505 = _mm512_maskz_loadu_ps(127, datPtr1+6432+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat505 = _mm512_mask_fmadd_ps(dat505, 127, bnMul16, bnAdd16);
__m512 dat506 = _mm512_maskz_loadu_ps(127, datPtr1+7328+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat506 = _mm512_mask_fmadd_ps(dat506, 127, bnMul16, bnAdd16);
__m512 dat507 = _mm512_maskz_loadu_ps(127, datPtr1+8224+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat507 = _mm512_mask_fmadd_ps(dat507, 127, bnMul16, bnAdd16);
__m512 dat508 = _mm512_maskz_loadu_ps(127, datPtr1+9120+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat508 = _mm512_mask_fmadd_ps(dat508, 127, bnMul16, bnAdd16);
__m512 dat509 = _mm512_maskz_loadu_ps(127, datPtr1+10016+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat509 = _mm512_mask_fmadd_ps(dat509, 127, bnMul16, bnAdd16);
__m512 dat510 = _mm512_maskz_loadu_ps(127, datPtr1+10912+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat510 = _mm512_mask_fmadd_ps(dat510, 127, bnMul16, bnAdd16);
__m512 dat511 = _mm512_maskz_loadu_ps(127, datPtr1+11808+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat511 = _mm512_mask_fmadd_ps(dat511, 127, bnMul16, bnAdd16);
__m512 dat512 = _mm512_maskz_loadu_ps(127, datPtr1+12704+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat512 = _mm512_mask_fmadd_ps(dat512, 127, bnMul16, bnAdd16);
__m512 dat513 = _mm512_maskz_loadu_ps(127, datPtr1+13600+602112*i6+200704*k17+896*h16+4*w16+0*b35);
dat513 = _mm512_mask_fmadd_ps(dat513, 127, bnMul16, bnAdd16);
__m512 fft5713 = _mm512_add_ps(dat498, dat506);
__m512 fft5801 = _mm512_add_ps(dat499, dat507);
__m512 fft5714 = _mm512_sub_ps(dat498, dat506);
__m512 fft5802 = _mm512_sub_ps(dat499, dat507);
__m512 fft5715 = _mm512_add_ps(dat500, dat508);
__m512 fft5803 = _mm512_add_ps(dat501, dat509);
__m512 fft5716 = _mm512_sub_ps(dat500, dat508);
__m512 fft5804 = _mm512_sub_ps(dat501, dat509);
__m512 fft5717 = _mm512_add_ps(dat502, dat510);
__m512 fft5805 = _mm512_add_ps(dat503, dat511);
__m512 fft5718 = _mm512_sub_ps(dat502, dat510);
__m512 fft5806 = _mm512_sub_ps(dat503, dat511);
__m512 fft5719 = _mm512_add_ps(dat504, dat512);
__m512 fft5807 = _mm512_add_ps(dat505, dat513);
__m512 fft5720 = _mm512_sub_ps(dat504, dat512);
__m512 fft5808 = _mm512_sub_ps(dat505, dat513);
__m512 fft5721 = _mm512_add_ps(fft5713, fft5717);
__m512 fft5809 = _mm512_add_ps(fft5801, fft5805);
__m512 fft5722 = _mm512_sub_ps(fft5713, fft5717);
__m512 fft5810 = _mm512_sub_ps(fft5801, fft5805);
__m512 fft5723 = _mm512_add_ps(fft5715, fft5719);
__m512 fft5811 = _mm512_add_ps(fft5803, fft5807);
__m512 fft5724 = _mm512_sub_ps(fft5719, fft5715);
__m512 fft5812 = _mm512_sub_ps(fft5807, fft5803);
__m512 fft5725 = _mm512_sub_ps(fft5716, fft5720);
__m512 fft5813 = _mm512_sub_ps(fft5804, fft5808);
__m512 fft5726 = _mm512_add_ps(fft5716, fft5720);
__m512 fft5814 = _mm512_add_ps(fft5804, fft5808);
__m512 fft5727 = _mm512_add_ps(fft5721, fft5723);
__m512 fft5815 = _mm512_add_ps(fft5809, fft5811);
__m512 fft5728 = _mm512_sub_ps(fft5721, fft5723);
__m512 fft5816 = _mm512_sub_ps(fft5809, fft5811);
__m512 fft5729 = _mm512_fmadd_ps(fft5725, _mm512_set1_ps(7.0710677e-01f), fft5714);
__m512 fft5817 = _mm512_fmadd_ps(fft5813, _mm512_set1_ps(7.0710677e-01f), fft5802);
__m512 fft5730 = _mm512_fnmsub_ps(fft5726, _mm512_set1_ps(7.0710677e-01f), fft5718);
__m512 fft5818 = _mm512_fnmsub_ps(fft5814, _mm512_set1_ps(7.0710677e-01f), fft5806);
__m512 fft5731 = _mm512_fnmadd_ps(fft5725, _mm512_set1_ps(7.0710677e-01f), fft5714);
__m512 fft5819 = _mm512_fnmadd_ps(fft5813, _mm512_set1_ps(7.0710677e-01f), fft5802);
__m512 fft5732 = _mm512_fnmadd_ps(fft5726, _mm512_set1_ps(7.0710677e-01f), fft5718);
__m512 fft5820 = _mm512_fnmadd_ps(fft5814, _mm512_set1_ps(7.0710677e-01f), fft5806);
__m512 fft5733 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5734 = _mm512_fmadd_ps(fft5727, fft5733, _mm512_shuffle_f32x4(fft5727, fft5727, 78));
__m512 fft5821 = _mm512_fmadd_ps(fft5815, fft5733, _mm512_shuffle_f32x4(fft5815, fft5815, 78));
__m512 fft5735 = _mm512_fmadd_ps(fft5728, fft5733, _mm512_shuffle_f32x4(fft5728, fft5728, 78));
__m512 fft5822 = _mm512_fmadd_ps(fft5816, fft5733, _mm512_shuffle_f32x4(fft5816, fft5816, 78));
__m512 fft5736 = _mm512_fmadd_ps(fft5729, fft5733, _mm512_shuffle_f32x4(fft5729, fft5729, 78));
__m512 fft5823 = _mm512_fmadd_ps(fft5817, fft5733, _mm512_shuffle_f32x4(fft5817, fft5817, 78));
__m512 fft5737 = _mm512_fmadd_ps(fft5730, fft5733, _mm512_shuffle_f32x4(fft5730, fft5730, 78));
__m512 fft5824 = _mm512_fmadd_ps(fft5818, fft5733, _mm512_shuffle_f32x4(fft5818, fft5818, 78));
__m512 fft5738 = _mm512_fmadd_ps(fft5722, fft5733, _mm512_shuffle_f32x4(fft5722, fft5722, 78));
__m512 fft5825 = _mm512_fmadd_ps(fft5810, fft5733, _mm512_shuffle_f32x4(fft5810, fft5810, 78));
__m512 fft5739 = _mm512_fmadd_ps(fft5724, fft5733, _mm512_shuffle_f32x4(fft5724, fft5724, 78));
__m512 fft5826 = _mm512_fmadd_ps(fft5812, fft5733, _mm512_shuffle_f32x4(fft5812, fft5812, 78));
__m512 fft5740 = _mm512_fmadd_ps(fft5731, fft5733, _mm512_shuffle_f32x4(fft5731, fft5731, 78));
__m512 fft5827 = _mm512_fmadd_ps(fft5819, fft5733, _mm512_shuffle_f32x4(fft5819, fft5819, 78));
__m512 fft5741 = _mm512_fmadd_ps(fft5732, fft5733, _mm512_shuffle_f32x4(fft5732, fft5732, 78));
__m512 fft5828 = _mm512_fmadd_ps(fft5820, fft5733, _mm512_shuffle_f32x4(fft5820, fft5820, 78));
__m512 fft5742 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5743 = _mm512_mul_ps(fft5734, fft5742);
__m512 fft5829 = _mm512_mul_ps(fft5821, fft5742);
__m512 fft5744 = _mm512_mul_ps(fft5735, fft5742);
__m512 fft5830 = _mm512_mul_ps(fft5822, fft5742);
__m512 fft5745 = _mm512_mul_ps(fft5736, fft5742);
__m512 fft5831 = _mm512_mul_ps(fft5823, fft5742);
__m512 fft5746 = _mm512_mul_ps(fft5737, fft5742);
__m512 fft5832 = _mm512_mul_ps(fft5824, fft5742);
__m512 fft5747 = _mm512_mul_ps(fft5738, fft5742);
__m512 fft5833 = _mm512_mul_ps(fft5825, fft5742);
__m512 fft5748 = _mm512_mul_ps(fft5739, fft5742);
__m512 fft5834 = _mm512_mul_ps(fft5826, fft5742);
__m512 fft5749 = _mm512_mul_ps(fft5740, fft5742);
__m512 fft5835 = _mm512_mul_ps(fft5827, fft5742);
__m512 fft5750 = _mm512_mul_ps(fft5741, fft5742);
__m512 fft5836 = _mm512_mul_ps(fft5828, fft5742);
__m512 fft5751 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5752 = _mm512_fmadd_ps(fft5735, fft5751, fft5743);
__m512 fft5837 = _mm512_fmadd_ps(fft5822, fft5751, fft5829);
__m512 fft5753 = _mm512_fnmadd_ps(fft5734, fft5751, fft5744);
__m512 fft5838 = _mm512_fnmadd_ps(fft5821, fft5751, fft5830);
__m512 fft5754 = _mm512_fmadd_ps(fft5737, fft5751, fft5745);
__m512 fft5839 = _mm512_fmadd_ps(fft5824, fft5751, fft5831);
__m512 fft5755 = _mm512_fnmadd_ps(fft5736, fft5751, fft5746);
__m512 fft5840 = _mm512_fnmadd_ps(fft5823, fft5751, fft5832);
__m512 fft5756 = _mm512_fmadd_ps(fft5739, fft5751, fft5747);
__m512 fft5841 = _mm512_fmadd_ps(fft5826, fft5751, fft5833);
__m512 fft5757 = _mm512_fnmadd_ps(fft5738, fft5751, fft5748);
__m512 fft5842 = _mm512_fnmadd_ps(fft5825, fft5751, fft5834);
__m512 fft5758 = _mm512_fmadd_ps(fft5741, fft5751, fft5749);
__m512 fft5843 = _mm512_fmadd_ps(fft5828, fft5751, fft5835);
__m512 fft5759 = _mm512_fnmadd_ps(fft5740, fft5751, fft5750);
__m512 fft5844 = _mm512_fnmadd_ps(fft5827, fft5751, fft5836);
__m512 fft5760 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5761 = _mm512_fmadd_ps(fft5752, fft5760, _mm512_shuffle_f32x4(fft5752, fft5752, 177));
__m512 fft5845 = _mm512_fmadd_ps(fft5837, fft5760, _mm512_shuffle_f32x4(fft5837, fft5837, 177));
__m512 fft5762 = _mm512_fmadd_ps(fft5753, fft5760, _mm512_shuffle_f32x4(fft5753, fft5753, 177));
__m512 fft5846 = _mm512_fmadd_ps(fft5838, fft5760, _mm512_shuffle_f32x4(fft5838, fft5838, 177));
__m512 fft5763 = _mm512_fmadd_ps(fft5754, fft5760, _mm512_shuffle_f32x4(fft5754, fft5754, 177));
__m512 fft5847 = _mm512_fmadd_ps(fft5839, fft5760, _mm512_shuffle_f32x4(fft5839, fft5839, 177));
__m512 fft5764 = _mm512_fmadd_ps(fft5755, fft5760, _mm512_shuffle_f32x4(fft5755, fft5755, 177));
__m512 fft5848 = _mm512_fmadd_ps(fft5840, fft5760, _mm512_shuffle_f32x4(fft5840, fft5840, 177));
__m512 fft5765 = _mm512_fmadd_ps(fft5756, fft5760, _mm512_shuffle_f32x4(fft5756, fft5756, 177));
__m512 fft5849 = _mm512_fmadd_ps(fft5841, fft5760, _mm512_shuffle_f32x4(fft5841, fft5841, 177));
__m512 fft5766 = _mm512_fmadd_ps(fft5757, fft5760, _mm512_shuffle_f32x4(fft5757, fft5757, 177));
__m512 fft5850 = _mm512_fmadd_ps(fft5842, fft5760, _mm512_shuffle_f32x4(fft5842, fft5842, 177));
__m512 fft5767 = _mm512_fmadd_ps(fft5758, fft5760, _mm512_shuffle_f32x4(fft5758, fft5758, 177));
__m512 fft5851 = _mm512_fmadd_ps(fft5843, fft5760, _mm512_shuffle_f32x4(fft5843, fft5843, 177));
__m512 fft5768 = _mm512_fmadd_ps(fft5759, fft5760, _mm512_shuffle_f32x4(fft5759, fft5759, 177));
__m512 fft5852 = _mm512_fmadd_ps(fft5844, fft5760, _mm512_shuffle_f32x4(fft5844, fft5844, 177));
__m512 fft5769 = _mm512_mask_mov_ps(fft5761, 49344, fft5762);
__m512 fft5853 = _mm512_mask_mov_ps(fft5845, 49344, fft5846);
__m512 fft5770 = _mm512_mask_sub_ps(fft5762, 49344, _mm512_setzero_ps(), fft5761);
__m512 fft5854 = _mm512_mask_sub_ps(fft5846, 49344, _mm512_setzero_ps(), fft5845);
__m512 fft5771 = _mm512_mask_mov_ps(fft5763, 49344, fft5764);
__m512 fft5855 = _mm512_mask_mov_ps(fft5847, 49344, fft5848);
__m512 fft5772 = _mm512_mask_sub_ps(fft5764, 49344, _mm512_setzero_ps(), fft5763);
__m512 fft5856 = _mm512_mask_sub_ps(fft5848, 49344, _mm512_setzero_ps(), fft5847);
__m512 fft5773 = _mm512_mask_mov_ps(fft5765, 49344, fft5766);
__m512 fft5857 = _mm512_mask_mov_ps(fft5849, 49344, fft5850);
__m512 fft5774 = _mm512_mask_sub_ps(fft5766, 49344, _mm512_setzero_ps(), fft5765);
__m512 fft5858 = _mm512_mask_sub_ps(fft5850, 49344, _mm512_setzero_ps(), fft5849);
__m512 fft5775 = _mm512_mask_mov_ps(fft5767, 49344, fft5768);
__m512 fft5859 = _mm512_mask_mov_ps(fft5851, 49344, fft5852);
__m512 fft5776 = _mm512_mask_sub_ps(fft5768, 49344, _mm512_setzero_ps(), fft5767);
__m512 fft5860 = _mm512_mask_sub_ps(fft5852, 49344, _mm512_setzero_ps(), fft5851);
__m512 fft5777 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5778 = _mm512_fmadd_ps(fft5769, fft5777, _mm512_shuffle_ps(fft5769, fft5769, 78));
__m512 fft5861 = _mm512_fmadd_ps(fft5853, fft5777, _mm512_shuffle_ps(fft5853, fft5853, 78));
__m512 fft5779 = _mm512_fmadd_ps(fft5770, fft5777, _mm512_shuffle_ps(fft5770, fft5770, 78));
__m512 fft5862 = _mm512_fmadd_ps(fft5854, fft5777, _mm512_shuffle_ps(fft5854, fft5854, 78));
__m512 fft5780 = _mm512_fmadd_ps(fft5771, fft5777, _mm512_shuffle_ps(fft5771, fft5771, 78));
__m512 fft5863 = _mm512_fmadd_ps(fft5855, fft5777, _mm512_shuffle_ps(fft5855, fft5855, 78));
__m512 fft5781 = _mm512_fmadd_ps(fft5772, fft5777, _mm512_shuffle_ps(fft5772, fft5772, 78));
__m512 fft5864 = _mm512_fmadd_ps(fft5856, fft5777, _mm512_shuffle_ps(fft5856, fft5856, 78));
__m512 fft5782 = _mm512_fmadd_ps(fft5773, fft5777, _mm512_shuffle_ps(fft5773, fft5773, 78));
__m512 fft5865 = _mm512_fmadd_ps(fft5857, fft5777, _mm512_shuffle_ps(fft5857, fft5857, 78));
__m512 fft5783 = _mm512_fmadd_ps(fft5774, fft5777, _mm512_shuffle_ps(fft5774, fft5774, 78));
__m512 fft5866 = _mm512_fmadd_ps(fft5858, fft5777, _mm512_shuffle_ps(fft5858, fft5858, 78));
__m512 fft5784 = _mm512_fmadd_ps(fft5775, fft5777, _mm512_shuffle_ps(fft5775, fft5775, 78));
__m512 fft5867 = _mm512_fmadd_ps(fft5859, fft5777, _mm512_shuffle_ps(fft5859, fft5859, 78));
__m512 fft5785 = _mm512_fmadd_ps(fft5776, fft5777, _mm512_shuffle_ps(fft5776, fft5776, 78));
__m512 fft5868 = _mm512_fmadd_ps(fft5860, fft5777, _mm512_shuffle_ps(fft5860, fft5860, 78));
__m512i fft5786 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5787 = _mm512_permutexvar_ps(fft5786, fft5778);
__m512 fft5869 = _mm512_permutexvar_ps(fft5786, fft5861);
__m512i fft5788 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5789 = _mm512_permutexvar_ps(fft5788, fft5778);
__m512 fft5870 = _mm512_permutexvar_ps(fft5788, fft5861);
__m512 fft5790 = _mm512_permutexvar_ps(fft5786, fft5779);
__m512 fft5871 = _mm512_permutexvar_ps(fft5786, fft5862);
__m512 fft5791 = _mm512_permutexvar_ps(fft5788, fft5779);
__m512 fft5872 = _mm512_permutexvar_ps(fft5788, fft5862);
__m512 fft5792 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5793 = _mm512_fmadd_ps(fft5787, fft5792, fft5789);
__m512 fft5873 = _mm512_fmadd_ps(fft5869, fft5792, fft5870);
__m512 fft5794 = _mm512_fnmadd_ps(fft5791, fft5792, fft5790);
__m512 fft5874 = _mm512_fnmadd_ps(fft5872, fft5792, fft5871);
__m512 fft5795 = _mm512_mask_mov_ps(fft5791, 21845, fft5793);
__m512 fft5875 = _mm512_mask_mov_ps(fft5872, 21845, fft5873);
__m512 fft5796 = _mm512_mask_mov_ps(fft5787, 43176, fft5793);
__m512 fft5876 = _mm512_mask_mov_ps(fft5869, 43176, fft5873);
__m512 fft5797 = _mm512_mask_mov_ps(fft5795, 43176, fft5794);
__m512 fft5877 = _mm512_mask_mov_ps(fft5875, 43176, fft5874);
__m512 fft5798 = _mm512_mask_mov_ps(fft5796, 22102, fft5794);
__m512 fft5878 = _mm512_mask_mov_ps(fft5876, 22102, fft5874);
__m512 fft5799 = _mm512_mask_mul_ps(fft5797, 64764, fft5797, _mm512_set1_ps(5e-01f));
__m512 fft5879 = _mm512_mask_mul_ps(fft5877, 64764, fft5877, _mm512_set1_ps(5e-01f));
__m512 fft5800 = _mm512_mask_mul_ps(fft5798, 64764, fft5798, _mm512_set1_ps(5e-01f));
__m512 fft5880 = _mm512_mask_mul_ps(fft5878, 64764, fft5878, _mm512_set1_ps(5e-01f));
__m512 df513 = fft5799;
__m512 df521 = fft5879;
__m512 df514 = fft5800;
__m512 df522 = fft5880;
__m512 df515 = fft5780;
__m512 df523 = fft5863;
__m512 df516 = fft5781;
__m512 df524 = fft5864;
__m512 df517 = fft5782;
__m512 df525 = fft5865;
__m512 df518 = fft5783;
__m512 df526 = fft5866;
__m512 df519 = fft5784;
__m512 df527 = fft5867;
__m512 df520 = fft5785;
__m512 df528 = fft5868;
__m512i eo35 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df515 = _mm512_permutexvar_ps(eo35, df515);
df516 = _mm512_permutexvar_ps(eo35, df516);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df515);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df516);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df515);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df516);
df523 = _mm512_permutexvar_ps(eo35, df523);
df524 = _mm512_permutexvar_ps(eo35, df524);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df523);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df524);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df523);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df524);
df517 = _mm512_permutexvar_ps(eo35, df517);
df518 = _mm512_permutexvar_ps(eo35, df518);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df517);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df518);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df517);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df518);
df525 = _mm512_permutexvar_ps(eo35, df525);
df526 = _mm512_permutexvar_ps(eo35, df526);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df525);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df526);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df525);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df526);
df519 = _mm512_permutexvar_ps(eo35, df519);
df520 = _mm512_permutexvar_ps(eo35, df520);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df519);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df520);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df519);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df520);
df527 = _mm512_permutexvar_ps(eo35, df527);
df528 = _mm512_permutexvar_ps(eo35, df528);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df527);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df528);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df527);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df528);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df513);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df514);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df513);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df514);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df521);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m35+32*f36, 255, df522);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df521);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m35+32*f36, 65280, df522);
ptrdiff_t b36 = 5;
ptrdiff_t m36 = (size_t)b36/2;
ptrdiff_t f37 = (size_t)b36%2;
__m512 dat514 = _mm512_maskz_loadu_ps(65528, datPtr1+8240+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat514 = _mm512_mask_fmadd_ps(dat514, 65528, bnMul16, bnAdd16);
__m512 dat515 = _mm512_maskz_loadu_ps(65528, datPtr1+9136+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat515 = _mm512_mask_fmadd_ps(dat515, 65528, bnMul16, bnAdd16);
__m512 dat516 = _mm512_maskz_loadu_ps(65528, datPtr1+10032+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat516 = _mm512_mask_fmadd_ps(dat516, 65528, bnMul16, bnAdd16);
__m512 dat517 = _mm512_maskz_loadu_ps(65528, datPtr1+10928+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat517 = _mm512_mask_fmadd_ps(dat517, 65528, bnMul16, bnAdd16);
__m512 dat518 = _mm512_maskz_loadu_ps(65528, datPtr1+11824+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat518 = _mm512_mask_fmadd_ps(dat518, 65528, bnMul16, bnAdd16);
__m512 dat519 = _mm512_maskz_loadu_ps(65528, datPtr1+12720+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat519 = _mm512_mask_fmadd_ps(dat519, 65528, bnMul16, bnAdd16);
__m512 dat520 = _mm512_maskz_loadu_ps(65528, datPtr1+13616+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat520 = _mm512_mask_fmadd_ps(dat520, 65528, bnMul16, bnAdd16);
__m512 dat521 = _mm512_maskz_loadu_ps(65528, datPtr1+14512+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat521 = _mm512_mask_fmadd_ps(dat521, 65528, bnMul16, bnAdd16);
__m512 dat522 = _mm512_maskz_loadu_ps(65528, datPtr1+15408+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat522 = _mm512_mask_fmadd_ps(dat522, 65528, bnMul16, bnAdd16);
__m512 dat523 = _mm512_maskz_loadu_ps(65528, datPtr1+16304+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat523 = _mm512_mask_fmadd_ps(dat523, 65528, bnMul16, bnAdd16);
__m512 dat524 = _mm512_maskz_loadu_ps(65528, datPtr1+17200+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat524 = _mm512_mask_fmadd_ps(dat524, 65528, bnMul16, bnAdd16);
__m512 dat525 = _mm512_maskz_loadu_ps(65528, datPtr1+18096+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat525 = _mm512_mask_fmadd_ps(dat525, 65528, bnMul16, bnAdd16);
__m512 dat526 = _mm512_maskz_loadu_ps(65528, datPtr1+18992+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat526 = _mm512_mask_fmadd_ps(dat526, 65528, bnMul16, bnAdd16);
__m512 dat527 = _mm512_maskz_loadu_ps(65528, datPtr1+19888+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat527 = _mm512_mask_fmadd_ps(dat527, 65528, bnMul16, bnAdd16);
__m512 dat528 = _mm512_maskz_loadu_ps(65528, datPtr1+20784+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat528 = _mm512_mask_fmadd_ps(dat528, 65528, bnMul16, bnAdd16);
__m512 dat529 = _mm512_maskz_loadu_ps(65528, datPtr1+21680+602112*i6+200704*k17+896*h16+4*w16+0*b36);
dat529 = _mm512_mask_fmadd_ps(dat529, 65528, bnMul16, bnAdd16);
__m512 fft5881 = _mm512_add_ps(dat514, dat522);
__m512 fft5969 = _mm512_add_ps(dat515, dat523);
__m512 fft5882 = _mm512_sub_ps(dat514, dat522);
__m512 fft5970 = _mm512_sub_ps(dat515, dat523);
__m512 fft5883 = _mm512_add_ps(dat516, dat524);
__m512 fft5971 = _mm512_add_ps(dat517, dat525);
__m512 fft5884 = _mm512_sub_ps(dat516, dat524);
__m512 fft5972 = _mm512_sub_ps(dat517, dat525);
__m512 fft5885 = _mm512_add_ps(dat518, dat526);
__m512 fft5973 = _mm512_add_ps(dat519, dat527);
__m512 fft5886 = _mm512_sub_ps(dat518, dat526);
__m512 fft5974 = _mm512_sub_ps(dat519, dat527);
__m512 fft5887 = _mm512_add_ps(dat520, dat528);
__m512 fft5975 = _mm512_add_ps(dat521, dat529);
__m512 fft5888 = _mm512_sub_ps(dat520, dat528);
__m512 fft5976 = _mm512_sub_ps(dat521, dat529);
__m512 fft5889 = _mm512_add_ps(fft5881, fft5885);
__m512 fft5977 = _mm512_add_ps(fft5969, fft5973);
__m512 fft5890 = _mm512_sub_ps(fft5881, fft5885);
__m512 fft5978 = _mm512_sub_ps(fft5969, fft5973);
__m512 fft5891 = _mm512_add_ps(fft5883, fft5887);
__m512 fft5979 = _mm512_add_ps(fft5971, fft5975);
__m512 fft5892 = _mm512_sub_ps(fft5887, fft5883);
__m512 fft5980 = _mm512_sub_ps(fft5975, fft5971);
__m512 fft5893 = _mm512_sub_ps(fft5884, fft5888);
__m512 fft5981 = _mm512_sub_ps(fft5972, fft5976);
__m512 fft5894 = _mm512_add_ps(fft5884, fft5888);
__m512 fft5982 = _mm512_add_ps(fft5972, fft5976);
__m512 fft5895 = _mm512_add_ps(fft5889, fft5891);
__m512 fft5983 = _mm512_add_ps(fft5977, fft5979);
__m512 fft5896 = _mm512_sub_ps(fft5889, fft5891);
__m512 fft5984 = _mm512_sub_ps(fft5977, fft5979);
__m512 fft5897 = _mm512_fmadd_ps(fft5893, _mm512_set1_ps(7.0710677e-01f), fft5882);
__m512 fft5985 = _mm512_fmadd_ps(fft5981, _mm512_set1_ps(7.0710677e-01f), fft5970);
__m512 fft5898 = _mm512_fnmsub_ps(fft5894, _mm512_set1_ps(7.0710677e-01f), fft5886);
__m512 fft5986 = _mm512_fnmsub_ps(fft5982, _mm512_set1_ps(7.0710677e-01f), fft5974);
__m512 fft5899 = _mm512_fnmadd_ps(fft5893, _mm512_set1_ps(7.0710677e-01f), fft5882);
__m512 fft5987 = _mm512_fnmadd_ps(fft5981, _mm512_set1_ps(7.0710677e-01f), fft5970);
__m512 fft5900 = _mm512_fnmadd_ps(fft5894, _mm512_set1_ps(7.0710677e-01f), fft5886);
__m512 fft5988 = _mm512_fnmadd_ps(fft5982, _mm512_set1_ps(7.0710677e-01f), fft5974);
__m512 fft5901 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5902 = _mm512_fmadd_ps(fft5895, fft5901, _mm512_shuffle_f32x4(fft5895, fft5895, 78));
__m512 fft5989 = _mm512_fmadd_ps(fft5983, fft5901, _mm512_shuffle_f32x4(fft5983, fft5983, 78));
__m512 fft5903 = _mm512_fmadd_ps(fft5896, fft5901, _mm512_shuffle_f32x4(fft5896, fft5896, 78));
__m512 fft5990 = _mm512_fmadd_ps(fft5984, fft5901, _mm512_shuffle_f32x4(fft5984, fft5984, 78));
__m512 fft5904 = _mm512_fmadd_ps(fft5897, fft5901, _mm512_shuffle_f32x4(fft5897, fft5897, 78));
__m512 fft5991 = _mm512_fmadd_ps(fft5985, fft5901, _mm512_shuffle_f32x4(fft5985, fft5985, 78));
__m512 fft5905 = _mm512_fmadd_ps(fft5898, fft5901, _mm512_shuffle_f32x4(fft5898, fft5898, 78));
__m512 fft5992 = _mm512_fmadd_ps(fft5986, fft5901, _mm512_shuffle_f32x4(fft5986, fft5986, 78));
__m512 fft5906 = _mm512_fmadd_ps(fft5890, fft5901, _mm512_shuffle_f32x4(fft5890, fft5890, 78));
__m512 fft5993 = _mm512_fmadd_ps(fft5978, fft5901, _mm512_shuffle_f32x4(fft5978, fft5978, 78));
__m512 fft5907 = _mm512_fmadd_ps(fft5892, fft5901, _mm512_shuffle_f32x4(fft5892, fft5892, 78));
__m512 fft5994 = _mm512_fmadd_ps(fft5980, fft5901, _mm512_shuffle_f32x4(fft5980, fft5980, 78));
__m512 fft5908 = _mm512_fmadd_ps(fft5899, fft5901, _mm512_shuffle_f32x4(fft5899, fft5899, 78));
__m512 fft5995 = _mm512_fmadd_ps(fft5987, fft5901, _mm512_shuffle_f32x4(fft5987, fft5987, 78));
__m512 fft5909 = _mm512_fmadd_ps(fft5900, fft5901, _mm512_shuffle_f32x4(fft5900, fft5900, 78));
__m512 fft5996 = _mm512_fmadd_ps(fft5988, fft5901, _mm512_shuffle_f32x4(fft5988, fft5988, 78));
__m512 fft5910 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft5911 = _mm512_mul_ps(fft5902, fft5910);
__m512 fft5997 = _mm512_mul_ps(fft5989, fft5910);
__m512 fft5912 = _mm512_mul_ps(fft5903, fft5910);
__m512 fft5998 = _mm512_mul_ps(fft5990, fft5910);
__m512 fft5913 = _mm512_mul_ps(fft5904, fft5910);
__m512 fft5999 = _mm512_mul_ps(fft5991, fft5910);
__m512 fft5914 = _mm512_mul_ps(fft5905, fft5910);
__m512 fft6000 = _mm512_mul_ps(fft5992, fft5910);
__m512 fft5915 = _mm512_mul_ps(fft5906, fft5910);
__m512 fft6001 = _mm512_mul_ps(fft5993, fft5910);
__m512 fft5916 = _mm512_mul_ps(fft5907, fft5910);
__m512 fft6002 = _mm512_mul_ps(fft5994, fft5910);
__m512 fft5917 = _mm512_mul_ps(fft5908, fft5910);
__m512 fft6003 = _mm512_mul_ps(fft5995, fft5910);
__m512 fft5918 = _mm512_mul_ps(fft5909, fft5910);
__m512 fft6004 = _mm512_mul_ps(fft5996, fft5910);
__m512 fft5919 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft5920 = _mm512_fmadd_ps(fft5903, fft5919, fft5911);
__m512 fft6005 = _mm512_fmadd_ps(fft5990, fft5919, fft5997);
__m512 fft5921 = _mm512_fnmadd_ps(fft5902, fft5919, fft5912);
__m512 fft6006 = _mm512_fnmadd_ps(fft5989, fft5919, fft5998);
__m512 fft5922 = _mm512_fmadd_ps(fft5905, fft5919, fft5913);
__m512 fft6007 = _mm512_fmadd_ps(fft5992, fft5919, fft5999);
__m512 fft5923 = _mm512_fnmadd_ps(fft5904, fft5919, fft5914);
__m512 fft6008 = _mm512_fnmadd_ps(fft5991, fft5919, fft6000);
__m512 fft5924 = _mm512_fmadd_ps(fft5907, fft5919, fft5915);
__m512 fft6009 = _mm512_fmadd_ps(fft5994, fft5919, fft6001);
__m512 fft5925 = _mm512_fnmadd_ps(fft5906, fft5919, fft5916);
__m512 fft6010 = _mm512_fnmadd_ps(fft5993, fft5919, fft6002);
__m512 fft5926 = _mm512_fmadd_ps(fft5909, fft5919, fft5917);
__m512 fft6011 = _mm512_fmadd_ps(fft5996, fft5919, fft6003);
__m512 fft5927 = _mm512_fnmadd_ps(fft5908, fft5919, fft5918);
__m512 fft6012 = _mm512_fnmadd_ps(fft5995, fft5919, fft6004);
__m512 fft5928 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft5929 = _mm512_fmadd_ps(fft5920, fft5928, _mm512_shuffle_f32x4(fft5920, fft5920, 177));
__m512 fft6013 = _mm512_fmadd_ps(fft6005, fft5928, _mm512_shuffle_f32x4(fft6005, fft6005, 177));
__m512 fft5930 = _mm512_fmadd_ps(fft5921, fft5928, _mm512_shuffle_f32x4(fft5921, fft5921, 177));
__m512 fft6014 = _mm512_fmadd_ps(fft6006, fft5928, _mm512_shuffle_f32x4(fft6006, fft6006, 177));
__m512 fft5931 = _mm512_fmadd_ps(fft5922, fft5928, _mm512_shuffle_f32x4(fft5922, fft5922, 177));
__m512 fft6015 = _mm512_fmadd_ps(fft6007, fft5928, _mm512_shuffle_f32x4(fft6007, fft6007, 177));
__m512 fft5932 = _mm512_fmadd_ps(fft5923, fft5928, _mm512_shuffle_f32x4(fft5923, fft5923, 177));
__m512 fft6016 = _mm512_fmadd_ps(fft6008, fft5928, _mm512_shuffle_f32x4(fft6008, fft6008, 177));
__m512 fft5933 = _mm512_fmadd_ps(fft5924, fft5928, _mm512_shuffle_f32x4(fft5924, fft5924, 177));
__m512 fft6017 = _mm512_fmadd_ps(fft6009, fft5928, _mm512_shuffle_f32x4(fft6009, fft6009, 177));
__m512 fft5934 = _mm512_fmadd_ps(fft5925, fft5928, _mm512_shuffle_f32x4(fft5925, fft5925, 177));
__m512 fft6018 = _mm512_fmadd_ps(fft6010, fft5928, _mm512_shuffle_f32x4(fft6010, fft6010, 177));
__m512 fft5935 = _mm512_fmadd_ps(fft5926, fft5928, _mm512_shuffle_f32x4(fft5926, fft5926, 177));
__m512 fft6019 = _mm512_fmadd_ps(fft6011, fft5928, _mm512_shuffle_f32x4(fft6011, fft6011, 177));
__m512 fft5936 = _mm512_fmadd_ps(fft5927, fft5928, _mm512_shuffle_f32x4(fft5927, fft5927, 177));
__m512 fft6020 = _mm512_fmadd_ps(fft6012, fft5928, _mm512_shuffle_f32x4(fft6012, fft6012, 177));
__m512 fft5937 = _mm512_mask_mov_ps(fft5929, 49344, fft5930);
__m512 fft6021 = _mm512_mask_mov_ps(fft6013, 49344, fft6014);
__m512 fft5938 = _mm512_mask_sub_ps(fft5930, 49344, _mm512_setzero_ps(), fft5929);
__m512 fft6022 = _mm512_mask_sub_ps(fft6014, 49344, _mm512_setzero_ps(), fft6013);
__m512 fft5939 = _mm512_mask_mov_ps(fft5931, 49344, fft5932);
__m512 fft6023 = _mm512_mask_mov_ps(fft6015, 49344, fft6016);
__m512 fft5940 = _mm512_mask_sub_ps(fft5932, 49344, _mm512_setzero_ps(), fft5931);
__m512 fft6024 = _mm512_mask_sub_ps(fft6016, 49344, _mm512_setzero_ps(), fft6015);
__m512 fft5941 = _mm512_mask_mov_ps(fft5933, 49344, fft5934);
__m512 fft6025 = _mm512_mask_mov_ps(fft6017, 49344, fft6018);
__m512 fft5942 = _mm512_mask_sub_ps(fft5934, 49344, _mm512_setzero_ps(), fft5933);
__m512 fft6026 = _mm512_mask_sub_ps(fft6018, 49344, _mm512_setzero_ps(), fft6017);
__m512 fft5943 = _mm512_mask_mov_ps(fft5935, 49344, fft5936);
__m512 fft6027 = _mm512_mask_mov_ps(fft6019, 49344, fft6020);
__m512 fft5944 = _mm512_mask_sub_ps(fft5936, 49344, _mm512_setzero_ps(), fft5935);
__m512 fft6028 = _mm512_mask_sub_ps(fft6020, 49344, _mm512_setzero_ps(), fft6019);
__m512 fft5945 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft5946 = _mm512_fmadd_ps(fft5937, fft5945, _mm512_shuffle_ps(fft5937, fft5937, 78));
__m512 fft6029 = _mm512_fmadd_ps(fft6021, fft5945, _mm512_shuffle_ps(fft6021, fft6021, 78));
__m512 fft5947 = _mm512_fmadd_ps(fft5938, fft5945, _mm512_shuffle_ps(fft5938, fft5938, 78));
__m512 fft6030 = _mm512_fmadd_ps(fft6022, fft5945, _mm512_shuffle_ps(fft6022, fft6022, 78));
__m512 fft5948 = _mm512_fmadd_ps(fft5939, fft5945, _mm512_shuffle_ps(fft5939, fft5939, 78));
__m512 fft6031 = _mm512_fmadd_ps(fft6023, fft5945, _mm512_shuffle_ps(fft6023, fft6023, 78));
__m512 fft5949 = _mm512_fmadd_ps(fft5940, fft5945, _mm512_shuffle_ps(fft5940, fft5940, 78));
__m512 fft6032 = _mm512_fmadd_ps(fft6024, fft5945, _mm512_shuffle_ps(fft6024, fft6024, 78));
__m512 fft5950 = _mm512_fmadd_ps(fft5941, fft5945, _mm512_shuffle_ps(fft5941, fft5941, 78));
__m512 fft6033 = _mm512_fmadd_ps(fft6025, fft5945, _mm512_shuffle_ps(fft6025, fft6025, 78));
__m512 fft5951 = _mm512_fmadd_ps(fft5942, fft5945, _mm512_shuffle_ps(fft5942, fft5942, 78));
__m512 fft6034 = _mm512_fmadd_ps(fft6026, fft5945, _mm512_shuffle_ps(fft6026, fft6026, 78));
__m512 fft5952 = _mm512_fmadd_ps(fft5943, fft5945, _mm512_shuffle_ps(fft5943, fft5943, 78));
__m512 fft6035 = _mm512_fmadd_ps(fft6027, fft5945, _mm512_shuffle_ps(fft6027, fft6027, 78));
__m512 fft5953 = _mm512_fmadd_ps(fft5944, fft5945, _mm512_shuffle_ps(fft5944, fft5944, 78));
__m512 fft6036 = _mm512_fmadd_ps(fft6028, fft5945, _mm512_shuffle_ps(fft6028, fft6028, 78));
__m512i fft5954 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft5955 = _mm512_permutexvar_ps(fft5954, fft5946);
__m512 fft6037 = _mm512_permutexvar_ps(fft5954, fft6029);
__m512i fft5956 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft5957 = _mm512_permutexvar_ps(fft5956, fft5946);
__m512 fft6038 = _mm512_permutexvar_ps(fft5956, fft6029);
__m512 fft5958 = _mm512_permutexvar_ps(fft5954, fft5947);
__m512 fft6039 = _mm512_permutexvar_ps(fft5954, fft6030);
__m512 fft5959 = _mm512_permutexvar_ps(fft5956, fft5947);
__m512 fft6040 = _mm512_permutexvar_ps(fft5956, fft6030);
__m512 fft5960 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft5961 = _mm512_fmadd_ps(fft5955, fft5960, fft5957);
__m512 fft6041 = _mm512_fmadd_ps(fft6037, fft5960, fft6038);
__m512 fft5962 = _mm512_fnmadd_ps(fft5959, fft5960, fft5958);
__m512 fft6042 = _mm512_fnmadd_ps(fft6040, fft5960, fft6039);
__m512 fft5963 = _mm512_mask_mov_ps(fft5959, 21845, fft5961);
__m512 fft6043 = _mm512_mask_mov_ps(fft6040, 21845, fft6041);
__m512 fft5964 = _mm512_mask_mov_ps(fft5955, 43176, fft5961);
__m512 fft6044 = _mm512_mask_mov_ps(fft6037, 43176, fft6041);
__m512 fft5965 = _mm512_mask_mov_ps(fft5963, 43176, fft5962);
__m512 fft6045 = _mm512_mask_mov_ps(fft6043, 43176, fft6042);
__m512 fft5966 = _mm512_mask_mov_ps(fft5964, 22102, fft5962);
__m512 fft6046 = _mm512_mask_mov_ps(fft6044, 22102, fft6042);
__m512 fft5967 = _mm512_mask_mul_ps(fft5965, 64764, fft5965, _mm512_set1_ps(5e-01f));
__m512 fft6047 = _mm512_mask_mul_ps(fft6045, 64764, fft6045, _mm512_set1_ps(5e-01f));
__m512 fft5968 = _mm512_mask_mul_ps(fft5966, 64764, fft5966, _mm512_set1_ps(5e-01f));
__m512 fft6048 = _mm512_mask_mul_ps(fft6046, 64764, fft6046, _mm512_set1_ps(5e-01f));
__m512 df529 = fft5967;
__m512 df537 = fft6047;
__m512 df530 = fft5968;
__m512 df538 = fft6048;
__m512 df531 = fft5948;
__m512 df539 = fft6031;
__m512 df532 = fft5949;
__m512 df540 = fft6032;
__m512 df533 = fft5950;
__m512 df541 = fft6033;
__m512 df534 = fft5951;
__m512 df542 = fft6034;
__m512 df535 = fft5952;
__m512 df543 = fft6035;
__m512 df536 = fft5953;
__m512 df544 = fft6036;
__m512i eo36 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df531 = _mm512_permutexvar_ps(eo36, df531);
df532 = _mm512_permutexvar_ps(eo36, df532);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df531);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df532);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df531);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df532);
df539 = _mm512_permutexvar_ps(eo36, df539);
df540 = _mm512_permutexvar_ps(eo36, df540);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df539);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df540);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df539);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df540);
df533 = _mm512_permutexvar_ps(eo36, df533);
df534 = _mm512_permutexvar_ps(eo36, df534);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df533);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df534);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df533);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df534);
df541 = _mm512_permutexvar_ps(eo36, df541);
df542 = _mm512_permutexvar_ps(eo36, df542);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df541);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df542);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df541);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df542);
df535 = _mm512_permutexvar_ps(eo36, df535);
df536 = _mm512_permutexvar_ps(eo36, df536);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df535);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df536);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df535);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df536);
df543 = _mm512_permutexvar_ps(eo36, df543);
df544 = _mm512_permutexvar_ps(eo36, df544);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df543);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df544);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df543);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df544);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df529);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df530);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df529);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df530);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df537);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k17+128*m36+32*f37, 255, df538);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df537);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k17+128*m36+32*f37, 65280, df538);
}
if (j2 >= last1) return;
++j2;
}
j2 = 84;
}
ptrdiff_t rel3 = j2-84;
ptrdiff_t base3 = 210;
if (rel3 < 1) {
ptrdiff_t h17 = base3+0;
ptrdiff_t w17 = 210;
ptrdiff_t k18 = 3*s1;
ptrdiff_t kk17 = k18+2;
for (; k18 <= kk17; ++k18) {
__m512 bnMul17 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k18+3*i6))[0]);
__m512 bnAdd17 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k18+3*i6))[1]);
ptrdiff_t b37 = 0;
ptrdiff_t m37 = (size_t)b37/2;
ptrdiff_t f38 = (size_t)b37%2;
__m512 dat530 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat530 = _mm512_mask_fmadd_ps(dat530, 65535, bnMul17, bnAdd17);
__m512 dat531 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat531 = _mm512_mask_fmadd_ps(dat531, 65535, bnMul17, bnAdd17);
__m512 dat532 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat532 = _mm512_mask_fmadd_ps(dat532, 65535, bnMul17, bnAdd17);
__m512 dat533 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat533 = _mm512_mask_fmadd_ps(dat533, 65535, bnMul17, bnAdd17);
__m512 dat534 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat534 = _mm512_mask_fmadd_ps(dat534, 65535, bnMul17, bnAdd17);
__m512 dat535 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat535 = _mm512_mask_fmadd_ps(dat535, 65535, bnMul17, bnAdd17);
__m512 dat536 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat536 = _mm512_mask_fmadd_ps(dat536, 65535, bnMul17, bnAdd17);
__m512 dat537 = _mm512_maskz_loadu_ps(65535, datPtr1+6272+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat537 = _mm512_mask_fmadd_ps(dat537, 65535, bnMul17, bnAdd17);
__m512 dat538 = _mm512_maskz_loadu_ps(65535, datPtr1+7168+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat538 = _mm512_mask_fmadd_ps(dat538, 65535, bnMul17, bnAdd17);
__m512 dat539 = _mm512_maskz_loadu_ps(65535, datPtr1+8064+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat539 = _mm512_mask_fmadd_ps(dat539, 65535, bnMul17, bnAdd17);
__m512 dat540 = _mm512_maskz_loadu_ps(65535, datPtr1+8960+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat540 = _mm512_mask_fmadd_ps(dat540, 65535, bnMul17, bnAdd17);
__m512 dat541 = _mm512_maskz_loadu_ps(65535, datPtr1+9856+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat541 = _mm512_mask_fmadd_ps(dat541, 65535, bnMul17, bnAdd17);
__m512 dat542 = _mm512_maskz_loadu_ps(65535, datPtr1+10752+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat542 = _mm512_mask_fmadd_ps(dat542, 65535, bnMul17, bnAdd17);
__m512 dat543 = _mm512_maskz_loadu_ps(65535, datPtr1+11648+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat543 = _mm512_mask_fmadd_ps(dat543, 65535, bnMul17, bnAdd17);
__m512 dat544 = _mm512_maskz_loadu_ps(65535, datPtr1+12544+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat544 = _mm512_mask_fmadd_ps(dat544, 65535, bnMul17, bnAdd17);
__m512 dat545 = _mm512_maskz_loadu_ps(65535, datPtr1+13440+602112*i6+200704*k18+896*h17+4*w17+0*b37);
dat545 = _mm512_mask_fmadd_ps(dat545, 65535, bnMul17, bnAdd17);
__m512 fft6049 = _mm512_add_ps(dat530, dat538);
__m512 fft6137 = _mm512_add_ps(dat531, dat539);
__m512 fft6050 = _mm512_sub_ps(dat530, dat538);
__m512 fft6138 = _mm512_sub_ps(dat531, dat539);
__m512 fft6051 = _mm512_add_ps(dat532, dat540);
__m512 fft6139 = _mm512_add_ps(dat533, dat541);
__m512 fft6052 = _mm512_sub_ps(dat532, dat540);
__m512 fft6140 = _mm512_sub_ps(dat533, dat541);
__m512 fft6053 = _mm512_add_ps(dat534, dat542);
__m512 fft6141 = _mm512_add_ps(dat535, dat543);
__m512 fft6054 = _mm512_sub_ps(dat534, dat542);
__m512 fft6142 = _mm512_sub_ps(dat535, dat543);
__m512 fft6055 = _mm512_add_ps(dat536, dat544);
__m512 fft6143 = _mm512_add_ps(dat537, dat545);
__m512 fft6056 = _mm512_sub_ps(dat536, dat544);
__m512 fft6144 = _mm512_sub_ps(dat537, dat545);
__m512 fft6057 = _mm512_add_ps(fft6049, fft6053);
__m512 fft6145 = _mm512_add_ps(fft6137, fft6141);
__m512 fft6058 = _mm512_sub_ps(fft6049, fft6053);
__m512 fft6146 = _mm512_sub_ps(fft6137, fft6141);
__m512 fft6059 = _mm512_add_ps(fft6051, fft6055);
__m512 fft6147 = _mm512_add_ps(fft6139, fft6143);
__m512 fft6060 = _mm512_sub_ps(fft6055, fft6051);
__m512 fft6148 = _mm512_sub_ps(fft6143, fft6139);
__m512 fft6061 = _mm512_sub_ps(fft6052, fft6056);
__m512 fft6149 = _mm512_sub_ps(fft6140, fft6144);
__m512 fft6062 = _mm512_add_ps(fft6052, fft6056);
__m512 fft6150 = _mm512_add_ps(fft6140, fft6144);
__m512 fft6063 = _mm512_add_ps(fft6057, fft6059);
__m512 fft6151 = _mm512_add_ps(fft6145, fft6147);
__m512 fft6064 = _mm512_sub_ps(fft6057, fft6059);
__m512 fft6152 = _mm512_sub_ps(fft6145, fft6147);
__m512 fft6065 = _mm512_fmadd_ps(fft6061, _mm512_set1_ps(7.0710677e-01f), fft6050);
__m512 fft6153 = _mm512_fmadd_ps(fft6149, _mm512_set1_ps(7.0710677e-01f), fft6138);
__m512 fft6066 = _mm512_fnmsub_ps(fft6062, _mm512_set1_ps(7.0710677e-01f), fft6054);
__m512 fft6154 = _mm512_fnmsub_ps(fft6150, _mm512_set1_ps(7.0710677e-01f), fft6142);
__m512 fft6067 = _mm512_fnmadd_ps(fft6061, _mm512_set1_ps(7.0710677e-01f), fft6050);
__m512 fft6155 = _mm512_fnmadd_ps(fft6149, _mm512_set1_ps(7.0710677e-01f), fft6138);
__m512 fft6068 = _mm512_fnmadd_ps(fft6062, _mm512_set1_ps(7.0710677e-01f), fft6054);
__m512 fft6156 = _mm512_fnmadd_ps(fft6150, _mm512_set1_ps(7.0710677e-01f), fft6142);
__m512 fft6069 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6070 = _mm512_fmadd_ps(fft6063, fft6069, _mm512_shuffle_f32x4(fft6063, fft6063, 78));
__m512 fft6157 = _mm512_fmadd_ps(fft6151, fft6069, _mm512_shuffle_f32x4(fft6151, fft6151, 78));
__m512 fft6071 = _mm512_fmadd_ps(fft6064, fft6069, _mm512_shuffle_f32x4(fft6064, fft6064, 78));
__m512 fft6158 = _mm512_fmadd_ps(fft6152, fft6069, _mm512_shuffle_f32x4(fft6152, fft6152, 78));
__m512 fft6072 = _mm512_fmadd_ps(fft6065, fft6069, _mm512_shuffle_f32x4(fft6065, fft6065, 78));
__m512 fft6159 = _mm512_fmadd_ps(fft6153, fft6069, _mm512_shuffle_f32x4(fft6153, fft6153, 78));
__m512 fft6073 = _mm512_fmadd_ps(fft6066, fft6069, _mm512_shuffle_f32x4(fft6066, fft6066, 78));
__m512 fft6160 = _mm512_fmadd_ps(fft6154, fft6069, _mm512_shuffle_f32x4(fft6154, fft6154, 78));
__m512 fft6074 = _mm512_fmadd_ps(fft6058, fft6069, _mm512_shuffle_f32x4(fft6058, fft6058, 78));
__m512 fft6161 = _mm512_fmadd_ps(fft6146, fft6069, _mm512_shuffle_f32x4(fft6146, fft6146, 78));
__m512 fft6075 = _mm512_fmadd_ps(fft6060, fft6069, _mm512_shuffle_f32x4(fft6060, fft6060, 78));
__m512 fft6162 = _mm512_fmadd_ps(fft6148, fft6069, _mm512_shuffle_f32x4(fft6148, fft6148, 78));
__m512 fft6076 = _mm512_fmadd_ps(fft6067, fft6069, _mm512_shuffle_f32x4(fft6067, fft6067, 78));
__m512 fft6163 = _mm512_fmadd_ps(fft6155, fft6069, _mm512_shuffle_f32x4(fft6155, fft6155, 78));
__m512 fft6077 = _mm512_fmadd_ps(fft6068, fft6069, _mm512_shuffle_f32x4(fft6068, fft6068, 78));
__m512 fft6164 = _mm512_fmadd_ps(fft6156, fft6069, _mm512_shuffle_f32x4(fft6156, fft6156, 78));
__m512 fft6078 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6079 = _mm512_mul_ps(fft6070, fft6078);
__m512 fft6165 = _mm512_mul_ps(fft6157, fft6078);
__m512 fft6080 = _mm512_mul_ps(fft6071, fft6078);
__m512 fft6166 = _mm512_mul_ps(fft6158, fft6078);
__m512 fft6081 = _mm512_mul_ps(fft6072, fft6078);
__m512 fft6167 = _mm512_mul_ps(fft6159, fft6078);
__m512 fft6082 = _mm512_mul_ps(fft6073, fft6078);
__m512 fft6168 = _mm512_mul_ps(fft6160, fft6078);
__m512 fft6083 = _mm512_mul_ps(fft6074, fft6078);
__m512 fft6169 = _mm512_mul_ps(fft6161, fft6078);
__m512 fft6084 = _mm512_mul_ps(fft6075, fft6078);
__m512 fft6170 = _mm512_mul_ps(fft6162, fft6078);
__m512 fft6085 = _mm512_mul_ps(fft6076, fft6078);
__m512 fft6171 = _mm512_mul_ps(fft6163, fft6078);
__m512 fft6086 = _mm512_mul_ps(fft6077, fft6078);
__m512 fft6172 = _mm512_mul_ps(fft6164, fft6078);
__m512 fft6087 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6088 = _mm512_fmadd_ps(fft6071, fft6087, fft6079);
__m512 fft6173 = _mm512_fmadd_ps(fft6158, fft6087, fft6165);
__m512 fft6089 = _mm512_fnmadd_ps(fft6070, fft6087, fft6080);
__m512 fft6174 = _mm512_fnmadd_ps(fft6157, fft6087, fft6166);
__m512 fft6090 = _mm512_fmadd_ps(fft6073, fft6087, fft6081);
__m512 fft6175 = _mm512_fmadd_ps(fft6160, fft6087, fft6167);
__m512 fft6091 = _mm512_fnmadd_ps(fft6072, fft6087, fft6082);
__m512 fft6176 = _mm512_fnmadd_ps(fft6159, fft6087, fft6168);
__m512 fft6092 = _mm512_fmadd_ps(fft6075, fft6087, fft6083);
__m512 fft6177 = _mm512_fmadd_ps(fft6162, fft6087, fft6169);
__m512 fft6093 = _mm512_fnmadd_ps(fft6074, fft6087, fft6084);
__m512 fft6178 = _mm512_fnmadd_ps(fft6161, fft6087, fft6170);
__m512 fft6094 = _mm512_fmadd_ps(fft6077, fft6087, fft6085);
__m512 fft6179 = _mm512_fmadd_ps(fft6164, fft6087, fft6171);
__m512 fft6095 = _mm512_fnmadd_ps(fft6076, fft6087, fft6086);
__m512 fft6180 = _mm512_fnmadd_ps(fft6163, fft6087, fft6172);
__m512 fft6096 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6097 = _mm512_fmadd_ps(fft6088, fft6096, _mm512_shuffle_f32x4(fft6088, fft6088, 177));
__m512 fft6181 = _mm512_fmadd_ps(fft6173, fft6096, _mm512_shuffle_f32x4(fft6173, fft6173, 177));
__m512 fft6098 = _mm512_fmadd_ps(fft6089, fft6096, _mm512_shuffle_f32x4(fft6089, fft6089, 177));
__m512 fft6182 = _mm512_fmadd_ps(fft6174, fft6096, _mm512_shuffle_f32x4(fft6174, fft6174, 177));
__m512 fft6099 = _mm512_fmadd_ps(fft6090, fft6096, _mm512_shuffle_f32x4(fft6090, fft6090, 177));
__m512 fft6183 = _mm512_fmadd_ps(fft6175, fft6096, _mm512_shuffle_f32x4(fft6175, fft6175, 177));
__m512 fft6100 = _mm512_fmadd_ps(fft6091, fft6096, _mm512_shuffle_f32x4(fft6091, fft6091, 177));
__m512 fft6184 = _mm512_fmadd_ps(fft6176, fft6096, _mm512_shuffle_f32x4(fft6176, fft6176, 177));
__m512 fft6101 = _mm512_fmadd_ps(fft6092, fft6096, _mm512_shuffle_f32x4(fft6092, fft6092, 177));
__m512 fft6185 = _mm512_fmadd_ps(fft6177, fft6096, _mm512_shuffle_f32x4(fft6177, fft6177, 177));
__m512 fft6102 = _mm512_fmadd_ps(fft6093, fft6096, _mm512_shuffle_f32x4(fft6093, fft6093, 177));
__m512 fft6186 = _mm512_fmadd_ps(fft6178, fft6096, _mm512_shuffle_f32x4(fft6178, fft6178, 177));
__m512 fft6103 = _mm512_fmadd_ps(fft6094, fft6096, _mm512_shuffle_f32x4(fft6094, fft6094, 177));
__m512 fft6187 = _mm512_fmadd_ps(fft6179, fft6096, _mm512_shuffle_f32x4(fft6179, fft6179, 177));
__m512 fft6104 = _mm512_fmadd_ps(fft6095, fft6096, _mm512_shuffle_f32x4(fft6095, fft6095, 177));
__m512 fft6188 = _mm512_fmadd_ps(fft6180, fft6096, _mm512_shuffle_f32x4(fft6180, fft6180, 177));
__m512 fft6105 = _mm512_mask_mov_ps(fft6097, 49344, fft6098);
__m512 fft6189 = _mm512_mask_mov_ps(fft6181, 49344, fft6182);
__m512 fft6106 = _mm512_mask_sub_ps(fft6098, 49344, _mm512_setzero_ps(), fft6097);
__m512 fft6190 = _mm512_mask_sub_ps(fft6182, 49344, _mm512_setzero_ps(), fft6181);
__m512 fft6107 = _mm512_mask_mov_ps(fft6099, 49344, fft6100);
__m512 fft6191 = _mm512_mask_mov_ps(fft6183, 49344, fft6184);
__m512 fft6108 = _mm512_mask_sub_ps(fft6100, 49344, _mm512_setzero_ps(), fft6099);
__m512 fft6192 = _mm512_mask_sub_ps(fft6184, 49344, _mm512_setzero_ps(), fft6183);
__m512 fft6109 = _mm512_mask_mov_ps(fft6101, 49344, fft6102);
__m512 fft6193 = _mm512_mask_mov_ps(fft6185, 49344, fft6186);
__m512 fft6110 = _mm512_mask_sub_ps(fft6102, 49344, _mm512_setzero_ps(), fft6101);
__m512 fft6194 = _mm512_mask_sub_ps(fft6186, 49344, _mm512_setzero_ps(), fft6185);
__m512 fft6111 = _mm512_mask_mov_ps(fft6103, 49344, fft6104);
__m512 fft6195 = _mm512_mask_mov_ps(fft6187, 49344, fft6188);
__m512 fft6112 = _mm512_mask_sub_ps(fft6104, 49344, _mm512_setzero_ps(), fft6103);
__m512 fft6196 = _mm512_mask_sub_ps(fft6188, 49344, _mm512_setzero_ps(), fft6187);
__m512 fft6113 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6114 = _mm512_fmadd_ps(fft6105, fft6113, _mm512_shuffle_ps(fft6105, fft6105, 78));
__m512 fft6197 = _mm512_fmadd_ps(fft6189, fft6113, _mm512_shuffle_ps(fft6189, fft6189, 78));
__m512 fft6115 = _mm512_fmadd_ps(fft6106, fft6113, _mm512_shuffle_ps(fft6106, fft6106, 78));
__m512 fft6198 = _mm512_fmadd_ps(fft6190, fft6113, _mm512_shuffle_ps(fft6190, fft6190, 78));
__m512 fft6116 = _mm512_fmadd_ps(fft6107, fft6113, _mm512_shuffle_ps(fft6107, fft6107, 78));
__m512 fft6199 = _mm512_fmadd_ps(fft6191, fft6113, _mm512_shuffle_ps(fft6191, fft6191, 78));
__m512 fft6117 = _mm512_fmadd_ps(fft6108, fft6113, _mm512_shuffle_ps(fft6108, fft6108, 78));
__m512 fft6200 = _mm512_fmadd_ps(fft6192, fft6113, _mm512_shuffle_ps(fft6192, fft6192, 78));
__m512 fft6118 = _mm512_fmadd_ps(fft6109, fft6113, _mm512_shuffle_ps(fft6109, fft6109, 78));
__m512 fft6201 = _mm512_fmadd_ps(fft6193, fft6113, _mm512_shuffle_ps(fft6193, fft6193, 78));
__m512 fft6119 = _mm512_fmadd_ps(fft6110, fft6113, _mm512_shuffle_ps(fft6110, fft6110, 78));
__m512 fft6202 = _mm512_fmadd_ps(fft6194, fft6113, _mm512_shuffle_ps(fft6194, fft6194, 78));
__m512 fft6120 = _mm512_fmadd_ps(fft6111, fft6113, _mm512_shuffle_ps(fft6111, fft6111, 78));
__m512 fft6203 = _mm512_fmadd_ps(fft6195, fft6113, _mm512_shuffle_ps(fft6195, fft6195, 78));
__m512 fft6121 = _mm512_fmadd_ps(fft6112, fft6113, _mm512_shuffle_ps(fft6112, fft6112, 78));
__m512 fft6204 = _mm512_fmadd_ps(fft6196, fft6113, _mm512_shuffle_ps(fft6196, fft6196, 78));
__m512i fft6122 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6123 = _mm512_permutexvar_ps(fft6122, fft6114);
__m512 fft6205 = _mm512_permutexvar_ps(fft6122, fft6197);
__m512i fft6124 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6125 = _mm512_permutexvar_ps(fft6124, fft6114);
__m512 fft6206 = _mm512_permutexvar_ps(fft6124, fft6197);
__m512 fft6126 = _mm512_permutexvar_ps(fft6122, fft6115);
__m512 fft6207 = _mm512_permutexvar_ps(fft6122, fft6198);
__m512 fft6127 = _mm512_permutexvar_ps(fft6124, fft6115);
__m512 fft6208 = _mm512_permutexvar_ps(fft6124, fft6198);
__m512 fft6128 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6129 = _mm512_fmadd_ps(fft6123, fft6128, fft6125);
__m512 fft6209 = _mm512_fmadd_ps(fft6205, fft6128, fft6206);
__m512 fft6130 = _mm512_fnmadd_ps(fft6127, fft6128, fft6126);
__m512 fft6210 = _mm512_fnmadd_ps(fft6208, fft6128, fft6207);
__m512 fft6131 = _mm512_mask_mov_ps(fft6127, 21845, fft6129);
__m512 fft6211 = _mm512_mask_mov_ps(fft6208, 21845, fft6209);
__m512 fft6132 = _mm512_mask_mov_ps(fft6123, 43176, fft6129);
__m512 fft6212 = _mm512_mask_mov_ps(fft6205, 43176, fft6209);
__m512 fft6133 = _mm512_mask_mov_ps(fft6131, 43176, fft6130);
__m512 fft6213 = _mm512_mask_mov_ps(fft6211, 43176, fft6210);
__m512 fft6134 = _mm512_mask_mov_ps(fft6132, 22102, fft6130);
__m512 fft6214 = _mm512_mask_mov_ps(fft6212, 22102, fft6210);
__m512 fft6135 = _mm512_mask_mul_ps(fft6133, 64764, fft6133, _mm512_set1_ps(5e-01f));
__m512 fft6215 = _mm512_mask_mul_ps(fft6213, 64764, fft6213, _mm512_set1_ps(5e-01f));
__m512 fft6136 = _mm512_mask_mul_ps(fft6134, 64764, fft6134, _mm512_set1_ps(5e-01f));
__m512 fft6216 = _mm512_mask_mul_ps(fft6214, 64764, fft6214, _mm512_set1_ps(5e-01f));
__m512 df545 = fft6135;
__m512 df553 = fft6215;
__m512 df546 = fft6136;
__m512 df554 = fft6216;
__m512 df547 = fft6116;
__m512 df555 = fft6199;
__m512 df548 = fft6117;
__m512 df556 = fft6200;
__m512 df549 = fft6118;
__m512 df557 = fft6201;
__m512 df550 = fft6119;
__m512 df558 = fft6202;
__m512 df551 = fft6120;
__m512 df559 = fft6203;
__m512 df552 = fft6121;
__m512 df560 = fft6204;
__m512i eo37 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df547 = _mm512_permutexvar_ps(eo37, df547);
df548 = _mm512_permutexvar_ps(eo37, df548);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df547);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df548);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df547);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df548);
df555 = _mm512_permutexvar_ps(eo37, df555);
df556 = _mm512_permutexvar_ps(eo37, df556);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df555);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df556);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df555);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df556);
df549 = _mm512_permutexvar_ps(eo37, df549);
df550 = _mm512_permutexvar_ps(eo37, df550);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df549);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df550);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df549);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df550);
df557 = _mm512_permutexvar_ps(eo37, df557);
df558 = _mm512_permutexvar_ps(eo37, df558);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df557);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df558);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df557);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df558);
df551 = _mm512_permutexvar_ps(eo37, df551);
df552 = _mm512_permutexvar_ps(eo37, df552);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df551);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df552);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df551);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df552);
df559 = _mm512_permutexvar_ps(eo37, df559);
df560 = _mm512_permutexvar_ps(eo37, df560);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df559);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df560);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df559);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df560);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df545);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df546);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df545);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df546);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df553);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m37+32*f38, 255, df554);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df553);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m37+32*f38, 65280, df554);
ptrdiff_t b38 = 1;
ptrdiff_t m38 = (size_t)b38/2;
ptrdiff_t f39 = (size_t)b38%2;
__m512 dat546 = _mm512_maskz_loadu_ps(127, datPtr1+40+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat546 = _mm512_mask_fmadd_ps(dat546, 127, bnMul17, bnAdd17);
__m512 dat547 = _mm512_maskz_loadu_ps(127, datPtr1+936+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat547 = _mm512_mask_fmadd_ps(dat547, 127, bnMul17, bnAdd17);
__m512 dat548 = _mm512_maskz_loadu_ps(127, datPtr1+1832+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat548 = _mm512_mask_fmadd_ps(dat548, 127, bnMul17, bnAdd17);
__m512 dat549 = _mm512_maskz_loadu_ps(127, datPtr1+2728+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat549 = _mm512_mask_fmadd_ps(dat549, 127, bnMul17, bnAdd17);
__m512 dat550 = _mm512_maskz_loadu_ps(127, datPtr1+3624+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat550 = _mm512_mask_fmadd_ps(dat550, 127, bnMul17, bnAdd17);
__m512 dat551 = _mm512_maskz_loadu_ps(127, datPtr1+4520+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat551 = _mm512_mask_fmadd_ps(dat551, 127, bnMul17, bnAdd17);
__m512 dat552 = _mm512_maskz_loadu_ps(127, datPtr1+5416+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat552 = _mm512_mask_fmadd_ps(dat552, 127, bnMul17, bnAdd17);
__m512 dat553 = _mm512_maskz_loadu_ps(127, datPtr1+6312+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat553 = _mm512_mask_fmadd_ps(dat553, 127, bnMul17, bnAdd17);
__m512 dat554 = _mm512_maskz_loadu_ps(127, datPtr1+7208+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat554 = _mm512_mask_fmadd_ps(dat554, 127, bnMul17, bnAdd17);
__m512 dat555 = _mm512_maskz_loadu_ps(127, datPtr1+8104+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat555 = _mm512_mask_fmadd_ps(dat555, 127, bnMul17, bnAdd17);
__m512 dat556 = _mm512_maskz_loadu_ps(127, datPtr1+9000+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat556 = _mm512_mask_fmadd_ps(dat556, 127, bnMul17, bnAdd17);
__m512 dat557 = _mm512_maskz_loadu_ps(127, datPtr1+9896+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat557 = _mm512_mask_fmadd_ps(dat557, 127, bnMul17, bnAdd17);
__m512 dat558 = _mm512_maskz_loadu_ps(127, datPtr1+10792+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat558 = _mm512_mask_fmadd_ps(dat558, 127, bnMul17, bnAdd17);
__m512 dat559 = _mm512_maskz_loadu_ps(127, datPtr1+11688+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat559 = _mm512_mask_fmadd_ps(dat559, 127, bnMul17, bnAdd17);
__m512 dat560 = _mm512_maskz_loadu_ps(127, datPtr1+12584+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat560 = _mm512_mask_fmadd_ps(dat560, 127, bnMul17, bnAdd17);
__m512 dat561 = _mm512_maskz_loadu_ps(127, datPtr1+13480+602112*i6+200704*k18+896*h17+4*w17+0*b38);
dat561 = _mm512_mask_fmadd_ps(dat561, 127, bnMul17, bnAdd17);
__m512 fft6217 = _mm512_add_ps(dat546, dat554);
__m512 fft6305 = _mm512_add_ps(dat547, dat555);
__m512 fft6218 = _mm512_sub_ps(dat546, dat554);
__m512 fft6306 = _mm512_sub_ps(dat547, dat555);
__m512 fft6219 = _mm512_add_ps(dat548, dat556);
__m512 fft6307 = _mm512_add_ps(dat549, dat557);
__m512 fft6220 = _mm512_sub_ps(dat548, dat556);
__m512 fft6308 = _mm512_sub_ps(dat549, dat557);
__m512 fft6221 = _mm512_add_ps(dat550, dat558);
__m512 fft6309 = _mm512_add_ps(dat551, dat559);
__m512 fft6222 = _mm512_sub_ps(dat550, dat558);
__m512 fft6310 = _mm512_sub_ps(dat551, dat559);
__m512 fft6223 = _mm512_add_ps(dat552, dat560);
__m512 fft6311 = _mm512_add_ps(dat553, dat561);
__m512 fft6224 = _mm512_sub_ps(dat552, dat560);
__m512 fft6312 = _mm512_sub_ps(dat553, dat561);
__m512 fft6225 = _mm512_add_ps(fft6217, fft6221);
__m512 fft6313 = _mm512_add_ps(fft6305, fft6309);
__m512 fft6226 = _mm512_sub_ps(fft6217, fft6221);
__m512 fft6314 = _mm512_sub_ps(fft6305, fft6309);
__m512 fft6227 = _mm512_add_ps(fft6219, fft6223);
__m512 fft6315 = _mm512_add_ps(fft6307, fft6311);
__m512 fft6228 = _mm512_sub_ps(fft6223, fft6219);
__m512 fft6316 = _mm512_sub_ps(fft6311, fft6307);
__m512 fft6229 = _mm512_sub_ps(fft6220, fft6224);
__m512 fft6317 = _mm512_sub_ps(fft6308, fft6312);
__m512 fft6230 = _mm512_add_ps(fft6220, fft6224);
__m512 fft6318 = _mm512_add_ps(fft6308, fft6312);
__m512 fft6231 = _mm512_add_ps(fft6225, fft6227);
__m512 fft6319 = _mm512_add_ps(fft6313, fft6315);
__m512 fft6232 = _mm512_sub_ps(fft6225, fft6227);
__m512 fft6320 = _mm512_sub_ps(fft6313, fft6315);
__m512 fft6233 = _mm512_fmadd_ps(fft6229, _mm512_set1_ps(7.0710677e-01f), fft6218);
__m512 fft6321 = _mm512_fmadd_ps(fft6317, _mm512_set1_ps(7.0710677e-01f), fft6306);
__m512 fft6234 = _mm512_fnmsub_ps(fft6230, _mm512_set1_ps(7.0710677e-01f), fft6222);
__m512 fft6322 = _mm512_fnmsub_ps(fft6318, _mm512_set1_ps(7.0710677e-01f), fft6310);
__m512 fft6235 = _mm512_fnmadd_ps(fft6229, _mm512_set1_ps(7.0710677e-01f), fft6218);
__m512 fft6323 = _mm512_fnmadd_ps(fft6317, _mm512_set1_ps(7.0710677e-01f), fft6306);
__m512 fft6236 = _mm512_fnmadd_ps(fft6230, _mm512_set1_ps(7.0710677e-01f), fft6222);
__m512 fft6324 = _mm512_fnmadd_ps(fft6318, _mm512_set1_ps(7.0710677e-01f), fft6310);
__m512 fft6237 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6238 = _mm512_fmadd_ps(fft6231, fft6237, _mm512_shuffle_f32x4(fft6231, fft6231, 78));
__m512 fft6325 = _mm512_fmadd_ps(fft6319, fft6237, _mm512_shuffle_f32x4(fft6319, fft6319, 78));
__m512 fft6239 = _mm512_fmadd_ps(fft6232, fft6237, _mm512_shuffle_f32x4(fft6232, fft6232, 78));
__m512 fft6326 = _mm512_fmadd_ps(fft6320, fft6237, _mm512_shuffle_f32x4(fft6320, fft6320, 78));
__m512 fft6240 = _mm512_fmadd_ps(fft6233, fft6237, _mm512_shuffle_f32x4(fft6233, fft6233, 78));
__m512 fft6327 = _mm512_fmadd_ps(fft6321, fft6237, _mm512_shuffle_f32x4(fft6321, fft6321, 78));
__m512 fft6241 = _mm512_fmadd_ps(fft6234, fft6237, _mm512_shuffle_f32x4(fft6234, fft6234, 78));
__m512 fft6328 = _mm512_fmadd_ps(fft6322, fft6237, _mm512_shuffle_f32x4(fft6322, fft6322, 78));
__m512 fft6242 = _mm512_fmadd_ps(fft6226, fft6237, _mm512_shuffle_f32x4(fft6226, fft6226, 78));
__m512 fft6329 = _mm512_fmadd_ps(fft6314, fft6237, _mm512_shuffle_f32x4(fft6314, fft6314, 78));
__m512 fft6243 = _mm512_fmadd_ps(fft6228, fft6237, _mm512_shuffle_f32x4(fft6228, fft6228, 78));
__m512 fft6330 = _mm512_fmadd_ps(fft6316, fft6237, _mm512_shuffle_f32x4(fft6316, fft6316, 78));
__m512 fft6244 = _mm512_fmadd_ps(fft6235, fft6237, _mm512_shuffle_f32x4(fft6235, fft6235, 78));
__m512 fft6331 = _mm512_fmadd_ps(fft6323, fft6237, _mm512_shuffle_f32x4(fft6323, fft6323, 78));
__m512 fft6245 = _mm512_fmadd_ps(fft6236, fft6237, _mm512_shuffle_f32x4(fft6236, fft6236, 78));
__m512 fft6332 = _mm512_fmadd_ps(fft6324, fft6237, _mm512_shuffle_f32x4(fft6324, fft6324, 78));
__m512 fft6246 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6247 = _mm512_mul_ps(fft6238, fft6246);
__m512 fft6333 = _mm512_mul_ps(fft6325, fft6246);
__m512 fft6248 = _mm512_mul_ps(fft6239, fft6246);
__m512 fft6334 = _mm512_mul_ps(fft6326, fft6246);
__m512 fft6249 = _mm512_mul_ps(fft6240, fft6246);
__m512 fft6335 = _mm512_mul_ps(fft6327, fft6246);
__m512 fft6250 = _mm512_mul_ps(fft6241, fft6246);
__m512 fft6336 = _mm512_mul_ps(fft6328, fft6246);
__m512 fft6251 = _mm512_mul_ps(fft6242, fft6246);
__m512 fft6337 = _mm512_mul_ps(fft6329, fft6246);
__m512 fft6252 = _mm512_mul_ps(fft6243, fft6246);
__m512 fft6338 = _mm512_mul_ps(fft6330, fft6246);
__m512 fft6253 = _mm512_mul_ps(fft6244, fft6246);
__m512 fft6339 = _mm512_mul_ps(fft6331, fft6246);
__m512 fft6254 = _mm512_mul_ps(fft6245, fft6246);
__m512 fft6340 = _mm512_mul_ps(fft6332, fft6246);
__m512 fft6255 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6256 = _mm512_fmadd_ps(fft6239, fft6255, fft6247);
__m512 fft6341 = _mm512_fmadd_ps(fft6326, fft6255, fft6333);
__m512 fft6257 = _mm512_fnmadd_ps(fft6238, fft6255, fft6248);
__m512 fft6342 = _mm512_fnmadd_ps(fft6325, fft6255, fft6334);
__m512 fft6258 = _mm512_fmadd_ps(fft6241, fft6255, fft6249);
__m512 fft6343 = _mm512_fmadd_ps(fft6328, fft6255, fft6335);
__m512 fft6259 = _mm512_fnmadd_ps(fft6240, fft6255, fft6250);
__m512 fft6344 = _mm512_fnmadd_ps(fft6327, fft6255, fft6336);
__m512 fft6260 = _mm512_fmadd_ps(fft6243, fft6255, fft6251);
__m512 fft6345 = _mm512_fmadd_ps(fft6330, fft6255, fft6337);
__m512 fft6261 = _mm512_fnmadd_ps(fft6242, fft6255, fft6252);
__m512 fft6346 = _mm512_fnmadd_ps(fft6329, fft6255, fft6338);
__m512 fft6262 = _mm512_fmadd_ps(fft6245, fft6255, fft6253);
__m512 fft6347 = _mm512_fmadd_ps(fft6332, fft6255, fft6339);
__m512 fft6263 = _mm512_fnmadd_ps(fft6244, fft6255, fft6254);
__m512 fft6348 = _mm512_fnmadd_ps(fft6331, fft6255, fft6340);
__m512 fft6264 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6265 = _mm512_fmadd_ps(fft6256, fft6264, _mm512_shuffle_f32x4(fft6256, fft6256, 177));
__m512 fft6349 = _mm512_fmadd_ps(fft6341, fft6264, _mm512_shuffle_f32x4(fft6341, fft6341, 177));
__m512 fft6266 = _mm512_fmadd_ps(fft6257, fft6264, _mm512_shuffle_f32x4(fft6257, fft6257, 177));
__m512 fft6350 = _mm512_fmadd_ps(fft6342, fft6264, _mm512_shuffle_f32x4(fft6342, fft6342, 177));
__m512 fft6267 = _mm512_fmadd_ps(fft6258, fft6264, _mm512_shuffle_f32x4(fft6258, fft6258, 177));
__m512 fft6351 = _mm512_fmadd_ps(fft6343, fft6264, _mm512_shuffle_f32x4(fft6343, fft6343, 177));
__m512 fft6268 = _mm512_fmadd_ps(fft6259, fft6264, _mm512_shuffle_f32x4(fft6259, fft6259, 177));
__m512 fft6352 = _mm512_fmadd_ps(fft6344, fft6264, _mm512_shuffle_f32x4(fft6344, fft6344, 177));
__m512 fft6269 = _mm512_fmadd_ps(fft6260, fft6264, _mm512_shuffle_f32x4(fft6260, fft6260, 177));
__m512 fft6353 = _mm512_fmadd_ps(fft6345, fft6264, _mm512_shuffle_f32x4(fft6345, fft6345, 177));
__m512 fft6270 = _mm512_fmadd_ps(fft6261, fft6264, _mm512_shuffle_f32x4(fft6261, fft6261, 177));
__m512 fft6354 = _mm512_fmadd_ps(fft6346, fft6264, _mm512_shuffle_f32x4(fft6346, fft6346, 177));
__m512 fft6271 = _mm512_fmadd_ps(fft6262, fft6264, _mm512_shuffle_f32x4(fft6262, fft6262, 177));
__m512 fft6355 = _mm512_fmadd_ps(fft6347, fft6264, _mm512_shuffle_f32x4(fft6347, fft6347, 177));
__m512 fft6272 = _mm512_fmadd_ps(fft6263, fft6264, _mm512_shuffle_f32x4(fft6263, fft6263, 177));
__m512 fft6356 = _mm512_fmadd_ps(fft6348, fft6264, _mm512_shuffle_f32x4(fft6348, fft6348, 177));
__m512 fft6273 = _mm512_mask_mov_ps(fft6265, 49344, fft6266);
__m512 fft6357 = _mm512_mask_mov_ps(fft6349, 49344, fft6350);
__m512 fft6274 = _mm512_mask_sub_ps(fft6266, 49344, _mm512_setzero_ps(), fft6265);
__m512 fft6358 = _mm512_mask_sub_ps(fft6350, 49344, _mm512_setzero_ps(), fft6349);
__m512 fft6275 = _mm512_mask_mov_ps(fft6267, 49344, fft6268);
__m512 fft6359 = _mm512_mask_mov_ps(fft6351, 49344, fft6352);
__m512 fft6276 = _mm512_mask_sub_ps(fft6268, 49344, _mm512_setzero_ps(), fft6267);
__m512 fft6360 = _mm512_mask_sub_ps(fft6352, 49344, _mm512_setzero_ps(), fft6351);
__m512 fft6277 = _mm512_mask_mov_ps(fft6269, 49344, fft6270);
__m512 fft6361 = _mm512_mask_mov_ps(fft6353, 49344, fft6354);
__m512 fft6278 = _mm512_mask_sub_ps(fft6270, 49344, _mm512_setzero_ps(), fft6269);
__m512 fft6362 = _mm512_mask_sub_ps(fft6354, 49344, _mm512_setzero_ps(), fft6353);
__m512 fft6279 = _mm512_mask_mov_ps(fft6271, 49344, fft6272);
__m512 fft6363 = _mm512_mask_mov_ps(fft6355, 49344, fft6356);
__m512 fft6280 = _mm512_mask_sub_ps(fft6272, 49344, _mm512_setzero_ps(), fft6271);
__m512 fft6364 = _mm512_mask_sub_ps(fft6356, 49344, _mm512_setzero_ps(), fft6355);
__m512 fft6281 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6282 = _mm512_fmadd_ps(fft6273, fft6281, _mm512_shuffle_ps(fft6273, fft6273, 78));
__m512 fft6365 = _mm512_fmadd_ps(fft6357, fft6281, _mm512_shuffle_ps(fft6357, fft6357, 78));
__m512 fft6283 = _mm512_fmadd_ps(fft6274, fft6281, _mm512_shuffle_ps(fft6274, fft6274, 78));
__m512 fft6366 = _mm512_fmadd_ps(fft6358, fft6281, _mm512_shuffle_ps(fft6358, fft6358, 78));
__m512 fft6284 = _mm512_fmadd_ps(fft6275, fft6281, _mm512_shuffle_ps(fft6275, fft6275, 78));
__m512 fft6367 = _mm512_fmadd_ps(fft6359, fft6281, _mm512_shuffle_ps(fft6359, fft6359, 78));
__m512 fft6285 = _mm512_fmadd_ps(fft6276, fft6281, _mm512_shuffle_ps(fft6276, fft6276, 78));
__m512 fft6368 = _mm512_fmadd_ps(fft6360, fft6281, _mm512_shuffle_ps(fft6360, fft6360, 78));
__m512 fft6286 = _mm512_fmadd_ps(fft6277, fft6281, _mm512_shuffle_ps(fft6277, fft6277, 78));
__m512 fft6369 = _mm512_fmadd_ps(fft6361, fft6281, _mm512_shuffle_ps(fft6361, fft6361, 78));
__m512 fft6287 = _mm512_fmadd_ps(fft6278, fft6281, _mm512_shuffle_ps(fft6278, fft6278, 78));
__m512 fft6370 = _mm512_fmadd_ps(fft6362, fft6281, _mm512_shuffle_ps(fft6362, fft6362, 78));
__m512 fft6288 = _mm512_fmadd_ps(fft6279, fft6281, _mm512_shuffle_ps(fft6279, fft6279, 78));
__m512 fft6371 = _mm512_fmadd_ps(fft6363, fft6281, _mm512_shuffle_ps(fft6363, fft6363, 78));
__m512 fft6289 = _mm512_fmadd_ps(fft6280, fft6281, _mm512_shuffle_ps(fft6280, fft6280, 78));
__m512 fft6372 = _mm512_fmadd_ps(fft6364, fft6281, _mm512_shuffle_ps(fft6364, fft6364, 78));
__m512i fft6290 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6291 = _mm512_permutexvar_ps(fft6290, fft6282);
__m512 fft6373 = _mm512_permutexvar_ps(fft6290, fft6365);
__m512i fft6292 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6293 = _mm512_permutexvar_ps(fft6292, fft6282);
__m512 fft6374 = _mm512_permutexvar_ps(fft6292, fft6365);
__m512 fft6294 = _mm512_permutexvar_ps(fft6290, fft6283);
__m512 fft6375 = _mm512_permutexvar_ps(fft6290, fft6366);
__m512 fft6295 = _mm512_permutexvar_ps(fft6292, fft6283);
__m512 fft6376 = _mm512_permutexvar_ps(fft6292, fft6366);
__m512 fft6296 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6297 = _mm512_fmadd_ps(fft6291, fft6296, fft6293);
__m512 fft6377 = _mm512_fmadd_ps(fft6373, fft6296, fft6374);
__m512 fft6298 = _mm512_fnmadd_ps(fft6295, fft6296, fft6294);
__m512 fft6378 = _mm512_fnmadd_ps(fft6376, fft6296, fft6375);
__m512 fft6299 = _mm512_mask_mov_ps(fft6295, 21845, fft6297);
__m512 fft6379 = _mm512_mask_mov_ps(fft6376, 21845, fft6377);
__m512 fft6300 = _mm512_mask_mov_ps(fft6291, 43176, fft6297);
__m512 fft6380 = _mm512_mask_mov_ps(fft6373, 43176, fft6377);
__m512 fft6301 = _mm512_mask_mov_ps(fft6299, 43176, fft6298);
__m512 fft6381 = _mm512_mask_mov_ps(fft6379, 43176, fft6378);
__m512 fft6302 = _mm512_mask_mov_ps(fft6300, 22102, fft6298);
__m512 fft6382 = _mm512_mask_mov_ps(fft6380, 22102, fft6378);
__m512 fft6303 = _mm512_mask_mul_ps(fft6301, 64764, fft6301, _mm512_set1_ps(5e-01f));
__m512 fft6383 = _mm512_mask_mul_ps(fft6381, 64764, fft6381, _mm512_set1_ps(5e-01f));
__m512 fft6304 = _mm512_mask_mul_ps(fft6302, 64764, fft6302, _mm512_set1_ps(5e-01f));
__m512 fft6384 = _mm512_mask_mul_ps(fft6382, 64764, fft6382, _mm512_set1_ps(5e-01f));
__m512 df561 = fft6303;
__m512 df569 = fft6383;
__m512 df562 = fft6304;
__m512 df570 = fft6384;
__m512 df563 = fft6284;
__m512 df571 = fft6367;
__m512 df564 = fft6285;
__m512 df572 = fft6368;
__m512 df565 = fft6286;
__m512 df573 = fft6369;
__m512 df566 = fft6287;
__m512 df574 = fft6370;
__m512 df567 = fft6288;
__m512 df575 = fft6371;
__m512 df568 = fft6289;
__m512 df576 = fft6372;
__m512i eo38 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df563 = _mm512_permutexvar_ps(eo38, df563);
df564 = _mm512_permutexvar_ps(eo38, df564);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df563);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df564);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df563);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df564);
df571 = _mm512_permutexvar_ps(eo38, df571);
df572 = _mm512_permutexvar_ps(eo38, df572);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df571);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df572);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df571);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df572);
df565 = _mm512_permutexvar_ps(eo38, df565);
df566 = _mm512_permutexvar_ps(eo38, df566);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df565);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df566);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df565);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df566);
df573 = _mm512_permutexvar_ps(eo38, df573);
df574 = _mm512_permutexvar_ps(eo38, df574);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df573);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df574);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df573);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df574);
df567 = _mm512_permutexvar_ps(eo38, df567);
df568 = _mm512_permutexvar_ps(eo38, df568);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df567);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df568);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df567);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df568);
df575 = _mm512_permutexvar_ps(eo38, df575);
df576 = _mm512_permutexvar_ps(eo38, df576);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df575);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df576);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df575);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df576);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df561);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df562);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df561);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df562);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df569);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m38+32*f39, 255, df570);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df569);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m38+32*f39, 65280, df570);
ptrdiff_t b39 = 2;
ptrdiff_t m39 = (size_t)b39/2;
ptrdiff_t f40 = (size_t)b39%2;
__m512 dat562 = _mm512_maskz_loadu_ps(65528, datPtr1+8120+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat562 = _mm512_mask_fmadd_ps(dat562, 65528, bnMul17, bnAdd17);
__m512 dat563 = _mm512_maskz_loadu_ps(65528, datPtr1+9016+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat563 = _mm512_mask_fmadd_ps(dat563, 65528, bnMul17, bnAdd17);
__m512 dat564 = _mm512_maskz_loadu_ps(65528, datPtr1+9912+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat564 = _mm512_mask_fmadd_ps(dat564, 65528, bnMul17, bnAdd17);
__m512 dat565 = _mm512_maskz_loadu_ps(65528, datPtr1+10808+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat565 = _mm512_mask_fmadd_ps(dat565, 65528, bnMul17, bnAdd17);
__m512 dat566 = _mm512_maskz_loadu_ps(65528, datPtr1+11704+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat566 = _mm512_mask_fmadd_ps(dat566, 65528, bnMul17, bnAdd17);
__m512 dat567 = _mm512_maskz_loadu_ps(65528, datPtr1+12600+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat567 = _mm512_mask_fmadd_ps(dat567, 65528, bnMul17, bnAdd17);
__m512 dat568 = _mm512_maskz_loadu_ps(65528, datPtr1+13496+602112*i6+200704*k18+896*h17+4*w17+0*b39);
dat568 = _mm512_mask_fmadd_ps(dat568, 65528, bnMul17, bnAdd17);
__m512 fft6385 = _mm512_add_ps(dat562, _mm512_setzero_ps());
__m512 fft6473 = _mm512_add_ps(dat563, _mm512_setzero_ps());
__m512 fft6386 = _mm512_sub_ps(dat562, _mm512_setzero_ps());
__m512 fft6474 = _mm512_sub_ps(dat563, _mm512_setzero_ps());
__m512 fft6387 = _mm512_add_ps(dat564, _mm512_setzero_ps());
__m512 fft6475 = _mm512_add_ps(dat565, _mm512_setzero_ps());
__m512 fft6388 = _mm512_sub_ps(dat564, _mm512_setzero_ps());
__m512 fft6476 = _mm512_sub_ps(dat565, _mm512_setzero_ps());
__m512 fft6389 = _mm512_add_ps(dat566, _mm512_setzero_ps());
__m512 fft6477 = _mm512_add_ps(dat567, _mm512_setzero_ps());
__m512 fft6390 = _mm512_sub_ps(dat566, _mm512_setzero_ps());
__m512 fft6478 = _mm512_sub_ps(dat567, _mm512_setzero_ps());
__m512 fft6391 = _mm512_add_ps(dat568, _mm512_setzero_ps());
__m512 fft6479 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6392 = _mm512_sub_ps(dat568, _mm512_setzero_ps());
__m512 fft6480 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6393 = _mm512_add_ps(fft6385, fft6389);
__m512 fft6481 = _mm512_add_ps(fft6473, fft6477);
__m512 fft6394 = _mm512_sub_ps(fft6385, fft6389);
__m512 fft6482 = _mm512_sub_ps(fft6473, fft6477);
__m512 fft6395 = _mm512_add_ps(fft6387, fft6391);
__m512 fft6483 = _mm512_add_ps(fft6475, fft6479);
__m512 fft6396 = _mm512_sub_ps(fft6391, fft6387);
__m512 fft6484 = _mm512_sub_ps(fft6479, fft6475);
__m512 fft6397 = _mm512_sub_ps(fft6388, fft6392);
__m512 fft6485 = _mm512_sub_ps(fft6476, fft6480);
__m512 fft6398 = _mm512_add_ps(fft6388, fft6392);
__m512 fft6486 = _mm512_add_ps(fft6476, fft6480);
__m512 fft6399 = _mm512_add_ps(fft6393, fft6395);
__m512 fft6487 = _mm512_add_ps(fft6481, fft6483);
__m512 fft6400 = _mm512_sub_ps(fft6393, fft6395);
__m512 fft6488 = _mm512_sub_ps(fft6481, fft6483);
__m512 fft6401 = _mm512_fmadd_ps(fft6397, _mm512_set1_ps(7.0710677e-01f), fft6386);
__m512 fft6489 = _mm512_fmadd_ps(fft6485, _mm512_set1_ps(7.0710677e-01f), fft6474);
__m512 fft6402 = _mm512_fnmsub_ps(fft6398, _mm512_set1_ps(7.0710677e-01f), fft6390);
__m512 fft6490 = _mm512_fnmsub_ps(fft6486, _mm512_set1_ps(7.0710677e-01f), fft6478);
__m512 fft6403 = _mm512_fnmadd_ps(fft6397, _mm512_set1_ps(7.0710677e-01f), fft6386);
__m512 fft6491 = _mm512_fnmadd_ps(fft6485, _mm512_set1_ps(7.0710677e-01f), fft6474);
__m512 fft6404 = _mm512_fnmadd_ps(fft6398, _mm512_set1_ps(7.0710677e-01f), fft6390);
__m512 fft6492 = _mm512_fnmadd_ps(fft6486, _mm512_set1_ps(7.0710677e-01f), fft6478);
__m512 fft6405 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6406 = _mm512_fmadd_ps(fft6399, fft6405, _mm512_shuffle_f32x4(fft6399, fft6399, 78));
__m512 fft6493 = _mm512_fmadd_ps(fft6487, fft6405, _mm512_shuffle_f32x4(fft6487, fft6487, 78));
__m512 fft6407 = _mm512_fmadd_ps(fft6400, fft6405, _mm512_shuffle_f32x4(fft6400, fft6400, 78));
__m512 fft6494 = _mm512_fmadd_ps(fft6488, fft6405, _mm512_shuffle_f32x4(fft6488, fft6488, 78));
__m512 fft6408 = _mm512_fmadd_ps(fft6401, fft6405, _mm512_shuffle_f32x4(fft6401, fft6401, 78));
__m512 fft6495 = _mm512_fmadd_ps(fft6489, fft6405, _mm512_shuffle_f32x4(fft6489, fft6489, 78));
__m512 fft6409 = _mm512_fmadd_ps(fft6402, fft6405, _mm512_shuffle_f32x4(fft6402, fft6402, 78));
__m512 fft6496 = _mm512_fmadd_ps(fft6490, fft6405, _mm512_shuffle_f32x4(fft6490, fft6490, 78));
__m512 fft6410 = _mm512_fmadd_ps(fft6394, fft6405, _mm512_shuffle_f32x4(fft6394, fft6394, 78));
__m512 fft6497 = _mm512_fmadd_ps(fft6482, fft6405, _mm512_shuffle_f32x4(fft6482, fft6482, 78));
__m512 fft6411 = _mm512_fmadd_ps(fft6396, fft6405, _mm512_shuffle_f32x4(fft6396, fft6396, 78));
__m512 fft6498 = _mm512_fmadd_ps(fft6484, fft6405, _mm512_shuffle_f32x4(fft6484, fft6484, 78));
__m512 fft6412 = _mm512_fmadd_ps(fft6403, fft6405, _mm512_shuffle_f32x4(fft6403, fft6403, 78));
__m512 fft6499 = _mm512_fmadd_ps(fft6491, fft6405, _mm512_shuffle_f32x4(fft6491, fft6491, 78));
__m512 fft6413 = _mm512_fmadd_ps(fft6404, fft6405, _mm512_shuffle_f32x4(fft6404, fft6404, 78));
__m512 fft6500 = _mm512_fmadd_ps(fft6492, fft6405, _mm512_shuffle_f32x4(fft6492, fft6492, 78));
__m512 fft6414 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6415 = _mm512_mul_ps(fft6406, fft6414);
__m512 fft6501 = _mm512_mul_ps(fft6493, fft6414);
__m512 fft6416 = _mm512_mul_ps(fft6407, fft6414);
__m512 fft6502 = _mm512_mul_ps(fft6494, fft6414);
__m512 fft6417 = _mm512_mul_ps(fft6408, fft6414);
__m512 fft6503 = _mm512_mul_ps(fft6495, fft6414);
__m512 fft6418 = _mm512_mul_ps(fft6409, fft6414);
__m512 fft6504 = _mm512_mul_ps(fft6496, fft6414);
__m512 fft6419 = _mm512_mul_ps(fft6410, fft6414);
__m512 fft6505 = _mm512_mul_ps(fft6497, fft6414);
__m512 fft6420 = _mm512_mul_ps(fft6411, fft6414);
__m512 fft6506 = _mm512_mul_ps(fft6498, fft6414);
__m512 fft6421 = _mm512_mul_ps(fft6412, fft6414);
__m512 fft6507 = _mm512_mul_ps(fft6499, fft6414);
__m512 fft6422 = _mm512_mul_ps(fft6413, fft6414);
__m512 fft6508 = _mm512_mul_ps(fft6500, fft6414);
__m512 fft6423 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6424 = _mm512_fmadd_ps(fft6407, fft6423, fft6415);
__m512 fft6509 = _mm512_fmadd_ps(fft6494, fft6423, fft6501);
__m512 fft6425 = _mm512_fnmadd_ps(fft6406, fft6423, fft6416);
__m512 fft6510 = _mm512_fnmadd_ps(fft6493, fft6423, fft6502);
__m512 fft6426 = _mm512_fmadd_ps(fft6409, fft6423, fft6417);
__m512 fft6511 = _mm512_fmadd_ps(fft6496, fft6423, fft6503);
__m512 fft6427 = _mm512_fnmadd_ps(fft6408, fft6423, fft6418);
__m512 fft6512 = _mm512_fnmadd_ps(fft6495, fft6423, fft6504);
__m512 fft6428 = _mm512_fmadd_ps(fft6411, fft6423, fft6419);
__m512 fft6513 = _mm512_fmadd_ps(fft6498, fft6423, fft6505);
__m512 fft6429 = _mm512_fnmadd_ps(fft6410, fft6423, fft6420);
__m512 fft6514 = _mm512_fnmadd_ps(fft6497, fft6423, fft6506);
__m512 fft6430 = _mm512_fmadd_ps(fft6413, fft6423, fft6421);
__m512 fft6515 = _mm512_fmadd_ps(fft6500, fft6423, fft6507);
__m512 fft6431 = _mm512_fnmadd_ps(fft6412, fft6423, fft6422);
__m512 fft6516 = _mm512_fnmadd_ps(fft6499, fft6423, fft6508);
__m512 fft6432 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6433 = _mm512_fmadd_ps(fft6424, fft6432, _mm512_shuffle_f32x4(fft6424, fft6424, 177));
__m512 fft6517 = _mm512_fmadd_ps(fft6509, fft6432, _mm512_shuffle_f32x4(fft6509, fft6509, 177));
__m512 fft6434 = _mm512_fmadd_ps(fft6425, fft6432, _mm512_shuffle_f32x4(fft6425, fft6425, 177));
__m512 fft6518 = _mm512_fmadd_ps(fft6510, fft6432, _mm512_shuffle_f32x4(fft6510, fft6510, 177));
__m512 fft6435 = _mm512_fmadd_ps(fft6426, fft6432, _mm512_shuffle_f32x4(fft6426, fft6426, 177));
__m512 fft6519 = _mm512_fmadd_ps(fft6511, fft6432, _mm512_shuffle_f32x4(fft6511, fft6511, 177));
__m512 fft6436 = _mm512_fmadd_ps(fft6427, fft6432, _mm512_shuffle_f32x4(fft6427, fft6427, 177));
__m512 fft6520 = _mm512_fmadd_ps(fft6512, fft6432, _mm512_shuffle_f32x4(fft6512, fft6512, 177));
__m512 fft6437 = _mm512_fmadd_ps(fft6428, fft6432, _mm512_shuffle_f32x4(fft6428, fft6428, 177));
__m512 fft6521 = _mm512_fmadd_ps(fft6513, fft6432, _mm512_shuffle_f32x4(fft6513, fft6513, 177));
__m512 fft6438 = _mm512_fmadd_ps(fft6429, fft6432, _mm512_shuffle_f32x4(fft6429, fft6429, 177));
__m512 fft6522 = _mm512_fmadd_ps(fft6514, fft6432, _mm512_shuffle_f32x4(fft6514, fft6514, 177));
__m512 fft6439 = _mm512_fmadd_ps(fft6430, fft6432, _mm512_shuffle_f32x4(fft6430, fft6430, 177));
__m512 fft6523 = _mm512_fmadd_ps(fft6515, fft6432, _mm512_shuffle_f32x4(fft6515, fft6515, 177));
__m512 fft6440 = _mm512_fmadd_ps(fft6431, fft6432, _mm512_shuffle_f32x4(fft6431, fft6431, 177));
__m512 fft6524 = _mm512_fmadd_ps(fft6516, fft6432, _mm512_shuffle_f32x4(fft6516, fft6516, 177));
__m512 fft6441 = _mm512_mask_mov_ps(fft6433, 49344, fft6434);
__m512 fft6525 = _mm512_mask_mov_ps(fft6517, 49344, fft6518);
__m512 fft6442 = _mm512_mask_sub_ps(fft6434, 49344, _mm512_setzero_ps(), fft6433);
__m512 fft6526 = _mm512_mask_sub_ps(fft6518, 49344, _mm512_setzero_ps(), fft6517);
__m512 fft6443 = _mm512_mask_mov_ps(fft6435, 49344, fft6436);
__m512 fft6527 = _mm512_mask_mov_ps(fft6519, 49344, fft6520);
__m512 fft6444 = _mm512_mask_sub_ps(fft6436, 49344, _mm512_setzero_ps(), fft6435);
__m512 fft6528 = _mm512_mask_sub_ps(fft6520, 49344, _mm512_setzero_ps(), fft6519);
__m512 fft6445 = _mm512_mask_mov_ps(fft6437, 49344, fft6438);
__m512 fft6529 = _mm512_mask_mov_ps(fft6521, 49344, fft6522);
__m512 fft6446 = _mm512_mask_sub_ps(fft6438, 49344, _mm512_setzero_ps(), fft6437);
__m512 fft6530 = _mm512_mask_sub_ps(fft6522, 49344, _mm512_setzero_ps(), fft6521);
__m512 fft6447 = _mm512_mask_mov_ps(fft6439, 49344, fft6440);
__m512 fft6531 = _mm512_mask_mov_ps(fft6523, 49344, fft6524);
__m512 fft6448 = _mm512_mask_sub_ps(fft6440, 49344, _mm512_setzero_ps(), fft6439);
__m512 fft6532 = _mm512_mask_sub_ps(fft6524, 49344, _mm512_setzero_ps(), fft6523);
__m512 fft6449 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6450 = _mm512_fmadd_ps(fft6441, fft6449, _mm512_shuffle_ps(fft6441, fft6441, 78));
__m512 fft6533 = _mm512_fmadd_ps(fft6525, fft6449, _mm512_shuffle_ps(fft6525, fft6525, 78));
__m512 fft6451 = _mm512_fmadd_ps(fft6442, fft6449, _mm512_shuffle_ps(fft6442, fft6442, 78));
__m512 fft6534 = _mm512_fmadd_ps(fft6526, fft6449, _mm512_shuffle_ps(fft6526, fft6526, 78));
__m512 fft6452 = _mm512_fmadd_ps(fft6443, fft6449, _mm512_shuffle_ps(fft6443, fft6443, 78));
__m512 fft6535 = _mm512_fmadd_ps(fft6527, fft6449, _mm512_shuffle_ps(fft6527, fft6527, 78));
__m512 fft6453 = _mm512_fmadd_ps(fft6444, fft6449, _mm512_shuffle_ps(fft6444, fft6444, 78));
__m512 fft6536 = _mm512_fmadd_ps(fft6528, fft6449, _mm512_shuffle_ps(fft6528, fft6528, 78));
__m512 fft6454 = _mm512_fmadd_ps(fft6445, fft6449, _mm512_shuffle_ps(fft6445, fft6445, 78));
__m512 fft6537 = _mm512_fmadd_ps(fft6529, fft6449, _mm512_shuffle_ps(fft6529, fft6529, 78));
__m512 fft6455 = _mm512_fmadd_ps(fft6446, fft6449, _mm512_shuffle_ps(fft6446, fft6446, 78));
__m512 fft6538 = _mm512_fmadd_ps(fft6530, fft6449, _mm512_shuffle_ps(fft6530, fft6530, 78));
__m512 fft6456 = _mm512_fmadd_ps(fft6447, fft6449, _mm512_shuffle_ps(fft6447, fft6447, 78));
__m512 fft6539 = _mm512_fmadd_ps(fft6531, fft6449, _mm512_shuffle_ps(fft6531, fft6531, 78));
__m512 fft6457 = _mm512_fmadd_ps(fft6448, fft6449, _mm512_shuffle_ps(fft6448, fft6448, 78));
__m512 fft6540 = _mm512_fmadd_ps(fft6532, fft6449, _mm512_shuffle_ps(fft6532, fft6532, 78));
__m512i fft6458 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6459 = _mm512_permutexvar_ps(fft6458, fft6450);
__m512 fft6541 = _mm512_permutexvar_ps(fft6458, fft6533);
__m512i fft6460 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6461 = _mm512_permutexvar_ps(fft6460, fft6450);
__m512 fft6542 = _mm512_permutexvar_ps(fft6460, fft6533);
__m512 fft6462 = _mm512_permutexvar_ps(fft6458, fft6451);
__m512 fft6543 = _mm512_permutexvar_ps(fft6458, fft6534);
__m512 fft6463 = _mm512_permutexvar_ps(fft6460, fft6451);
__m512 fft6544 = _mm512_permutexvar_ps(fft6460, fft6534);
__m512 fft6464 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6465 = _mm512_fmadd_ps(fft6459, fft6464, fft6461);
__m512 fft6545 = _mm512_fmadd_ps(fft6541, fft6464, fft6542);
__m512 fft6466 = _mm512_fnmadd_ps(fft6463, fft6464, fft6462);
__m512 fft6546 = _mm512_fnmadd_ps(fft6544, fft6464, fft6543);
__m512 fft6467 = _mm512_mask_mov_ps(fft6463, 21845, fft6465);
__m512 fft6547 = _mm512_mask_mov_ps(fft6544, 21845, fft6545);
__m512 fft6468 = _mm512_mask_mov_ps(fft6459, 43176, fft6465);
__m512 fft6548 = _mm512_mask_mov_ps(fft6541, 43176, fft6545);
__m512 fft6469 = _mm512_mask_mov_ps(fft6467, 43176, fft6466);
__m512 fft6549 = _mm512_mask_mov_ps(fft6547, 43176, fft6546);
__m512 fft6470 = _mm512_mask_mov_ps(fft6468, 22102, fft6466);
__m512 fft6550 = _mm512_mask_mov_ps(fft6548, 22102, fft6546);
__m512 fft6471 = _mm512_mask_mul_ps(fft6469, 64764, fft6469, _mm512_set1_ps(5e-01f));
__m512 fft6551 = _mm512_mask_mul_ps(fft6549, 64764, fft6549, _mm512_set1_ps(5e-01f));
__m512 fft6472 = _mm512_mask_mul_ps(fft6470, 64764, fft6470, _mm512_set1_ps(5e-01f));
__m512 fft6552 = _mm512_mask_mul_ps(fft6550, 64764, fft6550, _mm512_set1_ps(5e-01f));
__m512 df577 = fft6471;
__m512 df585 = fft6551;
__m512 df578 = fft6472;
__m512 df586 = fft6552;
__m512 df579 = fft6452;
__m512 df587 = fft6535;
__m512 df580 = fft6453;
__m512 df588 = fft6536;
__m512 df581 = fft6454;
__m512 df589 = fft6537;
__m512 df582 = fft6455;
__m512 df590 = fft6538;
__m512 df583 = fft6456;
__m512 df591 = fft6539;
__m512 df584 = fft6457;
__m512 df592 = fft6540;
__m512i eo39 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df579 = _mm512_permutexvar_ps(eo39, df579);
df580 = _mm512_permutexvar_ps(eo39, df580);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df579);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df580);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df579);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df580);
df587 = _mm512_permutexvar_ps(eo39, df587);
df588 = _mm512_permutexvar_ps(eo39, df588);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df587);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df588);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df587);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df588);
df581 = _mm512_permutexvar_ps(eo39, df581);
df582 = _mm512_permutexvar_ps(eo39, df582);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df581);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df582);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df581);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df582);
df589 = _mm512_permutexvar_ps(eo39, df589);
df590 = _mm512_permutexvar_ps(eo39, df590);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df589);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df590);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df589);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df590);
df583 = _mm512_permutexvar_ps(eo39, df583);
df584 = _mm512_permutexvar_ps(eo39, df584);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df583);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df584);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df583);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df584);
df591 = _mm512_permutexvar_ps(eo39, df591);
df592 = _mm512_permutexvar_ps(eo39, df592);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df591);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df592);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df591);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df592);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df577);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df578);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df577);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df578);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df585);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m39+32*f40, 255, df586);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df585);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m39+32*f40, 65280, df586);
for (ptrdiff_t b40 = 3; b40 < 6; ++b40) {
ptrdiff_t m40 = (size_t)b40/2;
ptrdiff_t f41 = (size_t)b40%2;
__m512 dat569 = _mm512_maskz_loadu_ps(65535, datPtr1+8040+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat569 = _mm512_mask_fmadd_ps(dat569, 65535, bnMul17, bnAdd17);
__m512 dat570 = _mm512_maskz_loadu_ps(65535, datPtr1+8936+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat570 = _mm512_mask_fmadd_ps(dat570, 65535, bnMul17, bnAdd17);
__m512 dat571 = _mm512_maskz_loadu_ps(65535, datPtr1+9832+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat571 = _mm512_mask_fmadd_ps(dat571, 65535, bnMul17, bnAdd17);
__m512 dat572 = _mm512_maskz_loadu_ps(65535, datPtr1+10728+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat572 = _mm512_mask_fmadd_ps(dat572, 65535, bnMul17, bnAdd17);
__m512 dat573 = _mm512_maskz_loadu_ps(65535, datPtr1+11624+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat573 = _mm512_mask_fmadd_ps(dat573, 65535, bnMul17, bnAdd17);
__m512 dat574 = _mm512_maskz_loadu_ps(65535, datPtr1+12520+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat574 = _mm512_mask_fmadd_ps(dat574, 65535, bnMul17, bnAdd17);
__m512 dat575 = _mm512_maskz_loadu_ps(65535, datPtr1+13416+602112*i6+200704*k18+896*h17+4*w17+40*b40);
dat575 = _mm512_mask_fmadd_ps(dat575, 65535, bnMul17, bnAdd17);
__m512 fft6553 = _mm512_add_ps(dat569, _mm512_setzero_ps());
__m512 fft6641 = _mm512_add_ps(dat570, _mm512_setzero_ps());
__m512 fft6554 = _mm512_sub_ps(dat569, _mm512_setzero_ps());
__m512 fft6642 = _mm512_sub_ps(dat570, _mm512_setzero_ps());
__m512 fft6555 = _mm512_add_ps(dat571, _mm512_setzero_ps());
__m512 fft6643 = _mm512_add_ps(dat572, _mm512_setzero_ps());
__m512 fft6556 = _mm512_sub_ps(dat571, _mm512_setzero_ps());
__m512 fft6644 = _mm512_sub_ps(dat572, _mm512_setzero_ps());
__m512 fft6557 = _mm512_add_ps(dat573, _mm512_setzero_ps());
__m512 fft6645 = _mm512_add_ps(dat574, _mm512_setzero_ps());
__m512 fft6558 = _mm512_sub_ps(dat573, _mm512_setzero_ps());
__m512 fft6646 = _mm512_sub_ps(dat574, _mm512_setzero_ps());
__m512 fft6559 = _mm512_add_ps(dat575, _mm512_setzero_ps());
__m512 fft6647 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6560 = _mm512_sub_ps(dat575, _mm512_setzero_ps());
__m512 fft6648 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6561 = _mm512_add_ps(fft6553, fft6557);
__m512 fft6649 = _mm512_add_ps(fft6641, fft6645);
__m512 fft6562 = _mm512_sub_ps(fft6553, fft6557);
__m512 fft6650 = _mm512_sub_ps(fft6641, fft6645);
__m512 fft6563 = _mm512_add_ps(fft6555, fft6559);
__m512 fft6651 = _mm512_add_ps(fft6643, fft6647);
__m512 fft6564 = _mm512_sub_ps(fft6559, fft6555);
__m512 fft6652 = _mm512_sub_ps(fft6647, fft6643);
__m512 fft6565 = _mm512_sub_ps(fft6556, fft6560);
__m512 fft6653 = _mm512_sub_ps(fft6644, fft6648);
__m512 fft6566 = _mm512_add_ps(fft6556, fft6560);
__m512 fft6654 = _mm512_add_ps(fft6644, fft6648);
__m512 fft6567 = _mm512_add_ps(fft6561, fft6563);
__m512 fft6655 = _mm512_add_ps(fft6649, fft6651);
__m512 fft6568 = _mm512_sub_ps(fft6561, fft6563);
__m512 fft6656 = _mm512_sub_ps(fft6649, fft6651);
__m512 fft6569 = _mm512_fmadd_ps(fft6565, _mm512_set1_ps(7.0710677e-01f), fft6554);
__m512 fft6657 = _mm512_fmadd_ps(fft6653, _mm512_set1_ps(7.0710677e-01f), fft6642);
__m512 fft6570 = _mm512_fnmsub_ps(fft6566, _mm512_set1_ps(7.0710677e-01f), fft6558);
__m512 fft6658 = _mm512_fnmsub_ps(fft6654, _mm512_set1_ps(7.0710677e-01f), fft6646);
__m512 fft6571 = _mm512_fnmadd_ps(fft6565, _mm512_set1_ps(7.0710677e-01f), fft6554);
__m512 fft6659 = _mm512_fnmadd_ps(fft6653, _mm512_set1_ps(7.0710677e-01f), fft6642);
__m512 fft6572 = _mm512_fnmadd_ps(fft6566, _mm512_set1_ps(7.0710677e-01f), fft6558);
__m512 fft6660 = _mm512_fnmadd_ps(fft6654, _mm512_set1_ps(7.0710677e-01f), fft6646);
__m512 fft6573 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6574 = _mm512_fmadd_ps(fft6567, fft6573, _mm512_shuffle_f32x4(fft6567, fft6567, 78));
__m512 fft6661 = _mm512_fmadd_ps(fft6655, fft6573, _mm512_shuffle_f32x4(fft6655, fft6655, 78));
__m512 fft6575 = _mm512_fmadd_ps(fft6568, fft6573, _mm512_shuffle_f32x4(fft6568, fft6568, 78));
__m512 fft6662 = _mm512_fmadd_ps(fft6656, fft6573, _mm512_shuffle_f32x4(fft6656, fft6656, 78));
__m512 fft6576 = _mm512_fmadd_ps(fft6569, fft6573, _mm512_shuffle_f32x4(fft6569, fft6569, 78));
__m512 fft6663 = _mm512_fmadd_ps(fft6657, fft6573, _mm512_shuffle_f32x4(fft6657, fft6657, 78));
__m512 fft6577 = _mm512_fmadd_ps(fft6570, fft6573, _mm512_shuffle_f32x4(fft6570, fft6570, 78));
__m512 fft6664 = _mm512_fmadd_ps(fft6658, fft6573, _mm512_shuffle_f32x4(fft6658, fft6658, 78));
__m512 fft6578 = _mm512_fmadd_ps(fft6562, fft6573, _mm512_shuffle_f32x4(fft6562, fft6562, 78));
__m512 fft6665 = _mm512_fmadd_ps(fft6650, fft6573, _mm512_shuffle_f32x4(fft6650, fft6650, 78));
__m512 fft6579 = _mm512_fmadd_ps(fft6564, fft6573, _mm512_shuffle_f32x4(fft6564, fft6564, 78));
__m512 fft6666 = _mm512_fmadd_ps(fft6652, fft6573, _mm512_shuffle_f32x4(fft6652, fft6652, 78));
__m512 fft6580 = _mm512_fmadd_ps(fft6571, fft6573, _mm512_shuffle_f32x4(fft6571, fft6571, 78));
__m512 fft6667 = _mm512_fmadd_ps(fft6659, fft6573, _mm512_shuffle_f32x4(fft6659, fft6659, 78));
__m512 fft6581 = _mm512_fmadd_ps(fft6572, fft6573, _mm512_shuffle_f32x4(fft6572, fft6572, 78));
__m512 fft6668 = _mm512_fmadd_ps(fft6660, fft6573, _mm512_shuffle_f32x4(fft6660, fft6660, 78));
__m512 fft6582 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6583 = _mm512_mul_ps(fft6574, fft6582);
__m512 fft6669 = _mm512_mul_ps(fft6661, fft6582);
__m512 fft6584 = _mm512_mul_ps(fft6575, fft6582);
__m512 fft6670 = _mm512_mul_ps(fft6662, fft6582);
__m512 fft6585 = _mm512_mul_ps(fft6576, fft6582);
__m512 fft6671 = _mm512_mul_ps(fft6663, fft6582);
__m512 fft6586 = _mm512_mul_ps(fft6577, fft6582);
__m512 fft6672 = _mm512_mul_ps(fft6664, fft6582);
__m512 fft6587 = _mm512_mul_ps(fft6578, fft6582);
__m512 fft6673 = _mm512_mul_ps(fft6665, fft6582);
__m512 fft6588 = _mm512_mul_ps(fft6579, fft6582);
__m512 fft6674 = _mm512_mul_ps(fft6666, fft6582);
__m512 fft6589 = _mm512_mul_ps(fft6580, fft6582);
__m512 fft6675 = _mm512_mul_ps(fft6667, fft6582);
__m512 fft6590 = _mm512_mul_ps(fft6581, fft6582);
__m512 fft6676 = _mm512_mul_ps(fft6668, fft6582);
__m512 fft6591 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6592 = _mm512_fmadd_ps(fft6575, fft6591, fft6583);
__m512 fft6677 = _mm512_fmadd_ps(fft6662, fft6591, fft6669);
__m512 fft6593 = _mm512_fnmadd_ps(fft6574, fft6591, fft6584);
__m512 fft6678 = _mm512_fnmadd_ps(fft6661, fft6591, fft6670);
__m512 fft6594 = _mm512_fmadd_ps(fft6577, fft6591, fft6585);
__m512 fft6679 = _mm512_fmadd_ps(fft6664, fft6591, fft6671);
__m512 fft6595 = _mm512_fnmadd_ps(fft6576, fft6591, fft6586);
__m512 fft6680 = _mm512_fnmadd_ps(fft6663, fft6591, fft6672);
__m512 fft6596 = _mm512_fmadd_ps(fft6579, fft6591, fft6587);
__m512 fft6681 = _mm512_fmadd_ps(fft6666, fft6591, fft6673);
__m512 fft6597 = _mm512_fnmadd_ps(fft6578, fft6591, fft6588);
__m512 fft6682 = _mm512_fnmadd_ps(fft6665, fft6591, fft6674);
__m512 fft6598 = _mm512_fmadd_ps(fft6581, fft6591, fft6589);
__m512 fft6683 = _mm512_fmadd_ps(fft6668, fft6591, fft6675);
__m512 fft6599 = _mm512_fnmadd_ps(fft6580, fft6591, fft6590);
__m512 fft6684 = _mm512_fnmadd_ps(fft6667, fft6591, fft6676);
__m512 fft6600 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6601 = _mm512_fmadd_ps(fft6592, fft6600, _mm512_shuffle_f32x4(fft6592, fft6592, 177));
__m512 fft6685 = _mm512_fmadd_ps(fft6677, fft6600, _mm512_shuffle_f32x4(fft6677, fft6677, 177));
__m512 fft6602 = _mm512_fmadd_ps(fft6593, fft6600, _mm512_shuffle_f32x4(fft6593, fft6593, 177));
__m512 fft6686 = _mm512_fmadd_ps(fft6678, fft6600, _mm512_shuffle_f32x4(fft6678, fft6678, 177));
__m512 fft6603 = _mm512_fmadd_ps(fft6594, fft6600, _mm512_shuffle_f32x4(fft6594, fft6594, 177));
__m512 fft6687 = _mm512_fmadd_ps(fft6679, fft6600, _mm512_shuffle_f32x4(fft6679, fft6679, 177));
__m512 fft6604 = _mm512_fmadd_ps(fft6595, fft6600, _mm512_shuffle_f32x4(fft6595, fft6595, 177));
__m512 fft6688 = _mm512_fmadd_ps(fft6680, fft6600, _mm512_shuffle_f32x4(fft6680, fft6680, 177));
__m512 fft6605 = _mm512_fmadd_ps(fft6596, fft6600, _mm512_shuffle_f32x4(fft6596, fft6596, 177));
__m512 fft6689 = _mm512_fmadd_ps(fft6681, fft6600, _mm512_shuffle_f32x4(fft6681, fft6681, 177));
__m512 fft6606 = _mm512_fmadd_ps(fft6597, fft6600, _mm512_shuffle_f32x4(fft6597, fft6597, 177));
__m512 fft6690 = _mm512_fmadd_ps(fft6682, fft6600, _mm512_shuffle_f32x4(fft6682, fft6682, 177));
__m512 fft6607 = _mm512_fmadd_ps(fft6598, fft6600, _mm512_shuffle_f32x4(fft6598, fft6598, 177));
__m512 fft6691 = _mm512_fmadd_ps(fft6683, fft6600, _mm512_shuffle_f32x4(fft6683, fft6683, 177));
__m512 fft6608 = _mm512_fmadd_ps(fft6599, fft6600, _mm512_shuffle_f32x4(fft6599, fft6599, 177));
__m512 fft6692 = _mm512_fmadd_ps(fft6684, fft6600, _mm512_shuffle_f32x4(fft6684, fft6684, 177));
__m512 fft6609 = _mm512_mask_mov_ps(fft6601, 49344, fft6602);
__m512 fft6693 = _mm512_mask_mov_ps(fft6685, 49344, fft6686);
__m512 fft6610 = _mm512_mask_sub_ps(fft6602, 49344, _mm512_setzero_ps(), fft6601);
__m512 fft6694 = _mm512_mask_sub_ps(fft6686, 49344, _mm512_setzero_ps(), fft6685);
__m512 fft6611 = _mm512_mask_mov_ps(fft6603, 49344, fft6604);
__m512 fft6695 = _mm512_mask_mov_ps(fft6687, 49344, fft6688);
__m512 fft6612 = _mm512_mask_sub_ps(fft6604, 49344, _mm512_setzero_ps(), fft6603);
__m512 fft6696 = _mm512_mask_sub_ps(fft6688, 49344, _mm512_setzero_ps(), fft6687);
__m512 fft6613 = _mm512_mask_mov_ps(fft6605, 49344, fft6606);
__m512 fft6697 = _mm512_mask_mov_ps(fft6689, 49344, fft6690);
__m512 fft6614 = _mm512_mask_sub_ps(fft6606, 49344, _mm512_setzero_ps(), fft6605);
__m512 fft6698 = _mm512_mask_sub_ps(fft6690, 49344, _mm512_setzero_ps(), fft6689);
__m512 fft6615 = _mm512_mask_mov_ps(fft6607, 49344, fft6608);
__m512 fft6699 = _mm512_mask_mov_ps(fft6691, 49344, fft6692);
__m512 fft6616 = _mm512_mask_sub_ps(fft6608, 49344, _mm512_setzero_ps(), fft6607);
__m512 fft6700 = _mm512_mask_sub_ps(fft6692, 49344, _mm512_setzero_ps(), fft6691);
__m512 fft6617 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6618 = _mm512_fmadd_ps(fft6609, fft6617, _mm512_shuffle_ps(fft6609, fft6609, 78));
__m512 fft6701 = _mm512_fmadd_ps(fft6693, fft6617, _mm512_shuffle_ps(fft6693, fft6693, 78));
__m512 fft6619 = _mm512_fmadd_ps(fft6610, fft6617, _mm512_shuffle_ps(fft6610, fft6610, 78));
__m512 fft6702 = _mm512_fmadd_ps(fft6694, fft6617, _mm512_shuffle_ps(fft6694, fft6694, 78));
__m512 fft6620 = _mm512_fmadd_ps(fft6611, fft6617, _mm512_shuffle_ps(fft6611, fft6611, 78));
__m512 fft6703 = _mm512_fmadd_ps(fft6695, fft6617, _mm512_shuffle_ps(fft6695, fft6695, 78));
__m512 fft6621 = _mm512_fmadd_ps(fft6612, fft6617, _mm512_shuffle_ps(fft6612, fft6612, 78));
__m512 fft6704 = _mm512_fmadd_ps(fft6696, fft6617, _mm512_shuffle_ps(fft6696, fft6696, 78));
__m512 fft6622 = _mm512_fmadd_ps(fft6613, fft6617, _mm512_shuffle_ps(fft6613, fft6613, 78));
__m512 fft6705 = _mm512_fmadd_ps(fft6697, fft6617, _mm512_shuffle_ps(fft6697, fft6697, 78));
__m512 fft6623 = _mm512_fmadd_ps(fft6614, fft6617, _mm512_shuffle_ps(fft6614, fft6614, 78));
__m512 fft6706 = _mm512_fmadd_ps(fft6698, fft6617, _mm512_shuffle_ps(fft6698, fft6698, 78));
__m512 fft6624 = _mm512_fmadd_ps(fft6615, fft6617, _mm512_shuffle_ps(fft6615, fft6615, 78));
__m512 fft6707 = _mm512_fmadd_ps(fft6699, fft6617, _mm512_shuffle_ps(fft6699, fft6699, 78));
__m512 fft6625 = _mm512_fmadd_ps(fft6616, fft6617, _mm512_shuffle_ps(fft6616, fft6616, 78));
__m512 fft6708 = _mm512_fmadd_ps(fft6700, fft6617, _mm512_shuffle_ps(fft6700, fft6700, 78));
__m512i fft6626 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6627 = _mm512_permutexvar_ps(fft6626, fft6618);
__m512 fft6709 = _mm512_permutexvar_ps(fft6626, fft6701);
__m512i fft6628 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6629 = _mm512_permutexvar_ps(fft6628, fft6618);
__m512 fft6710 = _mm512_permutexvar_ps(fft6628, fft6701);
__m512 fft6630 = _mm512_permutexvar_ps(fft6626, fft6619);
__m512 fft6711 = _mm512_permutexvar_ps(fft6626, fft6702);
__m512 fft6631 = _mm512_permutexvar_ps(fft6628, fft6619);
__m512 fft6712 = _mm512_permutexvar_ps(fft6628, fft6702);
__m512 fft6632 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6633 = _mm512_fmadd_ps(fft6627, fft6632, fft6629);
__m512 fft6713 = _mm512_fmadd_ps(fft6709, fft6632, fft6710);
__m512 fft6634 = _mm512_fnmadd_ps(fft6631, fft6632, fft6630);
__m512 fft6714 = _mm512_fnmadd_ps(fft6712, fft6632, fft6711);
__m512 fft6635 = _mm512_mask_mov_ps(fft6631, 21845, fft6633);
__m512 fft6715 = _mm512_mask_mov_ps(fft6712, 21845, fft6713);
__m512 fft6636 = _mm512_mask_mov_ps(fft6627, 43176, fft6633);
__m512 fft6716 = _mm512_mask_mov_ps(fft6709, 43176, fft6713);
__m512 fft6637 = _mm512_mask_mov_ps(fft6635, 43176, fft6634);
__m512 fft6717 = _mm512_mask_mov_ps(fft6715, 43176, fft6714);
__m512 fft6638 = _mm512_mask_mov_ps(fft6636, 22102, fft6634);
__m512 fft6718 = _mm512_mask_mov_ps(fft6716, 22102, fft6714);
__m512 fft6639 = _mm512_mask_mul_ps(fft6637, 64764, fft6637, _mm512_set1_ps(5e-01f));
__m512 fft6719 = _mm512_mask_mul_ps(fft6717, 64764, fft6717, _mm512_set1_ps(5e-01f));
__m512 fft6640 = _mm512_mask_mul_ps(fft6638, 64764, fft6638, _mm512_set1_ps(5e-01f));
__m512 fft6720 = _mm512_mask_mul_ps(fft6718, 64764, fft6718, _mm512_set1_ps(5e-01f));
__m512 df593 = fft6639;
__m512 df601 = fft6719;
__m512 df594 = fft6640;
__m512 df602 = fft6720;
__m512 df595 = fft6620;
__m512 df603 = fft6703;
__m512 df596 = fft6621;
__m512 df604 = fft6704;
__m512 df597 = fft6622;
__m512 df605 = fft6705;
__m512 df598 = fft6623;
__m512 df606 = fft6706;
__m512 df599 = fft6624;
__m512 df607 = fft6707;
__m512 df600 = fft6625;
__m512 df608 = fft6708;
__m512i eo40 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df595 = _mm512_permutexvar_ps(eo40, df595);
df596 = _mm512_permutexvar_ps(eo40, df596);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df595);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df596);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df595);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df596);
df603 = _mm512_permutexvar_ps(eo40, df603);
df604 = _mm512_permutexvar_ps(eo40, df604);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df603);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df604);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df603);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df604);
df597 = _mm512_permutexvar_ps(eo40, df597);
df598 = _mm512_permutexvar_ps(eo40, df598);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df597);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df598);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df597);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df598);
df605 = _mm512_permutexvar_ps(eo40, df605);
df606 = _mm512_permutexvar_ps(eo40, df606);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df605);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df606);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df605);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df606);
df599 = _mm512_permutexvar_ps(eo40, df599);
df600 = _mm512_permutexvar_ps(eo40, df600);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df599);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df600);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df599);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df600);
df607 = _mm512_permutexvar_ps(eo40, df607);
df608 = _mm512_permutexvar_ps(eo40, df608);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df607);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df608);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df607);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df608);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df593);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df594);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df593);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df594);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df601);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k18+128*m40+32*f41, 255, df602);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df601);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k18+128*m40+32*f41, 65280, df602);
}
}
if (j2 >= last1) return;
++j2;
rel3 = 1;
}
if (rel3 < 4) {
ptrdiff_t h18 = base3+10;
ptrdiff_t w18 = -20+60*rel3;
ptrdiff_t jj8 = 3-rel3+j2;
for (; j2 <= jj8; w18 += 60) {
ptrdiff_t k19 = 3*s1;
ptrdiff_t kk18 = k19+2;
for (; k19 <= kk18; ++k19) {
__m512 bnMul18 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k19+3*i6))[0]);
__m512 bnAdd18 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k19+3*i6))[1]);
for (ptrdiff_t b41 = 0; b41 < 6; ++b41) {
ptrdiff_t m41 = (size_t)b41/2;
ptrdiff_t f42 = (size_t)b41%2;
__m512 dat576 = _mm512_maskz_loadu_ps(65535, datPtr1+0+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat576 = _mm512_mask_fmadd_ps(dat576, 65535, bnMul18, bnAdd18);
__m512 dat577 = _mm512_maskz_loadu_ps(65535, datPtr1+896+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat577 = _mm512_mask_fmadd_ps(dat577, 65535, bnMul18, bnAdd18);
__m512 dat578 = _mm512_maskz_loadu_ps(65535, datPtr1+1792+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat578 = _mm512_mask_fmadd_ps(dat578, 65535, bnMul18, bnAdd18);
__m512 dat579 = _mm512_maskz_loadu_ps(65535, datPtr1+2688+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat579 = _mm512_mask_fmadd_ps(dat579, 65535, bnMul18, bnAdd18);
__m512 dat580 = _mm512_maskz_loadu_ps(65535, datPtr1+3584+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat580 = _mm512_mask_fmadd_ps(dat580, 65535, bnMul18, bnAdd18);
__m512 dat581 = _mm512_maskz_loadu_ps(65535, datPtr1+4480+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat581 = _mm512_mask_fmadd_ps(dat581, 65535, bnMul18, bnAdd18);
__m512 dat582 = _mm512_maskz_loadu_ps(65535, datPtr1+5376+602112*i6+200704*k19+896*h18+4*w18+40*b41);
dat582 = _mm512_mask_fmadd_ps(dat582, 65535, bnMul18, bnAdd18);
__m512 fft6721 = _mm512_add_ps(dat576, _mm512_setzero_ps());
__m512 fft6809 = _mm512_add_ps(dat577, _mm512_setzero_ps());
__m512 fft6722 = _mm512_sub_ps(dat576, _mm512_setzero_ps());
__m512 fft6810 = _mm512_sub_ps(dat577, _mm512_setzero_ps());
__m512 fft6723 = _mm512_add_ps(dat578, _mm512_setzero_ps());
__m512 fft6811 = _mm512_add_ps(dat579, _mm512_setzero_ps());
__m512 fft6724 = _mm512_sub_ps(dat578, _mm512_setzero_ps());
__m512 fft6812 = _mm512_sub_ps(dat579, _mm512_setzero_ps());
__m512 fft6725 = _mm512_add_ps(dat580, _mm512_setzero_ps());
__m512 fft6813 = _mm512_add_ps(dat581, _mm512_setzero_ps());
__m512 fft6726 = _mm512_sub_ps(dat580, _mm512_setzero_ps());
__m512 fft6814 = _mm512_sub_ps(dat581, _mm512_setzero_ps());
__m512 fft6727 = _mm512_add_ps(dat582, _mm512_setzero_ps());
__m512 fft6815 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6728 = _mm512_sub_ps(dat582, _mm512_setzero_ps());
__m512 fft6816 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6729 = _mm512_add_ps(fft6721, fft6725);
__m512 fft6817 = _mm512_add_ps(fft6809, fft6813);
__m512 fft6730 = _mm512_sub_ps(fft6721, fft6725);
__m512 fft6818 = _mm512_sub_ps(fft6809, fft6813);
__m512 fft6731 = _mm512_add_ps(fft6723, fft6727);
__m512 fft6819 = _mm512_add_ps(fft6811, fft6815);
__m512 fft6732 = _mm512_sub_ps(fft6727, fft6723);
__m512 fft6820 = _mm512_sub_ps(fft6815, fft6811);
__m512 fft6733 = _mm512_sub_ps(fft6724, fft6728);
__m512 fft6821 = _mm512_sub_ps(fft6812, fft6816);
__m512 fft6734 = _mm512_add_ps(fft6724, fft6728);
__m512 fft6822 = _mm512_add_ps(fft6812, fft6816);
__m512 fft6735 = _mm512_add_ps(fft6729, fft6731);
__m512 fft6823 = _mm512_add_ps(fft6817, fft6819);
__m512 fft6736 = _mm512_sub_ps(fft6729, fft6731);
__m512 fft6824 = _mm512_sub_ps(fft6817, fft6819);
__m512 fft6737 = _mm512_fmadd_ps(fft6733, _mm512_set1_ps(7.0710677e-01f), fft6722);
__m512 fft6825 = _mm512_fmadd_ps(fft6821, _mm512_set1_ps(7.0710677e-01f), fft6810);
__m512 fft6738 = _mm512_fnmsub_ps(fft6734, _mm512_set1_ps(7.0710677e-01f), fft6726);
__m512 fft6826 = _mm512_fnmsub_ps(fft6822, _mm512_set1_ps(7.0710677e-01f), fft6814);
__m512 fft6739 = _mm512_fnmadd_ps(fft6733, _mm512_set1_ps(7.0710677e-01f), fft6722);
__m512 fft6827 = _mm512_fnmadd_ps(fft6821, _mm512_set1_ps(7.0710677e-01f), fft6810);
__m512 fft6740 = _mm512_fnmadd_ps(fft6734, _mm512_set1_ps(7.0710677e-01f), fft6726);
__m512 fft6828 = _mm512_fnmadd_ps(fft6822, _mm512_set1_ps(7.0710677e-01f), fft6814);
__m512 fft6741 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6742 = _mm512_fmadd_ps(fft6735, fft6741, _mm512_shuffle_f32x4(fft6735, fft6735, 78));
__m512 fft6829 = _mm512_fmadd_ps(fft6823, fft6741, _mm512_shuffle_f32x4(fft6823, fft6823, 78));
__m512 fft6743 = _mm512_fmadd_ps(fft6736, fft6741, _mm512_shuffle_f32x4(fft6736, fft6736, 78));
__m512 fft6830 = _mm512_fmadd_ps(fft6824, fft6741, _mm512_shuffle_f32x4(fft6824, fft6824, 78));
__m512 fft6744 = _mm512_fmadd_ps(fft6737, fft6741, _mm512_shuffle_f32x4(fft6737, fft6737, 78));
__m512 fft6831 = _mm512_fmadd_ps(fft6825, fft6741, _mm512_shuffle_f32x4(fft6825, fft6825, 78));
__m512 fft6745 = _mm512_fmadd_ps(fft6738, fft6741, _mm512_shuffle_f32x4(fft6738, fft6738, 78));
__m512 fft6832 = _mm512_fmadd_ps(fft6826, fft6741, _mm512_shuffle_f32x4(fft6826, fft6826, 78));
__m512 fft6746 = _mm512_fmadd_ps(fft6730, fft6741, _mm512_shuffle_f32x4(fft6730, fft6730, 78));
__m512 fft6833 = _mm512_fmadd_ps(fft6818, fft6741, _mm512_shuffle_f32x4(fft6818, fft6818, 78));
__m512 fft6747 = _mm512_fmadd_ps(fft6732, fft6741, _mm512_shuffle_f32x4(fft6732, fft6732, 78));
__m512 fft6834 = _mm512_fmadd_ps(fft6820, fft6741, _mm512_shuffle_f32x4(fft6820, fft6820, 78));
__m512 fft6748 = _mm512_fmadd_ps(fft6739, fft6741, _mm512_shuffle_f32x4(fft6739, fft6739, 78));
__m512 fft6835 = _mm512_fmadd_ps(fft6827, fft6741, _mm512_shuffle_f32x4(fft6827, fft6827, 78));
__m512 fft6749 = _mm512_fmadd_ps(fft6740, fft6741, _mm512_shuffle_f32x4(fft6740, fft6740, 78));
__m512 fft6836 = _mm512_fmadd_ps(fft6828, fft6741, _mm512_shuffle_f32x4(fft6828, fft6828, 78));
__m512 fft6750 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6751 = _mm512_mul_ps(fft6742, fft6750);
__m512 fft6837 = _mm512_mul_ps(fft6829, fft6750);
__m512 fft6752 = _mm512_mul_ps(fft6743, fft6750);
__m512 fft6838 = _mm512_mul_ps(fft6830, fft6750);
__m512 fft6753 = _mm512_mul_ps(fft6744, fft6750);
__m512 fft6839 = _mm512_mul_ps(fft6831, fft6750);
__m512 fft6754 = _mm512_mul_ps(fft6745, fft6750);
__m512 fft6840 = _mm512_mul_ps(fft6832, fft6750);
__m512 fft6755 = _mm512_mul_ps(fft6746, fft6750);
__m512 fft6841 = _mm512_mul_ps(fft6833, fft6750);
__m512 fft6756 = _mm512_mul_ps(fft6747, fft6750);
__m512 fft6842 = _mm512_mul_ps(fft6834, fft6750);
__m512 fft6757 = _mm512_mul_ps(fft6748, fft6750);
__m512 fft6843 = _mm512_mul_ps(fft6835, fft6750);
__m512 fft6758 = _mm512_mul_ps(fft6749, fft6750);
__m512 fft6844 = _mm512_mul_ps(fft6836, fft6750);
__m512 fft6759 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6760 = _mm512_fmadd_ps(fft6743, fft6759, fft6751);
__m512 fft6845 = _mm512_fmadd_ps(fft6830, fft6759, fft6837);
__m512 fft6761 = _mm512_fnmadd_ps(fft6742, fft6759, fft6752);
__m512 fft6846 = _mm512_fnmadd_ps(fft6829, fft6759, fft6838);
__m512 fft6762 = _mm512_fmadd_ps(fft6745, fft6759, fft6753);
__m512 fft6847 = _mm512_fmadd_ps(fft6832, fft6759, fft6839);
__m512 fft6763 = _mm512_fnmadd_ps(fft6744, fft6759, fft6754);
__m512 fft6848 = _mm512_fnmadd_ps(fft6831, fft6759, fft6840);
__m512 fft6764 = _mm512_fmadd_ps(fft6747, fft6759, fft6755);
__m512 fft6849 = _mm512_fmadd_ps(fft6834, fft6759, fft6841);
__m512 fft6765 = _mm512_fnmadd_ps(fft6746, fft6759, fft6756);
__m512 fft6850 = _mm512_fnmadd_ps(fft6833, fft6759, fft6842);
__m512 fft6766 = _mm512_fmadd_ps(fft6749, fft6759, fft6757);
__m512 fft6851 = _mm512_fmadd_ps(fft6836, fft6759, fft6843);
__m512 fft6767 = _mm512_fnmadd_ps(fft6748, fft6759, fft6758);
__m512 fft6852 = _mm512_fnmadd_ps(fft6835, fft6759, fft6844);
__m512 fft6768 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6769 = _mm512_fmadd_ps(fft6760, fft6768, _mm512_shuffle_f32x4(fft6760, fft6760, 177));
__m512 fft6853 = _mm512_fmadd_ps(fft6845, fft6768, _mm512_shuffle_f32x4(fft6845, fft6845, 177));
__m512 fft6770 = _mm512_fmadd_ps(fft6761, fft6768, _mm512_shuffle_f32x4(fft6761, fft6761, 177));
__m512 fft6854 = _mm512_fmadd_ps(fft6846, fft6768, _mm512_shuffle_f32x4(fft6846, fft6846, 177));
__m512 fft6771 = _mm512_fmadd_ps(fft6762, fft6768, _mm512_shuffle_f32x4(fft6762, fft6762, 177));
__m512 fft6855 = _mm512_fmadd_ps(fft6847, fft6768, _mm512_shuffle_f32x4(fft6847, fft6847, 177));
__m512 fft6772 = _mm512_fmadd_ps(fft6763, fft6768, _mm512_shuffle_f32x4(fft6763, fft6763, 177));
__m512 fft6856 = _mm512_fmadd_ps(fft6848, fft6768, _mm512_shuffle_f32x4(fft6848, fft6848, 177));
__m512 fft6773 = _mm512_fmadd_ps(fft6764, fft6768, _mm512_shuffle_f32x4(fft6764, fft6764, 177));
__m512 fft6857 = _mm512_fmadd_ps(fft6849, fft6768, _mm512_shuffle_f32x4(fft6849, fft6849, 177));
__m512 fft6774 = _mm512_fmadd_ps(fft6765, fft6768, _mm512_shuffle_f32x4(fft6765, fft6765, 177));
__m512 fft6858 = _mm512_fmadd_ps(fft6850, fft6768, _mm512_shuffle_f32x4(fft6850, fft6850, 177));
__m512 fft6775 = _mm512_fmadd_ps(fft6766, fft6768, _mm512_shuffle_f32x4(fft6766, fft6766, 177));
__m512 fft6859 = _mm512_fmadd_ps(fft6851, fft6768, _mm512_shuffle_f32x4(fft6851, fft6851, 177));
__m512 fft6776 = _mm512_fmadd_ps(fft6767, fft6768, _mm512_shuffle_f32x4(fft6767, fft6767, 177));
__m512 fft6860 = _mm512_fmadd_ps(fft6852, fft6768, _mm512_shuffle_f32x4(fft6852, fft6852, 177));
__m512 fft6777 = _mm512_mask_mov_ps(fft6769, 49344, fft6770);
__m512 fft6861 = _mm512_mask_mov_ps(fft6853, 49344, fft6854);
__m512 fft6778 = _mm512_mask_sub_ps(fft6770, 49344, _mm512_setzero_ps(), fft6769);
__m512 fft6862 = _mm512_mask_sub_ps(fft6854, 49344, _mm512_setzero_ps(), fft6853);
__m512 fft6779 = _mm512_mask_mov_ps(fft6771, 49344, fft6772);
__m512 fft6863 = _mm512_mask_mov_ps(fft6855, 49344, fft6856);
__m512 fft6780 = _mm512_mask_sub_ps(fft6772, 49344, _mm512_setzero_ps(), fft6771);
__m512 fft6864 = _mm512_mask_sub_ps(fft6856, 49344, _mm512_setzero_ps(), fft6855);
__m512 fft6781 = _mm512_mask_mov_ps(fft6773, 49344, fft6774);
__m512 fft6865 = _mm512_mask_mov_ps(fft6857, 49344, fft6858);
__m512 fft6782 = _mm512_mask_sub_ps(fft6774, 49344, _mm512_setzero_ps(), fft6773);
__m512 fft6866 = _mm512_mask_sub_ps(fft6858, 49344, _mm512_setzero_ps(), fft6857);
__m512 fft6783 = _mm512_mask_mov_ps(fft6775, 49344, fft6776);
__m512 fft6867 = _mm512_mask_mov_ps(fft6859, 49344, fft6860);
__m512 fft6784 = _mm512_mask_sub_ps(fft6776, 49344, _mm512_setzero_ps(), fft6775);
__m512 fft6868 = _mm512_mask_sub_ps(fft6860, 49344, _mm512_setzero_ps(), fft6859);
__m512 fft6785 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6786 = _mm512_fmadd_ps(fft6777, fft6785, _mm512_shuffle_ps(fft6777, fft6777, 78));
__m512 fft6869 = _mm512_fmadd_ps(fft6861, fft6785, _mm512_shuffle_ps(fft6861, fft6861, 78));
__m512 fft6787 = _mm512_fmadd_ps(fft6778, fft6785, _mm512_shuffle_ps(fft6778, fft6778, 78));
__m512 fft6870 = _mm512_fmadd_ps(fft6862, fft6785, _mm512_shuffle_ps(fft6862, fft6862, 78));
__m512 fft6788 = _mm512_fmadd_ps(fft6779, fft6785, _mm512_shuffle_ps(fft6779, fft6779, 78));
__m512 fft6871 = _mm512_fmadd_ps(fft6863, fft6785, _mm512_shuffle_ps(fft6863, fft6863, 78));
__m512 fft6789 = _mm512_fmadd_ps(fft6780, fft6785, _mm512_shuffle_ps(fft6780, fft6780, 78));
__m512 fft6872 = _mm512_fmadd_ps(fft6864, fft6785, _mm512_shuffle_ps(fft6864, fft6864, 78));
__m512 fft6790 = _mm512_fmadd_ps(fft6781, fft6785, _mm512_shuffle_ps(fft6781, fft6781, 78));
__m512 fft6873 = _mm512_fmadd_ps(fft6865, fft6785, _mm512_shuffle_ps(fft6865, fft6865, 78));
__m512 fft6791 = _mm512_fmadd_ps(fft6782, fft6785, _mm512_shuffle_ps(fft6782, fft6782, 78));
__m512 fft6874 = _mm512_fmadd_ps(fft6866, fft6785, _mm512_shuffle_ps(fft6866, fft6866, 78));
__m512 fft6792 = _mm512_fmadd_ps(fft6783, fft6785, _mm512_shuffle_ps(fft6783, fft6783, 78));
__m512 fft6875 = _mm512_fmadd_ps(fft6867, fft6785, _mm512_shuffle_ps(fft6867, fft6867, 78));
__m512 fft6793 = _mm512_fmadd_ps(fft6784, fft6785, _mm512_shuffle_ps(fft6784, fft6784, 78));
__m512 fft6876 = _mm512_fmadd_ps(fft6868, fft6785, _mm512_shuffle_ps(fft6868, fft6868, 78));
__m512i fft6794 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6795 = _mm512_permutexvar_ps(fft6794, fft6786);
__m512 fft6877 = _mm512_permutexvar_ps(fft6794, fft6869);
__m512i fft6796 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6797 = _mm512_permutexvar_ps(fft6796, fft6786);
__m512 fft6878 = _mm512_permutexvar_ps(fft6796, fft6869);
__m512 fft6798 = _mm512_permutexvar_ps(fft6794, fft6787);
__m512 fft6879 = _mm512_permutexvar_ps(fft6794, fft6870);
__m512 fft6799 = _mm512_permutexvar_ps(fft6796, fft6787);
__m512 fft6880 = _mm512_permutexvar_ps(fft6796, fft6870);
__m512 fft6800 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6801 = _mm512_fmadd_ps(fft6795, fft6800, fft6797);
__m512 fft6881 = _mm512_fmadd_ps(fft6877, fft6800, fft6878);
__m512 fft6802 = _mm512_fnmadd_ps(fft6799, fft6800, fft6798);
__m512 fft6882 = _mm512_fnmadd_ps(fft6880, fft6800, fft6879);
__m512 fft6803 = _mm512_mask_mov_ps(fft6799, 21845, fft6801);
__m512 fft6883 = _mm512_mask_mov_ps(fft6880, 21845, fft6881);
__m512 fft6804 = _mm512_mask_mov_ps(fft6795, 43176, fft6801);
__m512 fft6884 = _mm512_mask_mov_ps(fft6877, 43176, fft6881);
__m512 fft6805 = _mm512_mask_mov_ps(fft6803, 43176, fft6802);
__m512 fft6885 = _mm512_mask_mov_ps(fft6883, 43176, fft6882);
__m512 fft6806 = _mm512_mask_mov_ps(fft6804, 22102, fft6802);
__m512 fft6886 = _mm512_mask_mov_ps(fft6884, 22102, fft6882);
__m512 fft6807 = _mm512_mask_mul_ps(fft6805, 64764, fft6805, _mm512_set1_ps(5e-01f));
__m512 fft6887 = _mm512_mask_mul_ps(fft6885, 64764, fft6885, _mm512_set1_ps(5e-01f));
__m512 fft6808 = _mm512_mask_mul_ps(fft6806, 64764, fft6806, _mm512_set1_ps(5e-01f));
__m512 fft6888 = _mm512_mask_mul_ps(fft6886, 64764, fft6886, _mm512_set1_ps(5e-01f));
__m512 df609 = fft6807;
__m512 df617 = fft6887;
__m512 df610 = fft6808;
__m512 df618 = fft6888;
__m512 df611 = fft6788;
__m512 df619 = fft6871;
__m512 df612 = fft6789;
__m512 df620 = fft6872;
__m512 df613 = fft6790;
__m512 df621 = fft6873;
__m512 df614 = fft6791;
__m512 df622 = fft6874;
__m512 df615 = fft6792;
__m512 df623 = fft6875;
__m512 df616 = fft6793;
__m512 df624 = fft6876;
__m512i eo41 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df611 = _mm512_permutexvar_ps(eo41, df611);
df612 = _mm512_permutexvar_ps(eo41, df612);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df611);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df612);
_mm512_mask_storeu_ps(dfPtr1+508768+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df611);
_mm512_mask_storeu_ps(dfPtr1+508832+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df612);
df619 = _mm512_permutexvar_ps(eo41, df619);
df620 = _mm512_permutexvar_ps(eo41, df620);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df619);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df620);
_mm512_mask_storeu_ps(dfPtr1+1322848+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df619);
_mm512_mask_storeu_ps(dfPtr1+1322912+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df620);
df613 = _mm512_permutexvar_ps(eo41, df613);
df614 = _mm512_permutexvar_ps(eo41, df614);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df613);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df614);
_mm512_mask_storeu_ps(dfPtr1+610528+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df613);
_mm512_mask_storeu_ps(dfPtr1+610592+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df614);
df621 = _mm512_permutexvar_ps(eo41, df621);
df622 = _mm512_permutexvar_ps(eo41, df622);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df621);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df622);
_mm512_mask_storeu_ps(dfPtr1+1424608+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df621);
_mm512_mask_storeu_ps(dfPtr1+1424672+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df622);
df615 = _mm512_permutexvar_ps(eo41, df615);
df616 = _mm512_permutexvar_ps(eo41, df616);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df615);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df616);
_mm512_mask_storeu_ps(dfPtr1+712288+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df615);
_mm512_mask_storeu_ps(dfPtr1+712352+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df616);
df623 = _mm512_permutexvar_ps(eo41, df623);
df624 = _mm512_permutexvar_ps(eo41, df624);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df623);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df624);
_mm512_mask_storeu_ps(dfPtr1+1526368+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df623);
_mm512_mask_storeu_ps(dfPtr1+1526432+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df624);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df609);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df610);
_mm512_mask_storeu_ps(dfPtr1+407008+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df609);
_mm512_mask_storeu_ps(dfPtr1+407072+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df610);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df617);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+384*k19+128*m41+32*f42, 255, df618);
_mm512_mask_storeu_ps(dfPtr1+1221088+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df617);
_mm512_mask_storeu_ps(dfPtr1+1221152+407040*i6+1152*j2+384*k19+128*m41+32*f42, 65280, df618);
}
}
if (j2 >= last1) return;
++j2;
}
rel3 = 4;
}
ptrdiff_t h19 = base3+10;
ptrdiff_t w19 = 220;
ptrdiff_t k20 = 3*s1;
ptrdiff_t kk19 = k20+2;
for (; k20 <= kk19; ++k20) {
__m512 bnMul19 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k20+3*i6))[0]);
__m512 bnAdd19 = _mm512_set1_ps(((float*)bnPtr2+(ptrdiff_t)2*(k20+3*i6))[1]);
ptrdiff_t b42 = 0;
ptrdiff_t m42 = (size_t)b42/2;
ptrdiff_t f43 = (size_t)b42%2;
__m512 dat583 = _mm512_maskz_loadu_ps(127, datPtr1+0+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat583 = _mm512_mask_fmadd_ps(dat583, 127, bnMul19, bnAdd19);
__m512 dat584 = _mm512_maskz_loadu_ps(127, datPtr1+896+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat584 = _mm512_mask_fmadd_ps(dat584, 127, bnMul19, bnAdd19);
__m512 dat585 = _mm512_maskz_loadu_ps(127, datPtr1+1792+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat585 = _mm512_mask_fmadd_ps(dat585, 127, bnMul19, bnAdd19);
__m512 dat586 = _mm512_maskz_loadu_ps(127, datPtr1+2688+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat586 = _mm512_mask_fmadd_ps(dat586, 127, bnMul19, bnAdd19);
__m512 dat587 = _mm512_maskz_loadu_ps(127, datPtr1+3584+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat587 = _mm512_mask_fmadd_ps(dat587, 127, bnMul19, bnAdd19);
__m512 dat588 = _mm512_maskz_loadu_ps(127, datPtr1+4480+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat588 = _mm512_mask_fmadd_ps(dat588, 127, bnMul19, bnAdd19);
__m512 dat589 = _mm512_maskz_loadu_ps(127, datPtr1+5376+602112*i6+200704*k20+896*h19+4*w19+0*b42);
dat589 = _mm512_mask_fmadd_ps(dat589, 127, bnMul19, bnAdd19);
__m512 fft6889 = _mm512_add_ps(dat583, _mm512_setzero_ps());
__m512 fft6977 = _mm512_add_ps(dat584, _mm512_setzero_ps());
__m512 fft6890 = _mm512_sub_ps(dat583, _mm512_setzero_ps());
__m512 fft6978 = _mm512_sub_ps(dat584, _mm512_setzero_ps());
__m512 fft6891 = _mm512_add_ps(dat585, _mm512_setzero_ps());
__m512 fft6979 = _mm512_add_ps(dat586, _mm512_setzero_ps());
__m512 fft6892 = _mm512_sub_ps(dat585, _mm512_setzero_ps());
__m512 fft6980 = _mm512_sub_ps(dat586, _mm512_setzero_ps());
__m512 fft6893 = _mm512_add_ps(dat587, _mm512_setzero_ps());
__m512 fft6981 = _mm512_add_ps(dat588, _mm512_setzero_ps());
__m512 fft6894 = _mm512_sub_ps(dat587, _mm512_setzero_ps());
__m512 fft6982 = _mm512_sub_ps(dat588, _mm512_setzero_ps());
__m512 fft6895 = _mm512_add_ps(dat589, _mm512_setzero_ps());
__m512 fft6983 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6896 = _mm512_sub_ps(dat589, _mm512_setzero_ps());
__m512 fft6984 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft6897 = _mm512_add_ps(fft6889, fft6893);
__m512 fft6985 = _mm512_add_ps(fft6977, fft6981);
__m512 fft6898 = _mm512_sub_ps(fft6889, fft6893);
__m512 fft6986 = _mm512_sub_ps(fft6977, fft6981);
__m512 fft6899 = _mm512_add_ps(fft6891, fft6895);
__m512 fft6987 = _mm512_add_ps(fft6979, fft6983);
__m512 fft6900 = _mm512_sub_ps(fft6895, fft6891);
__m512 fft6988 = _mm512_sub_ps(fft6983, fft6979);
__m512 fft6901 = _mm512_sub_ps(fft6892, fft6896);
__m512 fft6989 = _mm512_sub_ps(fft6980, fft6984);
__m512 fft6902 = _mm512_add_ps(fft6892, fft6896);
__m512 fft6990 = _mm512_add_ps(fft6980, fft6984);
__m512 fft6903 = _mm512_add_ps(fft6897, fft6899);
__m512 fft6991 = _mm512_add_ps(fft6985, fft6987);
__m512 fft6904 = _mm512_sub_ps(fft6897, fft6899);
__m512 fft6992 = _mm512_sub_ps(fft6985, fft6987);
__m512 fft6905 = _mm512_fmadd_ps(fft6901, _mm512_set1_ps(7.0710677e-01f), fft6890);
__m512 fft6993 = _mm512_fmadd_ps(fft6989, _mm512_set1_ps(7.0710677e-01f), fft6978);
__m512 fft6906 = _mm512_fnmsub_ps(fft6902, _mm512_set1_ps(7.0710677e-01f), fft6894);
__m512 fft6994 = _mm512_fnmsub_ps(fft6990, _mm512_set1_ps(7.0710677e-01f), fft6982);
__m512 fft6907 = _mm512_fnmadd_ps(fft6901, _mm512_set1_ps(7.0710677e-01f), fft6890);
__m512 fft6995 = _mm512_fnmadd_ps(fft6989, _mm512_set1_ps(7.0710677e-01f), fft6978);
__m512 fft6908 = _mm512_fnmadd_ps(fft6902, _mm512_set1_ps(7.0710677e-01f), fft6894);
__m512 fft6996 = _mm512_fnmadd_ps(fft6990, _mm512_set1_ps(7.0710677e-01f), fft6982);
__m512 fft6909 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6910 = _mm512_fmadd_ps(fft6903, fft6909, _mm512_shuffle_f32x4(fft6903, fft6903, 78));
__m512 fft6997 = _mm512_fmadd_ps(fft6991, fft6909, _mm512_shuffle_f32x4(fft6991, fft6991, 78));
__m512 fft6911 = _mm512_fmadd_ps(fft6904, fft6909, _mm512_shuffle_f32x4(fft6904, fft6904, 78));
__m512 fft6998 = _mm512_fmadd_ps(fft6992, fft6909, _mm512_shuffle_f32x4(fft6992, fft6992, 78));
__m512 fft6912 = _mm512_fmadd_ps(fft6905, fft6909, _mm512_shuffle_f32x4(fft6905, fft6905, 78));
__m512 fft6999 = _mm512_fmadd_ps(fft6993, fft6909, _mm512_shuffle_f32x4(fft6993, fft6993, 78));
__m512 fft6913 = _mm512_fmadd_ps(fft6906, fft6909, _mm512_shuffle_f32x4(fft6906, fft6906, 78));
__m512 fft7000 = _mm512_fmadd_ps(fft6994, fft6909, _mm512_shuffle_f32x4(fft6994, fft6994, 78));
__m512 fft6914 = _mm512_fmadd_ps(fft6898, fft6909, _mm512_shuffle_f32x4(fft6898, fft6898, 78));
__m512 fft7001 = _mm512_fmadd_ps(fft6986, fft6909, _mm512_shuffle_f32x4(fft6986, fft6986, 78));
__m512 fft6915 = _mm512_fmadd_ps(fft6900, fft6909, _mm512_shuffle_f32x4(fft6900, fft6900, 78));
__m512 fft7002 = _mm512_fmadd_ps(fft6988, fft6909, _mm512_shuffle_f32x4(fft6988, fft6988, 78));
__m512 fft6916 = _mm512_fmadd_ps(fft6907, fft6909, _mm512_shuffle_f32x4(fft6907, fft6907, 78));
__m512 fft7003 = _mm512_fmadd_ps(fft6995, fft6909, _mm512_shuffle_f32x4(fft6995, fft6995, 78));
__m512 fft6917 = _mm512_fmadd_ps(fft6908, fft6909, _mm512_shuffle_f32x4(fft6908, fft6908, 78));
__m512 fft7004 = _mm512_fmadd_ps(fft6996, fft6909, _mm512_shuffle_f32x4(fft6996, fft6996, 78));
__m512 fft6918 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft6919 = _mm512_mul_ps(fft6910, fft6918);
__m512 fft7005 = _mm512_mul_ps(fft6997, fft6918);
__m512 fft6920 = _mm512_mul_ps(fft6911, fft6918);
__m512 fft7006 = _mm512_mul_ps(fft6998, fft6918);
__m512 fft6921 = _mm512_mul_ps(fft6912, fft6918);
__m512 fft7007 = _mm512_mul_ps(fft6999, fft6918);
__m512 fft6922 = _mm512_mul_ps(fft6913, fft6918);
__m512 fft7008 = _mm512_mul_ps(fft7000, fft6918);
__m512 fft6923 = _mm512_mul_ps(fft6914, fft6918);
__m512 fft7009 = _mm512_mul_ps(fft7001, fft6918);
__m512 fft6924 = _mm512_mul_ps(fft6915, fft6918);
__m512 fft7010 = _mm512_mul_ps(fft7002, fft6918);
__m512 fft6925 = _mm512_mul_ps(fft6916, fft6918);
__m512 fft7011 = _mm512_mul_ps(fft7003, fft6918);
__m512 fft6926 = _mm512_mul_ps(fft6917, fft6918);
__m512 fft7012 = _mm512_mul_ps(fft7004, fft6918);
__m512 fft6927 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft6928 = _mm512_fmadd_ps(fft6911, fft6927, fft6919);
__m512 fft7013 = _mm512_fmadd_ps(fft6998, fft6927, fft7005);
__m512 fft6929 = _mm512_fnmadd_ps(fft6910, fft6927, fft6920);
__m512 fft7014 = _mm512_fnmadd_ps(fft6997, fft6927, fft7006);
__m512 fft6930 = _mm512_fmadd_ps(fft6913, fft6927, fft6921);
__m512 fft7015 = _mm512_fmadd_ps(fft7000, fft6927, fft7007);
__m512 fft6931 = _mm512_fnmadd_ps(fft6912, fft6927, fft6922);
__m512 fft7016 = _mm512_fnmadd_ps(fft6999, fft6927, fft7008);
__m512 fft6932 = _mm512_fmadd_ps(fft6915, fft6927, fft6923);
__m512 fft7017 = _mm512_fmadd_ps(fft7002, fft6927, fft7009);
__m512 fft6933 = _mm512_fnmadd_ps(fft6914, fft6927, fft6924);
__m512 fft7018 = _mm512_fnmadd_ps(fft7001, fft6927, fft7010);
__m512 fft6934 = _mm512_fmadd_ps(fft6917, fft6927, fft6925);
__m512 fft7019 = _mm512_fmadd_ps(fft7004, fft6927, fft7011);
__m512 fft6935 = _mm512_fnmadd_ps(fft6916, fft6927, fft6926);
__m512 fft7020 = _mm512_fnmadd_ps(fft7003, fft6927, fft7012);
__m512 fft6936 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft6937 = _mm512_fmadd_ps(fft6928, fft6936, _mm512_shuffle_f32x4(fft6928, fft6928, 177));
__m512 fft7021 = _mm512_fmadd_ps(fft7013, fft6936, _mm512_shuffle_f32x4(fft7013, fft7013, 177));
__m512 fft6938 = _mm512_fmadd_ps(fft6929, fft6936, _mm512_shuffle_f32x4(fft6929, fft6929, 177));
__m512 fft7022 = _mm512_fmadd_ps(fft7014, fft6936, _mm512_shuffle_f32x4(fft7014, fft7014, 177));
__m512 fft6939 = _mm512_fmadd_ps(fft6930, fft6936, _mm512_shuffle_f32x4(fft6930, fft6930, 177));
__m512 fft7023 = _mm512_fmadd_ps(fft7015, fft6936, _mm512_shuffle_f32x4(fft7015, fft7015, 177));
__m512 fft6940 = _mm512_fmadd_ps(fft6931, fft6936, _mm512_shuffle_f32x4(fft6931, fft6931, 177));
__m512 fft7024 = _mm512_fmadd_ps(fft7016, fft6936, _mm512_shuffle_f32x4(fft7016, fft7016, 177));
__m512 fft6941 = _mm512_fmadd_ps(fft6932, fft6936, _mm512_shuffle_f32x4(fft6932, fft6932, 177));
__m512 fft7025 = _mm512_fmadd_ps(fft7017, fft6936, _mm512_shuffle_f32x4(fft7017, fft7017, 177));
__m512 fft6942 = _mm512_fmadd_ps(fft6933, fft6936, _mm512_shuffle_f32x4(fft6933, fft6933, 177));
__m512 fft7026 = _mm512_fmadd_ps(fft7018, fft6936, _mm512_shuffle_f32x4(fft7018, fft7018, 177));
__m512 fft6943 = _mm512_fmadd_ps(fft6934, fft6936, _mm512_shuffle_f32x4(fft6934, fft6934, 177));
__m512 fft7027 = _mm512_fmadd_ps(fft7019, fft6936, _mm512_shuffle_f32x4(fft7019, fft7019, 177));
__m512 fft6944 = _mm512_fmadd_ps(fft6935, fft6936, _mm512_shuffle_f32x4(fft6935, fft6935, 177));
__m512 fft7028 = _mm512_fmadd_ps(fft7020, fft6936, _mm512_shuffle_f32x4(fft7020, fft7020, 177));
__m512 fft6945 = _mm512_mask_mov_ps(fft6937, 49344, fft6938);
__m512 fft7029 = _mm512_mask_mov_ps(fft7021, 49344, fft7022);
__m512 fft6946 = _mm512_mask_sub_ps(fft6938, 49344, _mm512_setzero_ps(), fft6937);
__m512 fft7030 = _mm512_mask_sub_ps(fft7022, 49344, _mm512_setzero_ps(), fft7021);
__m512 fft6947 = _mm512_mask_mov_ps(fft6939, 49344, fft6940);
__m512 fft7031 = _mm512_mask_mov_ps(fft7023, 49344, fft7024);
__m512 fft6948 = _mm512_mask_sub_ps(fft6940, 49344, _mm512_setzero_ps(), fft6939);
__m512 fft7032 = _mm512_mask_sub_ps(fft7024, 49344, _mm512_setzero_ps(), fft7023);
__m512 fft6949 = _mm512_mask_mov_ps(fft6941, 49344, fft6942);
__m512 fft7033 = _mm512_mask_mov_ps(fft7025, 49344, fft7026);
__m512 fft6950 = _mm512_mask_sub_ps(fft6942, 49344, _mm512_setzero_ps(), fft6941);
__m512 fft7034 = _mm512_mask_sub_ps(fft7026, 49344, _mm512_setzero_ps(), fft7025);
__m512 fft6951 = _mm512_mask_mov_ps(fft6943, 49344, fft6944);
__m512 fft7035 = _mm512_mask_mov_ps(fft7027, 49344, fft7028);
__m512 fft6952 = _mm512_mask_sub_ps(fft6944, 49344, _mm512_setzero_ps(), fft6943);
__m512 fft7036 = _mm512_mask_sub_ps(fft7028, 49344, _mm512_setzero_ps(), fft7027);
__m512 fft6953 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft6954 = _mm512_fmadd_ps(fft6945, fft6953, _mm512_shuffle_ps(fft6945, fft6945, 78));
__m512 fft7037 = _mm512_fmadd_ps(fft7029, fft6953, _mm512_shuffle_ps(fft7029, fft7029, 78));
__m512 fft6955 = _mm512_fmadd_ps(fft6946, fft6953, _mm512_shuffle_ps(fft6946, fft6946, 78));
__m512 fft7038 = _mm512_fmadd_ps(fft7030, fft6953, _mm512_shuffle_ps(fft7030, fft7030, 78));
__m512 fft6956 = _mm512_fmadd_ps(fft6947, fft6953, _mm512_shuffle_ps(fft6947, fft6947, 78));
__m512 fft7039 = _mm512_fmadd_ps(fft7031, fft6953, _mm512_shuffle_ps(fft7031, fft7031, 78));
__m512 fft6957 = _mm512_fmadd_ps(fft6948, fft6953, _mm512_shuffle_ps(fft6948, fft6948, 78));
__m512 fft7040 = _mm512_fmadd_ps(fft7032, fft6953, _mm512_shuffle_ps(fft7032, fft7032, 78));
__m512 fft6958 = _mm512_fmadd_ps(fft6949, fft6953, _mm512_shuffle_ps(fft6949, fft6949, 78));
__m512 fft7041 = _mm512_fmadd_ps(fft7033, fft6953, _mm512_shuffle_ps(fft7033, fft7033, 78));
__m512 fft6959 = _mm512_fmadd_ps(fft6950, fft6953, _mm512_shuffle_ps(fft6950, fft6950, 78));
__m512 fft7042 = _mm512_fmadd_ps(fft7034, fft6953, _mm512_shuffle_ps(fft7034, fft7034, 78));
__m512 fft6960 = _mm512_fmadd_ps(fft6951, fft6953, _mm512_shuffle_ps(fft6951, fft6951, 78));
__m512 fft7043 = _mm512_fmadd_ps(fft7035, fft6953, _mm512_shuffle_ps(fft7035, fft7035, 78));
__m512 fft6961 = _mm512_fmadd_ps(fft6952, fft6953, _mm512_shuffle_ps(fft6952, fft6952, 78));
__m512 fft7044 = _mm512_fmadd_ps(fft7036, fft6953, _mm512_shuffle_ps(fft7036, fft7036, 78));
__m512i fft6962 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft6963 = _mm512_permutexvar_ps(fft6962, fft6954);
__m512 fft7045 = _mm512_permutexvar_ps(fft6962, fft7037);
__m512i fft6964 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft6965 = _mm512_permutexvar_ps(fft6964, fft6954);
__m512 fft7046 = _mm512_permutexvar_ps(fft6964, fft7037);
__m512 fft6966 = _mm512_permutexvar_ps(fft6962, fft6955);
__m512 fft7047 = _mm512_permutexvar_ps(fft6962, fft7038);
__m512 fft6967 = _mm512_permutexvar_ps(fft6964, fft6955);
__m512 fft7048 = _mm512_permutexvar_ps(fft6964, fft7038);
__m512 fft6968 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft6969 = _mm512_fmadd_ps(fft6963, fft6968, fft6965);
__m512 fft7049 = _mm512_fmadd_ps(fft7045, fft6968, fft7046);
__m512 fft6970 = _mm512_fnmadd_ps(fft6967, fft6968, fft6966);
__m512 fft7050 = _mm512_fnmadd_ps(fft7048, fft6968, fft7047);
__m512 fft6971 = _mm512_mask_mov_ps(fft6967, 21845, fft6969);
__m512 fft7051 = _mm512_mask_mov_ps(fft7048, 21845, fft7049);
__m512 fft6972 = _mm512_mask_mov_ps(fft6963, 43176, fft6969);
__m512 fft7052 = _mm512_mask_mov_ps(fft7045, 43176, fft7049);
__m512 fft6973 = _mm512_mask_mov_ps(fft6971, 43176, fft6970);
__m512 fft7053 = _mm512_mask_mov_ps(fft7051, 43176, fft7050);
__m512 fft6974 = _mm512_mask_mov_ps(fft6972, 22102, fft6970);
__m512 fft7054 = _mm512_mask_mov_ps(fft7052, 22102, fft7050);
__m512 fft6975 = _mm512_mask_mul_ps(fft6973, 64764, fft6973, _mm512_set1_ps(5e-01f));
__m512 fft7055 = _mm512_mask_mul_ps(fft7053, 64764, fft7053, _mm512_set1_ps(5e-01f));
__m512 fft6976 = _mm512_mask_mul_ps(fft6974, 64764, fft6974, _mm512_set1_ps(5e-01f));
__m512 fft7056 = _mm512_mask_mul_ps(fft7054, 64764, fft7054, _mm512_set1_ps(5e-01f));
__m512 df625 = fft6975;
__m512 df633 = fft7055;
__m512 df626 = fft6976;
__m512 df634 = fft7056;
__m512 df627 = fft6956;
__m512 df635 = fft7039;
__m512 df628 = fft6957;
__m512 df636 = fft7040;
__m512 df629 = fft6958;
__m512 df637 = fft7041;
__m512 df630 = fft6959;
__m512 df638 = fft7042;
__m512 df631 = fft6960;
__m512 df639 = fft7043;
__m512 df632 = fft6961;
__m512 df640 = fft7044;
__m512i eo42 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df627 = _mm512_permutexvar_ps(eo42, df627);
df628 = _mm512_permutexvar_ps(eo42, df628);
__m512 rep1 = _mm512_shuffle_f32x4(df627, df627, 68);
_mm512_mask_storeu_ps(dfPtr1+101760+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep1);
__m512 rep2 = _mm512_shuffle_f32x4(df628, df628, 68);
_mm512_mask_storeu_ps(dfPtr1+101824+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep2);
__m512 rep3 = _mm512_shuffle_f32x4(df627, df627, 238);
_mm512_mask_storeu_ps(dfPtr1+508800+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep3);
__m512 rep4 = _mm512_shuffle_f32x4(df628, df628, 238);
_mm512_mask_storeu_ps(dfPtr1+508864+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep4);
df635 = _mm512_permutexvar_ps(eo42, df635);
df636 = _mm512_permutexvar_ps(eo42, df636);
__m512 rep5 = _mm512_shuffle_f32x4(df635, df635, 68);
_mm512_mask_storeu_ps(dfPtr1+915840+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep5);
__m512 rep6 = _mm512_shuffle_f32x4(df636, df636, 68);
_mm512_mask_storeu_ps(dfPtr1+915904+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep6);
__m512 rep7 = _mm512_shuffle_f32x4(df635, df635, 238);
_mm512_mask_storeu_ps(dfPtr1+1322880+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep7);
__m512 rep8 = _mm512_shuffle_f32x4(df636, df636, 238);
_mm512_mask_storeu_ps(dfPtr1+1322944+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep8);
df629 = _mm512_permutexvar_ps(eo42, df629);
df630 = _mm512_permutexvar_ps(eo42, df630);
__m512 rep9 = _mm512_shuffle_f32x4(df629, df629, 68);
_mm512_mask_storeu_ps(dfPtr1+203520+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep9);
__m512 rep10 = _mm512_shuffle_f32x4(df630, df630, 68);
_mm512_mask_storeu_ps(dfPtr1+203584+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep10);
__m512 rep11 = _mm512_shuffle_f32x4(df629, df629, 238);
_mm512_mask_storeu_ps(dfPtr1+610560+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep11);
__m512 rep12 = _mm512_shuffle_f32x4(df630, df630, 238);
_mm512_mask_storeu_ps(dfPtr1+610624+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep12);
df637 = _mm512_permutexvar_ps(eo42, df637);
df638 = _mm512_permutexvar_ps(eo42, df638);
__m512 rep13 = _mm512_shuffle_f32x4(df637, df637, 68);
_mm512_mask_storeu_ps(dfPtr1+1017600+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep13);
__m512 rep14 = _mm512_shuffle_f32x4(df638, df638, 68);
_mm512_mask_storeu_ps(dfPtr1+1017664+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep14);
__m512 rep15 = _mm512_shuffle_f32x4(df637, df637, 238);
_mm512_mask_storeu_ps(dfPtr1+1424640+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep15);
__m512 rep16 = _mm512_shuffle_f32x4(df638, df638, 238);
_mm512_mask_storeu_ps(dfPtr1+1424704+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep16);
df631 = _mm512_permutexvar_ps(eo42, df631);
df632 = _mm512_permutexvar_ps(eo42, df632);
__m512 rep17 = _mm512_shuffle_f32x4(df631, df631, 68);
_mm512_mask_storeu_ps(dfPtr1+305280+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep17);
__m512 rep18 = _mm512_shuffle_f32x4(df632, df632, 68);
_mm512_mask_storeu_ps(dfPtr1+305344+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep18);
__m512 rep19 = _mm512_shuffle_f32x4(df631, df631, 238);
_mm512_mask_storeu_ps(dfPtr1+712320+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep19);
__m512 rep20 = _mm512_shuffle_f32x4(df632, df632, 238);
_mm512_mask_storeu_ps(dfPtr1+712384+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep20);
df639 = _mm512_permutexvar_ps(eo42, df639);
df640 = _mm512_permutexvar_ps(eo42, df640);
__m512 rep21 = _mm512_shuffle_f32x4(df639, df639, 68);
_mm512_mask_storeu_ps(dfPtr1+1119360+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep21);
__m512 rep22 = _mm512_shuffle_f32x4(df640, df640, 68);
_mm512_mask_storeu_ps(dfPtr1+1119424+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep22);
__m512 rep23 = _mm512_shuffle_f32x4(df639, df639, 238);
_mm512_mask_storeu_ps(dfPtr1+1526400+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep23);
__m512 rep24 = _mm512_shuffle_f32x4(df640, df640, 238);
_mm512_mask_storeu_ps(dfPtr1+1526464+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep24);
__m512 rep25 = _mm512_shuffle_f32x4(df625, df625, 68);
_mm512_mask_storeu_ps(dfPtr1+0+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep25);
__m512 rep26 = _mm512_shuffle_f32x4(df626, df626, 68);
_mm512_mask_storeu_ps(dfPtr1+64+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep26);
__m512 rep27 = _mm512_shuffle_f32x4(df625, df625, 238);
_mm512_mask_storeu_ps(dfPtr1+407040+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep27);
__m512 rep28 = _mm512_shuffle_f32x4(df626, df626, 238);
_mm512_mask_storeu_ps(dfPtr1+407104+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep28);
__m512 rep29 = _mm512_shuffle_f32x4(df633, df633, 68);
_mm512_mask_storeu_ps(dfPtr1+814080+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep29);
__m512 rep30 = _mm512_shuffle_f32x4(df634, df634, 68);
_mm512_mask_storeu_ps(dfPtr1+814144+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep30);
__m512 rep31 = _mm512_shuffle_f32x4(df633, df633, 238);
_mm512_mask_storeu_ps(dfPtr1+1221120+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep31);
__m512 rep32 = _mm512_shuffle_f32x4(df634, df634, 238);
_mm512_mask_storeu_ps(dfPtr1+1221184+407040*i6+1152*j2+128*k20+128*m42+32*f43, 65535, rep32);
}
if (j2 >= last1) return;
++j2;
}

static void ResNeXt50StriderArrangeDats1(ResNeXt50ThreaderTeam1* team15, char** tensors3) {
ResNeXt50ThreaderTask1 task7;
task7.callee1 = ResNeXt50StriderArrangeDats1Callee1;
task7.any1 = tensors3;
task7.nd1 = 4;
task7.hull1[0] = 1;
task7.hull1[1] = 11;
task7.hull1[2] = 1;
task7.hull1[3] = 1;
ResNeXt50ThreaderDo1(team15, &task7);
}

static void ResNeXt50StriderProduceSums1Callee1(ResNeXt50ThreaderTask1* task8, int64_t* pt9) {
void** tuple2 = task8->any1;
char** tensors6 = tuple2[0];
ptrdiff_t e3 = 0;
ptrdiff_t z2 = (ptrdiff_t)tuple2[2];
ptrdiff_t g4 = 0;
ptrdiff_t p1 = pt9[2];
ptrdiff_t d1 = pt9[1];
ptrdiff_t w20 = 0;
if (__builtin_expect(!(e3|z2), 0)) {
z2 = 0;
char*restrict bfPtr2 = tensors6[0]+256*e3;
char*restrict wfPtr2 = tensors6[0]+256+12976128*e3+24576*z2;
char*restrict dfPtr2 = tensors6[1]+214917120*e3+407040*z2;
char*restrict sfPtr1 = tensors6[2];
ptrdiff_t i7 = 1*g4;
ptrdiff_t j3 = 1*p1;
ptrdiff_t jj9 = j3+0;
if (__builtin_expect(!j3, 0)) {
ptrdiff_t k21 = 6*d1;
ptrdiff_t kk20 = k21+(d1 < 13 ? 5 : 10);
for (; k21 != 88; ++k21) {
ptrdiff_t l1 = 16*w20;
for (; l1 != 16; ++l1) {
__m512 sfRe1 = _mm512_setzero_ps();
__m512 sfIm1 = _mm512_setzero_ps();
__m512 sfRe7 = _mm512_setzero_ps();
__m512 sfIm7 = _mm512_setzero_ps();
sfRe1 = _mm512_mask_mov_ps(sfRe1, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+256*i7+16*l1)));
sfRe1 = _mm512_mask_mov_ps(sfRe1, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+256*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+256*i7+16*l1)));
sfRe7 = _mm512_mask_mov_ps(sfRe7, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+256*i7+16*l1)));
__m512 sfRe2 = sfRe1;
__m512 sfIm2 = sfIm1;
__m512 sfRe3 = sfRe1;
__m512 sfIm3 = sfIm1;
__m512 sfRe4 = sfRe1;
__m512 sfIm4 = sfIm1;
__m512 sfRe5 = sfRe1;
__m512 sfIm5 = sfIm1;
__m512 sfRe6 = sfRe1;
__m512 sfIm6 = sfIm1;
__m512 sfRe8 = sfRe7;
__m512 sfIm8 = sfIm7;
__m512 sfRe9 = sfRe7;
__m512 sfIm9 = sfIm7;
__m512 sfRe10 = sfRe7;
__m512 sfIm10 = sfIm7;
__m512 sfRe11 = sfRe7;
__m512 sfIm11 = sfIm7;
__m512 sfRe12 = sfRe7;
__m512 sfIm12 = sfIm7;
for (ptrdiff_t s2 = 0; s2 < 3; ++s2) {
__m512i wfLd1 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l1+128*s2);
__m512 wfRe1 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd1));
__m512 wfIm1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd1, 1));
__m512 wfMx1 = _mm512_mask_mov_ps(wfIm1, 64764, wfRe1);
__m512i wfLd2 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l1+128*s2);
__m512 wfRe2 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd2));
__m512 wfIm2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd2, 1));
__m512 wfMx2 = _mm512_mask_mov_ps(wfIm2, 64764, wfRe2);
__m512 dfRe1 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm1 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k21+384*s2);
sfRe1 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe1);
sfRe1 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe1, 64764);
sfIm1 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm1);
sfIm1 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm1, 64764);
sfRe7 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe7);
sfRe7 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe7, 64764);
sfIm7 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm7);
sfIm7 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm7, 64764);
dfRe1 = _mm512_shuffle_f32x4(dfRe1, dfRe1, 78);
dfIm1 = _mm512_shuffle_f32x4(dfIm1, dfIm1, 78);
sfRe2 = _mm512_fmadd_ps(wfRe1, dfRe1, sfRe2);
sfRe2 = _mm512_mask3_fmadd_ps(wfIm1, dfIm1, sfRe2, 64764);
sfIm2 = _mm512_fmadd_ps(wfMx1, dfIm1, sfIm2);
sfIm2 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe1, sfIm2, 64764);
sfRe8 = _mm512_fmadd_ps(wfRe2, dfRe1, sfRe8);
sfRe8 = _mm512_mask3_fmadd_ps(wfIm2, dfIm1, sfRe8, 64764);
sfIm8 = _mm512_fmadd_ps(wfMx2, dfIm1, sfIm8);
sfIm8 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe1, sfIm8, 64764);
__m512 dfRe2 = _mm512_loadu_ps(dfPtr2+128+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm2 = _mm512_loadu_ps(dfPtr2+192+407040*i7+101760*j3+1152*k21+384*s2);
sfRe3 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe3);
sfRe3 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe3, 64764);
sfIm3 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm3);
sfIm3 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm3, 64764);
sfRe9 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe9);
sfRe9 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe9, 64764);
sfIm9 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm9);
sfIm9 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm9, 64764);
dfRe2 = _mm512_shuffle_f32x4(dfRe2, dfRe2, 78);
dfIm2 = _mm512_shuffle_f32x4(dfIm2, dfIm2, 78);
sfRe4 = _mm512_fmadd_ps(wfRe1, dfRe2, sfRe4);
sfRe4 = _mm512_mask3_fmadd_ps(wfIm1, dfIm2, sfRe4, 64764);
sfIm4 = _mm512_fmadd_ps(wfMx1, dfIm2, sfIm4);
sfIm4 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe2, sfIm4, 64764);
sfRe10 = _mm512_fmadd_ps(wfRe2, dfRe2, sfRe10);
sfRe10 = _mm512_mask3_fmadd_ps(wfIm2, dfIm2, sfRe10, 64764);
sfIm10 = _mm512_fmadd_ps(wfMx2, dfIm2, sfIm10);
sfIm10 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe2, sfIm10, 64764);
__m512 dfRe3 = _mm512_loadu_ps(dfPtr2+256+407040*i7+101760*j3+1152*k21+384*s2);
__m512 dfIm3 = _mm512_loadu_ps(dfPtr2+320+407040*i7+101760*j3+1152*k21+384*s2);
sfRe5 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe5);
sfRe5 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe5, 64764);
sfIm5 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm5);
sfIm5 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm5, 64764);
sfRe11 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe11);
sfRe11 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe11, 64764);
sfIm11 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm11);
sfIm11 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm11, 64764);
dfRe3 = _mm512_shuffle_f32x4(dfRe3, dfRe3, 78);
dfIm3 = _mm512_shuffle_f32x4(dfIm3, dfIm3, 78);
sfRe6 = _mm512_fmadd_ps(wfRe1, dfRe3, sfRe6);
sfRe6 = _mm512_mask3_fmadd_ps(wfIm1, dfIm3, sfRe6, 64764);
sfIm6 = _mm512_fmadd_ps(wfMx1, dfIm3, sfIm6);
sfIm6 = _mm512_mask3_fnmadd_ps(wfIm1, dfRe3, sfIm6, 64764);
sfRe12 = _mm512_fmadd_ps(wfRe2, dfRe3, sfRe12);
sfRe12 = _mm512_mask3_fmadd_ps(wfIm2, dfIm3, sfRe12, 64764);
sfIm12 = _mm512_fmadd_ps(wfMx2, dfIm3, sfIm12);
sfIm12 = _mm512_mask3_fnmadd_ps(wfIm2, dfRe3, sfIm12, 64764);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe1);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm1);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe2);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm2);
_mm512_storeu_ps(sfPtr1+256+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe3);
_mm512_storeu_ps(sfPtr1+320+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm3);
_mm512_storeu_ps(sfPtr1+384+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe4);
_mm512_storeu_ps(sfPtr1+448+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm4);
_mm512_storeu_ps(sfPtr1+512+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe5);
_mm512_storeu_ps(sfPtr1+576+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm5);
_mm512_storeu_ps(sfPtr1+640+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe6);
_mm512_storeu_ps(sfPtr1+704+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm6);
_mm512_storeu_ps(sfPtr1+768+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe7);
_mm512_storeu_ps(sfPtr1+832+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm7);
_mm512_storeu_ps(sfPtr1+896+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe8);
_mm512_storeu_ps(sfPtr1+960+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm8);
_mm512_storeu_ps(sfPtr1+1024+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe9);
_mm512_storeu_ps(sfPtr1+1088+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm9);
_mm512_storeu_ps(sfPtr1+1152+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe10);
_mm512_storeu_ps(sfPtr1+1216+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm10);
_mm512_storeu_ps(sfPtr1+1280+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe11);
_mm512_storeu_ps(sfPtr1+1344+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm11);
_mm512_storeu_ps(sfPtr1+1408+8667136*i7+2166784*j3+24576*k21+1536*l1, sfRe12);
_mm512_storeu_ps(sfPtr1+1472+8667136*i7+2166784*j3+24576*k21+1536*l1, sfIm12);
}
if (k21 >= kk20) return;
}
ptrdiff_t l2 = 16*w20;
for (; l2 != 16; ++l2) {
__m512 sfRe13 = _mm512_setzero_ps();
__m512 sfIm13 = _mm512_setzero_ps();
__m512 sfRe14 = _mm512_setzero_ps();
__m512 sfIm14 = _mm512_setzero_ps();
sfRe13 = _mm512_mask_mov_ps(sfRe13, 1, _mm512_set1_ps(*(float*)(bfPtr2+0+256*i7+16*l2)));
sfRe13 = _mm512_mask_mov_ps(sfRe13, 256, _mm512_set1_ps(*(float*)(bfPtr2+4+256*i7+16*l2)));
sfRe14 = _mm512_mask_mov_ps(sfRe14, 1, _mm512_set1_ps(*(float*)(bfPtr2+8+256*i7+16*l2)));
sfRe14 = _mm512_mask_mov_ps(sfRe14, 256, _mm512_set1_ps(*(float*)(bfPtr2+12+256*i7+16*l2)));
for (ptrdiff_t s3 = 0; s3 < 3; ++s3) {
__m512i wfLd3 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l2+128*s3);
__m512 wfRe3 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd3));
__m512 wfIm3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd3, 1));
__m512 wfMx3 = _mm512_mask_mov_ps(wfIm3, 64764, wfRe3);
__m512i wfLd4 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l2+128*s3);
__m512 wfRe4 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd4));
__m512 wfIm4 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd4, 1));
__m512 wfMx4 = _mm512_mask_mov_ps(wfIm4, 64764, wfRe4);
__m512 dfRe4 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k21+128*s3);
__m512 dfIm4 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k21+128*s3);
sfRe13 = _mm512_fmadd_ps(wfRe3, dfRe4, sfRe13);
sfRe13 = _mm512_mask3_fmadd_ps(wfIm3, dfIm4, sfRe13, 64764);
sfIm13 = _mm512_fmadd_ps(wfMx3, dfIm4, sfIm13);
sfIm13 = _mm512_mask3_fnmadd_ps(wfIm3, dfRe4, sfIm13, 64764);
sfRe14 = _mm512_fmadd_ps(wfRe4, dfRe4, sfRe14);
sfRe14 = _mm512_mask3_fmadd_ps(wfIm4, dfIm4, sfRe14, 64764);
sfIm14 = _mm512_fmadd_ps(wfMx4, dfIm4, sfIm14);
sfIm14 = _mm512_mask3_fnmadd_ps(wfIm4, dfRe4, sfIm14, 64764);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k21+256*l2, sfRe13);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k21+256*l2, sfIm13);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k21+256*l2, sfRe14);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k21+256*l2, sfIm14);
}
j3 = 1;
}
for (; j3 <= jj9; ++j3) {
ptrdiff_t k22 = 6*d1;
ptrdiff_t kk21 = k22+(d1 < 13 ? 5 : 10);
for (; k22 != 88; ++k22) {
ptrdiff_t l3 = 16*w20;
for (; l3 != 16; ++l3) {
__m512 sfRe15 = _mm512_setzero_ps();
__m512 sfIm15 = _mm512_setzero_ps();
__m512 sfRe21 = _mm512_setzero_ps();
__m512 sfIm21 = _mm512_setzero_ps();
(void)bfPtr2;
__m512 sfRe16 = sfRe15;
__m512 sfIm16 = sfIm15;
__m512 sfRe17 = sfRe15;
__m512 sfIm17 = sfIm15;
__m512 sfRe18 = sfRe15;
__m512 sfIm18 = sfIm15;
__m512 sfRe19 = sfRe15;
__m512 sfIm19 = sfIm15;
__m512 sfRe20 = sfRe15;
__m512 sfIm20 = sfIm15;
__m512 sfRe22 = sfRe21;
__m512 sfIm22 = sfIm21;
__m512 sfRe23 = sfRe21;
__m512 sfIm23 = sfIm21;
__m512 sfRe24 = sfRe21;
__m512 sfIm24 = sfIm21;
__m512 sfRe25 = sfRe21;
__m512 sfIm25 = sfIm21;
__m512 sfRe26 = sfRe21;
__m512 sfIm26 = sfIm21;
for (ptrdiff_t s4 = 0; s4 < 3; ++s4) {
__m512i wfLd5 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l3+128*s4);
__m512 wfRe5 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd5));
__m512 wfIm5 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd5, 1));
__m512i wfLd6 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l3+128*s4);
__m512 wfRe6 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd6));
__m512 wfIm6 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd6, 1));
__m512 dfRe5 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm5 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k22+384*s4);
sfRe15 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe15);
sfRe15 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe15);
sfIm15 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm15);
sfIm15 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm15);
sfRe21 = _mm512_fmadd_ps(wfRe6, dfRe5, sfRe21);
sfRe21 = _mm512_fmadd_ps(wfIm6, dfIm5, sfRe21);
sfIm21 = _mm512_fmadd_ps(wfRe6, dfIm5, sfIm21);
sfIm21 = _mm512_fnmadd_ps(wfIm6, dfRe5, sfIm21);
dfRe5 = _mm512_shuffle_f32x4(dfRe5, dfRe5, 78);
dfIm5 = _mm512_shuffle_f32x4(dfIm5, dfIm5, 78);
sfRe16 = _mm512_fmadd_ps(wfRe5, dfRe5, sfRe16);
sfRe16 = _mm512_fmadd_ps(wfIm5, dfIm5, sfRe16);
sfIm16 = _mm512_fmadd_ps(wfRe5, dfIm5, sfIm16);
sfIm16 = _mm512_fnmadd_ps(wfIm5, dfRe5, sfIm16);
sfRe22 = _mm512_fmadd_ps(wfRe6, dfRe5, sfRe22);
sfRe22 = _mm512_fmadd_ps(wfIm6, dfIm5, sfRe22);
sfIm22 = _mm512_fmadd_ps(wfRe6, dfIm5, sfIm22);
sfIm22 = _mm512_fnmadd_ps(wfIm6, dfRe5, sfIm22);
__m512 dfRe6 = _mm512_loadu_ps(dfPtr2+128+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm6 = _mm512_loadu_ps(dfPtr2+192+407040*i7+101760*j3+1152*k22+384*s4);
sfRe17 = _mm512_fmadd_ps(wfRe5, dfRe6, sfRe17);
sfRe17 = _mm512_fmadd_ps(wfIm5, dfIm6, sfRe17);
sfIm17 = _mm512_fmadd_ps(wfRe5, dfIm6, sfIm17);
sfIm17 = _mm512_fnmadd_ps(wfIm5, dfRe6, sfIm17);
sfRe23 = _mm512_fmadd_ps(wfRe6, dfRe6, sfRe23);
sfRe23 = _mm512_fmadd_ps(wfIm6, dfIm6, sfRe23);
sfIm23 = _mm512_fmadd_ps(wfRe6, dfIm6, sfIm23);
sfIm23 = _mm512_fnmadd_ps(wfIm6, dfRe6, sfIm23);
dfRe6 = _mm512_shuffle_f32x4(dfRe6, dfRe6, 78);
dfIm6 = _mm512_shuffle_f32x4(dfIm6, dfIm6, 78);
sfRe18 = _mm512_fmadd_ps(wfRe5, dfRe6, sfRe18);
sfRe18 = _mm512_fmadd_ps(wfIm5, dfIm6, sfRe18);
sfIm18 = _mm512_fmadd_ps(wfRe5, dfIm6, sfIm18);
sfIm18 = _mm512_fnmadd_ps(wfIm5, dfRe6, sfIm18);
sfRe24 = _mm512_fmadd_ps(wfRe6, dfRe6, sfRe24);
sfRe24 = _mm512_fmadd_ps(wfIm6, dfIm6, sfRe24);
sfIm24 = _mm512_fmadd_ps(wfRe6, dfIm6, sfIm24);
sfIm24 = _mm512_fnmadd_ps(wfIm6, dfRe6, sfIm24);
__m512 dfRe7 = _mm512_loadu_ps(dfPtr2+256+407040*i7+101760*j3+1152*k22+384*s4);
__m512 dfIm7 = _mm512_loadu_ps(dfPtr2+320+407040*i7+101760*j3+1152*k22+384*s4);
sfRe19 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe19);
sfRe19 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe19);
sfIm19 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm19);
sfIm19 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm19);
sfRe25 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe25);
sfRe25 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe25);
sfIm25 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm25);
sfIm25 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm25);
dfRe7 = _mm512_shuffle_f32x4(dfRe7, dfRe7, 78);
dfIm7 = _mm512_shuffle_f32x4(dfIm7, dfIm7, 78);
sfRe20 = _mm512_fmadd_ps(wfRe5, dfRe7, sfRe20);
sfRe20 = _mm512_fmadd_ps(wfIm5, dfIm7, sfRe20);
sfIm20 = _mm512_fmadd_ps(wfRe5, dfIm7, sfIm20);
sfIm20 = _mm512_fnmadd_ps(wfIm5, dfRe7, sfIm20);
sfRe26 = _mm512_fmadd_ps(wfRe6, dfRe7, sfRe26);
sfRe26 = _mm512_fmadd_ps(wfIm6, dfIm7, sfRe26);
sfIm26 = _mm512_fmadd_ps(wfRe6, dfIm7, sfIm26);
sfIm26 = _mm512_fnmadd_ps(wfIm6, dfRe7, sfIm26);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe15);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm15);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe16);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm16);
_mm512_storeu_ps(sfPtr1+256+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe17);
_mm512_storeu_ps(sfPtr1+320+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm17);
_mm512_storeu_ps(sfPtr1+384+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe18);
_mm512_storeu_ps(sfPtr1+448+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm18);
_mm512_storeu_ps(sfPtr1+512+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe19);
_mm512_storeu_ps(sfPtr1+576+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm19);
_mm512_storeu_ps(sfPtr1+640+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe20);
_mm512_storeu_ps(sfPtr1+704+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm20);
_mm512_storeu_ps(sfPtr1+768+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe21);
_mm512_storeu_ps(sfPtr1+832+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm21);
_mm512_storeu_ps(sfPtr1+896+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe22);
_mm512_storeu_ps(sfPtr1+960+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm22);
_mm512_storeu_ps(sfPtr1+1024+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe23);
_mm512_storeu_ps(sfPtr1+1088+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm23);
_mm512_storeu_ps(sfPtr1+1152+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe24);
_mm512_storeu_ps(sfPtr1+1216+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm24);
_mm512_storeu_ps(sfPtr1+1280+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe25);
_mm512_storeu_ps(sfPtr1+1344+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm25);
_mm512_storeu_ps(sfPtr1+1408+8667136*i7+2166784*j3+24576*k22+1536*l3, sfRe26);
_mm512_storeu_ps(sfPtr1+1472+8667136*i7+2166784*j3+24576*k22+1536*l3, sfIm26);
}
if (k22 >= kk21) return;
}
ptrdiff_t l4 = 16*w20;
for (; l4 != 16; ++l4) {
__m512 sfRe27 = _mm512_setzero_ps();
__m512 sfIm27 = _mm512_setzero_ps();
__m512 sfRe28 = _mm512_setzero_ps();
__m512 sfIm28 = _mm512_setzero_ps();
(void)bfPtr2;
for (ptrdiff_t s5 = 0; s5 < 3; ++s5) {
__m512i wfLd7 = _mm512_loadu_si512(wfPtr2+0+24576*i7+6144*j3+384*l4+128*s5);
__m512 wfRe7 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd7));
__m512 wfIm7 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd7, 1));
__m512i wfLd8 = _mm512_loadu_si512(wfPtr2+64+24576*i7+6144*j3+384*l4+128*s5);
__m512 wfRe8 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd8));
__m512 wfIm8 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd8, 1));
__m512 dfRe8 = _mm512_loadu_ps(dfPtr2+0+407040*i7+101760*j3+1152*k22+128*s5);
__m512 dfIm8 = _mm512_loadu_ps(dfPtr2+64+407040*i7+101760*j3+1152*k22+128*s5);
sfRe27 = _mm512_fmadd_ps(wfRe7, dfRe8, sfRe27);
sfRe27 = _mm512_fmadd_ps(wfIm7, dfIm8, sfRe27);
sfIm27 = _mm512_fmadd_ps(wfRe7, dfIm8, sfIm27);
sfIm27 = _mm512_fnmadd_ps(wfIm7, dfRe8, sfIm27);
sfRe28 = _mm512_fmadd_ps(wfRe8, dfRe8, sfRe28);
sfRe28 = _mm512_fmadd_ps(wfIm8, dfIm8, sfRe28);
sfIm28 = _mm512_fmadd_ps(wfRe8, dfIm8, sfIm28);
sfIm28 = _mm512_fnmadd_ps(wfIm8, dfRe8, sfIm28);
}
_mm512_storeu_ps(sfPtr1+0+8667136*i7+2166784*j3+24576*k22+256*l4, sfRe27);
_mm512_storeu_ps(sfPtr1+64+8667136*i7+2166784*j3+24576*k22+256*l4, sfIm27);
_mm512_storeu_ps(sfPtr1+128+8667136*i7+2166784*j3+24576*k22+256*l4, sfRe28);
_mm512_storeu_ps(sfPtr1+192+8667136*i7+2166784*j3+24576*k22+256*l4, sfIm28);
}
}
return;
}
char*restrict bfPtr3 = tensors6[0]+256*e3;
char*restrict wfPtr3 = tensors6[0]+256+12976128*e3+24576*z2;
char*restrict dfPtr3 = tensors6[1]+214917120*e3+407040*z2;
char*restrict sfPtr2 = tensors6[2];
ptrdiff_t i8 = 1*g4;
ptrdiff_t j4 = 1*p1;
ptrdiff_t jj10 = j4+0;
if (__builtin_expect(!j4, 0)) {
ptrdiff_t k23 = 6*d1;
ptrdiff_t kk22 = k23+(d1 < 13 ? 5 : 10);
for (; k23 != 88; ++k23) {
ptrdiff_t l5 = 16*w20;
for (; l5 != 16; ++l5) {
__m512 sfRe29 = _mm512_setzero_ps();
__m512 sfIm29 = _mm512_setzero_ps();
__m512 sfRe35 = _mm512_setzero_ps();
__m512 sfIm35 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe30 = sfRe29;
__m512 sfIm30 = sfIm29;
__m512 sfRe31 = sfRe29;
__m512 sfIm31 = sfIm29;
__m512 sfRe32 = sfRe29;
__m512 sfIm32 = sfIm29;
__m512 sfRe33 = sfRe29;
__m512 sfIm33 = sfIm29;
__m512 sfRe34 = sfRe29;
__m512 sfIm34 = sfIm29;
__m512 sfRe36 = sfRe35;
__m512 sfIm36 = sfIm35;
__m512 sfRe37 = sfRe35;
__m512 sfIm37 = sfIm35;
__m512 sfRe38 = sfRe35;
__m512 sfIm38 = sfIm35;
__m512 sfRe39 = sfRe35;
__m512 sfIm39 = sfIm35;
__m512 sfRe40 = sfRe35;
__m512 sfIm40 = sfIm35;
for (ptrdiff_t s6 = 0; s6 < 3; ++s6) {
__m512i wfLd9 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l5+128*s6);
__m512 wfRe9 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd9));
__m512 wfIm9 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd9, 1));
__m512 wfMx5 = _mm512_mask_mov_ps(wfIm9, 64764, wfRe9);
__m512i wfLd10 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l5+128*s6);
__m512 wfRe10 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd10));
__m512 wfIm10 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd10, 1));
__m512 wfMx6 = _mm512_mask_mov_ps(wfIm10, 64764, wfRe10);
__m512 dfRe9 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm9 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k23+384*s6);
sfRe29 = _mm512_fmadd_ps(wfRe9, dfRe9, sfRe29);
sfRe29 = _mm512_mask3_fmadd_ps(wfIm9, dfIm9, sfRe29, 64764);
sfIm29 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm29);
sfIm29 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe9, sfIm29, 64764);
sfRe35 = _mm512_fmadd_ps(wfRe10, dfRe9, sfRe35);
sfRe35 = _mm512_mask3_fmadd_ps(wfIm10, dfIm9, sfRe35, 64764);
sfIm35 = _mm512_fmadd_ps(wfMx6, dfIm9, sfIm35);
sfIm35 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe9, sfIm35, 64764);
dfRe9 = _mm512_shuffle_f32x4(dfRe9, dfRe9, 78);
dfIm9 = _mm512_shuffle_f32x4(dfIm9, dfIm9, 78);
sfRe30 = _mm512_fmadd_ps(wfRe9, dfRe9, sfRe30);
sfRe30 = _mm512_mask3_fmadd_ps(wfIm9, dfIm9, sfRe30, 64764);
sfIm30 = _mm512_fmadd_ps(wfMx5, dfIm9, sfIm30);
sfIm30 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe9, sfIm30, 64764);
sfRe36 = _mm512_fmadd_ps(wfRe10, dfRe9, sfRe36);
sfRe36 = _mm512_mask3_fmadd_ps(wfIm10, dfIm9, sfRe36, 64764);
sfIm36 = _mm512_fmadd_ps(wfMx6, dfIm9, sfIm36);
sfIm36 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe9, sfIm36, 64764);
__m512 dfRe10 = _mm512_loadu_ps(dfPtr3+128+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm10 = _mm512_loadu_ps(dfPtr3+192+407040*i8+101760*j4+1152*k23+384*s6);
sfRe31 = _mm512_fmadd_ps(wfRe9, dfRe10, sfRe31);
sfRe31 = _mm512_mask3_fmadd_ps(wfIm9, dfIm10, sfRe31, 64764);
sfIm31 = _mm512_fmadd_ps(wfMx5, dfIm10, sfIm31);
sfIm31 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe10, sfIm31, 64764);
sfRe37 = _mm512_fmadd_ps(wfRe10, dfRe10, sfRe37);
sfRe37 = _mm512_mask3_fmadd_ps(wfIm10, dfIm10, sfRe37, 64764);
sfIm37 = _mm512_fmadd_ps(wfMx6, dfIm10, sfIm37);
sfIm37 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe10, sfIm37, 64764);
dfRe10 = _mm512_shuffle_f32x4(dfRe10, dfRe10, 78);
dfIm10 = _mm512_shuffle_f32x4(dfIm10, dfIm10, 78);
sfRe32 = _mm512_fmadd_ps(wfRe9, dfRe10, sfRe32);
sfRe32 = _mm512_mask3_fmadd_ps(wfIm9, dfIm10, sfRe32, 64764);
sfIm32 = _mm512_fmadd_ps(wfMx5, dfIm10, sfIm32);
sfIm32 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe10, sfIm32, 64764);
sfRe38 = _mm512_fmadd_ps(wfRe10, dfRe10, sfRe38);
sfRe38 = _mm512_mask3_fmadd_ps(wfIm10, dfIm10, sfRe38, 64764);
sfIm38 = _mm512_fmadd_ps(wfMx6, dfIm10, sfIm38);
sfIm38 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe10, sfIm38, 64764);
__m512 dfRe11 = _mm512_loadu_ps(dfPtr3+256+407040*i8+101760*j4+1152*k23+384*s6);
__m512 dfIm11 = _mm512_loadu_ps(dfPtr3+320+407040*i8+101760*j4+1152*k23+384*s6);
sfRe33 = _mm512_fmadd_ps(wfRe9, dfRe11, sfRe33);
sfRe33 = _mm512_mask3_fmadd_ps(wfIm9, dfIm11, sfRe33, 64764);
sfIm33 = _mm512_fmadd_ps(wfMx5, dfIm11, sfIm33);
sfIm33 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe11, sfIm33, 64764);
sfRe39 = _mm512_fmadd_ps(wfRe10, dfRe11, sfRe39);
sfRe39 = _mm512_mask3_fmadd_ps(wfIm10, dfIm11, sfRe39, 64764);
sfIm39 = _mm512_fmadd_ps(wfMx6, dfIm11, sfIm39);
sfIm39 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe11, sfIm39, 64764);
dfRe11 = _mm512_shuffle_f32x4(dfRe11, dfRe11, 78);
dfIm11 = _mm512_shuffle_f32x4(dfIm11, dfIm11, 78);
sfRe34 = _mm512_fmadd_ps(wfRe9, dfRe11, sfRe34);
sfRe34 = _mm512_mask3_fmadd_ps(wfIm9, dfIm11, sfRe34, 64764);
sfIm34 = _mm512_fmadd_ps(wfMx5, dfIm11, sfIm34);
sfIm34 = _mm512_mask3_fnmadd_ps(wfIm9, dfRe11, sfIm34, 64764);
sfRe40 = _mm512_fmadd_ps(wfRe10, dfRe11, sfRe40);
sfRe40 = _mm512_mask3_fmadd_ps(wfIm10, dfIm11, sfRe40, 64764);
sfIm40 = _mm512_fmadd_ps(wfMx6, dfIm11, sfIm40);
sfIm40 = _mm512_mask3_fnmadd_ps(wfIm10, dfRe11, sfIm40, 64764);
}
sfRe29 = _mm512_add_ps(sfRe29, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm29 = _mm512_add_ps(sfIm29, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe30 = _mm512_add_ps(sfRe30, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm30 = _mm512_add_ps(sfIm30, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe31 = _mm512_add_ps(sfRe31, _mm512_loadu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm31 = _mm512_add_ps(sfIm31, _mm512_loadu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe32 = _mm512_add_ps(sfRe32, _mm512_loadu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm32 = _mm512_add_ps(sfIm32, _mm512_loadu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe33 = _mm512_add_ps(sfRe33, _mm512_loadu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm33 = _mm512_add_ps(sfIm33, _mm512_loadu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe34 = _mm512_add_ps(sfRe34, _mm512_loadu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm34 = _mm512_add_ps(sfIm34, _mm512_loadu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe35 = _mm512_add_ps(sfRe35, _mm512_loadu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm35 = _mm512_add_ps(sfIm35, _mm512_loadu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe36 = _mm512_add_ps(sfRe36, _mm512_loadu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm36 = _mm512_add_ps(sfIm36, _mm512_loadu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe37 = _mm512_add_ps(sfRe37, _mm512_loadu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm37 = _mm512_add_ps(sfIm37, _mm512_loadu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe38 = _mm512_add_ps(sfRe38, _mm512_loadu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm38 = _mm512_add_ps(sfIm38, _mm512_loadu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe39 = _mm512_add_ps(sfRe39, _mm512_loadu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm39 = _mm512_add_ps(sfIm39, _mm512_loadu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfRe40 = _mm512_add_ps(sfRe40, _mm512_loadu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k23+1536*l5));
sfIm40 = _mm512_add_ps(sfIm40, _mm512_loadu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k23+1536*l5));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe29);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm29);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe30);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm30);
_mm512_storeu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe31);
_mm512_storeu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm31);
_mm512_storeu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe32);
_mm512_storeu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm32);
_mm512_storeu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe33);
_mm512_storeu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm33);
_mm512_storeu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe34);
_mm512_storeu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm34);
_mm512_storeu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe35);
_mm512_storeu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm35);
_mm512_storeu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe36);
_mm512_storeu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm36);
_mm512_storeu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe37);
_mm512_storeu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm37);
_mm512_storeu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe38);
_mm512_storeu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm38);
_mm512_storeu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe39);
_mm512_storeu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm39);
_mm512_storeu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k23+1536*l5, sfRe40);
_mm512_storeu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k23+1536*l5, sfIm40);
}
if (k23 >= kk22) return;
}
ptrdiff_t l6 = 16*w20;
for (; l6 != 16; ++l6) {
__m512 sfRe41 = _mm512_setzero_ps();
__m512 sfIm41 = _mm512_setzero_ps();
__m512 sfRe42 = _mm512_setzero_ps();
__m512 sfIm42 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s7 = 0; s7 < 3; ++s7) {
__m512i wfLd11 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l6+128*s7);
__m512 wfRe11 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd11));
__m512 wfIm11 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd11, 1));
__m512 wfMx7 = _mm512_mask_mov_ps(wfIm11, 64764, wfRe11);
__m512i wfLd12 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l6+128*s7);
__m512 wfRe12 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd12));
__m512 wfIm12 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd12, 1));
__m512 wfMx8 = _mm512_mask_mov_ps(wfIm12, 64764, wfRe12);
__m512 dfRe12 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k23+128*s7);
__m512 dfIm12 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k23+128*s7);
sfRe41 = _mm512_fmadd_ps(wfRe11, dfRe12, sfRe41);
sfRe41 = _mm512_mask3_fmadd_ps(wfIm11, dfIm12, sfRe41, 64764);
sfIm41 = _mm512_fmadd_ps(wfMx7, dfIm12, sfIm41);
sfIm41 = _mm512_mask3_fnmadd_ps(wfIm11, dfRe12, sfIm41, 64764);
sfRe42 = _mm512_fmadd_ps(wfRe12, dfRe12, sfRe42);
sfRe42 = _mm512_mask3_fmadd_ps(wfIm12, dfIm12, sfRe42, 64764);
sfIm42 = _mm512_fmadd_ps(wfMx8, dfIm12, sfIm42);
sfIm42 = _mm512_mask3_fnmadd_ps(wfIm12, dfRe12, sfIm42, 64764);
}
sfRe41 = _mm512_add_ps(sfRe41, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+256*l6));
sfIm41 = _mm512_add_ps(sfIm41, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+256*l6));
sfRe42 = _mm512_add_ps(sfRe42, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+256*l6));
sfIm42 = _mm512_add_ps(sfIm42, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+256*l6));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k23+256*l6, sfRe41);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k23+256*l6, sfIm41);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k23+256*l6, sfRe42);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k23+256*l6, sfIm42);
}
j4 = 1;
}
for (; j4 <= jj10; ++j4) {
ptrdiff_t k24 = 6*d1;
ptrdiff_t kk23 = k24+(d1 < 13 ? 5 : 10);
for (; k24 != 88; ++k24) {
ptrdiff_t l7 = 16*w20;
for (; l7 != 16; ++l7) {
__m512 sfRe43 = _mm512_setzero_ps();
__m512 sfIm43 = _mm512_setzero_ps();
__m512 sfRe49 = _mm512_setzero_ps();
__m512 sfIm49 = _mm512_setzero_ps();
(void)bfPtr3;
__m512 sfRe44 = sfRe43;
__m512 sfIm44 = sfIm43;
__m512 sfRe45 = sfRe43;
__m512 sfIm45 = sfIm43;
__m512 sfRe46 = sfRe43;
__m512 sfIm46 = sfIm43;
__m512 sfRe47 = sfRe43;
__m512 sfIm47 = sfIm43;
__m512 sfRe48 = sfRe43;
__m512 sfIm48 = sfIm43;
__m512 sfRe50 = sfRe49;
__m512 sfIm50 = sfIm49;
__m512 sfRe51 = sfRe49;
__m512 sfIm51 = sfIm49;
__m512 sfRe52 = sfRe49;
__m512 sfIm52 = sfIm49;
__m512 sfRe53 = sfRe49;
__m512 sfIm53 = sfIm49;
__m512 sfRe54 = sfRe49;
__m512 sfIm54 = sfIm49;
for (ptrdiff_t s8 = 0; s8 < 3; ++s8) {
__m512i wfLd13 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l7+128*s8);
__m512 wfRe13 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd13));
__m512 wfIm13 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd13, 1));
__m512i wfLd14 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l7+128*s8);
__m512 wfRe14 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd14));
__m512 wfIm14 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd14, 1));
__m512 dfRe13 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm13 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k24+384*s8);
sfRe43 = _mm512_fmadd_ps(wfRe13, dfRe13, sfRe43);
sfRe43 = _mm512_fmadd_ps(wfIm13, dfIm13, sfRe43);
sfIm43 = _mm512_fmadd_ps(wfRe13, dfIm13, sfIm43);
sfIm43 = _mm512_fnmadd_ps(wfIm13, dfRe13, sfIm43);
sfRe49 = _mm512_fmadd_ps(wfRe14, dfRe13, sfRe49);
sfRe49 = _mm512_fmadd_ps(wfIm14, dfIm13, sfRe49);
sfIm49 = _mm512_fmadd_ps(wfRe14, dfIm13, sfIm49);
sfIm49 = _mm512_fnmadd_ps(wfIm14, dfRe13, sfIm49);
dfRe13 = _mm512_shuffle_f32x4(dfRe13, dfRe13, 78);
dfIm13 = _mm512_shuffle_f32x4(dfIm13, dfIm13, 78);
sfRe44 = _mm512_fmadd_ps(wfRe13, dfRe13, sfRe44);
sfRe44 = _mm512_fmadd_ps(wfIm13, dfIm13, sfRe44);
sfIm44 = _mm512_fmadd_ps(wfRe13, dfIm13, sfIm44);
sfIm44 = _mm512_fnmadd_ps(wfIm13, dfRe13, sfIm44);
sfRe50 = _mm512_fmadd_ps(wfRe14, dfRe13, sfRe50);
sfRe50 = _mm512_fmadd_ps(wfIm14, dfIm13, sfRe50);
sfIm50 = _mm512_fmadd_ps(wfRe14, dfIm13, sfIm50);
sfIm50 = _mm512_fnmadd_ps(wfIm14, dfRe13, sfIm50);
__m512 dfRe14 = _mm512_loadu_ps(dfPtr3+128+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm14 = _mm512_loadu_ps(dfPtr3+192+407040*i8+101760*j4+1152*k24+384*s8);
sfRe45 = _mm512_fmadd_ps(wfRe13, dfRe14, sfRe45);
sfRe45 = _mm512_fmadd_ps(wfIm13, dfIm14, sfRe45);
sfIm45 = _mm512_fmadd_ps(wfRe13, dfIm14, sfIm45);
sfIm45 = _mm512_fnmadd_ps(wfIm13, dfRe14, sfIm45);
sfRe51 = _mm512_fmadd_ps(wfRe14, dfRe14, sfRe51);
sfRe51 = _mm512_fmadd_ps(wfIm14, dfIm14, sfRe51);
sfIm51 = _mm512_fmadd_ps(wfRe14, dfIm14, sfIm51);
sfIm51 = _mm512_fnmadd_ps(wfIm14, dfRe14, sfIm51);
dfRe14 = _mm512_shuffle_f32x4(dfRe14, dfRe14, 78);
dfIm14 = _mm512_shuffle_f32x4(dfIm14, dfIm14, 78);
sfRe46 = _mm512_fmadd_ps(wfRe13, dfRe14, sfRe46);
sfRe46 = _mm512_fmadd_ps(wfIm13, dfIm14, sfRe46);
sfIm46 = _mm512_fmadd_ps(wfRe13, dfIm14, sfIm46);
sfIm46 = _mm512_fnmadd_ps(wfIm13, dfRe14, sfIm46);
sfRe52 = _mm512_fmadd_ps(wfRe14, dfRe14, sfRe52);
sfRe52 = _mm512_fmadd_ps(wfIm14, dfIm14, sfRe52);
sfIm52 = _mm512_fmadd_ps(wfRe14, dfIm14, sfIm52);
sfIm52 = _mm512_fnmadd_ps(wfIm14, dfRe14, sfIm52);
__m512 dfRe15 = _mm512_loadu_ps(dfPtr3+256+407040*i8+101760*j4+1152*k24+384*s8);
__m512 dfIm15 = _mm512_loadu_ps(dfPtr3+320+407040*i8+101760*j4+1152*k24+384*s8);
sfRe47 = _mm512_fmadd_ps(wfRe13, dfRe15, sfRe47);
sfRe47 = _mm512_fmadd_ps(wfIm13, dfIm15, sfRe47);
sfIm47 = _mm512_fmadd_ps(wfRe13, dfIm15, sfIm47);
sfIm47 = _mm512_fnmadd_ps(wfIm13, dfRe15, sfIm47);
sfRe53 = _mm512_fmadd_ps(wfRe14, dfRe15, sfRe53);
sfRe53 = _mm512_fmadd_ps(wfIm14, dfIm15, sfRe53);
sfIm53 = _mm512_fmadd_ps(wfRe14, dfIm15, sfIm53);
sfIm53 = _mm512_fnmadd_ps(wfIm14, dfRe15, sfIm53);
dfRe15 = _mm512_shuffle_f32x4(dfRe15, dfRe15, 78);
dfIm15 = _mm512_shuffle_f32x4(dfIm15, dfIm15, 78);
sfRe48 = _mm512_fmadd_ps(wfRe13, dfRe15, sfRe48);
sfRe48 = _mm512_fmadd_ps(wfIm13, dfIm15, sfRe48);
sfIm48 = _mm512_fmadd_ps(wfRe13, dfIm15, sfIm48);
sfIm48 = _mm512_fnmadd_ps(wfIm13, dfRe15, sfIm48);
sfRe54 = _mm512_fmadd_ps(wfRe14, dfRe15, sfRe54);
sfRe54 = _mm512_fmadd_ps(wfIm14, dfIm15, sfRe54);
sfIm54 = _mm512_fmadd_ps(wfRe14, dfIm15, sfIm54);
sfIm54 = _mm512_fnmadd_ps(wfIm14, dfRe15, sfIm54);
}
sfRe43 = _mm512_add_ps(sfRe43, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm43 = _mm512_add_ps(sfIm43, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe44 = _mm512_add_ps(sfRe44, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm44 = _mm512_add_ps(sfIm44, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe45 = _mm512_add_ps(sfRe45, _mm512_loadu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm45 = _mm512_add_ps(sfIm45, _mm512_loadu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe46 = _mm512_add_ps(sfRe46, _mm512_loadu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm46 = _mm512_add_ps(sfIm46, _mm512_loadu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe47 = _mm512_add_ps(sfRe47, _mm512_loadu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm47 = _mm512_add_ps(sfIm47, _mm512_loadu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe48 = _mm512_add_ps(sfRe48, _mm512_loadu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm48 = _mm512_add_ps(sfIm48, _mm512_loadu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe49 = _mm512_add_ps(sfRe49, _mm512_loadu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm49 = _mm512_add_ps(sfIm49, _mm512_loadu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe50 = _mm512_add_ps(sfRe50, _mm512_loadu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm50 = _mm512_add_ps(sfIm50, _mm512_loadu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe51 = _mm512_add_ps(sfRe51, _mm512_loadu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm51 = _mm512_add_ps(sfIm51, _mm512_loadu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe52 = _mm512_add_ps(sfRe52, _mm512_loadu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm52 = _mm512_add_ps(sfIm52, _mm512_loadu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe53 = _mm512_add_ps(sfRe53, _mm512_loadu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm53 = _mm512_add_ps(sfIm53, _mm512_loadu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfRe54 = _mm512_add_ps(sfRe54, _mm512_loadu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k24+1536*l7));
sfIm54 = _mm512_add_ps(sfIm54, _mm512_loadu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k24+1536*l7));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe43);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm43);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe44);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm44);
_mm512_storeu_ps(sfPtr2+256+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe45);
_mm512_storeu_ps(sfPtr2+320+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm45);
_mm512_storeu_ps(sfPtr2+384+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe46);
_mm512_storeu_ps(sfPtr2+448+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm46);
_mm512_storeu_ps(sfPtr2+512+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe47);
_mm512_storeu_ps(sfPtr2+576+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm47);
_mm512_storeu_ps(sfPtr2+640+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe48);
_mm512_storeu_ps(sfPtr2+704+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm48);
_mm512_storeu_ps(sfPtr2+768+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe49);
_mm512_storeu_ps(sfPtr2+832+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm49);
_mm512_storeu_ps(sfPtr2+896+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe50);
_mm512_storeu_ps(sfPtr2+960+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm50);
_mm512_storeu_ps(sfPtr2+1024+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe51);
_mm512_storeu_ps(sfPtr2+1088+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm51);
_mm512_storeu_ps(sfPtr2+1152+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe52);
_mm512_storeu_ps(sfPtr2+1216+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm52);
_mm512_storeu_ps(sfPtr2+1280+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe53);
_mm512_storeu_ps(sfPtr2+1344+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm53);
_mm512_storeu_ps(sfPtr2+1408+8667136*i8+2166784*j4+24576*k24+1536*l7, sfRe54);
_mm512_storeu_ps(sfPtr2+1472+8667136*i8+2166784*j4+24576*k24+1536*l7, sfIm54);
}
if (k24 >= kk23) return;
}
ptrdiff_t l8 = 16*w20;
for (; l8 != 16; ++l8) {
__m512 sfRe55 = _mm512_setzero_ps();
__m512 sfIm55 = _mm512_setzero_ps();
__m512 sfRe56 = _mm512_setzero_ps();
__m512 sfIm56 = _mm512_setzero_ps();
(void)bfPtr3;
for (ptrdiff_t s9 = 0; s9 < 3; ++s9) {
__m512i wfLd15 = _mm512_loadu_si512(wfPtr3+0+24576*i8+6144*j4+384*l8+128*s9);
__m512 wfRe15 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd15));
__m512 wfIm15 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd15, 1));
__m512i wfLd16 = _mm512_loadu_si512(wfPtr3+64+24576*i8+6144*j4+384*l8+128*s9);
__m512 wfRe16 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd16));
__m512 wfIm16 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd16, 1));
__m512 dfRe16 = _mm512_loadu_ps(dfPtr3+0+407040*i8+101760*j4+1152*k24+128*s9);
__m512 dfIm16 = _mm512_loadu_ps(dfPtr3+64+407040*i8+101760*j4+1152*k24+128*s9);
sfRe55 = _mm512_fmadd_ps(wfRe15, dfRe16, sfRe55);
sfRe55 = _mm512_fmadd_ps(wfIm15, dfIm16, sfRe55);
sfIm55 = _mm512_fmadd_ps(wfRe15, dfIm16, sfIm55);
sfIm55 = _mm512_fnmadd_ps(wfIm15, dfRe16, sfIm55);
sfRe56 = _mm512_fmadd_ps(wfRe16, dfRe16, sfRe56);
sfRe56 = _mm512_fmadd_ps(wfIm16, dfIm16, sfRe56);
sfIm56 = _mm512_fmadd_ps(wfRe16, dfIm16, sfIm56);
sfIm56 = _mm512_fnmadd_ps(wfIm16, dfRe16, sfIm56);
}
sfRe55 = _mm512_add_ps(sfRe55, _mm512_loadu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+256*l8));
sfIm55 = _mm512_add_ps(sfIm55, _mm512_loadu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+256*l8));
sfRe56 = _mm512_add_ps(sfRe56, _mm512_loadu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+256*l8));
sfIm56 = _mm512_add_ps(sfIm56, _mm512_loadu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+256*l8));
_mm512_storeu_ps(sfPtr2+0+8667136*i8+2166784*j4+24576*k24+256*l8, sfRe55);
_mm512_storeu_ps(sfPtr2+64+8667136*i8+2166784*j4+24576*k24+256*l8, sfIm55);
_mm512_storeu_ps(sfPtr2+128+8667136*i8+2166784*j4+24576*k24+256*l8, sfRe56);
_mm512_storeu_ps(sfPtr2+192+8667136*i8+2166784*j4+24576*k24+256*l8, sfIm56);
}
}
}

static void ResNeXt50StriderProduceSums1(ResNeXt50ThreaderTeam1* team16, char** tensors5) {
void* tuple1[3];
tuple1[0] = tensors5;
for (ptrdiff_t e4 = 0; e4 < 1; ++e4) {
tuple1[1] = (void*)e4;
for (ptrdiff_t z3 = 0; z3 < 4; ++z3) {
tuple1[2] = (void*)z3;
ResNeXt50ThreaderTask1 task9;
task9.callee1 = ResNeXt50StriderProduceSums1Callee1;
task9.any1 = tuple1;
task9.nd1 = 4;
task9.hull1[0] = 1;
task9.hull1[1] = 14;
task9.hull1[2] = 4;
task9.hull1[3] = 1;
ResNeXt50ThreaderDo1(team16, &task9);
}
}
}

static void ResNeXt50StriderConsumeSums1Callee1(ResNeXt50ThreaderTask1* task10, int64_t* pt10) {
char** tensors8 = task10->any1;
ptrdiff_t w21 = 0;
ptrdiff_t d2 = pt10[1];
ptrdiff_t g5 = 0;
char*restrict sfPtr3 = tensors8[0];
char*restrict datPtr2 = tensors8[1];
ptrdiff_t i9 = 1*g5;
ptrdiff_t j5 = 2*d2;
ptrdiff_t last2 = j5+(d2 < 43 ? 1 : 2);
if (j5 < 4) {
ptrdiff_t rel4 = j5-0;
ptrdiff_t base4 = 0;
if (rel4 < 1) {
ptrdiff_t toH1 = base4+0;
ptrdiff_t toW1 = 0;
ptrdiff_t k25 = 16*w21;
for (; k25 != 16; ++k25) {
ptrdiff_t r2 = 0;
for (; r2 != 2; ++r2) {
ptrdiff_t t2 = 0;
__m512 sfRe57 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm57 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe61 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm61 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe58 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm58 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe62 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm62 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe59 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm59 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe63 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm63 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe60 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm60 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfRe64 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512 sfIm64 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k25+768*r2+256*t2);
__m512i ifft1 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2 = _mm512_permutexvar_ps(ifft1, sfRe57);
__m512 ifft93 = _mm512_permutexvar_ps(ifft1, sfRe61);
__m512i ifft3 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4 = _mm512_permutexvar_ps(ifft3, sfRe57);
__m512 ifft94 = _mm512_permutexvar_ps(ifft3, sfRe61);
__m512 ifft5 = _mm512_permutexvar_ps(ifft1, sfIm57);
__m512 ifft95 = _mm512_permutexvar_ps(ifft1, sfIm61);
__m512 ifft6 = _mm512_permutexvar_ps(ifft3, sfIm57);
__m512 ifft96 = _mm512_permutexvar_ps(ifft3, sfIm61);
__m512 ifft7 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft8 = _mm512_mask_fmadd_ps(ifft6, 65021, ifft7, ifft2);
__m512 ifft97 = _mm512_mask_fmadd_ps(ifft96, 65021, ifft7, ifft93);
__m512 ifft9 = _mm512_mask_fnmadd_ps(ifft5, 65021, ifft7, ifft4);
__m512 ifft98 = _mm512_mask_fnmadd_ps(ifft95, 65021, ifft7, ifft94);
__m512 ifft10 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft11 = _mm512_fmadd_ps(ifft8, ifft10, _mm512_shuffle_ps(ifft8, ifft8, 177));
__m512 ifft99 = _mm512_fmadd_ps(ifft97, ifft10, _mm512_shuffle_ps(ifft97, ifft97, 177));
__m512 ifft12 = _mm512_fmadd_ps(ifft9, ifft10, _mm512_shuffle_ps(ifft9, ifft9, 177));
__m512 ifft100 = _mm512_fmadd_ps(ifft98, ifft10, _mm512_shuffle_ps(ifft98, ifft98, 177));
__m512 ifft13 = _mm512_fmadd_ps(sfRe58, ifft10, _mm512_shuffle_ps(sfRe58, sfRe58, 177));
__m512 ifft101 = _mm512_fmadd_ps(sfRe62, ifft10, _mm512_shuffle_ps(sfRe62, sfRe62, 177));
__m512 ifft14 = _mm512_fmadd_ps(sfIm58, ifft10, _mm512_shuffle_ps(sfIm58, sfIm58, 177));
__m512 ifft102 = _mm512_fmadd_ps(sfIm62, ifft10, _mm512_shuffle_ps(sfIm62, sfIm62, 177));
__m512 ifft15 = _mm512_fmadd_ps(sfRe59, ifft10, _mm512_shuffle_ps(sfRe59, sfRe59, 177));
__m512 ifft103 = _mm512_fmadd_ps(sfRe63, ifft10, _mm512_shuffle_ps(sfRe63, sfRe63, 177));
__m512 ifft16 = _mm512_fmadd_ps(sfIm59, ifft10, _mm512_shuffle_ps(sfIm59, sfIm59, 177));
__m512 ifft104 = _mm512_fmadd_ps(sfIm63, ifft10, _mm512_shuffle_ps(sfIm63, sfIm63, 177));
__m512 ifft17 = _mm512_fmadd_ps(sfRe60, ifft10, _mm512_shuffle_ps(sfRe60, sfRe60, 177));
__m512 ifft105 = _mm512_fmadd_ps(sfRe64, ifft10, _mm512_shuffle_ps(sfRe64, sfRe64, 177));
__m512 ifft18 = _mm512_fmadd_ps(sfIm60, ifft10, _mm512_shuffle_ps(sfIm60, sfIm60, 177));
__m512 ifft106 = _mm512_fmadd_ps(sfIm64, ifft10, _mm512_shuffle_ps(sfIm64, sfIm64, 177));
__m512 ifft19 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft20 = _mm512_mul_ps(ifft11, ifft19);
__m512 ifft107 = _mm512_mul_ps(ifft99, ifft19);
__m512 ifft21 = _mm512_mul_ps(ifft12, ifft19);
__m512 ifft108 = _mm512_mul_ps(ifft100, ifft19);
__m512 ifft22 = _mm512_mul_ps(ifft13, ifft19);
__m512 ifft109 = _mm512_mul_ps(ifft101, ifft19);
__m512 ifft23 = _mm512_mul_ps(ifft14, ifft19);
__m512 ifft110 = _mm512_mul_ps(ifft102, ifft19);
__m512 ifft24 = _mm512_mul_ps(ifft15, ifft19);
__m512 ifft111 = _mm512_mul_ps(ifft103, ifft19);
__m512 ifft25 = _mm512_mul_ps(ifft16, ifft19);
__m512 ifft112 = _mm512_mul_ps(ifft104, ifft19);
__m512 ifft26 = _mm512_mul_ps(ifft17, ifft19);
__m512 ifft113 = _mm512_mul_ps(ifft105, ifft19);
__m512 ifft27 = _mm512_mul_ps(ifft18, ifft19);
__m512 ifft114 = _mm512_mul_ps(ifft106, ifft19);
__m512 ifft28 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft29 = _mm512_fnmadd_ps(ifft12, ifft28, ifft20);
__m512 ifft115 = _mm512_fnmadd_ps(ifft100, ifft28, ifft107);
__m512 ifft30 = _mm512_fmadd_ps(ifft11, ifft28, ifft21);
__m512 ifft116 = _mm512_fmadd_ps(ifft99, ifft28, ifft108);
__m512 ifft31 = _mm512_fnmadd_ps(ifft14, ifft28, ifft22);
__m512 ifft117 = _mm512_fnmadd_ps(ifft102, ifft28, ifft109);
__m512 ifft32 = _mm512_fmadd_ps(ifft13, ifft28, ifft23);
__m512 ifft118 = _mm512_fmadd_ps(ifft101, ifft28, ifft110);
__m512 ifft33 = _mm512_fnmadd_ps(ifft16, ifft28, ifft24);
__m512 ifft119 = _mm512_fnmadd_ps(ifft104, ifft28, ifft111);
__m512 ifft34 = _mm512_fmadd_ps(ifft15, ifft28, ifft25);
__m512 ifft120 = _mm512_fmadd_ps(ifft103, ifft28, ifft112);
__m512 ifft35 = _mm512_fnmadd_ps(ifft18, ifft28, ifft26);
__m512 ifft121 = _mm512_fnmadd_ps(ifft106, ifft28, ifft113);
__m512 ifft36 = _mm512_fmadd_ps(ifft17, ifft28, ifft27);
__m512 ifft122 = _mm512_fmadd_ps(ifft105, ifft28, ifft114);
__m512 ifft37 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft38 = _mm512_fmadd_ps(ifft29, ifft37, _mm512_shuffle_ps(ifft29, ifft29, 78));
__m512 ifft123 = _mm512_fmadd_ps(ifft115, ifft37, _mm512_shuffle_ps(ifft115, ifft115, 78));
__m512 ifft39 = _mm512_fmadd_ps(ifft30, ifft37, _mm512_shuffle_ps(ifft30, ifft30, 78));
__m512 ifft124 = _mm512_fmadd_ps(ifft116, ifft37, _mm512_shuffle_ps(ifft116, ifft116, 78));
__m512 ifft40 = _mm512_fmadd_ps(ifft31, ifft37, _mm512_shuffle_ps(ifft31, ifft31, 78));
__m512 ifft125 = _mm512_fmadd_ps(ifft117, ifft37, _mm512_shuffle_ps(ifft117, ifft117, 78));
__m512 ifft41 = _mm512_fmadd_ps(ifft32, ifft37, _mm512_shuffle_ps(ifft32, ifft32, 78));
__m512 ifft126 = _mm512_fmadd_ps(ifft118, ifft37, _mm512_shuffle_ps(ifft118, ifft118, 78));
__m512 ifft42 = _mm512_fmadd_ps(ifft33, ifft37, _mm512_shuffle_ps(ifft33, ifft33, 78));
__m512 ifft127 = _mm512_fmadd_ps(ifft119, ifft37, _mm512_shuffle_ps(ifft119, ifft119, 78));
__m512 ifft43 = _mm512_fmadd_ps(ifft34, ifft37, _mm512_shuffle_ps(ifft34, ifft34, 78));
__m512 ifft128 = _mm512_fmadd_ps(ifft120, ifft37, _mm512_shuffle_ps(ifft120, ifft120, 78));
__m512 ifft44 = _mm512_fmadd_ps(ifft35, ifft37, _mm512_shuffle_ps(ifft35, ifft35, 78));
__m512 ifft129 = _mm512_fmadd_ps(ifft121, ifft37, _mm512_shuffle_ps(ifft121, ifft121, 78));
__m512 ifft45 = _mm512_fmadd_ps(ifft36, ifft37, _mm512_shuffle_ps(ifft36, ifft36, 78));
__m512 ifft130 = _mm512_fmadd_ps(ifft122, ifft37, _mm512_shuffle_ps(ifft122, ifft122, 78));
__m512 ifft46 = _mm512_mask_sub_ps(ifft38, 49344, _mm512_setzero_ps(), ifft39);
__m512 ifft131 = _mm512_mask_sub_ps(ifft123, 49344, _mm512_setzero_ps(), ifft124);
__m512 ifft47 = _mm512_mask_mov_ps(ifft39, 49344, ifft38);
__m512 ifft132 = _mm512_mask_mov_ps(ifft124, 49344, ifft123);
__m512 ifft48 = _mm512_mask_sub_ps(ifft40, 49344, _mm512_setzero_ps(), ifft41);
__m512 ifft133 = _mm512_mask_sub_ps(ifft125, 49344, _mm512_setzero_ps(), ifft126);
__m512 ifft49 = _mm512_mask_mov_ps(ifft41, 49344, ifft40);
__m512 ifft134 = _mm512_mask_mov_ps(ifft126, 49344, ifft125);
__m512 ifft50 = _mm512_mask_sub_ps(ifft42, 49344, _mm512_setzero_ps(), ifft43);
__m512 ifft135 = _mm512_mask_sub_ps(ifft127, 49344, _mm512_setzero_ps(), ifft128);
__m512 ifft51 = _mm512_mask_mov_ps(ifft43, 49344, ifft42);
__m512 ifft136 = _mm512_mask_mov_ps(ifft128, 49344, ifft127);
__m512 ifft52 = _mm512_mask_sub_ps(ifft44, 49344, _mm512_setzero_ps(), ifft45);
__m512 ifft137 = _mm512_mask_sub_ps(ifft129, 49344, _mm512_setzero_ps(), ifft130);
__m512 ifft53 = _mm512_mask_mov_ps(ifft45, 49344, ifft44);
__m512 ifft138 = _mm512_mask_mov_ps(ifft130, 49344, ifft129);
__m512 ifft54 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft55 = _mm512_fmadd_ps(ifft46, ifft54, _mm512_shuffle_f32x4(ifft46, ifft46, 177));
__m512 ifft139 = _mm512_fmadd_ps(ifft131, ifft54, _mm512_shuffle_f32x4(ifft131, ifft131, 177));
__m512 ifft56 = _mm512_fmadd_ps(ifft47, ifft54, _mm512_shuffle_f32x4(ifft47, ifft47, 177));
__m512 ifft140 = _mm512_fmadd_ps(ifft132, ifft54, _mm512_shuffle_f32x4(ifft132, ifft132, 177));
__m512 ifft57 = _mm512_fmadd_ps(ifft48, ifft54, _mm512_shuffle_f32x4(ifft48, ifft48, 177));
__m512 ifft141 = _mm512_fmadd_ps(ifft133, ifft54, _mm512_shuffle_f32x4(ifft133, ifft133, 177));
__m512 ifft58 = _mm512_fmadd_ps(ifft49, ifft54, _mm512_shuffle_f32x4(ifft49, ifft49, 177));
__m512 ifft142 = _mm512_fmadd_ps(ifft134, ifft54, _mm512_shuffle_f32x4(ifft134, ifft134, 177));
__m512 ifft59 = _mm512_fmadd_ps(ifft50, ifft54, _mm512_shuffle_f32x4(ifft50, ifft50, 177));
__m512 ifft143 = _mm512_fmadd_ps(ifft135, ifft54, _mm512_shuffle_f32x4(ifft135, ifft135, 177));
__m512 ifft60 = _mm512_fnmsub_ps(ifft51, ifft54, _mm512_shuffle_f32x4(ifft51, ifft51, 177));
__m512 ifft144 = _mm512_fnmsub_ps(ifft136, ifft54, _mm512_shuffle_f32x4(ifft136, ifft136, 177));
__m512 ifft61 = _mm512_fmadd_ps(ifft52, ifft54, _mm512_shuffle_f32x4(ifft52, ifft52, 177));
__m512 ifft145 = _mm512_fmadd_ps(ifft137, ifft54, _mm512_shuffle_f32x4(ifft137, ifft137, 177));
__m512 ifft62 = _mm512_fmadd_ps(ifft53, ifft54, _mm512_shuffle_f32x4(ifft53, ifft53, 177));
__m512 ifft146 = _mm512_fmadd_ps(ifft138, ifft54, _mm512_shuffle_f32x4(ifft138, ifft138, 177));
__m512 ifft63 = _mm512_add_ps(ifft55, ifft56);
__m512 ifft147 = _mm512_add_ps(ifft139, ifft140);
__m512 ifft64 = _mm512_sub_ps(ifft55, ifft56);
__m512 ifft148 = _mm512_sub_ps(ifft139, ifft140);
__m512 ifft65 = _mm512_sub_ps(ifft57, ifft61);
__m512 ifft149 = _mm512_sub_ps(ifft141, ifft145);
__m512 ifft66 = _mm512_add_ps(ifft58, ifft62);
__m512 ifft150 = _mm512_add_ps(ifft142, ifft146);
__m512 ifft67 = _mm512_add_ps(ifft57, ifft61);
__m512 ifft151 = _mm512_add_ps(ifft141, ifft145);
__m512 ifft68 = _mm512_sub_ps(ifft58, ifft62);
__m512 ifft152 = _mm512_sub_ps(ifft142, ifft146);
__m512 ifft69 = _mm512_mul_ps(ifft59, _mm512_set1_ps(3.125e-02f));
__m512 ifft153 = _mm512_mul_ps(ifft143, _mm512_set1_ps(3.125e-02f));
__m512 ifft70 = _mm512_mul_ps(ifft60, _mm512_set1_ps(3.125e-02f));
__m512 ifft154 = _mm512_mul_ps(ifft144, _mm512_set1_ps(3.125e-02f));
__m512 ifft71 = _mm512_fmadd_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft155 = _mm512_fmadd_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft72 = _mm512_fmsub_ps(ifft63, _mm512_set1_ps(1.5625e-02f), ifft69);
__m512 ifft156 = _mm512_fmsub_ps(ifft147, _mm512_set1_ps(1.5625e-02f), ifft153);
__m512 ifft73 = _mm512_fmadd_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft157 = _mm512_fmadd_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft74 = _mm512_fmsub_ps(ifft64, _mm512_set1_ps(1.5625e-02f), ifft70);
__m512 ifft158 = _mm512_fmsub_ps(ifft148, _mm512_set1_ps(1.5625e-02f), ifft154);
__m512 ifft75 = _mm512_add_ps(ifft65, ifft66);
__m512 ifft159 = _mm512_add_ps(ifft149, ifft150);
__m512 ifft76 = _mm512_sub_ps(ifft65, ifft66);
__m512 ifft160 = _mm512_sub_ps(ifft149, ifft150);
__m512 ifft77 = _mm512_fnmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft161 = _mm512_fnmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft78 = _mm512_fmadd_ps(ifft75, _mm512_set1_ps(7.0710677e-01f), ifft67);
__m512 ifft162 = _mm512_fmadd_ps(ifft159, _mm512_set1_ps(7.0710677e-01f), ifft151);
__m512 ifft79 = _mm512_fmadd_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft163 = _mm512_fmadd_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft80 = _mm512_fmsub_ps(ifft76, _mm512_set1_ps(7.0710677e-01f), ifft68);
__m512 ifft164 = _mm512_fmsub_ps(ifft160, _mm512_set1_ps(7.0710677e-01f), ifft152);
__m512 ifft81 = _mm512_add_ps(ifft77, ifft78);
__m512 ifft165 = _mm512_add_ps(ifft161, ifft162);
__m512 ifft82 = _mm512_sub_ps(ifft77, ifft78);
__m512 ifft166 = _mm512_sub_ps(ifft161, ifft162);
__m512 ifft83 = _mm512_add_ps(ifft79, ifft80);
__m512 ifft167 = _mm512_add_ps(ifft163, ifft164);
__m512 ifft84 = _mm512_sub_ps(ifft79, ifft80);
__m512 ifft168 = _mm512_sub_ps(ifft163, ifft164);
__m512 ifft85 = _mm512_fmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft169 = _mm512_fmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft86 = _mm512_fnmadd_ps(ifft81, _mm512_set1_ps(1.5625e-02f), ifft71);
__m512 ifft170 = _mm512_fnmadd_ps(ifft165, _mm512_set1_ps(1.5625e-02f), ifft155);
__m512 ifft87 = _mm512_fmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft171 = _mm512_fmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft88 = _mm512_fnmadd_ps(ifft83, _mm512_set1_ps(1.5625e-02f), ifft73);
__m512 ifft172 = _mm512_fnmadd_ps(ifft167, _mm512_set1_ps(1.5625e-02f), ifft157);
__m512 ifft89 = _mm512_fnmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft173 = _mm512_fnmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft90 = _mm512_fmadd_ps(ifft84, _mm512_set1_ps(1.5625e-02f), ifft72);
__m512 ifft174 = _mm512_fmadd_ps(ifft168, _mm512_set1_ps(1.5625e-02f), ifft156);
__m512 ifft91 = _mm512_fmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft175 = _mm512_fmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 ifft92 = _mm512_fnmadd_ps(ifft82, _mm512_set1_ps(1.5625e-02f), ifft74);
__m512 ifft176 = _mm512_fnmadd_ps(ifft166, _mm512_set1_ps(1.5625e-02f), ifft158);
__m512 dat590 = ifft85;
__m512 dat595 = ifft169;
__m512 dat591 = ifft87;
__m512 dat596 = ifft171;
__m512 dat592 = ifft89;
__m512 dat597 = ifft173;
__m512 dat593 = ifft91;
__m512 dat598 = ifft175;
__m512 dat594 = ifft86;
__m512 dat599 = ifft170;
(void)ifft88;
(void)ifft172;
(void)ifft90;
(void)ifft174;
(void)ifft92;
(void)ifft176;
__m512i pm1 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack1 = _mm512_permutex2var_ps(dat590, pm1, dat595);
__m512i pm2 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack2 = _mm512_permutex2var_ps(dat590, pm2, dat595);
__m512 pack3 = _mm512_permutex2var_ps(dat591, pm1, dat596);
__m512 pack4 = _mm512_permutex2var_ps(dat591, pm2, dat596);
__m512 pack5 = _mm512_permutex2var_ps(dat592, pm1, dat597);
__m512 pack6 = _mm512_permutex2var_ps(dat592, pm2, dat597);
__m512 pack7 = _mm512_permutex2var_ps(dat593, pm1, dat598);
__m512 pack8 = _mm512_permutex2var_ps(dat593, pm2, dat598);
__m512 pack9 = _mm512_permutex2var_ps(dat594, pm1, dat599);
__m512 pack10 = _mm512_permutex2var_ps(dat594, pm2, dat599);
pack1 = _mm512_max_ps(_mm512_setzero_ps(), pack1);
pack2 = _mm512_max_ps(_mm512_setzero_ps(), pack2);
pack3 = _mm512_max_ps(_mm512_setzero_ps(), pack3);
pack4 = _mm512_max_ps(_mm512_setzero_ps(), pack4);
pack5 = _mm512_max_ps(_mm512_setzero_ps(), pack5);
pack6 = _mm512_max_ps(_mm512_setzero_ps(), pack6);
pack7 = _mm512_max_ps(_mm512_setzero_ps(), pack7);
pack8 = _mm512_max_ps(_mm512_setzero_ps(), pack8);
pack9 = _mm512_max_ps(_mm512_setzero_ps(), pack9);
pack10 = _mm512_max_ps(_mm512_setzero_ps(), pack10);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack1);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack2);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack3);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack4);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack5);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack6);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack7);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack8);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack9);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+0*t2, 1023, pack10);
ptrdiff_t t3 = 0;
for (; t3 < 2; ++t3) {
__m512 sfRe65 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm65 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe69 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm69 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe66 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm66 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe70 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm70 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe67 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm67 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe71 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm71 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe68 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm68 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfRe72 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512 sfIm72 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k25+768*r2+256*t3);
__m512i ifft177 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft178 = _mm512_permutexvar_ps(ifft177, sfRe65);
__m512 ifft269 = _mm512_permutexvar_ps(ifft177, sfRe69);
__m512i ifft179 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft180 = _mm512_permutexvar_ps(ifft179, sfRe65);
__m512 ifft270 = _mm512_permutexvar_ps(ifft179, sfRe69);
__m512 ifft181 = _mm512_permutexvar_ps(ifft177, sfIm65);
__m512 ifft271 = _mm512_permutexvar_ps(ifft177, sfIm69);
__m512 ifft182 = _mm512_permutexvar_ps(ifft179, sfIm65);
__m512 ifft272 = _mm512_permutexvar_ps(ifft179, sfIm69);
__m512 ifft183 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft184 = _mm512_mask_fmadd_ps(ifft182, 65021, ifft183, ifft178);
__m512 ifft273 = _mm512_mask_fmadd_ps(ifft272, 65021, ifft183, ifft269);
__m512 ifft185 = _mm512_mask_fnmadd_ps(ifft181, 65021, ifft183, ifft180);
__m512 ifft274 = _mm512_mask_fnmadd_ps(ifft271, 65021, ifft183, ifft270);
__m512 ifft186 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft187 = _mm512_fmadd_ps(ifft184, ifft186, _mm512_shuffle_ps(ifft184, ifft184, 177));
__m512 ifft275 = _mm512_fmadd_ps(ifft273, ifft186, _mm512_shuffle_ps(ifft273, ifft273, 177));
__m512 ifft188 = _mm512_fmadd_ps(ifft185, ifft186, _mm512_shuffle_ps(ifft185, ifft185, 177));
__m512 ifft276 = _mm512_fmadd_ps(ifft274, ifft186, _mm512_shuffle_ps(ifft274, ifft274, 177));
__m512 ifft189 = _mm512_fmadd_ps(sfRe66, ifft186, _mm512_shuffle_ps(sfRe66, sfRe66, 177));
__m512 ifft277 = _mm512_fmadd_ps(sfRe70, ifft186, _mm512_shuffle_ps(sfRe70, sfRe70, 177));
__m512 ifft190 = _mm512_fmadd_ps(sfIm66, ifft186, _mm512_shuffle_ps(sfIm66, sfIm66, 177));
__m512 ifft278 = _mm512_fmadd_ps(sfIm70, ifft186, _mm512_shuffle_ps(sfIm70, sfIm70, 177));
__m512 ifft191 = _mm512_fmadd_ps(sfRe67, ifft186, _mm512_shuffle_ps(sfRe67, sfRe67, 177));
__m512 ifft279 = _mm512_fmadd_ps(sfRe71, ifft186, _mm512_shuffle_ps(sfRe71, sfRe71, 177));
__m512 ifft192 = _mm512_fmadd_ps(sfIm67, ifft186, _mm512_shuffle_ps(sfIm67, sfIm67, 177));
__m512 ifft280 = _mm512_fmadd_ps(sfIm71, ifft186, _mm512_shuffle_ps(sfIm71, sfIm71, 177));
__m512 ifft193 = _mm512_fmadd_ps(sfRe68, ifft186, _mm512_shuffle_ps(sfRe68, sfRe68, 177));
__m512 ifft281 = _mm512_fmadd_ps(sfRe72, ifft186, _mm512_shuffle_ps(sfRe72, sfRe72, 177));
__m512 ifft194 = _mm512_fmadd_ps(sfIm68, ifft186, _mm512_shuffle_ps(sfIm68, sfIm68, 177));
__m512 ifft282 = _mm512_fmadd_ps(sfIm72, ifft186, _mm512_shuffle_ps(sfIm72, sfIm72, 177));
__m512 ifft195 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft196 = _mm512_mul_ps(ifft187, ifft195);
__m512 ifft283 = _mm512_mul_ps(ifft275, ifft195);
__m512 ifft197 = _mm512_mul_ps(ifft188, ifft195);
__m512 ifft284 = _mm512_mul_ps(ifft276, ifft195);
__m512 ifft198 = _mm512_mul_ps(ifft189, ifft195);
__m512 ifft285 = _mm512_mul_ps(ifft277, ifft195);
__m512 ifft199 = _mm512_mul_ps(ifft190, ifft195);
__m512 ifft286 = _mm512_mul_ps(ifft278, ifft195);
__m512 ifft200 = _mm512_mul_ps(ifft191, ifft195);
__m512 ifft287 = _mm512_mul_ps(ifft279, ifft195);
__m512 ifft201 = _mm512_mul_ps(ifft192, ifft195);
__m512 ifft288 = _mm512_mul_ps(ifft280, ifft195);
__m512 ifft202 = _mm512_mul_ps(ifft193, ifft195);
__m512 ifft289 = _mm512_mul_ps(ifft281, ifft195);
__m512 ifft203 = _mm512_mul_ps(ifft194, ifft195);
__m512 ifft290 = _mm512_mul_ps(ifft282, ifft195);
__m512 ifft204 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft205 = _mm512_fnmadd_ps(ifft188, ifft204, ifft196);
__m512 ifft291 = _mm512_fnmadd_ps(ifft276, ifft204, ifft283);
__m512 ifft206 = _mm512_fmadd_ps(ifft187, ifft204, ifft197);
__m512 ifft292 = _mm512_fmadd_ps(ifft275, ifft204, ifft284);
__m512 ifft207 = _mm512_fnmadd_ps(ifft190, ifft204, ifft198);
__m512 ifft293 = _mm512_fnmadd_ps(ifft278, ifft204, ifft285);
__m512 ifft208 = _mm512_fmadd_ps(ifft189, ifft204, ifft199);
__m512 ifft294 = _mm512_fmadd_ps(ifft277, ifft204, ifft286);
__m512 ifft209 = _mm512_fnmadd_ps(ifft192, ifft204, ifft200);
__m512 ifft295 = _mm512_fnmadd_ps(ifft280, ifft204, ifft287);
__m512 ifft210 = _mm512_fmadd_ps(ifft191, ifft204, ifft201);
__m512 ifft296 = _mm512_fmadd_ps(ifft279, ifft204, ifft288);
__m512 ifft211 = _mm512_fnmadd_ps(ifft194, ifft204, ifft202);
__m512 ifft297 = _mm512_fnmadd_ps(ifft282, ifft204, ifft289);
__m512 ifft212 = _mm512_fmadd_ps(ifft193, ifft204, ifft203);
__m512 ifft298 = _mm512_fmadd_ps(ifft281, ifft204, ifft290);
__m512 ifft213 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft214 = _mm512_fmadd_ps(ifft205, ifft213, _mm512_shuffle_ps(ifft205, ifft205, 78));
__m512 ifft299 = _mm512_fmadd_ps(ifft291, ifft213, _mm512_shuffle_ps(ifft291, ifft291, 78));
__m512 ifft215 = _mm512_fmadd_ps(ifft206, ifft213, _mm512_shuffle_ps(ifft206, ifft206, 78));
__m512 ifft300 = _mm512_fmadd_ps(ifft292, ifft213, _mm512_shuffle_ps(ifft292, ifft292, 78));
__m512 ifft216 = _mm512_fmadd_ps(ifft207, ifft213, _mm512_shuffle_ps(ifft207, ifft207, 78));
__m512 ifft301 = _mm512_fmadd_ps(ifft293, ifft213, _mm512_shuffle_ps(ifft293, ifft293, 78));
__m512 ifft217 = _mm512_fmadd_ps(ifft208, ifft213, _mm512_shuffle_ps(ifft208, ifft208, 78));
__m512 ifft302 = _mm512_fmadd_ps(ifft294, ifft213, _mm512_shuffle_ps(ifft294, ifft294, 78));
__m512 ifft218 = _mm512_fmadd_ps(ifft209, ifft213, _mm512_shuffle_ps(ifft209, ifft209, 78));
__m512 ifft303 = _mm512_fmadd_ps(ifft295, ifft213, _mm512_shuffle_ps(ifft295, ifft295, 78));
__m512 ifft219 = _mm512_fmadd_ps(ifft210, ifft213, _mm512_shuffle_ps(ifft210, ifft210, 78));
__m512 ifft304 = _mm512_fmadd_ps(ifft296, ifft213, _mm512_shuffle_ps(ifft296, ifft296, 78));
__m512 ifft220 = _mm512_fmadd_ps(ifft211, ifft213, _mm512_shuffle_ps(ifft211, ifft211, 78));
__m512 ifft305 = _mm512_fmadd_ps(ifft297, ifft213, _mm512_shuffle_ps(ifft297, ifft297, 78));
__m512 ifft221 = _mm512_fmadd_ps(ifft212, ifft213, _mm512_shuffle_ps(ifft212, ifft212, 78));
__m512 ifft306 = _mm512_fmadd_ps(ifft298, ifft213, _mm512_shuffle_ps(ifft298, ifft298, 78));
__m512 ifft222 = _mm512_mask_sub_ps(ifft214, 49344, _mm512_setzero_ps(), ifft215);
__m512 ifft307 = _mm512_mask_sub_ps(ifft299, 49344, _mm512_setzero_ps(), ifft300);
__m512 ifft223 = _mm512_mask_mov_ps(ifft215, 49344, ifft214);
__m512 ifft308 = _mm512_mask_mov_ps(ifft300, 49344, ifft299);
__m512 ifft224 = _mm512_mask_sub_ps(ifft216, 49344, _mm512_setzero_ps(), ifft217);
__m512 ifft309 = _mm512_mask_sub_ps(ifft301, 49344, _mm512_setzero_ps(), ifft302);
__m512 ifft225 = _mm512_mask_mov_ps(ifft217, 49344, ifft216);
__m512 ifft310 = _mm512_mask_mov_ps(ifft302, 49344, ifft301);
__m512 ifft226 = _mm512_mask_sub_ps(ifft218, 49344, _mm512_setzero_ps(), ifft219);
__m512 ifft311 = _mm512_mask_sub_ps(ifft303, 49344, _mm512_setzero_ps(), ifft304);
__m512 ifft227 = _mm512_mask_mov_ps(ifft219, 49344, ifft218);
__m512 ifft312 = _mm512_mask_mov_ps(ifft304, 49344, ifft303);
__m512 ifft228 = _mm512_mask_sub_ps(ifft220, 49344, _mm512_setzero_ps(), ifft221);
__m512 ifft313 = _mm512_mask_sub_ps(ifft305, 49344, _mm512_setzero_ps(), ifft306);
__m512 ifft229 = _mm512_mask_mov_ps(ifft221, 49344, ifft220);
__m512 ifft314 = _mm512_mask_mov_ps(ifft306, 49344, ifft305);
__m512 ifft230 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft231 = _mm512_fmadd_ps(ifft222, ifft230, _mm512_shuffle_f32x4(ifft222, ifft222, 177));
__m512 ifft315 = _mm512_fmadd_ps(ifft307, ifft230, _mm512_shuffle_f32x4(ifft307, ifft307, 177));
__m512 ifft232 = _mm512_fmadd_ps(ifft223, ifft230, _mm512_shuffle_f32x4(ifft223, ifft223, 177));
__m512 ifft316 = _mm512_fmadd_ps(ifft308, ifft230, _mm512_shuffle_f32x4(ifft308, ifft308, 177));
__m512 ifft233 = _mm512_fmadd_ps(ifft224, ifft230, _mm512_shuffle_f32x4(ifft224, ifft224, 177));
__m512 ifft317 = _mm512_fmadd_ps(ifft309, ifft230, _mm512_shuffle_f32x4(ifft309, ifft309, 177));
__m512 ifft234 = _mm512_fmadd_ps(ifft225, ifft230, _mm512_shuffle_f32x4(ifft225, ifft225, 177));
__m512 ifft318 = _mm512_fmadd_ps(ifft310, ifft230, _mm512_shuffle_f32x4(ifft310, ifft310, 177));
__m512 ifft235 = _mm512_fmadd_ps(ifft226, ifft230, _mm512_shuffle_f32x4(ifft226, ifft226, 177));
__m512 ifft319 = _mm512_fmadd_ps(ifft311, ifft230, _mm512_shuffle_f32x4(ifft311, ifft311, 177));
__m512 ifft236 = _mm512_fnmsub_ps(ifft227, ifft230, _mm512_shuffle_f32x4(ifft227, ifft227, 177));
__m512 ifft320 = _mm512_fnmsub_ps(ifft312, ifft230, _mm512_shuffle_f32x4(ifft312, ifft312, 177));
__m512 ifft237 = _mm512_fmadd_ps(ifft228, ifft230, _mm512_shuffle_f32x4(ifft228, ifft228, 177));
__m512 ifft321 = _mm512_fmadd_ps(ifft313, ifft230, _mm512_shuffle_f32x4(ifft313, ifft313, 177));
__m512 ifft238 = _mm512_fmadd_ps(ifft229, ifft230, _mm512_shuffle_f32x4(ifft229, ifft229, 177));
__m512 ifft322 = _mm512_fmadd_ps(ifft314, ifft230, _mm512_shuffle_f32x4(ifft314, ifft314, 177));
__m512 ifft239 = _mm512_add_ps(ifft231, ifft232);
__m512 ifft323 = _mm512_add_ps(ifft315, ifft316);
__m512 ifft240 = _mm512_sub_ps(ifft231, ifft232);
__m512 ifft324 = _mm512_sub_ps(ifft315, ifft316);
__m512 ifft241 = _mm512_sub_ps(ifft233, ifft237);
__m512 ifft325 = _mm512_sub_ps(ifft317, ifft321);
__m512 ifft242 = _mm512_add_ps(ifft234, ifft238);
__m512 ifft326 = _mm512_add_ps(ifft318, ifft322);
__m512 ifft243 = _mm512_add_ps(ifft233, ifft237);
__m512 ifft327 = _mm512_add_ps(ifft317, ifft321);
__m512 ifft244 = _mm512_sub_ps(ifft234, ifft238);
__m512 ifft328 = _mm512_sub_ps(ifft318, ifft322);
__m512 ifft245 = _mm512_mul_ps(ifft235, _mm512_set1_ps(3.125e-02f));
__m512 ifft329 = _mm512_mul_ps(ifft319, _mm512_set1_ps(3.125e-02f));
__m512 ifft246 = _mm512_mul_ps(ifft236, _mm512_set1_ps(3.125e-02f));
__m512 ifft330 = _mm512_mul_ps(ifft320, _mm512_set1_ps(3.125e-02f));
__m512 ifft247 = _mm512_fmadd_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft331 = _mm512_fmadd_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft248 = _mm512_fmsub_ps(ifft239, _mm512_set1_ps(1.5625e-02f), ifft245);
__m512 ifft332 = _mm512_fmsub_ps(ifft323, _mm512_set1_ps(1.5625e-02f), ifft329);
__m512 ifft249 = _mm512_fmadd_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft333 = _mm512_fmadd_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft250 = _mm512_fmsub_ps(ifft240, _mm512_set1_ps(1.5625e-02f), ifft246);
__m512 ifft334 = _mm512_fmsub_ps(ifft324, _mm512_set1_ps(1.5625e-02f), ifft330);
__m512 ifft251 = _mm512_add_ps(ifft241, ifft242);
__m512 ifft335 = _mm512_add_ps(ifft325, ifft326);
__m512 ifft252 = _mm512_sub_ps(ifft241, ifft242);
__m512 ifft336 = _mm512_sub_ps(ifft325, ifft326);
__m512 ifft253 = _mm512_fnmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft337 = _mm512_fnmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft254 = _mm512_fmadd_ps(ifft251, _mm512_set1_ps(7.0710677e-01f), ifft243);
__m512 ifft338 = _mm512_fmadd_ps(ifft335, _mm512_set1_ps(7.0710677e-01f), ifft327);
__m512 ifft255 = _mm512_fmadd_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft339 = _mm512_fmadd_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft256 = _mm512_fmsub_ps(ifft252, _mm512_set1_ps(7.0710677e-01f), ifft244);
__m512 ifft340 = _mm512_fmsub_ps(ifft336, _mm512_set1_ps(7.0710677e-01f), ifft328);
__m512 ifft257 = _mm512_add_ps(ifft253, ifft254);
__m512 ifft341 = _mm512_add_ps(ifft337, ifft338);
__m512 ifft258 = _mm512_sub_ps(ifft253, ifft254);
__m512 ifft342 = _mm512_sub_ps(ifft337, ifft338);
__m512 ifft259 = _mm512_add_ps(ifft255, ifft256);
__m512 ifft343 = _mm512_add_ps(ifft339, ifft340);
__m512 ifft260 = _mm512_sub_ps(ifft255, ifft256);
__m512 ifft344 = _mm512_sub_ps(ifft339, ifft340);
__m512 ifft261 = _mm512_fmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft345 = _mm512_fmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft262 = _mm512_fnmadd_ps(ifft257, _mm512_set1_ps(1.5625e-02f), ifft247);
__m512 ifft346 = _mm512_fnmadd_ps(ifft341, _mm512_set1_ps(1.5625e-02f), ifft331);
__m512 ifft263 = _mm512_fmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft347 = _mm512_fmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft264 = _mm512_fnmadd_ps(ifft259, _mm512_set1_ps(1.5625e-02f), ifft249);
__m512 ifft348 = _mm512_fnmadd_ps(ifft343, _mm512_set1_ps(1.5625e-02f), ifft333);
__m512 ifft265 = _mm512_fnmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft349 = _mm512_fnmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft266 = _mm512_fmadd_ps(ifft260, _mm512_set1_ps(1.5625e-02f), ifft248);
__m512 ifft350 = _mm512_fmadd_ps(ifft344, _mm512_set1_ps(1.5625e-02f), ifft332);
__m512 ifft267 = _mm512_fmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft351 = _mm512_fmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 ifft268 = _mm512_fnmadd_ps(ifft258, _mm512_set1_ps(1.5625e-02f), ifft250);
__m512 ifft352 = _mm512_fnmadd_ps(ifft342, _mm512_set1_ps(1.5625e-02f), ifft334);
__m512 dat600 = ifft261;
__m512 dat605 = ifft345;
__m512 dat601 = ifft263;
__m512 dat606 = ifft347;
__m512 dat602 = ifft265;
__m512 dat607 = ifft349;
__m512 dat603 = ifft267;
__m512 dat608 = ifft351;
__m512 dat604 = ifft262;
__m512 dat609 = ifft346;
(void)ifft264;
(void)ifft348;
(void)ifft266;
(void)ifft350;
(void)ifft268;
(void)ifft352;
__m512i pm3 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack11 = _mm512_permutex2var_ps(dat600, pm3, dat605);
__m512i pm4 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack12 = _mm512_permutex2var_ps(dat600, pm4, dat605);
__m512 pack13 = _mm512_permutex2var_ps(dat601, pm3, dat606);
__m512 pack14 = _mm512_permutex2var_ps(dat601, pm4, dat606);
__m512 pack15 = _mm512_permutex2var_ps(dat602, pm3, dat607);
__m512 pack16 = _mm512_permutex2var_ps(dat602, pm4, dat607);
__m512 pack17 = _mm512_permutex2var_ps(dat603, pm3, dat608);
__m512 pack18 = _mm512_permutex2var_ps(dat603, pm4, dat608);
__m512 pack19 = _mm512_permutex2var_ps(dat604, pm3, dat609);
__m512 pack20 = _mm512_permutex2var_ps(dat604, pm4, dat609);
pack11 = _mm512_max_ps(_mm512_setzero_ps(), pack11);
pack12 = _mm512_max_ps(_mm512_setzero_ps(), pack12);
pack13 = _mm512_max_ps(_mm512_setzero_ps(), pack13);
pack14 = _mm512_max_ps(_mm512_setzero_ps(), pack14);
pack15 = _mm512_max_ps(_mm512_setzero_ps(), pack15);
pack16 = _mm512_max_ps(_mm512_setzero_ps(), pack16);
pack17 = _mm512_max_ps(_mm512_setzero_ps(), pack17);
pack18 = _mm512_max_ps(_mm512_setzero_ps(), pack18);
pack19 = _mm512_max_ps(_mm512_setzero_ps(), pack19);
pack20 = _mm512_max_ps(_mm512_setzero_ps(), pack20);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack11);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack12);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack13);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack14);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack15);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack16);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack17);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack18);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack19);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k25+100480*r2+448*toH1+4*toW1+40*t3, 1023, pack20);
}
}
}
if (j5 >= last2) return;
++j5;
rel4 = 1;
}
if (rel4 < 3) {
ptrdiff_t toH2 = base4+0;
ptrdiff_t toW2 = 0+30*rel4;
ptrdiff_t jj11 = 2-rel4+j5;
for (; j5 <= jj11; toW2 += 30) {
ptrdiff_t k26 = 16*w21;
for (; k26 != 16; ++k26) {
ptrdiff_t r3 = 0;
for (; r3 != 2; ++r3) {
ptrdiff_t t4 = 0;
for (; t4 < 3; ++t4) {
__m512 sfRe73 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm73 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe77 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm77 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe74 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm74 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe78 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm78 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe75 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm75 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe79 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm79 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe76 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm76 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfRe80 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512 sfIm80 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k26+768*r3+256*t4);
__m512i ifft353 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft354 = _mm512_permutexvar_ps(ifft353, sfRe73);
__m512 ifft445 = _mm512_permutexvar_ps(ifft353, sfRe77);
__m512i ifft355 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft356 = _mm512_permutexvar_ps(ifft355, sfRe73);
__m512 ifft446 = _mm512_permutexvar_ps(ifft355, sfRe77);
__m512 ifft357 = _mm512_permutexvar_ps(ifft353, sfIm73);
__m512 ifft447 = _mm512_permutexvar_ps(ifft353, sfIm77);
__m512 ifft358 = _mm512_permutexvar_ps(ifft355, sfIm73);
__m512 ifft448 = _mm512_permutexvar_ps(ifft355, sfIm77);
__m512 ifft359 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft360 = _mm512_mask_fmadd_ps(ifft358, 65021, ifft359, ifft354);
__m512 ifft449 = _mm512_mask_fmadd_ps(ifft448, 65021, ifft359, ifft445);
__m512 ifft361 = _mm512_mask_fnmadd_ps(ifft357, 65021, ifft359, ifft356);
__m512 ifft450 = _mm512_mask_fnmadd_ps(ifft447, 65021, ifft359, ifft446);
__m512 ifft362 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft363 = _mm512_fmadd_ps(ifft360, ifft362, _mm512_shuffle_ps(ifft360, ifft360, 177));
__m512 ifft451 = _mm512_fmadd_ps(ifft449, ifft362, _mm512_shuffle_ps(ifft449, ifft449, 177));
__m512 ifft364 = _mm512_fmadd_ps(ifft361, ifft362, _mm512_shuffle_ps(ifft361, ifft361, 177));
__m512 ifft452 = _mm512_fmadd_ps(ifft450, ifft362, _mm512_shuffle_ps(ifft450, ifft450, 177));
__m512 ifft365 = _mm512_fmadd_ps(sfRe74, ifft362, _mm512_shuffle_ps(sfRe74, sfRe74, 177));
__m512 ifft453 = _mm512_fmadd_ps(sfRe78, ifft362, _mm512_shuffle_ps(sfRe78, sfRe78, 177));
__m512 ifft366 = _mm512_fmadd_ps(sfIm74, ifft362, _mm512_shuffle_ps(sfIm74, sfIm74, 177));
__m512 ifft454 = _mm512_fmadd_ps(sfIm78, ifft362, _mm512_shuffle_ps(sfIm78, sfIm78, 177));
__m512 ifft367 = _mm512_fmadd_ps(sfRe75, ifft362, _mm512_shuffle_ps(sfRe75, sfRe75, 177));
__m512 ifft455 = _mm512_fmadd_ps(sfRe79, ifft362, _mm512_shuffle_ps(sfRe79, sfRe79, 177));
__m512 ifft368 = _mm512_fmadd_ps(sfIm75, ifft362, _mm512_shuffle_ps(sfIm75, sfIm75, 177));
__m512 ifft456 = _mm512_fmadd_ps(sfIm79, ifft362, _mm512_shuffle_ps(sfIm79, sfIm79, 177));
__m512 ifft369 = _mm512_fmadd_ps(sfRe76, ifft362, _mm512_shuffle_ps(sfRe76, sfRe76, 177));
__m512 ifft457 = _mm512_fmadd_ps(sfRe80, ifft362, _mm512_shuffle_ps(sfRe80, sfRe80, 177));
__m512 ifft370 = _mm512_fmadd_ps(sfIm76, ifft362, _mm512_shuffle_ps(sfIm76, sfIm76, 177));
__m512 ifft458 = _mm512_fmadd_ps(sfIm80, ifft362, _mm512_shuffle_ps(sfIm80, sfIm80, 177));
__m512 ifft371 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft372 = _mm512_mul_ps(ifft363, ifft371);
__m512 ifft459 = _mm512_mul_ps(ifft451, ifft371);
__m512 ifft373 = _mm512_mul_ps(ifft364, ifft371);
__m512 ifft460 = _mm512_mul_ps(ifft452, ifft371);
__m512 ifft374 = _mm512_mul_ps(ifft365, ifft371);
__m512 ifft461 = _mm512_mul_ps(ifft453, ifft371);
__m512 ifft375 = _mm512_mul_ps(ifft366, ifft371);
__m512 ifft462 = _mm512_mul_ps(ifft454, ifft371);
__m512 ifft376 = _mm512_mul_ps(ifft367, ifft371);
__m512 ifft463 = _mm512_mul_ps(ifft455, ifft371);
__m512 ifft377 = _mm512_mul_ps(ifft368, ifft371);
__m512 ifft464 = _mm512_mul_ps(ifft456, ifft371);
__m512 ifft378 = _mm512_mul_ps(ifft369, ifft371);
__m512 ifft465 = _mm512_mul_ps(ifft457, ifft371);
__m512 ifft379 = _mm512_mul_ps(ifft370, ifft371);
__m512 ifft466 = _mm512_mul_ps(ifft458, ifft371);
__m512 ifft380 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft381 = _mm512_fnmadd_ps(ifft364, ifft380, ifft372);
__m512 ifft467 = _mm512_fnmadd_ps(ifft452, ifft380, ifft459);
__m512 ifft382 = _mm512_fmadd_ps(ifft363, ifft380, ifft373);
__m512 ifft468 = _mm512_fmadd_ps(ifft451, ifft380, ifft460);
__m512 ifft383 = _mm512_fnmadd_ps(ifft366, ifft380, ifft374);
__m512 ifft469 = _mm512_fnmadd_ps(ifft454, ifft380, ifft461);
__m512 ifft384 = _mm512_fmadd_ps(ifft365, ifft380, ifft375);
__m512 ifft470 = _mm512_fmadd_ps(ifft453, ifft380, ifft462);
__m512 ifft385 = _mm512_fnmadd_ps(ifft368, ifft380, ifft376);
__m512 ifft471 = _mm512_fnmadd_ps(ifft456, ifft380, ifft463);
__m512 ifft386 = _mm512_fmadd_ps(ifft367, ifft380, ifft377);
__m512 ifft472 = _mm512_fmadd_ps(ifft455, ifft380, ifft464);
__m512 ifft387 = _mm512_fnmadd_ps(ifft370, ifft380, ifft378);
__m512 ifft473 = _mm512_fnmadd_ps(ifft458, ifft380, ifft465);
__m512 ifft388 = _mm512_fmadd_ps(ifft369, ifft380, ifft379);
__m512 ifft474 = _mm512_fmadd_ps(ifft457, ifft380, ifft466);
__m512 ifft389 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft390 = _mm512_fmadd_ps(ifft381, ifft389, _mm512_shuffle_ps(ifft381, ifft381, 78));
__m512 ifft475 = _mm512_fmadd_ps(ifft467, ifft389, _mm512_shuffle_ps(ifft467, ifft467, 78));
__m512 ifft391 = _mm512_fmadd_ps(ifft382, ifft389, _mm512_shuffle_ps(ifft382, ifft382, 78));
__m512 ifft476 = _mm512_fmadd_ps(ifft468, ifft389, _mm512_shuffle_ps(ifft468, ifft468, 78));
__m512 ifft392 = _mm512_fmadd_ps(ifft383, ifft389, _mm512_shuffle_ps(ifft383, ifft383, 78));
__m512 ifft477 = _mm512_fmadd_ps(ifft469, ifft389, _mm512_shuffle_ps(ifft469, ifft469, 78));
__m512 ifft393 = _mm512_fmadd_ps(ifft384, ifft389, _mm512_shuffle_ps(ifft384, ifft384, 78));
__m512 ifft478 = _mm512_fmadd_ps(ifft470, ifft389, _mm512_shuffle_ps(ifft470, ifft470, 78));
__m512 ifft394 = _mm512_fmadd_ps(ifft385, ifft389, _mm512_shuffle_ps(ifft385, ifft385, 78));
__m512 ifft479 = _mm512_fmadd_ps(ifft471, ifft389, _mm512_shuffle_ps(ifft471, ifft471, 78));
__m512 ifft395 = _mm512_fmadd_ps(ifft386, ifft389, _mm512_shuffle_ps(ifft386, ifft386, 78));
__m512 ifft480 = _mm512_fmadd_ps(ifft472, ifft389, _mm512_shuffle_ps(ifft472, ifft472, 78));
__m512 ifft396 = _mm512_fmadd_ps(ifft387, ifft389, _mm512_shuffle_ps(ifft387, ifft387, 78));
__m512 ifft481 = _mm512_fmadd_ps(ifft473, ifft389, _mm512_shuffle_ps(ifft473, ifft473, 78));
__m512 ifft397 = _mm512_fmadd_ps(ifft388, ifft389, _mm512_shuffle_ps(ifft388, ifft388, 78));
__m512 ifft482 = _mm512_fmadd_ps(ifft474, ifft389, _mm512_shuffle_ps(ifft474, ifft474, 78));
__m512 ifft398 = _mm512_mask_sub_ps(ifft390, 49344, _mm512_setzero_ps(), ifft391);
__m512 ifft483 = _mm512_mask_sub_ps(ifft475, 49344, _mm512_setzero_ps(), ifft476);
__m512 ifft399 = _mm512_mask_mov_ps(ifft391, 49344, ifft390);
__m512 ifft484 = _mm512_mask_mov_ps(ifft476, 49344, ifft475);
__m512 ifft400 = _mm512_mask_sub_ps(ifft392, 49344, _mm512_setzero_ps(), ifft393);
__m512 ifft485 = _mm512_mask_sub_ps(ifft477, 49344, _mm512_setzero_ps(), ifft478);
__m512 ifft401 = _mm512_mask_mov_ps(ifft393, 49344, ifft392);
__m512 ifft486 = _mm512_mask_mov_ps(ifft478, 49344, ifft477);
__m512 ifft402 = _mm512_mask_sub_ps(ifft394, 49344, _mm512_setzero_ps(), ifft395);
__m512 ifft487 = _mm512_mask_sub_ps(ifft479, 49344, _mm512_setzero_ps(), ifft480);
__m512 ifft403 = _mm512_mask_mov_ps(ifft395, 49344, ifft394);
__m512 ifft488 = _mm512_mask_mov_ps(ifft480, 49344, ifft479);
__m512 ifft404 = _mm512_mask_sub_ps(ifft396, 49344, _mm512_setzero_ps(), ifft397);
__m512 ifft489 = _mm512_mask_sub_ps(ifft481, 49344, _mm512_setzero_ps(), ifft482);
__m512 ifft405 = _mm512_mask_mov_ps(ifft397, 49344, ifft396);
__m512 ifft490 = _mm512_mask_mov_ps(ifft482, 49344, ifft481);
__m512 ifft406 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft407 = _mm512_fmadd_ps(ifft398, ifft406, _mm512_shuffle_f32x4(ifft398, ifft398, 177));
__m512 ifft491 = _mm512_fmadd_ps(ifft483, ifft406, _mm512_shuffle_f32x4(ifft483, ifft483, 177));
__m512 ifft408 = _mm512_fmadd_ps(ifft399, ifft406, _mm512_shuffle_f32x4(ifft399, ifft399, 177));
__m512 ifft492 = _mm512_fmadd_ps(ifft484, ifft406, _mm512_shuffle_f32x4(ifft484, ifft484, 177));
__m512 ifft409 = _mm512_fmadd_ps(ifft400, ifft406, _mm512_shuffle_f32x4(ifft400, ifft400, 177));
__m512 ifft493 = _mm512_fmadd_ps(ifft485, ifft406, _mm512_shuffle_f32x4(ifft485, ifft485, 177));
__m512 ifft410 = _mm512_fmadd_ps(ifft401, ifft406, _mm512_shuffle_f32x4(ifft401, ifft401, 177));
__m512 ifft494 = _mm512_fmadd_ps(ifft486, ifft406, _mm512_shuffle_f32x4(ifft486, ifft486, 177));
__m512 ifft411 = _mm512_fmadd_ps(ifft402, ifft406, _mm512_shuffle_f32x4(ifft402, ifft402, 177));
__m512 ifft495 = _mm512_fmadd_ps(ifft487, ifft406, _mm512_shuffle_f32x4(ifft487, ifft487, 177));
__m512 ifft412 = _mm512_fnmsub_ps(ifft403, ifft406, _mm512_shuffle_f32x4(ifft403, ifft403, 177));
__m512 ifft496 = _mm512_fnmsub_ps(ifft488, ifft406, _mm512_shuffle_f32x4(ifft488, ifft488, 177));
__m512 ifft413 = _mm512_fmadd_ps(ifft404, ifft406, _mm512_shuffle_f32x4(ifft404, ifft404, 177));
__m512 ifft497 = _mm512_fmadd_ps(ifft489, ifft406, _mm512_shuffle_f32x4(ifft489, ifft489, 177));
__m512 ifft414 = _mm512_fmadd_ps(ifft405, ifft406, _mm512_shuffle_f32x4(ifft405, ifft405, 177));
__m512 ifft498 = _mm512_fmadd_ps(ifft490, ifft406, _mm512_shuffle_f32x4(ifft490, ifft490, 177));
__m512 ifft415 = _mm512_add_ps(ifft407, ifft408);
__m512 ifft499 = _mm512_add_ps(ifft491, ifft492);
__m512 ifft416 = _mm512_sub_ps(ifft407, ifft408);
__m512 ifft500 = _mm512_sub_ps(ifft491, ifft492);
__m512 ifft417 = _mm512_sub_ps(ifft409, ifft413);
__m512 ifft501 = _mm512_sub_ps(ifft493, ifft497);
__m512 ifft418 = _mm512_add_ps(ifft410, ifft414);
__m512 ifft502 = _mm512_add_ps(ifft494, ifft498);
__m512 ifft419 = _mm512_add_ps(ifft409, ifft413);
__m512 ifft503 = _mm512_add_ps(ifft493, ifft497);
__m512 ifft420 = _mm512_sub_ps(ifft410, ifft414);
__m512 ifft504 = _mm512_sub_ps(ifft494, ifft498);
__m512 ifft421 = _mm512_mul_ps(ifft411, _mm512_set1_ps(3.125e-02f));
__m512 ifft505 = _mm512_mul_ps(ifft495, _mm512_set1_ps(3.125e-02f));
__m512 ifft422 = _mm512_mul_ps(ifft412, _mm512_set1_ps(3.125e-02f));
__m512 ifft506 = _mm512_mul_ps(ifft496, _mm512_set1_ps(3.125e-02f));
__m512 ifft423 = _mm512_fmadd_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft507 = _mm512_fmadd_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft424 = _mm512_fmsub_ps(ifft415, _mm512_set1_ps(1.5625e-02f), ifft421);
__m512 ifft508 = _mm512_fmsub_ps(ifft499, _mm512_set1_ps(1.5625e-02f), ifft505);
__m512 ifft425 = _mm512_fmadd_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft509 = _mm512_fmadd_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft426 = _mm512_fmsub_ps(ifft416, _mm512_set1_ps(1.5625e-02f), ifft422);
__m512 ifft510 = _mm512_fmsub_ps(ifft500, _mm512_set1_ps(1.5625e-02f), ifft506);
__m512 ifft427 = _mm512_add_ps(ifft417, ifft418);
__m512 ifft511 = _mm512_add_ps(ifft501, ifft502);
__m512 ifft428 = _mm512_sub_ps(ifft417, ifft418);
__m512 ifft512 = _mm512_sub_ps(ifft501, ifft502);
__m512 ifft429 = _mm512_fnmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft513 = _mm512_fnmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft430 = _mm512_fmadd_ps(ifft427, _mm512_set1_ps(7.0710677e-01f), ifft419);
__m512 ifft514 = _mm512_fmadd_ps(ifft511, _mm512_set1_ps(7.0710677e-01f), ifft503);
__m512 ifft431 = _mm512_fmadd_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft515 = _mm512_fmadd_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft432 = _mm512_fmsub_ps(ifft428, _mm512_set1_ps(7.0710677e-01f), ifft420);
__m512 ifft516 = _mm512_fmsub_ps(ifft512, _mm512_set1_ps(7.0710677e-01f), ifft504);
__m512 ifft433 = _mm512_add_ps(ifft429, ifft430);
__m512 ifft517 = _mm512_add_ps(ifft513, ifft514);
__m512 ifft434 = _mm512_sub_ps(ifft429, ifft430);
__m512 ifft518 = _mm512_sub_ps(ifft513, ifft514);
__m512 ifft435 = _mm512_add_ps(ifft431, ifft432);
__m512 ifft519 = _mm512_add_ps(ifft515, ifft516);
__m512 ifft436 = _mm512_sub_ps(ifft431, ifft432);
__m512 ifft520 = _mm512_sub_ps(ifft515, ifft516);
__m512 ifft437 = _mm512_fmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft521 = _mm512_fmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft438 = _mm512_fnmadd_ps(ifft433, _mm512_set1_ps(1.5625e-02f), ifft423);
__m512 ifft522 = _mm512_fnmadd_ps(ifft517, _mm512_set1_ps(1.5625e-02f), ifft507);
__m512 ifft439 = _mm512_fmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft523 = _mm512_fmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft440 = _mm512_fnmadd_ps(ifft435, _mm512_set1_ps(1.5625e-02f), ifft425);
__m512 ifft524 = _mm512_fnmadd_ps(ifft519, _mm512_set1_ps(1.5625e-02f), ifft509);
__m512 ifft441 = _mm512_fnmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft525 = _mm512_fnmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft442 = _mm512_fmadd_ps(ifft436, _mm512_set1_ps(1.5625e-02f), ifft424);
__m512 ifft526 = _mm512_fmadd_ps(ifft520, _mm512_set1_ps(1.5625e-02f), ifft508);
__m512 ifft443 = _mm512_fmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft527 = _mm512_fmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 ifft444 = _mm512_fnmadd_ps(ifft434, _mm512_set1_ps(1.5625e-02f), ifft426);
__m512 ifft528 = _mm512_fnmadd_ps(ifft518, _mm512_set1_ps(1.5625e-02f), ifft510);
__m512 dat610 = ifft437;
__m512 dat615 = ifft521;
__m512 dat611 = ifft439;
__m512 dat616 = ifft523;
__m512 dat612 = ifft441;
__m512 dat617 = ifft525;
__m512 dat613 = ifft443;
__m512 dat618 = ifft527;
__m512 dat614 = ifft438;
__m512 dat619 = ifft522;
(void)ifft440;
(void)ifft524;
(void)ifft442;
(void)ifft526;
(void)ifft444;
(void)ifft528;
__m512i pm5 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack21 = _mm512_permutex2var_ps(dat610, pm5, dat615);
__m512i pm6 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack22 = _mm512_permutex2var_ps(dat610, pm6, dat615);
__m512 pack23 = _mm512_permutex2var_ps(dat611, pm5, dat616);
__m512 pack24 = _mm512_permutex2var_ps(dat611, pm6, dat616);
__m512 pack25 = _mm512_permutex2var_ps(dat612, pm5, dat617);
__m512 pack26 = _mm512_permutex2var_ps(dat612, pm6, dat617);
__m512 pack27 = _mm512_permutex2var_ps(dat613, pm5, dat618);
__m512 pack28 = _mm512_permutex2var_ps(dat613, pm6, dat618);
__m512 pack29 = _mm512_permutex2var_ps(dat614, pm5, dat619);
__m512 pack30 = _mm512_permutex2var_ps(dat614, pm6, dat619);
pack21 = _mm512_max_ps(_mm512_setzero_ps(), pack21);
pack22 = _mm512_max_ps(_mm512_setzero_ps(), pack22);
pack23 = _mm512_max_ps(_mm512_setzero_ps(), pack23);
pack24 = _mm512_max_ps(_mm512_setzero_ps(), pack24);
pack25 = _mm512_max_ps(_mm512_setzero_ps(), pack25);
pack26 = _mm512_max_ps(_mm512_setzero_ps(), pack26);
pack27 = _mm512_max_ps(_mm512_setzero_ps(), pack27);
pack28 = _mm512_max_ps(_mm512_setzero_ps(), pack28);
pack29 = _mm512_max_ps(_mm512_setzero_ps(), pack29);
pack30 = _mm512_max_ps(_mm512_setzero_ps(), pack30);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack21);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack22);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack23);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack24);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack25);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack26);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack27);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack28);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack29);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k26+100480*r3+448*toH2+4*toW2+40*t4, 1023, pack30);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel4 = 3;
}
ptrdiff_t toH3 = base4+0;
ptrdiff_t toW3 = 90;
ptrdiff_t k27 = 16*w21;
for (; k27 != 16; ++k27) {
ptrdiff_t r4 = 0;
for (; r4 != 2; ++r4) {
ptrdiff_t t5 = 0;
for (; t5 < 2; ++t5) {
__m512 sfRe81 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm81 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe85 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm85 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe82 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm82 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe86 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm86 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe83 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm83 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe87 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm87 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe84 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm84 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfRe88 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512 sfIm88 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k27+768*r4+256*t5);
__m512i ifft529 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft530 = _mm512_permutexvar_ps(ifft529, sfRe81);
__m512 ifft621 = _mm512_permutexvar_ps(ifft529, sfRe85);
__m512i ifft531 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft532 = _mm512_permutexvar_ps(ifft531, sfRe81);
__m512 ifft622 = _mm512_permutexvar_ps(ifft531, sfRe85);
__m512 ifft533 = _mm512_permutexvar_ps(ifft529, sfIm81);
__m512 ifft623 = _mm512_permutexvar_ps(ifft529, sfIm85);
__m512 ifft534 = _mm512_permutexvar_ps(ifft531, sfIm81);
__m512 ifft624 = _mm512_permutexvar_ps(ifft531, sfIm85);
__m512 ifft535 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft536 = _mm512_mask_fmadd_ps(ifft534, 65021, ifft535, ifft530);
__m512 ifft625 = _mm512_mask_fmadd_ps(ifft624, 65021, ifft535, ifft621);
__m512 ifft537 = _mm512_mask_fnmadd_ps(ifft533, 65021, ifft535, ifft532);
__m512 ifft626 = _mm512_mask_fnmadd_ps(ifft623, 65021, ifft535, ifft622);
__m512 ifft538 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft539 = _mm512_fmadd_ps(ifft536, ifft538, _mm512_shuffle_ps(ifft536, ifft536, 177));
__m512 ifft627 = _mm512_fmadd_ps(ifft625, ifft538, _mm512_shuffle_ps(ifft625, ifft625, 177));
__m512 ifft540 = _mm512_fmadd_ps(ifft537, ifft538, _mm512_shuffle_ps(ifft537, ifft537, 177));
__m512 ifft628 = _mm512_fmadd_ps(ifft626, ifft538, _mm512_shuffle_ps(ifft626, ifft626, 177));
__m512 ifft541 = _mm512_fmadd_ps(sfRe82, ifft538, _mm512_shuffle_ps(sfRe82, sfRe82, 177));
__m512 ifft629 = _mm512_fmadd_ps(sfRe86, ifft538, _mm512_shuffle_ps(sfRe86, sfRe86, 177));
__m512 ifft542 = _mm512_fmadd_ps(sfIm82, ifft538, _mm512_shuffle_ps(sfIm82, sfIm82, 177));
__m512 ifft630 = _mm512_fmadd_ps(sfIm86, ifft538, _mm512_shuffle_ps(sfIm86, sfIm86, 177));
__m512 ifft543 = _mm512_fmadd_ps(sfRe83, ifft538, _mm512_shuffle_ps(sfRe83, sfRe83, 177));
__m512 ifft631 = _mm512_fmadd_ps(sfRe87, ifft538, _mm512_shuffle_ps(sfRe87, sfRe87, 177));
__m512 ifft544 = _mm512_fmadd_ps(sfIm83, ifft538, _mm512_shuffle_ps(sfIm83, sfIm83, 177));
__m512 ifft632 = _mm512_fmadd_ps(sfIm87, ifft538, _mm512_shuffle_ps(sfIm87, sfIm87, 177));
__m512 ifft545 = _mm512_fmadd_ps(sfRe84, ifft538, _mm512_shuffle_ps(sfRe84, sfRe84, 177));
__m512 ifft633 = _mm512_fmadd_ps(sfRe88, ifft538, _mm512_shuffle_ps(sfRe88, sfRe88, 177));
__m512 ifft546 = _mm512_fmadd_ps(sfIm84, ifft538, _mm512_shuffle_ps(sfIm84, sfIm84, 177));
__m512 ifft634 = _mm512_fmadd_ps(sfIm88, ifft538, _mm512_shuffle_ps(sfIm88, sfIm88, 177));
__m512 ifft547 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft548 = _mm512_mul_ps(ifft539, ifft547);
__m512 ifft635 = _mm512_mul_ps(ifft627, ifft547);
__m512 ifft549 = _mm512_mul_ps(ifft540, ifft547);
__m512 ifft636 = _mm512_mul_ps(ifft628, ifft547);
__m512 ifft550 = _mm512_mul_ps(ifft541, ifft547);
__m512 ifft637 = _mm512_mul_ps(ifft629, ifft547);
__m512 ifft551 = _mm512_mul_ps(ifft542, ifft547);
__m512 ifft638 = _mm512_mul_ps(ifft630, ifft547);
__m512 ifft552 = _mm512_mul_ps(ifft543, ifft547);
__m512 ifft639 = _mm512_mul_ps(ifft631, ifft547);
__m512 ifft553 = _mm512_mul_ps(ifft544, ifft547);
__m512 ifft640 = _mm512_mul_ps(ifft632, ifft547);
__m512 ifft554 = _mm512_mul_ps(ifft545, ifft547);
__m512 ifft641 = _mm512_mul_ps(ifft633, ifft547);
__m512 ifft555 = _mm512_mul_ps(ifft546, ifft547);
__m512 ifft642 = _mm512_mul_ps(ifft634, ifft547);
__m512 ifft556 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft557 = _mm512_fnmadd_ps(ifft540, ifft556, ifft548);
__m512 ifft643 = _mm512_fnmadd_ps(ifft628, ifft556, ifft635);
__m512 ifft558 = _mm512_fmadd_ps(ifft539, ifft556, ifft549);
__m512 ifft644 = _mm512_fmadd_ps(ifft627, ifft556, ifft636);
__m512 ifft559 = _mm512_fnmadd_ps(ifft542, ifft556, ifft550);
__m512 ifft645 = _mm512_fnmadd_ps(ifft630, ifft556, ifft637);
__m512 ifft560 = _mm512_fmadd_ps(ifft541, ifft556, ifft551);
__m512 ifft646 = _mm512_fmadd_ps(ifft629, ifft556, ifft638);
__m512 ifft561 = _mm512_fnmadd_ps(ifft544, ifft556, ifft552);
__m512 ifft647 = _mm512_fnmadd_ps(ifft632, ifft556, ifft639);
__m512 ifft562 = _mm512_fmadd_ps(ifft543, ifft556, ifft553);
__m512 ifft648 = _mm512_fmadd_ps(ifft631, ifft556, ifft640);
__m512 ifft563 = _mm512_fnmadd_ps(ifft546, ifft556, ifft554);
__m512 ifft649 = _mm512_fnmadd_ps(ifft634, ifft556, ifft641);
__m512 ifft564 = _mm512_fmadd_ps(ifft545, ifft556, ifft555);
__m512 ifft650 = _mm512_fmadd_ps(ifft633, ifft556, ifft642);
__m512 ifft565 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft566 = _mm512_fmadd_ps(ifft557, ifft565, _mm512_shuffle_ps(ifft557, ifft557, 78));
__m512 ifft651 = _mm512_fmadd_ps(ifft643, ifft565, _mm512_shuffle_ps(ifft643, ifft643, 78));
__m512 ifft567 = _mm512_fmadd_ps(ifft558, ifft565, _mm512_shuffle_ps(ifft558, ifft558, 78));
__m512 ifft652 = _mm512_fmadd_ps(ifft644, ifft565, _mm512_shuffle_ps(ifft644, ifft644, 78));
__m512 ifft568 = _mm512_fmadd_ps(ifft559, ifft565, _mm512_shuffle_ps(ifft559, ifft559, 78));
__m512 ifft653 = _mm512_fmadd_ps(ifft645, ifft565, _mm512_shuffle_ps(ifft645, ifft645, 78));
__m512 ifft569 = _mm512_fmadd_ps(ifft560, ifft565, _mm512_shuffle_ps(ifft560, ifft560, 78));
__m512 ifft654 = _mm512_fmadd_ps(ifft646, ifft565, _mm512_shuffle_ps(ifft646, ifft646, 78));
__m512 ifft570 = _mm512_fmadd_ps(ifft561, ifft565, _mm512_shuffle_ps(ifft561, ifft561, 78));
__m512 ifft655 = _mm512_fmadd_ps(ifft647, ifft565, _mm512_shuffle_ps(ifft647, ifft647, 78));
__m512 ifft571 = _mm512_fmadd_ps(ifft562, ifft565, _mm512_shuffle_ps(ifft562, ifft562, 78));
__m512 ifft656 = _mm512_fmadd_ps(ifft648, ifft565, _mm512_shuffle_ps(ifft648, ifft648, 78));
__m512 ifft572 = _mm512_fmadd_ps(ifft563, ifft565, _mm512_shuffle_ps(ifft563, ifft563, 78));
__m512 ifft657 = _mm512_fmadd_ps(ifft649, ifft565, _mm512_shuffle_ps(ifft649, ifft649, 78));
__m512 ifft573 = _mm512_fmadd_ps(ifft564, ifft565, _mm512_shuffle_ps(ifft564, ifft564, 78));
__m512 ifft658 = _mm512_fmadd_ps(ifft650, ifft565, _mm512_shuffle_ps(ifft650, ifft650, 78));
__m512 ifft574 = _mm512_mask_sub_ps(ifft566, 49344, _mm512_setzero_ps(), ifft567);
__m512 ifft659 = _mm512_mask_sub_ps(ifft651, 49344, _mm512_setzero_ps(), ifft652);
__m512 ifft575 = _mm512_mask_mov_ps(ifft567, 49344, ifft566);
__m512 ifft660 = _mm512_mask_mov_ps(ifft652, 49344, ifft651);
__m512 ifft576 = _mm512_mask_sub_ps(ifft568, 49344, _mm512_setzero_ps(), ifft569);
__m512 ifft661 = _mm512_mask_sub_ps(ifft653, 49344, _mm512_setzero_ps(), ifft654);
__m512 ifft577 = _mm512_mask_mov_ps(ifft569, 49344, ifft568);
__m512 ifft662 = _mm512_mask_mov_ps(ifft654, 49344, ifft653);
__m512 ifft578 = _mm512_mask_sub_ps(ifft570, 49344, _mm512_setzero_ps(), ifft571);
__m512 ifft663 = _mm512_mask_sub_ps(ifft655, 49344, _mm512_setzero_ps(), ifft656);
__m512 ifft579 = _mm512_mask_mov_ps(ifft571, 49344, ifft570);
__m512 ifft664 = _mm512_mask_mov_ps(ifft656, 49344, ifft655);
__m512 ifft580 = _mm512_mask_sub_ps(ifft572, 49344, _mm512_setzero_ps(), ifft573);
__m512 ifft665 = _mm512_mask_sub_ps(ifft657, 49344, _mm512_setzero_ps(), ifft658);
__m512 ifft581 = _mm512_mask_mov_ps(ifft573, 49344, ifft572);
__m512 ifft666 = _mm512_mask_mov_ps(ifft658, 49344, ifft657);
__m512 ifft582 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft583 = _mm512_fmadd_ps(ifft574, ifft582, _mm512_shuffle_f32x4(ifft574, ifft574, 177));
__m512 ifft667 = _mm512_fmadd_ps(ifft659, ifft582, _mm512_shuffle_f32x4(ifft659, ifft659, 177));
__m512 ifft584 = _mm512_fmadd_ps(ifft575, ifft582, _mm512_shuffle_f32x4(ifft575, ifft575, 177));
__m512 ifft668 = _mm512_fmadd_ps(ifft660, ifft582, _mm512_shuffle_f32x4(ifft660, ifft660, 177));
__m512 ifft585 = _mm512_fmadd_ps(ifft576, ifft582, _mm512_shuffle_f32x4(ifft576, ifft576, 177));
__m512 ifft669 = _mm512_fmadd_ps(ifft661, ifft582, _mm512_shuffle_f32x4(ifft661, ifft661, 177));
__m512 ifft586 = _mm512_fmadd_ps(ifft577, ifft582, _mm512_shuffle_f32x4(ifft577, ifft577, 177));
__m512 ifft670 = _mm512_fmadd_ps(ifft662, ifft582, _mm512_shuffle_f32x4(ifft662, ifft662, 177));
__m512 ifft587 = _mm512_fmadd_ps(ifft578, ifft582, _mm512_shuffle_f32x4(ifft578, ifft578, 177));
__m512 ifft671 = _mm512_fmadd_ps(ifft663, ifft582, _mm512_shuffle_f32x4(ifft663, ifft663, 177));
__m512 ifft588 = _mm512_fnmsub_ps(ifft579, ifft582, _mm512_shuffle_f32x4(ifft579, ifft579, 177));
__m512 ifft672 = _mm512_fnmsub_ps(ifft664, ifft582, _mm512_shuffle_f32x4(ifft664, ifft664, 177));
__m512 ifft589 = _mm512_fmadd_ps(ifft580, ifft582, _mm512_shuffle_f32x4(ifft580, ifft580, 177));
__m512 ifft673 = _mm512_fmadd_ps(ifft665, ifft582, _mm512_shuffle_f32x4(ifft665, ifft665, 177));
__m512 ifft590 = _mm512_fmadd_ps(ifft581, ifft582, _mm512_shuffle_f32x4(ifft581, ifft581, 177));
__m512 ifft674 = _mm512_fmadd_ps(ifft666, ifft582, _mm512_shuffle_f32x4(ifft666, ifft666, 177));
__m512 ifft591 = _mm512_add_ps(ifft583, ifft584);
__m512 ifft675 = _mm512_add_ps(ifft667, ifft668);
__m512 ifft592 = _mm512_sub_ps(ifft583, ifft584);
__m512 ifft676 = _mm512_sub_ps(ifft667, ifft668);
__m512 ifft593 = _mm512_sub_ps(ifft585, ifft589);
__m512 ifft677 = _mm512_sub_ps(ifft669, ifft673);
__m512 ifft594 = _mm512_add_ps(ifft586, ifft590);
__m512 ifft678 = _mm512_add_ps(ifft670, ifft674);
__m512 ifft595 = _mm512_add_ps(ifft585, ifft589);
__m512 ifft679 = _mm512_add_ps(ifft669, ifft673);
__m512 ifft596 = _mm512_sub_ps(ifft586, ifft590);
__m512 ifft680 = _mm512_sub_ps(ifft670, ifft674);
__m512 ifft597 = _mm512_mul_ps(ifft587, _mm512_set1_ps(3.125e-02f));
__m512 ifft681 = _mm512_mul_ps(ifft671, _mm512_set1_ps(3.125e-02f));
__m512 ifft598 = _mm512_mul_ps(ifft588, _mm512_set1_ps(3.125e-02f));
__m512 ifft682 = _mm512_mul_ps(ifft672, _mm512_set1_ps(3.125e-02f));
__m512 ifft599 = _mm512_fmadd_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft683 = _mm512_fmadd_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft600 = _mm512_fmsub_ps(ifft591, _mm512_set1_ps(1.5625e-02f), ifft597);
__m512 ifft684 = _mm512_fmsub_ps(ifft675, _mm512_set1_ps(1.5625e-02f), ifft681);
__m512 ifft601 = _mm512_fmadd_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft685 = _mm512_fmadd_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft602 = _mm512_fmsub_ps(ifft592, _mm512_set1_ps(1.5625e-02f), ifft598);
__m512 ifft686 = _mm512_fmsub_ps(ifft676, _mm512_set1_ps(1.5625e-02f), ifft682);
__m512 ifft603 = _mm512_add_ps(ifft593, ifft594);
__m512 ifft687 = _mm512_add_ps(ifft677, ifft678);
__m512 ifft604 = _mm512_sub_ps(ifft593, ifft594);
__m512 ifft688 = _mm512_sub_ps(ifft677, ifft678);
__m512 ifft605 = _mm512_fnmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft689 = _mm512_fnmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft606 = _mm512_fmadd_ps(ifft603, _mm512_set1_ps(7.0710677e-01f), ifft595);
__m512 ifft690 = _mm512_fmadd_ps(ifft687, _mm512_set1_ps(7.0710677e-01f), ifft679);
__m512 ifft607 = _mm512_fmadd_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft691 = _mm512_fmadd_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft608 = _mm512_fmsub_ps(ifft604, _mm512_set1_ps(7.0710677e-01f), ifft596);
__m512 ifft692 = _mm512_fmsub_ps(ifft688, _mm512_set1_ps(7.0710677e-01f), ifft680);
__m512 ifft609 = _mm512_add_ps(ifft605, ifft606);
__m512 ifft693 = _mm512_add_ps(ifft689, ifft690);
__m512 ifft610 = _mm512_sub_ps(ifft605, ifft606);
__m512 ifft694 = _mm512_sub_ps(ifft689, ifft690);
__m512 ifft611 = _mm512_add_ps(ifft607, ifft608);
__m512 ifft695 = _mm512_add_ps(ifft691, ifft692);
__m512 ifft612 = _mm512_sub_ps(ifft607, ifft608);
__m512 ifft696 = _mm512_sub_ps(ifft691, ifft692);
__m512 ifft613 = _mm512_fmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft697 = _mm512_fmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft614 = _mm512_fnmadd_ps(ifft609, _mm512_set1_ps(1.5625e-02f), ifft599);
__m512 ifft698 = _mm512_fnmadd_ps(ifft693, _mm512_set1_ps(1.5625e-02f), ifft683);
__m512 ifft615 = _mm512_fmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft699 = _mm512_fmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft616 = _mm512_fnmadd_ps(ifft611, _mm512_set1_ps(1.5625e-02f), ifft601);
__m512 ifft700 = _mm512_fnmadd_ps(ifft695, _mm512_set1_ps(1.5625e-02f), ifft685);
__m512 ifft617 = _mm512_fnmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft701 = _mm512_fnmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft618 = _mm512_fmadd_ps(ifft612, _mm512_set1_ps(1.5625e-02f), ifft600);
__m512 ifft702 = _mm512_fmadd_ps(ifft696, _mm512_set1_ps(1.5625e-02f), ifft684);
__m512 ifft619 = _mm512_fmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft703 = _mm512_fmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 ifft620 = _mm512_fnmadd_ps(ifft610, _mm512_set1_ps(1.5625e-02f), ifft602);
__m512 ifft704 = _mm512_fnmadd_ps(ifft694, _mm512_set1_ps(1.5625e-02f), ifft686);
__m512 dat620 = ifft613;
__m512 dat625 = ifft697;
__m512 dat621 = ifft615;
__m512 dat626 = ifft699;
__m512 dat622 = ifft617;
__m512 dat627 = ifft701;
__m512 dat623 = ifft619;
__m512 dat628 = ifft703;
__m512 dat624 = ifft614;
__m512 dat629 = ifft698;
(void)ifft616;
(void)ifft700;
(void)ifft618;
(void)ifft702;
(void)ifft620;
(void)ifft704;
__m512i pm7 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack31 = _mm512_permutex2var_ps(dat620, pm7, dat625);
__m512i pm8 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack32 = _mm512_permutex2var_ps(dat620, pm8, dat625);
__m512 pack33 = _mm512_permutex2var_ps(dat621, pm7, dat626);
__m512 pack34 = _mm512_permutex2var_ps(dat621, pm8, dat626);
__m512 pack35 = _mm512_permutex2var_ps(dat622, pm7, dat627);
__m512 pack36 = _mm512_permutex2var_ps(dat622, pm8, dat627);
__m512 pack37 = _mm512_permutex2var_ps(dat623, pm7, dat628);
__m512 pack38 = _mm512_permutex2var_ps(dat623, pm8, dat628);
__m512 pack39 = _mm512_permutex2var_ps(dat624, pm7, dat629);
__m512 pack40 = _mm512_permutex2var_ps(dat624, pm8, dat629);
pack31 = _mm512_max_ps(_mm512_setzero_ps(), pack31);
pack32 = _mm512_max_ps(_mm512_setzero_ps(), pack32);
pack33 = _mm512_max_ps(_mm512_setzero_ps(), pack33);
pack34 = _mm512_max_ps(_mm512_setzero_ps(), pack34);
pack35 = _mm512_max_ps(_mm512_setzero_ps(), pack35);
pack36 = _mm512_max_ps(_mm512_setzero_ps(), pack36);
pack37 = _mm512_max_ps(_mm512_setzero_ps(), pack37);
pack38 = _mm512_max_ps(_mm512_setzero_ps(), pack38);
pack39 = _mm512_max_ps(_mm512_setzero_ps(), pack39);
pack40 = _mm512_max_ps(_mm512_setzero_ps(), pack40);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack31);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack32);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack33);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack34);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack35);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack36);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack37);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack38);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack39);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+40*t5, 1023, pack40);
}
ptrdiff_t t6 = 0;
__m512 sfRe89 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm89 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe93 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm93 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe90 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm90 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe94 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm94 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe91 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm91 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe95 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm95 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe92 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm92 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfRe96 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512 sfIm96 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k27+768*r4+256*t6);
__m512i ifft705 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft706 = _mm512_permutexvar_ps(ifft705, sfRe89);
__m512 ifft797 = _mm512_permutexvar_ps(ifft705, sfRe93);
__m512i ifft707 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft708 = _mm512_permutexvar_ps(ifft707, sfRe89);
__m512 ifft798 = _mm512_permutexvar_ps(ifft707, sfRe93);
__m512 ifft709 = _mm512_permutexvar_ps(ifft705, sfIm89);
__m512 ifft799 = _mm512_permutexvar_ps(ifft705, sfIm93);
__m512 ifft710 = _mm512_permutexvar_ps(ifft707, sfIm89);
__m512 ifft800 = _mm512_permutexvar_ps(ifft707, sfIm93);
__m512 ifft711 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft712 = _mm512_mask_fmadd_ps(ifft710, 65021, ifft711, ifft706);
__m512 ifft801 = _mm512_mask_fmadd_ps(ifft800, 65021, ifft711, ifft797);
__m512 ifft713 = _mm512_mask_fnmadd_ps(ifft709, 65021, ifft711, ifft708);
__m512 ifft802 = _mm512_mask_fnmadd_ps(ifft799, 65021, ifft711, ifft798);
__m512 ifft714 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft715 = _mm512_fmadd_ps(ifft712, ifft714, _mm512_shuffle_ps(ifft712, ifft712, 177));
__m512 ifft803 = _mm512_fmadd_ps(ifft801, ifft714, _mm512_shuffle_ps(ifft801, ifft801, 177));
__m512 ifft716 = _mm512_fmadd_ps(ifft713, ifft714, _mm512_shuffle_ps(ifft713, ifft713, 177));
__m512 ifft804 = _mm512_fmadd_ps(ifft802, ifft714, _mm512_shuffle_ps(ifft802, ifft802, 177));
__m512 ifft717 = _mm512_fmadd_ps(sfRe90, ifft714, _mm512_shuffle_ps(sfRe90, sfRe90, 177));
__m512 ifft805 = _mm512_fmadd_ps(sfRe94, ifft714, _mm512_shuffle_ps(sfRe94, sfRe94, 177));
__m512 ifft718 = _mm512_fmadd_ps(sfIm90, ifft714, _mm512_shuffle_ps(sfIm90, sfIm90, 177));
__m512 ifft806 = _mm512_fmadd_ps(sfIm94, ifft714, _mm512_shuffle_ps(sfIm94, sfIm94, 177));
__m512 ifft719 = _mm512_fmadd_ps(sfRe91, ifft714, _mm512_shuffle_ps(sfRe91, sfRe91, 177));
__m512 ifft807 = _mm512_fmadd_ps(sfRe95, ifft714, _mm512_shuffle_ps(sfRe95, sfRe95, 177));
__m512 ifft720 = _mm512_fmadd_ps(sfIm91, ifft714, _mm512_shuffle_ps(sfIm91, sfIm91, 177));
__m512 ifft808 = _mm512_fmadd_ps(sfIm95, ifft714, _mm512_shuffle_ps(sfIm95, sfIm95, 177));
__m512 ifft721 = _mm512_fmadd_ps(sfRe92, ifft714, _mm512_shuffle_ps(sfRe92, sfRe92, 177));
__m512 ifft809 = _mm512_fmadd_ps(sfRe96, ifft714, _mm512_shuffle_ps(sfRe96, sfRe96, 177));
__m512 ifft722 = _mm512_fmadd_ps(sfIm92, ifft714, _mm512_shuffle_ps(sfIm92, sfIm92, 177));
__m512 ifft810 = _mm512_fmadd_ps(sfIm96, ifft714, _mm512_shuffle_ps(sfIm96, sfIm96, 177));
__m512 ifft723 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft724 = _mm512_mul_ps(ifft715, ifft723);
__m512 ifft811 = _mm512_mul_ps(ifft803, ifft723);
__m512 ifft725 = _mm512_mul_ps(ifft716, ifft723);
__m512 ifft812 = _mm512_mul_ps(ifft804, ifft723);
__m512 ifft726 = _mm512_mul_ps(ifft717, ifft723);
__m512 ifft813 = _mm512_mul_ps(ifft805, ifft723);
__m512 ifft727 = _mm512_mul_ps(ifft718, ifft723);
__m512 ifft814 = _mm512_mul_ps(ifft806, ifft723);
__m512 ifft728 = _mm512_mul_ps(ifft719, ifft723);
__m512 ifft815 = _mm512_mul_ps(ifft807, ifft723);
__m512 ifft729 = _mm512_mul_ps(ifft720, ifft723);
__m512 ifft816 = _mm512_mul_ps(ifft808, ifft723);
__m512 ifft730 = _mm512_mul_ps(ifft721, ifft723);
__m512 ifft817 = _mm512_mul_ps(ifft809, ifft723);
__m512 ifft731 = _mm512_mul_ps(ifft722, ifft723);
__m512 ifft818 = _mm512_mul_ps(ifft810, ifft723);
__m512 ifft732 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft733 = _mm512_fnmadd_ps(ifft716, ifft732, ifft724);
__m512 ifft819 = _mm512_fnmadd_ps(ifft804, ifft732, ifft811);
__m512 ifft734 = _mm512_fmadd_ps(ifft715, ifft732, ifft725);
__m512 ifft820 = _mm512_fmadd_ps(ifft803, ifft732, ifft812);
__m512 ifft735 = _mm512_fnmadd_ps(ifft718, ifft732, ifft726);
__m512 ifft821 = _mm512_fnmadd_ps(ifft806, ifft732, ifft813);
__m512 ifft736 = _mm512_fmadd_ps(ifft717, ifft732, ifft727);
__m512 ifft822 = _mm512_fmadd_ps(ifft805, ifft732, ifft814);
__m512 ifft737 = _mm512_fnmadd_ps(ifft720, ifft732, ifft728);
__m512 ifft823 = _mm512_fnmadd_ps(ifft808, ifft732, ifft815);
__m512 ifft738 = _mm512_fmadd_ps(ifft719, ifft732, ifft729);
__m512 ifft824 = _mm512_fmadd_ps(ifft807, ifft732, ifft816);
__m512 ifft739 = _mm512_fnmadd_ps(ifft722, ifft732, ifft730);
__m512 ifft825 = _mm512_fnmadd_ps(ifft810, ifft732, ifft817);
__m512 ifft740 = _mm512_fmadd_ps(ifft721, ifft732, ifft731);
__m512 ifft826 = _mm512_fmadd_ps(ifft809, ifft732, ifft818);
__m512 ifft741 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft742 = _mm512_fmadd_ps(ifft733, ifft741, _mm512_shuffle_ps(ifft733, ifft733, 78));
__m512 ifft827 = _mm512_fmadd_ps(ifft819, ifft741, _mm512_shuffle_ps(ifft819, ifft819, 78));
__m512 ifft743 = _mm512_fmadd_ps(ifft734, ifft741, _mm512_shuffle_ps(ifft734, ifft734, 78));
__m512 ifft828 = _mm512_fmadd_ps(ifft820, ifft741, _mm512_shuffle_ps(ifft820, ifft820, 78));
__m512 ifft744 = _mm512_fmadd_ps(ifft735, ifft741, _mm512_shuffle_ps(ifft735, ifft735, 78));
__m512 ifft829 = _mm512_fmadd_ps(ifft821, ifft741, _mm512_shuffle_ps(ifft821, ifft821, 78));
__m512 ifft745 = _mm512_fmadd_ps(ifft736, ifft741, _mm512_shuffle_ps(ifft736, ifft736, 78));
__m512 ifft830 = _mm512_fmadd_ps(ifft822, ifft741, _mm512_shuffle_ps(ifft822, ifft822, 78));
__m512 ifft746 = _mm512_fmadd_ps(ifft737, ifft741, _mm512_shuffle_ps(ifft737, ifft737, 78));
__m512 ifft831 = _mm512_fmadd_ps(ifft823, ifft741, _mm512_shuffle_ps(ifft823, ifft823, 78));
__m512 ifft747 = _mm512_fmadd_ps(ifft738, ifft741, _mm512_shuffle_ps(ifft738, ifft738, 78));
__m512 ifft832 = _mm512_fmadd_ps(ifft824, ifft741, _mm512_shuffle_ps(ifft824, ifft824, 78));
__m512 ifft748 = _mm512_fmadd_ps(ifft739, ifft741, _mm512_shuffle_ps(ifft739, ifft739, 78));
__m512 ifft833 = _mm512_fmadd_ps(ifft825, ifft741, _mm512_shuffle_ps(ifft825, ifft825, 78));
__m512 ifft749 = _mm512_fmadd_ps(ifft740, ifft741, _mm512_shuffle_ps(ifft740, ifft740, 78));
__m512 ifft834 = _mm512_fmadd_ps(ifft826, ifft741, _mm512_shuffle_ps(ifft826, ifft826, 78));
__m512 ifft750 = _mm512_mask_sub_ps(ifft742, 49344, _mm512_setzero_ps(), ifft743);
__m512 ifft835 = _mm512_mask_sub_ps(ifft827, 49344, _mm512_setzero_ps(), ifft828);
__m512 ifft751 = _mm512_mask_mov_ps(ifft743, 49344, ifft742);
__m512 ifft836 = _mm512_mask_mov_ps(ifft828, 49344, ifft827);
__m512 ifft752 = _mm512_mask_sub_ps(ifft744, 49344, _mm512_setzero_ps(), ifft745);
__m512 ifft837 = _mm512_mask_sub_ps(ifft829, 49344, _mm512_setzero_ps(), ifft830);
__m512 ifft753 = _mm512_mask_mov_ps(ifft745, 49344, ifft744);
__m512 ifft838 = _mm512_mask_mov_ps(ifft830, 49344, ifft829);
__m512 ifft754 = _mm512_mask_sub_ps(ifft746, 49344, _mm512_setzero_ps(), ifft747);
__m512 ifft839 = _mm512_mask_sub_ps(ifft831, 49344, _mm512_setzero_ps(), ifft832);
__m512 ifft755 = _mm512_mask_mov_ps(ifft747, 49344, ifft746);
__m512 ifft840 = _mm512_mask_mov_ps(ifft832, 49344, ifft831);
__m512 ifft756 = _mm512_mask_sub_ps(ifft748, 49344, _mm512_setzero_ps(), ifft749);
__m512 ifft841 = _mm512_mask_sub_ps(ifft833, 49344, _mm512_setzero_ps(), ifft834);
__m512 ifft757 = _mm512_mask_mov_ps(ifft749, 49344, ifft748);
__m512 ifft842 = _mm512_mask_mov_ps(ifft834, 49344, ifft833);
__m512 ifft758 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft759 = _mm512_fmadd_ps(ifft750, ifft758, _mm512_shuffle_f32x4(ifft750, ifft750, 177));
__m512 ifft843 = _mm512_fmadd_ps(ifft835, ifft758, _mm512_shuffle_f32x4(ifft835, ifft835, 177));
__m512 ifft760 = _mm512_fmadd_ps(ifft751, ifft758, _mm512_shuffle_f32x4(ifft751, ifft751, 177));
__m512 ifft844 = _mm512_fmadd_ps(ifft836, ifft758, _mm512_shuffle_f32x4(ifft836, ifft836, 177));
__m512 ifft761 = _mm512_fmadd_ps(ifft752, ifft758, _mm512_shuffle_f32x4(ifft752, ifft752, 177));
__m512 ifft845 = _mm512_fmadd_ps(ifft837, ifft758, _mm512_shuffle_f32x4(ifft837, ifft837, 177));
__m512 ifft762 = _mm512_fmadd_ps(ifft753, ifft758, _mm512_shuffle_f32x4(ifft753, ifft753, 177));
__m512 ifft846 = _mm512_fmadd_ps(ifft838, ifft758, _mm512_shuffle_f32x4(ifft838, ifft838, 177));
__m512 ifft763 = _mm512_fmadd_ps(ifft754, ifft758, _mm512_shuffle_f32x4(ifft754, ifft754, 177));
__m512 ifft847 = _mm512_fmadd_ps(ifft839, ifft758, _mm512_shuffle_f32x4(ifft839, ifft839, 177));
__m512 ifft764 = _mm512_fnmsub_ps(ifft755, ifft758, _mm512_shuffle_f32x4(ifft755, ifft755, 177));
__m512 ifft848 = _mm512_fnmsub_ps(ifft840, ifft758, _mm512_shuffle_f32x4(ifft840, ifft840, 177));
__m512 ifft765 = _mm512_fmadd_ps(ifft756, ifft758, _mm512_shuffle_f32x4(ifft756, ifft756, 177));
__m512 ifft849 = _mm512_fmadd_ps(ifft841, ifft758, _mm512_shuffle_f32x4(ifft841, ifft841, 177));
__m512 ifft766 = _mm512_fmadd_ps(ifft757, ifft758, _mm512_shuffle_f32x4(ifft757, ifft757, 177));
__m512 ifft850 = _mm512_fmadd_ps(ifft842, ifft758, _mm512_shuffle_f32x4(ifft842, ifft842, 177));
__m512 ifft767 = _mm512_add_ps(ifft759, ifft760);
__m512 ifft851 = _mm512_add_ps(ifft843, ifft844);
__m512 ifft768 = _mm512_sub_ps(ifft759, ifft760);
__m512 ifft852 = _mm512_sub_ps(ifft843, ifft844);
__m512 ifft769 = _mm512_sub_ps(ifft761, ifft765);
__m512 ifft853 = _mm512_sub_ps(ifft845, ifft849);
__m512 ifft770 = _mm512_add_ps(ifft762, ifft766);
__m512 ifft854 = _mm512_add_ps(ifft846, ifft850);
__m512 ifft771 = _mm512_add_ps(ifft761, ifft765);
__m512 ifft855 = _mm512_add_ps(ifft845, ifft849);
__m512 ifft772 = _mm512_sub_ps(ifft762, ifft766);
__m512 ifft856 = _mm512_sub_ps(ifft846, ifft850);
__m512 ifft773 = _mm512_mul_ps(ifft763, _mm512_set1_ps(3.125e-02f));
__m512 ifft857 = _mm512_mul_ps(ifft847, _mm512_set1_ps(3.125e-02f));
__m512 ifft774 = _mm512_mul_ps(ifft764, _mm512_set1_ps(3.125e-02f));
__m512 ifft858 = _mm512_mul_ps(ifft848, _mm512_set1_ps(3.125e-02f));
__m512 ifft775 = _mm512_fmadd_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft859 = _mm512_fmadd_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft776 = _mm512_fmsub_ps(ifft767, _mm512_set1_ps(1.5625e-02f), ifft773);
__m512 ifft860 = _mm512_fmsub_ps(ifft851, _mm512_set1_ps(1.5625e-02f), ifft857);
__m512 ifft777 = _mm512_fmadd_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft861 = _mm512_fmadd_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft778 = _mm512_fmsub_ps(ifft768, _mm512_set1_ps(1.5625e-02f), ifft774);
__m512 ifft862 = _mm512_fmsub_ps(ifft852, _mm512_set1_ps(1.5625e-02f), ifft858);
__m512 ifft779 = _mm512_add_ps(ifft769, ifft770);
__m512 ifft863 = _mm512_add_ps(ifft853, ifft854);
__m512 ifft780 = _mm512_sub_ps(ifft769, ifft770);
__m512 ifft864 = _mm512_sub_ps(ifft853, ifft854);
__m512 ifft781 = _mm512_fnmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft865 = _mm512_fnmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft782 = _mm512_fmadd_ps(ifft779, _mm512_set1_ps(7.0710677e-01f), ifft771);
__m512 ifft866 = _mm512_fmadd_ps(ifft863, _mm512_set1_ps(7.0710677e-01f), ifft855);
__m512 ifft783 = _mm512_fmadd_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft867 = _mm512_fmadd_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft784 = _mm512_fmsub_ps(ifft780, _mm512_set1_ps(7.0710677e-01f), ifft772);
__m512 ifft868 = _mm512_fmsub_ps(ifft864, _mm512_set1_ps(7.0710677e-01f), ifft856);
__m512 ifft785 = _mm512_add_ps(ifft781, ifft782);
__m512 ifft869 = _mm512_add_ps(ifft865, ifft866);
__m512 ifft786 = _mm512_sub_ps(ifft781, ifft782);
__m512 ifft870 = _mm512_sub_ps(ifft865, ifft866);
__m512 ifft787 = _mm512_add_ps(ifft783, ifft784);
__m512 ifft871 = _mm512_add_ps(ifft867, ifft868);
__m512 ifft788 = _mm512_sub_ps(ifft783, ifft784);
__m512 ifft872 = _mm512_sub_ps(ifft867, ifft868);
__m512 ifft789 = _mm512_fmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft873 = _mm512_fmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft790 = _mm512_fnmadd_ps(ifft785, _mm512_set1_ps(1.5625e-02f), ifft775);
__m512 ifft874 = _mm512_fnmadd_ps(ifft869, _mm512_set1_ps(1.5625e-02f), ifft859);
__m512 ifft791 = _mm512_fmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft875 = _mm512_fmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft792 = _mm512_fnmadd_ps(ifft787, _mm512_set1_ps(1.5625e-02f), ifft777);
__m512 ifft876 = _mm512_fnmadd_ps(ifft871, _mm512_set1_ps(1.5625e-02f), ifft861);
__m512 ifft793 = _mm512_fnmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft877 = _mm512_fnmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft794 = _mm512_fmadd_ps(ifft788, _mm512_set1_ps(1.5625e-02f), ifft776);
__m512 ifft878 = _mm512_fmadd_ps(ifft872, _mm512_set1_ps(1.5625e-02f), ifft860);
__m512 ifft795 = _mm512_fmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft879 = _mm512_fmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 ifft796 = _mm512_fnmadd_ps(ifft786, _mm512_set1_ps(1.5625e-02f), ifft778);
__m512 ifft880 = _mm512_fnmadd_ps(ifft870, _mm512_set1_ps(1.5625e-02f), ifft862);
__m512 dat630 = ifft789;
__m512 dat635 = ifft873;
__m512 dat631 = ifft791;
__m512 dat636 = ifft875;
__m512 dat632 = ifft793;
__m512 dat637 = ifft877;
__m512 dat633 = ifft795;
__m512 dat638 = ifft879;
__m512 dat634 = ifft790;
__m512 dat639 = ifft874;
(void)ifft792;
(void)ifft876;
(void)ifft794;
(void)ifft878;
(void)ifft796;
(void)ifft880;
dat630 = _mm512_max_ps(_mm512_setzero_ps(), dat630);
dat635 = _mm512_max_ps(_mm512_setzero_ps(), dat635);
dat631 = _mm512_max_ps(_mm512_setzero_ps(), dat631);
dat636 = _mm512_max_ps(_mm512_setzero_ps(), dat636);
dat632 = _mm512_max_ps(_mm512_setzero_ps(), dat632);
dat637 = _mm512_max_ps(_mm512_setzero_ps(), dat637);
dat633 = _mm512_max_ps(_mm512_setzero_ps(), dat633);
dat638 = _mm512_max_ps(_mm512_setzero_ps(), dat638);
dat634 = _mm512_max_ps(_mm512_setzero_ps(), dat634);
dat639 = _mm512_max_ps(_mm512_setzero_ps(), dat639);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat630);
_mm512_mask_storeu_ps(datPtr2+52088+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat630);
_mm512_mask_storeu_ps(datPtr2+1880+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat635);
_mm512_mask_storeu_ps(datPtr2+50288+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat635);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat631);
_mm512_mask_storeu_ps(datPtr2+52536+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat631);
_mm512_mask_storeu_ps(datPtr2+2328+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat636);
_mm512_mask_storeu_ps(datPtr2+50736+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat636);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat632);
_mm512_mask_storeu_ps(datPtr2+52984+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat632);
_mm512_mask_storeu_ps(datPtr2+2776+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat637);
_mm512_mask_storeu_ps(datPtr2+51184+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat637);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat633);
_mm512_mask_storeu_ps(datPtr2+53432+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat633);
_mm512_mask_storeu_ps(datPtr2+3224+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat638);
_mm512_mask_storeu_ps(datPtr2+51632+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat638);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 3, dat634);
_mm512_mask_storeu_ps(datPtr2+53880+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 7936, dat634);
_mm512_mask_storeu_ps(datPtr2+3672+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 31, dat639);
_mm512_mask_storeu_ps(datPtr2+52080+3215360*i9+200960*k27+100480*r4+448*toH3+4*toW3+0*t6, 768, dat639);
}
}
if (j5 >= last2) return;
++j5;
j5 = 4;
}
if (j5 < 84) {
ptrdiff_t rel5 = (size_t)(j5-4)%23;
ptrdiff_t base5 = 5+(size_t)(j5-4)/23*30;
for (; ; rel5 = 0, base5 += 30) {
if (rel5 < 11) {
if (rel5 < 4) {
if (rel5 < 3) {
ptrdiff_t toH4 = base5+0;
ptrdiff_t toW4 = 5+30*rel5;
ptrdiff_t jj12 = 2-rel5+j5;
for (; j5 <= jj12; toW4 += 30) {
ptrdiff_t k28 = 16*w21;
for (; k28 != 16; ++k28) {
ptrdiff_t r5 = 0;
for (; r5 != 2; ++r5) {
ptrdiff_t t7 = 0;
for (; t7 < 3; ++t7) {
__m512 sfRe97 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm97 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe101 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm101 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe98 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm98 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe102 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm102 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe99 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm99 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe103 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm103 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe100 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm100 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfRe104 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512 sfIm104 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k28+768*r5+256*t7);
__m512i ifft881 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft882 = _mm512_permutexvar_ps(ifft881, sfRe97);
__m512 ifft973 = _mm512_permutexvar_ps(ifft881, sfRe101);
__m512i ifft883 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft884 = _mm512_permutexvar_ps(ifft883, sfRe97);
__m512 ifft974 = _mm512_permutexvar_ps(ifft883, sfRe101);
__m512 ifft885 = _mm512_permutexvar_ps(ifft881, sfIm97);
__m512 ifft975 = _mm512_permutexvar_ps(ifft881, sfIm101);
__m512 ifft886 = _mm512_permutexvar_ps(ifft883, sfIm97);
__m512 ifft976 = _mm512_permutexvar_ps(ifft883, sfIm101);
__m512 ifft887 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft888 = _mm512_mask_fmadd_ps(ifft886, 65021, ifft887, ifft882);
__m512 ifft977 = _mm512_mask_fmadd_ps(ifft976, 65021, ifft887, ifft973);
__m512 ifft889 = _mm512_mask_fnmadd_ps(ifft885, 65021, ifft887, ifft884);
__m512 ifft978 = _mm512_mask_fnmadd_ps(ifft975, 65021, ifft887, ifft974);
__m512 ifft890 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft891 = _mm512_fmadd_ps(ifft888, ifft890, _mm512_shuffle_ps(ifft888, ifft888, 177));
__m512 ifft979 = _mm512_fmadd_ps(ifft977, ifft890, _mm512_shuffle_ps(ifft977, ifft977, 177));
__m512 ifft892 = _mm512_fmadd_ps(ifft889, ifft890, _mm512_shuffle_ps(ifft889, ifft889, 177));
__m512 ifft980 = _mm512_fmadd_ps(ifft978, ifft890, _mm512_shuffle_ps(ifft978, ifft978, 177));
__m512 ifft893 = _mm512_fmadd_ps(sfRe98, ifft890, _mm512_shuffle_ps(sfRe98, sfRe98, 177));
__m512 ifft981 = _mm512_fmadd_ps(sfRe102, ifft890, _mm512_shuffle_ps(sfRe102, sfRe102, 177));
__m512 ifft894 = _mm512_fmadd_ps(sfIm98, ifft890, _mm512_shuffle_ps(sfIm98, sfIm98, 177));
__m512 ifft982 = _mm512_fmadd_ps(sfIm102, ifft890, _mm512_shuffle_ps(sfIm102, sfIm102, 177));
__m512 ifft895 = _mm512_fmadd_ps(sfRe99, ifft890, _mm512_shuffle_ps(sfRe99, sfRe99, 177));
__m512 ifft983 = _mm512_fmadd_ps(sfRe103, ifft890, _mm512_shuffle_ps(sfRe103, sfRe103, 177));
__m512 ifft896 = _mm512_fmadd_ps(sfIm99, ifft890, _mm512_shuffle_ps(sfIm99, sfIm99, 177));
__m512 ifft984 = _mm512_fmadd_ps(sfIm103, ifft890, _mm512_shuffle_ps(sfIm103, sfIm103, 177));
__m512 ifft897 = _mm512_fmadd_ps(sfRe100, ifft890, _mm512_shuffle_ps(sfRe100, sfRe100, 177));
__m512 ifft985 = _mm512_fmadd_ps(sfRe104, ifft890, _mm512_shuffle_ps(sfRe104, sfRe104, 177));
__m512 ifft898 = _mm512_fmadd_ps(sfIm100, ifft890, _mm512_shuffle_ps(sfIm100, sfIm100, 177));
__m512 ifft986 = _mm512_fmadd_ps(sfIm104, ifft890, _mm512_shuffle_ps(sfIm104, sfIm104, 177));
__m512 ifft899 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft900 = _mm512_mul_ps(ifft891, ifft899);
__m512 ifft987 = _mm512_mul_ps(ifft979, ifft899);
__m512 ifft901 = _mm512_mul_ps(ifft892, ifft899);
__m512 ifft988 = _mm512_mul_ps(ifft980, ifft899);
__m512 ifft902 = _mm512_mul_ps(ifft893, ifft899);
__m512 ifft989 = _mm512_mul_ps(ifft981, ifft899);
__m512 ifft903 = _mm512_mul_ps(ifft894, ifft899);
__m512 ifft990 = _mm512_mul_ps(ifft982, ifft899);
__m512 ifft904 = _mm512_mul_ps(ifft895, ifft899);
__m512 ifft991 = _mm512_mul_ps(ifft983, ifft899);
__m512 ifft905 = _mm512_mul_ps(ifft896, ifft899);
__m512 ifft992 = _mm512_mul_ps(ifft984, ifft899);
__m512 ifft906 = _mm512_mul_ps(ifft897, ifft899);
__m512 ifft993 = _mm512_mul_ps(ifft985, ifft899);
__m512 ifft907 = _mm512_mul_ps(ifft898, ifft899);
__m512 ifft994 = _mm512_mul_ps(ifft986, ifft899);
__m512 ifft908 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft909 = _mm512_fnmadd_ps(ifft892, ifft908, ifft900);
__m512 ifft995 = _mm512_fnmadd_ps(ifft980, ifft908, ifft987);
__m512 ifft910 = _mm512_fmadd_ps(ifft891, ifft908, ifft901);
__m512 ifft996 = _mm512_fmadd_ps(ifft979, ifft908, ifft988);
__m512 ifft911 = _mm512_fnmadd_ps(ifft894, ifft908, ifft902);
__m512 ifft997 = _mm512_fnmadd_ps(ifft982, ifft908, ifft989);
__m512 ifft912 = _mm512_fmadd_ps(ifft893, ifft908, ifft903);
__m512 ifft998 = _mm512_fmadd_ps(ifft981, ifft908, ifft990);
__m512 ifft913 = _mm512_fnmadd_ps(ifft896, ifft908, ifft904);
__m512 ifft999 = _mm512_fnmadd_ps(ifft984, ifft908, ifft991);
__m512 ifft914 = _mm512_fmadd_ps(ifft895, ifft908, ifft905);
__m512 ifft1000 = _mm512_fmadd_ps(ifft983, ifft908, ifft992);
__m512 ifft915 = _mm512_fnmadd_ps(ifft898, ifft908, ifft906);
__m512 ifft1001 = _mm512_fnmadd_ps(ifft986, ifft908, ifft993);
__m512 ifft916 = _mm512_fmadd_ps(ifft897, ifft908, ifft907);
__m512 ifft1002 = _mm512_fmadd_ps(ifft985, ifft908, ifft994);
__m512 ifft917 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft918 = _mm512_fmadd_ps(ifft909, ifft917, _mm512_shuffle_ps(ifft909, ifft909, 78));
__m512 ifft1003 = _mm512_fmadd_ps(ifft995, ifft917, _mm512_shuffle_ps(ifft995, ifft995, 78));
__m512 ifft919 = _mm512_fmadd_ps(ifft910, ifft917, _mm512_shuffle_ps(ifft910, ifft910, 78));
__m512 ifft1004 = _mm512_fmadd_ps(ifft996, ifft917, _mm512_shuffle_ps(ifft996, ifft996, 78));
__m512 ifft920 = _mm512_fmadd_ps(ifft911, ifft917, _mm512_shuffle_ps(ifft911, ifft911, 78));
__m512 ifft1005 = _mm512_fmadd_ps(ifft997, ifft917, _mm512_shuffle_ps(ifft997, ifft997, 78));
__m512 ifft921 = _mm512_fmadd_ps(ifft912, ifft917, _mm512_shuffle_ps(ifft912, ifft912, 78));
__m512 ifft1006 = _mm512_fmadd_ps(ifft998, ifft917, _mm512_shuffle_ps(ifft998, ifft998, 78));
__m512 ifft922 = _mm512_fmadd_ps(ifft913, ifft917, _mm512_shuffle_ps(ifft913, ifft913, 78));
__m512 ifft1007 = _mm512_fmadd_ps(ifft999, ifft917, _mm512_shuffle_ps(ifft999, ifft999, 78));
__m512 ifft923 = _mm512_fmadd_ps(ifft914, ifft917, _mm512_shuffle_ps(ifft914, ifft914, 78));
__m512 ifft1008 = _mm512_fmadd_ps(ifft1000, ifft917, _mm512_shuffle_ps(ifft1000, ifft1000, 78));
__m512 ifft924 = _mm512_fmadd_ps(ifft915, ifft917, _mm512_shuffle_ps(ifft915, ifft915, 78));
__m512 ifft1009 = _mm512_fmadd_ps(ifft1001, ifft917, _mm512_shuffle_ps(ifft1001, ifft1001, 78));
__m512 ifft925 = _mm512_fmadd_ps(ifft916, ifft917, _mm512_shuffle_ps(ifft916, ifft916, 78));
__m512 ifft1010 = _mm512_fmadd_ps(ifft1002, ifft917, _mm512_shuffle_ps(ifft1002, ifft1002, 78));
__m512 ifft926 = _mm512_mask_sub_ps(ifft918, 49344, _mm512_setzero_ps(), ifft919);
__m512 ifft1011 = _mm512_mask_sub_ps(ifft1003, 49344, _mm512_setzero_ps(), ifft1004);
__m512 ifft927 = _mm512_mask_mov_ps(ifft919, 49344, ifft918);
__m512 ifft1012 = _mm512_mask_mov_ps(ifft1004, 49344, ifft1003);
__m512 ifft928 = _mm512_mask_sub_ps(ifft920, 49344, _mm512_setzero_ps(), ifft921);
__m512 ifft1013 = _mm512_mask_sub_ps(ifft1005, 49344, _mm512_setzero_ps(), ifft1006);
__m512 ifft929 = _mm512_mask_mov_ps(ifft921, 49344, ifft920);
__m512 ifft1014 = _mm512_mask_mov_ps(ifft1006, 49344, ifft1005);
__m512 ifft930 = _mm512_mask_sub_ps(ifft922, 49344, _mm512_setzero_ps(), ifft923);
__m512 ifft1015 = _mm512_mask_sub_ps(ifft1007, 49344, _mm512_setzero_ps(), ifft1008);
__m512 ifft931 = _mm512_mask_mov_ps(ifft923, 49344, ifft922);
__m512 ifft1016 = _mm512_mask_mov_ps(ifft1008, 49344, ifft1007);
__m512 ifft932 = _mm512_mask_sub_ps(ifft924, 49344, _mm512_setzero_ps(), ifft925);
__m512 ifft1017 = _mm512_mask_sub_ps(ifft1009, 49344, _mm512_setzero_ps(), ifft1010);
__m512 ifft933 = _mm512_mask_mov_ps(ifft925, 49344, ifft924);
__m512 ifft1018 = _mm512_mask_mov_ps(ifft1010, 49344, ifft1009);
__m512 ifft934 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft935 = _mm512_fmadd_ps(ifft926, ifft934, _mm512_shuffle_f32x4(ifft926, ifft926, 177));
__m512 ifft1019 = _mm512_fmadd_ps(ifft1011, ifft934, _mm512_shuffle_f32x4(ifft1011, ifft1011, 177));
__m512 ifft936 = _mm512_fmadd_ps(ifft927, ifft934, _mm512_shuffle_f32x4(ifft927, ifft927, 177));
__m512 ifft1020 = _mm512_fmadd_ps(ifft1012, ifft934, _mm512_shuffle_f32x4(ifft1012, ifft1012, 177));
__m512 ifft937 = _mm512_fmadd_ps(ifft928, ifft934, _mm512_shuffle_f32x4(ifft928, ifft928, 177));
__m512 ifft1021 = _mm512_fmadd_ps(ifft1013, ifft934, _mm512_shuffle_f32x4(ifft1013, ifft1013, 177));
__m512 ifft938 = _mm512_fmadd_ps(ifft929, ifft934, _mm512_shuffle_f32x4(ifft929, ifft929, 177));
__m512 ifft1022 = _mm512_fmadd_ps(ifft1014, ifft934, _mm512_shuffle_f32x4(ifft1014, ifft1014, 177));
__m512 ifft939 = _mm512_fmadd_ps(ifft930, ifft934, _mm512_shuffle_f32x4(ifft930, ifft930, 177));
__m512 ifft1023 = _mm512_fmadd_ps(ifft1015, ifft934, _mm512_shuffle_f32x4(ifft1015, ifft1015, 177));
__m512 ifft940 = _mm512_fnmsub_ps(ifft931, ifft934, _mm512_shuffle_f32x4(ifft931, ifft931, 177));
__m512 ifft1024 = _mm512_fnmsub_ps(ifft1016, ifft934, _mm512_shuffle_f32x4(ifft1016, ifft1016, 177));
__m512 ifft941 = _mm512_fmadd_ps(ifft932, ifft934, _mm512_shuffle_f32x4(ifft932, ifft932, 177));
__m512 ifft1025 = _mm512_fmadd_ps(ifft1017, ifft934, _mm512_shuffle_f32x4(ifft1017, ifft1017, 177));
__m512 ifft942 = _mm512_fmadd_ps(ifft933, ifft934, _mm512_shuffle_f32x4(ifft933, ifft933, 177));
__m512 ifft1026 = _mm512_fmadd_ps(ifft1018, ifft934, _mm512_shuffle_f32x4(ifft1018, ifft1018, 177));
__m512 ifft943 = _mm512_add_ps(ifft935, ifft936);
__m512 ifft1027 = _mm512_add_ps(ifft1019, ifft1020);
__m512 ifft944 = _mm512_sub_ps(ifft935, ifft936);
__m512 ifft1028 = _mm512_sub_ps(ifft1019, ifft1020);
__m512 ifft945 = _mm512_sub_ps(ifft937, ifft941);
__m512 ifft1029 = _mm512_sub_ps(ifft1021, ifft1025);
__m512 ifft946 = _mm512_add_ps(ifft938, ifft942);
__m512 ifft1030 = _mm512_add_ps(ifft1022, ifft1026);
__m512 ifft947 = _mm512_add_ps(ifft937, ifft941);
__m512 ifft1031 = _mm512_add_ps(ifft1021, ifft1025);
__m512 ifft948 = _mm512_sub_ps(ifft938, ifft942);
__m512 ifft1032 = _mm512_sub_ps(ifft1022, ifft1026);
__m512 ifft949 = _mm512_mul_ps(ifft939, _mm512_set1_ps(3.125e-02f));
__m512 ifft1033 = _mm512_mul_ps(ifft1023, _mm512_set1_ps(3.125e-02f));
__m512 ifft950 = _mm512_mul_ps(ifft940, _mm512_set1_ps(3.125e-02f));
__m512 ifft1034 = _mm512_mul_ps(ifft1024, _mm512_set1_ps(3.125e-02f));
__m512 ifft951 = _mm512_fmadd_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1035 = _mm512_fmadd_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft952 = _mm512_fmsub_ps(ifft943, _mm512_set1_ps(1.5625e-02f), ifft949);
__m512 ifft1036 = _mm512_fmsub_ps(ifft1027, _mm512_set1_ps(1.5625e-02f), ifft1033);
__m512 ifft953 = _mm512_fmadd_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1037 = _mm512_fmadd_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft954 = _mm512_fmsub_ps(ifft944, _mm512_set1_ps(1.5625e-02f), ifft950);
__m512 ifft1038 = _mm512_fmsub_ps(ifft1028, _mm512_set1_ps(1.5625e-02f), ifft1034);
__m512 ifft955 = _mm512_add_ps(ifft945, ifft946);
__m512 ifft1039 = _mm512_add_ps(ifft1029, ifft1030);
__m512 ifft956 = _mm512_sub_ps(ifft945, ifft946);
__m512 ifft1040 = _mm512_sub_ps(ifft1029, ifft1030);
__m512 ifft957 = _mm512_fnmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1041 = _mm512_fnmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft958 = _mm512_fmadd_ps(ifft955, _mm512_set1_ps(7.0710677e-01f), ifft947);
__m512 ifft1042 = _mm512_fmadd_ps(ifft1039, _mm512_set1_ps(7.0710677e-01f), ifft1031);
__m512 ifft959 = _mm512_fmadd_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1043 = _mm512_fmadd_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft960 = _mm512_fmsub_ps(ifft956, _mm512_set1_ps(7.0710677e-01f), ifft948);
__m512 ifft1044 = _mm512_fmsub_ps(ifft1040, _mm512_set1_ps(7.0710677e-01f), ifft1032);
__m512 ifft961 = _mm512_add_ps(ifft957, ifft958);
__m512 ifft1045 = _mm512_add_ps(ifft1041, ifft1042);
__m512 ifft962 = _mm512_sub_ps(ifft957, ifft958);
__m512 ifft1046 = _mm512_sub_ps(ifft1041, ifft1042);
__m512 ifft963 = _mm512_add_ps(ifft959, ifft960);
__m512 ifft1047 = _mm512_add_ps(ifft1043, ifft1044);
__m512 ifft964 = _mm512_sub_ps(ifft959, ifft960);
__m512 ifft1048 = _mm512_sub_ps(ifft1043, ifft1044);
__m512 ifft965 = _mm512_fmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1049 = _mm512_fmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft966 = _mm512_fnmadd_ps(ifft961, _mm512_set1_ps(1.5625e-02f), ifft951);
__m512 ifft1050 = _mm512_fnmadd_ps(ifft1045, _mm512_set1_ps(1.5625e-02f), ifft1035);
__m512 ifft967 = _mm512_fmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1051 = _mm512_fmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft968 = _mm512_fnmadd_ps(ifft963, _mm512_set1_ps(1.5625e-02f), ifft953);
__m512 ifft1052 = _mm512_fnmadd_ps(ifft1047, _mm512_set1_ps(1.5625e-02f), ifft1037);
__m512 ifft969 = _mm512_fnmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1053 = _mm512_fnmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft970 = _mm512_fmadd_ps(ifft964, _mm512_set1_ps(1.5625e-02f), ifft952);
__m512 ifft1054 = _mm512_fmadd_ps(ifft1048, _mm512_set1_ps(1.5625e-02f), ifft1036);
__m512 ifft971 = _mm512_fmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1055 = _mm512_fmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 ifft972 = _mm512_fnmadd_ps(ifft962, _mm512_set1_ps(1.5625e-02f), ifft954);
__m512 ifft1056 = _mm512_fnmadd_ps(ifft1046, _mm512_set1_ps(1.5625e-02f), ifft1038);
__m512 dat640 = ifft965;
__m512 dat645 = ifft1049;
__m512 dat641 = ifft967;
__m512 dat646 = ifft1051;
__m512 dat642 = ifft969;
__m512 dat647 = ifft1053;
__m512 dat643 = ifft971;
__m512 dat648 = ifft1055;
__m512 dat644 = ifft966;
__m512 dat649 = ifft1050;
(void)ifft968;
(void)ifft1052;
(void)ifft970;
(void)ifft1054;
(void)ifft972;
(void)ifft1056;
__m512i pm9 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack41 = _mm512_permutex2var_ps(dat640, pm9, dat645);
__m512i pm10 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack42 = _mm512_permutex2var_ps(dat640, pm10, dat645);
__m512 pack43 = _mm512_permutex2var_ps(dat641, pm9, dat646);
__m512 pack44 = _mm512_permutex2var_ps(dat641, pm10, dat646);
__m512 pack45 = _mm512_permutex2var_ps(dat642, pm9, dat647);
__m512 pack46 = _mm512_permutex2var_ps(dat642, pm10, dat647);
__m512 pack47 = _mm512_permutex2var_ps(dat643, pm9, dat648);
__m512 pack48 = _mm512_permutex2var_ps(dat643, pm10, dat648);
__m512 pack49 = _mm512_permutex2var_ps(dat644, pm9, dat649);
__m512 pack50 = _mm512_permutex2var_ps(dat644, pm10, dat649);
pack41 = _mm512_max_ps(_mm512_setzero_ps(), pack41);
pack42 = _mm512_max_ps(_mm512_setzero_ps(), pack42);
pack43 = _mm512_max_ps(_mm512_setzero_ps(), pack43);
pack44 = _mm512_max_ps(_mm512_setzero_ps(), pack44);
pack45 = _mm512_max_ps(_mm512_setzero_ps(), pack45);
pack46 = _mm512_max_ps(_mm512_setzero_ps(), pack46);
pack47 = _mm512_max_ps(_mm512_setzero_ps(), pack47);
pack48 = _mm512_max_ps(_mm512_setzero_ps(), pack48);
pack49 = _mm512_max_ps(_mm512_setzero_ps(), pack49);
pack50 = _mm512_max_ps(_mm512_setzero_ps(), pack50);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack41);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack42);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack43);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack44);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack45);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack46);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack47);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack48);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack49);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k28+100480*r5+448*toH4+4*toW4+40*t7, 1023, pack50);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 3;
}
ptrdiff_t toH5 = base5+0;
ptrdiff_t toW5 = 95;
ptrdiff_t k29 = 16*w21;
for (; k29 != 16; ++k29) {
ptrdiff_t r6 = 0;
for (; r6 != 2; ++r6) {
ptrdiff_t t8 = 0;
__m512 sfRe105 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm105 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe109 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm109 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe106 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm106 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe110 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm110 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe107 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm107 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe111 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm111 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe108 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm108 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfRe112 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512 sfIm112 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k29+768*r6+256*t8);
__m512i ifft1057 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1058 = _mm512_permutexvar_ps(ifft1057, sfRe105);
__m512 ifft1149 = _mm512_permutexvar_ps(ifft1057, sfRe109);
__m512i ifft1059 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1060 = _mm512_permutexvar_ps(ifft1059, sfRe105);
__m512 ifft1150 = _mm512_permutexvar_ps(ifft1059, sfRe109);
__m512 ifft1061 = _mm512_permutexvar_ps(ifft1057, sfIm105);
__m512 ifft1151 = _mm512_permutexvar_ps(ifft1057, sfIm109);
__m512 ifft1062 = _mm512_permutexvar_ps(ifft1059, sfIm105);
__m512 ifft1152 = _mm512_permutexvar_ps(ifft1059, sfIm109);
__m512 ifft1063 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1064 = _mm512_mask_fmadd_ps(ifft1062, 65021, ifft1063, ifft1058);
__m512 ifft1153 = _mm512_mask_fmadd_ps(ifft1152, 65021, ifft1063, ifft1149);
__m512 ifft1065 = _mm512_mask_fnmadd_ps(ifft1061, 65021, ifft1063, ifft1060);
__m512 ifft1154 = _mm512_mask_fnmadd_ps(ifft1151, 65021, ifft1063, ifft1150);
__m512 ifft1066 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1067 = _mm512_fmadd_ps(ifft1064, ifft1066, _mm512_shuffle_ps(ifft1064, ifft1064, 177));
__m512 ifft1155 = _mm512_fmadd_ps(ifft1153, ifft1066, _mm512_shuffle_ps(ifft1153, ifft1153, 177));
__m512 ifft1068 = _mm512_fmadd_ps(ifft1065, ifft1066, _mm512_shuffle_ps(ifft1065, ifft1065, 177));
__m512 ifft1156 = _mm512_fmadd_ps(ifft1154, ifft1066, _mm512_shuffle_ps(ifft1154, ifft1154, 177));
__m512 ifft1069 = _mm512_fmadd_ps(sfRe106, ifft1066, _mm512_shuffle_ps(sfRe106, sfRe106, 177));
__m512 ifft1157 = _mm512_fmadd_ps(sfRe110, ifft1066, _mm512_shuffle_ps(sfRe110, sfRe110, 177));
__m512 ifft1070 = _mm512_fmadd_ps(sfIm106, ifft1066, _mm512_shuffle_ps(sfIm106, sfIm106, 177));
__m512 ifft1158 = _mm512_fmadd_ps(sfIm110, ifft1066, _mm512_shuffle_ps(sfIm110, sfIm110, 177));
__m512 ifft1071 = _mm512_fmadd_ps(sfRe107, ifft1066, _mm512_shuffle_ps(sfRe107, sfRe107, 177));
__m512 ifft1159 = _mm512_fmadd_ps(sfRe111, ifft1066, _mm512_shuffle_ps(sfRe111, sfRe111, 177));
__m512 ifft1072 = _mm512_fmadd_ps(sfIm107, ifft1066, _mm512_shuffle_ps(sfIm107, sfIm107, 177));
__m512 ifft1160 = _mm512_fmadd_ps(sfIm111, ifft1066, _mm512_shuffle_ps(sfIm111, sfIm111, 177));
__m512 ifft1073 = _mm512_fmadd_ps(sfRe108, ifft1066, _mm512_shuffle_ps(sfRe108, sfRe108, 177));
__m512 ifft1161 = _mm512_fmadd_ps(sfRe112, ifft1066, _mm512_shuffle_ps(sfRe112, sfRe112, 177));
__m512 ifft1074 = _mm512_fmadd_ps(sfIm108, ifft1066, _mm512_shuffle_ps(sfIm108, sfIm108, 177));
__m512 ifft1162 = _mm512_fmadd_ps(sfIm112, ifft1066, _mm512_shuffle_ps(sfIm112, sfIm112, 177));
__m512 ifft1075 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1076 = _mm512_mul_ps(ifft1067, ifft1075);
__m512 ifft1163 = _mm512_mul_ps(ifft1155, ifft1075);
__m512 ifft1077 = _mm512_mul_ps(ifft1068, ifft1075);
__m512 ifft1164 = _mm512_mul_ps(ifft1156, ifft1075);
__m512 ifft1078 = _mm512_mul_ps(ifft1069, ifft1075);
__m512 ifft1165 = _mm512_mul_ps(ifft1157, ifft1075);
__m512 ifft1079 = _mm512_mul_ps(ifft1070, ifft1075);
__m512 ifft1166 = _mm512_mul_ps(ifft1158, ifft1075);
__m512 ifft1080 = _mm512_mul_ps(ifft1071, ifft1075);
__m512 ifft1167 = _mm512_mul_ps(ifft1159, ifft1075);
__m512 ifft1081 = _mm512_mul_ps(ifft1072, ifft1075);
__m512 ifft1168 = _mm512_mul_ps(ifft1160, ifft1075);
__m512 ifft1082 = _mm512_mul_ps(ifft1073, ifft1075);
__m512 ifft1169 = _mm512_mul_ps(ifft1161, ifft1075);
__m512 ifft1083 = _mm512_mul_ps(ifft1074, ifft1075);
__m512 ifft1170 = _mm512_mul_ps(ifft1162, ifft1075);
__m512 ifft1084 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1085 = _mm512_fnmadd_ps(ifft1068, ifft1084, ifft1076);
__m512 ifft1171 = _mm512_fnmadd_ps(ifft1156, ifft1084, ifft1163);
__m512 ifft1086 = _mm512_fmadd_ps(ifft1067, ifft1084, ifft1077);
__m512 ifft1172 = _mm512_fmadd_ps(ifft1155, ifft1084, ifft1164);
__m512 ifft1087 = _mm512_fnmadd_ps(ifft1070, ifft1084, ifft1078);
__m512 ifft1173 = _mm512_fnmadd_ps(ifft1158, ifft1084, ifft1165);
__m512 ifft1088 = _mm512_fmadd_ps(ifft1069, ifft1084, ifft1079);
__m512 ifft1174 = _mm512_fmadd_ps(ifft1157, ifft1084, ifft1166);
__m512 ifft1089 = _mm512_fnmadd_ps(ifft1072, ifft1084, ifft1080);
__m512 ifft1175 = _mm512_fnmadd_ps(ifft1160, ifft1084, ifft1167);
__m512 ifft1090 = _mm512_fmadd_ps(ifft1071, ifft1084, ifft1081);
__m512 ifft1176 = _mm512_fmadd_ps(ifft1159, ifft1084, ifft1168);
__m512 ifft1091 = _mm512_fnmadd_ps(ifft1074, ifft1084, ifft1082);
__m512 ifft1177 = _mm512_fnmadd_ps(ifft1162, ifft1084, ifft1169);
__m512 ifft1092 = _mm512_fmadd_ps(ifft1073, ifft1084, ifft1083);
__m512 ifft1178 = _mm512_fmadd_ps(ifft1161, ifft1084, ifft1170);
__m512 ifft1093 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1094 = _mm512_fmadd_ps(ifft1085, ifft1093, _mm512_shuffle_ps(ifft1085, ifft1085, 78));
__m512 ifft1179 = _mm512_fmadd_ps(ifft1171, ifft1093, _mm512_shuffle_ps(ifft1171, ifft1171, 78));
__m512 ifft1095 = _mm512_fmadd_ps(ifft1086, ifft1093, _mm512_shuffle_ps(ifft1086, ifft1086, 78));
__m512 ifft1180 = _mm512_fmadd_ps(ifft1172, ifft1093, _mm512_shuffle_ps(ifft1172, ifft1172, 78));
__m512 ifft1096 = _mm512_fmadd_ps(ifft1087, ifft1093, _mm512_shuffle_ps(ifft1087, ifft1087, 78));
__m512 ifft1181 = _mm512_fmadd_ps(ifft1173, ifft1093, _mm512_shuffle_ps(ifft1173, ifft1173, 78));
__m512 ifft1097 = _mm512_fmadd_ps(ifft1088, ifft1093, _mm512_shuffle_ps(ifft1088, ifft1088, 78));
__m512 ifft1182 = _mm512_fmadd_ps(ifft1174, ifft1093, _mm512_shuffle_ps(ifft1174, ifft1174, 78));
__m512 ifft1098 = _mm512_fmadd_ps(ifft1089, ifft1093, _mm512_shuffle_ps(ifft1089, ifft1089, 78));
__m512 ifft1183 = _mm512_fmadd_ps(ifft1175, ifft1093, _mm512_shuffle_ps(ifft1175, ifft1175, 78));
__m512 ifft1099 = _mm512_fmadd_ps(ifft1090, ifft1093, _mm512_shuffle_ps(ifft1090, ifft1090, 78));
__m512 ifft1184 = _mm512_fmadd_ps(ifft1176, ifft1093, _mm512_shuffle_ps(ifft1176, ifft1176, 78));
__m512 ifft1100 = _mm512_fmadd_ps(ifft1091, ifft1093, _mm512_shuffle_ps(ifft1091, ifft1091, 78));
__m512 ifft1185 = _mm512_fmadd_ps(ifft1177, ifft1093, _mm512_shuffle_ps(ifft1177, ifft1177, 78));
__m512 ifft1101 = _mm512_fmadd_ps(ifft1092, ifft1093, _mm512_shuffle_ps(ifft1092, ifft1092, 78));
__m512 ifft1186 = _mm512_fmadd_ps(ifft1178, ifft1093, _mm512_shuffle_ps(ifft1178, ifft1178, 78));
__m512 ifft1102 = _mm512_mask_sub_ps(ifft1094, 49344, _mm512_setzero_ps(), ifft1095);
__m512 ifft1187 = _mm512_mask_sub_ps(ifft1179, 49344, _mm512_setzero_ps(), ifft1180);
__m512 ifft1103 = _mm512_mask_mov_ps(ifft1095, 49344, ifft1094);
__m512 ifft1188 = _mm512_mask_mov_ps(ifft1180, 49344, ifft1179);
__m512 ifft1104 = _mm512_mask_sub_ps(ifft1096, 49344, _mm512_setzero_ps(), ifft1097);
__m512 ifft1189 = _mm512_mask_sub_ps(ifft1181, 49344, _mm512_setzero_ps(), ifft1182);
__m512 ifft1105 = _mm512_mask_mov_ps(ifft1097, 49344, ifft1096);
__m512 ifft1190 = _mm512_mask_mov_ps(ifft1182, 49344, ifft1181);
__m512 ifft1106 = _mm512_mask_sub_ps(ifft1098, 49344, _mm512_setzero_ps(), ifft1099);
__m512 ifft1191 = _mm512_mask_sub_ps(ifft1183, 49344, _mm512_setzero_ps(), ifft1184);
__m512 ifft1107 = _mm512_mask_mov_ps(ifft1099, 49344, ifft1098);
__m512 ifft1192 = _mm512_mask_mov_ps(ifft1184, 49344, ifft1183);
__m512 ifft1108 = _mm512_mask_sub_ps(ifft1100, 49344, _mm512_setzero_ps(), ifft1101);
__m512 ifft1193 = _mm512_mask_sub_ps(ifft1185, 49344, _mm512_setzero_ps(), ifft1186);
__m512 ifft1109 = _mm512_mask_mov_ps(ifft1101, 49344, ifft1100);
__m512 ifft1194 = _mm512_mask_mov_ps(ifft1186, 49344, ifft1185);
__m512 ifft1110 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1111 = _mm512_fmadd_ps(ifft1102, ifft1110, _mm512_shuffle_f32x4(ifft1102, ifft1102, 177));
__m512 ifft1195 = _mm512_fmadd_ps(ifft1187, ifft1110, _mm512_shuffle_f32x4(ifft1187, ifft1187, 177));
__m512 ifft1112 = _mm512_fmadd_ps(ifft1103, ifft1110, _mm512_shuffle_f32x4(ifft1103, ifft1103, 177));
__m512 ifft1196 = _mm512_fmadd_ps(ifft1188, ifft1110, _mm512_shuffle_f32x4(ifft1188, ifft1188, 177));
__m512 ifft1113 = _mm512_fmadd_ps(ifft1104, ifft1110, _mm512_shuffle_f32x4(ifft1104, ifft1104, 177));
__m512 ifft1197 = _mm512_fmadd_ps(ifft1189, ifft1110, _mm512_shuffle_f32x4(ifft1189, ifft1189, 177));
__m512 ifft1114 = _mm512_fmadd_ps(ifft1105, ifft1110, _mm512_shuffle_f32x4(ifft1105, ifft1105, 177));
__m512 ifft1198 = _mm512_fmadd_ps(ifft1190, ifft1110, _mm512_shuffle_f32x4(ifft1190, ifft1190, 177));
__m512 ifft1115 = _mm512_fmadd_ps(ifft1106, ifft1110, _mm512_shuffle_f32x4(ifft1106, ifft1106, 177));
__m512 ifft1199 = _mm512_fmadd_ps(ifft1191, ifft1110, _mm512_shuffle_f32x4(ifft1191, ifft1191, 177));
__m512 ifft1116 = _mm512_fnmsub_ps(ifft1107, ifft1110, _mm512_shuffle_f32x4(ifft1107, ifft1107, 177));
__m512 ifft1200 = _mm512_fnmsub_ps(ifft1192, ifft1110, _mm512_shuffle_f32x4(ifft1192, ifft1192, 177));
__m512 ifft1117 = _mm512_fmadd_ps(ifft1108, ifft1110, _mm512_shuffle_f32x4(ifft1108, ifft1108, 177));
__m512 ifft1201 = _mm512_fmadd_ps(ifft1193, ifft1110, _mm512_shuffle_f32x4(ifft1193, ifft1193, 177));
__m512 ifft1118 = _mm512_fmadd_ps(ifft1109, ifft1110, _mm512_shuffle_f32x4(ifft1109, ifft1109, 177));
__m512 ifft1202 = _mm512_fmadd_ps(ifft1194, ifft1110, _mm512_shuffle_f32x4(ifft1194, ifft1194, 177));
__m512 ifft1119 = _mm512_add_ps(ifft1111, ifft1112);
__m512 ifft1203 = _mm512_add_ps(ifft1195, ifft1196);
__m512 ifft1120 = _mm512_sub_ps(ifft1111, ifft1112);
__m512 ifft1204 = _mm512_sub_ps(ifft1195, ifft1196);
__m512 ifft1121 = _mm512_sub_ps(ifft1113, ifft1117);
__m512 ifft1205 = _mm512_sub_ps(ifft1197, ifft1201);
__m512 ifft1122 = _mm512_add_ps(ifft1114, ifft1118);
__m512 ifft1206 = _mm512_add_ps(ifft1198, ifft1202);
__m512 ifft1123 = _mm512_add_ps(ifft1113, ifft1117);
__m512 ifft1207 = _mm512_add_ps(ifft1197, ifft1201);
__m512 ifft1124 = _mm512_sub_ps(ifft1114, ifft1118);
__m512 ifft1208 = _mm512_sub_ps(ifft1198, ifft1202);
__m512 ifft1125 = _mm512_mul_ps(ifft1115, _mm512_set1_ps(3.125e-02f));
__m512 ifft1209 = _mm512_mul_ps(ifft1199, _mm512_set1_ps(3.125e-02f));
__m512 ifft1126 = _mm512_mul_ps(ifft1116, _mm512_set1_ps(3.125e-02f));
__m512 ifft1210 = _mm512_mul_ps(ifft1200, _mm512_set1_ps(3.125e-02f));
__m512 ifft1127 = _mm512_fmadd_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1211 = _mm512_fmadd_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1128 = _mm512_fmsub_ps(ifft1119, _mm512_set1_ps(1.5625e-02f), ifft1125);
__m512 ifft1212 = _mm512_fmsub_ps(ifft1203, _mm512_set1_ps(1.5625e-02f), ifft1209);
__m512 ifft1129 = _mm512_fmadd_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1213 = _mm512_fmadd_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1130 = _mm512_fmsub_ps(ifft1120, _mm512_set1_ps(1.5625e-02f), ifft1126);
__m512 ifft1214 = _mm512_fmsub_ps(ifft1204, _mm512_set1_ps(1.5625e-02f), ifft1210);
__m512 ifft1131 = _mm512_add_ps(ifft1121, ifft1122);
__m512 ifft1215 = _mm512_add_ps(ifft1205, ifft1206);
__m512 ifft1132 = _mm512_sub_ps(ifft1121, ifft1122);
__m512 ifft1216 = _mm512_sub_ps(ifft1205, ifft1206);
__m512 ifft1133 = _mm512_fnmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1217 = _mm512_fnmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1134 = _mm512_fmadd_ps(ifft1131, _mm512_set1_ps(7.0710677e-01f), ifft1123);
__m512 ifft1218 = _mm512_fmadd_ps(ifft1215, _mm512_set1_ps(7.0710677e-01f), ifft1207);
__m512 ifft1135 = _mm512_fmadd_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1219 = _mm512_fmadd_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1136 = _mm512_fmsub_ps(ifft1132, _mm512_set1_ps(7.0710677e-01f), ifft1124);
__m512 ifft1220 = _mm512_fmsub_ps(ifft1216, _mm512_set1_ps(7.0710677e-01f), ifft1208);
__m512 ifft1137 = _mm512_add_ps(ifft1133, ifft1134);
__m512 ifft1221 = _mm512_add_ps(ifft1217, ifft1218);
__m512 ifft1138 = _mm512_sub_ps(ifft1133, ifft1134);
__m512 ifft1222 = _mm512_sub_ps(ifft1217, ifft1218);
__m512 ifft1139 = _mm512_add_ps(ifft1135, ifft1136);
__m512 ifft1223 = _mm512_add_ps(ifft1219, ifft1220);
__m512 ifft1140 = _mm512_sub_ps(ifft1135, ifft1136);
__m512 ifft1224 = _mm512_sub_ps(ifft1219, ifft1220);
__m512 ifft1141 = _mm512_fmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1225 = _mm512_fmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1142 = _mm512_fnmadd_ps(ifft1137, _mm512_set1_ps(1.5625e-02f), ifft1127);
__m512 ifft1226 = _mm512_fnmadd_ps(ifft1221, _mm512_set1_ps(1.5625e-02f), ifft1211);
__m512 ifft1143 = _mm512_fmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1227 = _mm512_fmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1144 = _mm512_fnmadd_ps(ifft1139, _mm512_set1_ps(1.5625e-02f), ifft1129);
__m512 ifft1228 = _mm512_fnmadd_ps(ifft1223, _mm512_set1_ps(1.5625e-02f), ifft1213);
__m512 ifft1145 = _mm512_fnmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1229 = _mm512_fnmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1146 = _mm512_fmadd_ps(ifft1140, _mm512_set1_ps(1.5625e-02f), ifft1128);
__m512 ifft1230 = _mm512_fmadd_ps(ifft1224, _mm512_set1_ps(1.5625e-02f), ifft1212);
__m512 ifft1147 = _mm512_fmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1231 = _mm512_fmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 ifft1148 = _mm512_fnmadd_ps(ifft1138, _mm512_set1_ps(1.5625e-02f), ifft1130);
__m512 ifft1232 = _mm512_fnmadd_ps(ifft1222, _mm512_set1_ps(1.5625e-02f), ifft1214);
__m512 dat650 = ifft1141;
__m512 dat655 = ifft1225;
__m512 dat651 = ifft1143;
__m512 dat656 = ifft1227;
__m512 dat652 = ifft1145;
__m512 dat657 = ifft1229;
__m512 dat653 = ifft1147;
__m512 dat658 = ifft1231;
__m512 dat654 = ifft1142;
__m512 dat659 = ifft1226;
(void)ifft1144;
(void)ifft1228;
(void)ifft1146;
(void)ifft1230;
(void)ifft1148;
(void)ifft1232;
__m512i pm11 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack51 = _mm512_permutex2var_ps(dat650, pm11, dat655);
__m512i pm12 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack52 = _mm512_permutex2var_ps(dat650, pm12, dat655);
__m512 pack53 = _mm512_permutex2var_ps(dat651, pm11, dat656);
__m512 pack54 = _mm512_permutex2var_ps(dat651, pm12, dat656);
__m512 pack55 = _mm512_permutex2var_ps(dat652, pm11, dat657);
__m512 pack56 = _mm512_permutex2var_ps(dat652, pm12, dat657);
__m512 pack57 = _mm512_permutex2var_ps(dat653, pm11, dat658);
__m512 pack58 = _mm512_permutex2var_ps(dat653, pm12, dat658);
__m512 pack59 = _mm512_permutex2var_ps(dat654, pm11, dat659);
__m512 pack60 = _mm512_permutex2var_ps(dat654, pm12, dat659);
pack51 = _mm512_max_ps(_mm512_setzero_ps(), pack51);
pack52 = _mm512_max_ps(_mm512_setzero_ps(), pack52);
pack53 = _mm512_max_ps(_mm512_setzero_ps(), pack53);
pack54 = _mm512_max_ps(_mm512_setzero_ps(), pack54);
pack55 = _mm512_max_ps(_mm512_setzero_ps(), pack55);
pack56 = _mm512_max_ps(_mm512_setzero_ps(), pack56);
pack57 = _mm512_max_ps(_mm512_setzero_ps(), pack57);
pack58 = _mm512_max_ps(_mm512_setzero_ps(), pack58);
pack59 = _mm512_max_ps(_mm512_setzero_ps(), pack59);
pack60 = _mm512_max_ps(_mm512_setzero_ps(), pack60);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack51);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack52);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack53);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack54);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack55);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack56);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack57);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack58);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack59);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t8, 1023, pack60);
ptrdiff_t t9 = 0;
__m512 sfRe113 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm113 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe117 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm117 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe114 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm114 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe118 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm118 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe115 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm115 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe119 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm119 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe116 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm116 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfRe120 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512 sfIm120 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k29+768*r6+256*t9);
__m512i ifft1233 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1234 = _mm512_permutexvar_ps(ifft1233, sfRe113);
__m512 ifft1325 = _mm512_permutexvar_ps(ifft1233, sfRe117);
__m512i ifft1235 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1236 = _mm512_permutexvar_ps(ifft1235, sfRe113);
__m512 ifft1326 = _mm512_permutexvar_ps(ifft1235, sfRe117);
__m512 ifft1237 = _mm512_permutexvar_ps(ifft1233, sfIm113);
__m512 ifft1327 = _mm512_permutexvar_ps(ifft1233, sfIm117);
__m512 ifft1238 = _mm512_permutexvar_ps(ifft1235, sfIm113);
__m512 ifft1328 = _mm512_permutexvar_ps(ifft1235, sfIm117);
__m512 ifft1239 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1240 = _mm512_mask_fmadd_ps(ifft1238, 65021, ifft1239, ifft1234);
__m512 ifft1329 = _mm512_mask_fmadd_ps(ifft1328, 65021, ifft1239, ifft1325);
__m512 ifft1241 = _mm512_mask_fnmadd_ps(ifft1237, 65021, ifft1239, ifft1236);
__m512 ifft1330 = _mm512_mask_fnmadd_ps(ifft1327, 65021, ifft1239, ifft1326);
__m512 ifft1242 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1243 = _mm512_fmadd_ps(ifft1240, ifft1242, _mm512_shuffle_ps(ifft1240, ifft1240, 177));
__m512 ifft1331 = _mm512_fmadd_ps(ifft1329, ifft1242, _mm512_shuffle_ps(ifft1329, ifft1329, 177));
__m512 ifft1244 = _mm512_fmadd_ps(ifft1241, ifft1242, _mm512_shuffle_ps(ifft1241, ifft1241, 177));
__m512 ifft1332 = _mm512_fmadd_ps(ifft1330, ifft1242, _mm512_shuffle_ps(ifft1330, ifft1330, 177));
__m512 ifft1245 = _mm512_fmadd_ps(sfRe114, ifft1242, _mm512_shuffle_ps(sfRe114, sfRe114, 177));
__m512 ifft1333 = _mm512_fmadd_ps(sfRe118, ifft1242, _mm512_shuffle_ps(sfRe118, sfRe118, 177));
__m512 ifft1246 = _mm512_fmadd_ps(sfIm114, ifft1242, _mm512_shuffle_ps(sfIm114, sfIm114, 177));
__m512 ifft1334 = _mm512_fmadd_ps(sfIm118, ifft1242, _mm512_shuffle_ps(sfIm118, sfIm118, 177));
__m512 ifft1247 = _mm512_fmadd_ps(sfRe115, ifft1242, _mm512_shuffle_ps(sfRe115, sfRe115, 177));
__m512 ifft1335 = _mm512_fmadd_ps(sfRe119, ifft1242, _mm512_shuffle_ps(sfRe119, sfRe119, 177));
__m512 ifft1248 = _mm512_fmadd_ps(sfIm115, ifft1242, _mm512_shuffle_ps(sfIm115, sfIm115, 177));
__m512 ifft1336 = _mm512_fmadd_ps(sfIm119, ifft1242, _mm512_shuffle_ps(sfIm119, sfIm119, 177));
__m512 ifft1249 = _mm512_fmadd_ps(sfRe116, ifft1242, _mm512_shuffle_ps(sfRe116, sfRe116, 177));
__m512 ifft1337 = _mm512_fmadd_ps(sfRe120, ifft1242, _mm512_shuffle_ps(sfRe120, sfRe120, 177));
__m512 ifft1250 = _mm512_fmadd_ps(sfIm116, ifft1242, _mm512_shuffle_ps(sfIm116, sfIm116, 177));
__m512 ifft1338 = _mm512_fmadd_ps(sfIm120, ifft1242, _mm512_shuffle_ps(sfIm120, sfIm120, 177));
__m512 ifft1251 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1252 = _mm512_mul_ps(ifft1243, ifft1251);
__m512 ifft1339 = _mm512_mul_ps(ifft1331, ifft1251);
__m512 ifft1253 = _mm512_mul_ps(ifft1244, ifft1251);
__m512 ifft1340 = _mm512_mul_ps(ifft1332, ifft1251);
__m512 ifft1254 = _mm512_mul_ps(ifft1245, ifft1251);
__m512 ifft1341 = _mm512_mul_ps(ifft1333, ifft1251);
__m512 ifft1255 = _mm512_mul_ps(ifft1246, ifft1251);
__m512 ifft1342 = _mm512_mul_ps(ifft1334, ifft1251);
__m512 ifft1256 = _mm512_mul_ps(ifft1247, ifft1251);
__m512 ifft1343 = _mm512_mul_ps(ifft1335, ifft1251);
__m512 ifft1257 = _mm512_mul_ps(ifft1248, ifft1251);
__m512 ifft1344 = _mm512_mul_ps(ifft1336, ifft1251);
__m512 ifft1258 = _mm512_mul_ps(ifft1249, ifft1251);
__m512 ifft1345 = _mm512_mul_ps(ifft1337, ifft1251);
__m512 ifft1259 = _mm512_mul_ps(ifft1250, ifft1251);
__m512 ifft1346 = _mm512_mul_ps(ifft1338, ifft1251);
__m512 ifft1260 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1261 = _mm512_fnmadd_ps(ifft1244, ifft1260, ifft1252);
__m512 ifft1347 = _mm512_fnmadd_ps(ifft1332, ifft1260, ifft1339);
__m512 ifft1262 = _mm512_fmadd_ps(ifft1243, ifft1260, ifft1253);
__m512 ifft1348 = _mm512_fmadd_ps(ifft1331, ifft1260, ifft1340);
__m512 ifft1263 = _mm512_fnmadd_ps(ifft1246, ifft1260, ifft1254);
__m512 ifft1349 = _mm512_fnmadd_ps(ifft1334, ifft1260, ifft1341);
__m512 ifft1264 = _mm512_fmadd_ps(ifft1245, ifft1260, ifft1255);
__m512 ifft1350 = _mm512_fmadd_ps(ifft1333, ifft1260, ifft1342);
__m512 ifft1265 = _mm512_fnmadd_ps(ifft1248, ifft1260, ifft1256);
__m512 ifft1351 = _mm512_fnmadd_ps(ifft1336, ifft1260, ifft1343);
__m512 ifft1266 = _mm512_fmadd_ps(ifft1247, ifft1260, ifft1257);
__m512 ifft1352 = _mm512_fmadd_ps(ifft1335, ifft1260, ifft1344);
__m512 ifft1267 = _mm512_fnmadd_ps(ifft1250, ifft1260, ifft1258);
__m512 ifft1353 = _mm512_fnmadd_ps(ifft1338, ifft1260, ifft1345);
__m512 ifft1268 = _mm512_fmadd_ps(ifft1249, ifft1260, ifft1259);
__m512 ifft1354 = _mm512_fmadd_ps(ifft1337, ifft1260, ifft1346);
__m512 ifft1269 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1270 = _mm512_fmadd_ps(ifft1261, ifft1269, _mm512_shuffle_ps(ifft1261, ifft1261, 78));
__m512 ifft1355 = _mm512_fmadd_ps(ifft1347, ifft1269, _mm512_shuffle_ps(ifft1347, ifft1347, 78));
__m512 ifft1271 = _mm512_fmadd_ps(ifft1262, ifft1269, _mm512_shuffle_ps(ifft1262, ifft1262, 78));
__m512 ifft1356 = _mm512_fmadd_ps(ifft1348, ifft1269, _mm512_shuffle_ps(ifft1348, ifft1348, 78));
__m512 ifft1272 = _mm512_fmadd_ps(ifft1263, ifft1269, _mm512_shuffle_ps(ifft1263, ifft1263, 78));
__m512 ifft1357 = _mm512_fmadd_ps(ifft1349, ifft1269, _mm512_shuffle_ps(ifft1349, ifft1349, 78));
__m512 ifft1273 = _mm512_fmadd_ps(ifft1264, ifft1269, _mm512_shuffle_ps(ifft1264, ifft1264, 78));
__m512 ifft1358 = _mm512_fmadd_ps(ifft1350, ifft1269, _mm512_shuffle_ps(ifft1350, ifft1350, 78));
__m512 ifft1274 = _mm512_fmadd_ps(ifft1265, ifft1269, _mm512_shuffle_ps(ifft1265, ifft1265, 78));
__m512 ifft1359 = _mm512_fmadd_ps(ifft1351, ifft1269, _mm512_shuffle_ps(ifft1351, ifft1351, 78));
__m512 ifft1275 = _mm512_fmadd_ps(ifft1266, ifft1269, _mm512_shuffle_ps(ifft1266, ifft1266, 78));
__m512 ifft1360 = _mm512_fmadd_ps(ifft1352, ifft1269, _mm512_shuffle_ps(ifft1352, ifft1352, 78));
__m512 ifft1276 = _mm512_fmadd_ps(ifft1267, ifft1269, _mm512_shuffle_ps(ifft1267, ifft1267, 78));
__m512 ifft1361 = _mm512_fmadd_ps(ifft1353, ifft1269, _mm512_shuffle_ps(ifft1353, ifft1353, 78));
__m512 ifft1277 = _mm512_fmadd_ps(ifft1268, ifft1269, _mm512_shuffle_ps(ifft1268, ifft1268, 78));
__m512 ifft1362 = _mm512_fmadd_ps(ifft1354, ifft1269, _mm512_shuffle_ps(ifft1354, ifft1354, 78));
__m512 ifft1278 = _mm512_mask_sub_ps(ifft1270, 49344, _mm512_setzero_ps(), ifft1271);
__m512 ifft1363 = _mm512_mask_sub_ps(ifft1355, 49344, _mm512_setzero_ps(), ifft1356);
__m512 ifft1279 = _mm512_mask_mov_ps(ifft1271, 49344, ifft1270);
__m512 ifft1364 = _mm512_mask_mov_ps(ifft1356, 49344, ifft1355);
__m512 ifft1280 = _mm512_mask_sub_ps(ifft1272, 49344, _mm512_setzero_ps(), ifft1273);
__m512 ifft1365 = _mm512_mask_sub_ps(ifft1357, 49344, _mm512_setzero_ps(), ifft1358);
__m512 ifft1281 = _mm512_mask_mov_ps(ifft1273, 49344, ifft1272);
__m512 ifft1366 = _mm512_mask_mov_ps(ifft1358, 49344, ifft1357);
__m512 ifft1282 = _mm512_mask_sub_ps(ifft1274, 49344, _mm512_setzero_ps(), ifft1275);
__m512 ifft1367 = _mm512_mask_sub_ps(ifft1359, 49344, _mm512_setzero_ps(), ifft1360);
__m512 ifft1283 = _mm512_mask_mov_ps(ifft1275, 49344, ifft1274);
__m512 ifft1368 = _mm512_mask_mov_ps(ifft1360, 49344, ifft1359);
__m512 ifft1284 = _mm512_mask_sub_ps(ifft1276, 49344, _mm512_setzero_ps(), ifft1277);
__m512 ifft1369 = _mm512_mask_sub_ps(ifft1361, 49344, _mm512_setzero_ps(), ifft1362);
__m512 ifft1285 = _mm512_mask_mov_ps(ifft1277, 49344, ifft1276);
__m512 ifft1370 = _mm512_mask_mov_ps(ifft1362, 49344, ifft1361);
__m512 ifft1286 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1287 = _mm512_fmadd_ps(ifft1278, ifft1286, _mm512_shuffle_f32x4(ifft1278, ifft1278, 177));
__m512 ifft1371 = _mm512_fmadd_ps(ifft1363, ifft1286, _mm512_shuffle_f32x4(ifft1363, ifft1363, 177));
__m512 ifft1288 = _mm512_fmadd_ps(ifft1279, ifft1286, _mm512_shuffle_f32x4(ifft1279, ifft1279, 177));
__m512 ifft1372 = _mm512_fmadd_ps(ifft1364, ifft1286, _mm512_shuffle_f32x4(ifft1364, ifft1364, 177));
__m512 ifft1289 = _mm512_fmadd_ps(ifft1280, ifft1286, _mm512_shuffle_f32x4(ifft1280, ifft1280, 177));
__m512 ifft1373 = _mm512_fmadd_ps(ifft1365, ifft1286, _mm512_shuffle_f32x4(ifft1365, ifft1365, 177));
__m512 ifft1290 = _mm512_fmadd_ps(ifft1281, ifft1286, _mm512_shuffle_f32x4(ifft1281, ifft1281, 177));
__m512 ifft1374 = _mm512_fmadd_ps(ifft1366, ifft1286, _mm512_shuffle_f32x4(ifft1366, ifft1366, 177));
__m512 ifft1291 = _mm512_fmadd_ps(ifft1282, ifft1286, _mm512_shuffle_f32x4(ifft1282, ifft1282, 177));
__m512 ifft1375 = _mm512_fmadd_ps(ifft1367, ifft1286, _mm512_shuffle_f32x4(ifft1367, ifft1367, 177));
__m512 ifft1292 = _mm512_fnmsub_ps(ifft1283, ifft1286, _mm512_shuffle_f32x4(ifft1283, ifft1283, 177));
__m512 ifft1376 = _mm512_fnmsub_ps(ifft1368, ifft1286, _mm512_shuffle_f32x4(ifft1368, ifft1368, 177));
__m512 ifft1293 = _mm512_fmadd_ps(ifft1284, ifft1286, _mm512_shuffle_f32x4(ifft1284, ifft1284, 177));
__m512 ifft1377 = _mm512_fmadd_ps(ifft1369, ifft1286, _mm512_shuffle_f32x4(ifft1369, ifft1369, 177));
__m512 ifft1294 = _mm512_fmadd_ps(ifft1285, ifft1286, _mm512_shuffle_f32x4(ifft1285, ifft1285, 177));
__m512 ifft1378 = _mm512_fmadd_ps(ifft1370, ifft1286, _mm512_shuffle_f32x4(ifft1370, ifft1370, 177));
__m512 ifft1295 = _mm512_add_ps(ifft1287, ifft1288);
__m512 ifft1379 = _mm512_add_ps(ifft1371, ifft1372);
__m512 ifft1296 = _mm512_sub_ps(ifft1287, ifft1288);
__m512 ifft1380 = _mm512_sub_ps(ifft1371, ifft1372);
__m512 ifft1297 = _mm512_sub_ps(ifft1289, ifft1293);
__m512 ifft1381 = _mm512_sub_ps(ifft1373, ifft1377);
__m512 ifft1298 = _mm512_add_ps(ifft1290, ifft1294);
__m512 ifft1382 = _mm512_add_ps(ifft1374, ifft1378);
__m512 ifft1299 = _mm512_add_ps(ifft1289, ifft1293);
__m512 ifft1383 = _mm512_add_ps(ifft1373, ifft1377);
__m512 ifft1300 = _mm512_sub_ps(ifft1290, ifft1294);
__m512 ifft1384 = _mm512_sub_ps(ifft1374, ifft1378);
__m512 ifft1301 = _mm512_mul_ps(ifft1291, _mm512_set1_ps(3.125e-02f));
__m512 ifft1385 = _mm512_mul_ps(ifft1375, _mm512_set1_ps(3.125e-02f));
__m512 ifft1302 = _mm512_mul_ps(ifft1292, _mm512_set1_ps(3.125e-02f));
__m512 ifft1386 = _mm512_mul_ps(ifft1376, _mm512_set1_ps(3.125e-02f));
__m512 ifft1303 = _mm512_fmadd_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1387 = _mm512_fmadd_ps(ifft1379, _mm512_set1_ps(1.5625e-02f), ifft1385);
__m512 ifft1304 = _mm512_fmsub_ps(ifft1295, _mm512_set1_ps(1.5625e-02f), ifft1301);
__m512 ifft1388 = _mm512_fmsub_ps(ifft1379, _mm512_set1_ps(1.5625e-02f), ifft1385);
__m512 ifft1305 = _mm512_fmadd_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1389 = _mm512_fmadd_ps(ifft1380, _mm512_set1_ps(1.5625e-02f), ifft1386);
__m512 ifft1306 = _mm512_fmsub_ps(ifft1296, _mm512_set1_ps(1.5625e-02f), ifft1302);
__m512 ifft1390 = _mm512_fmsub_ps(ifft1380, _mm512_set1_ps(1.5625e-02f), ifft1386);
__m512 ifft1307 = _mm512_add_ps(ifft1297, ifft1298);
__m512 ifft1391 = _mm512_add_ps(ifft1381, ifft1382);
__m512 ifft1308 = _mm512_sub_ps(ifft1297, ifft1298);
__m512 ifft1392 = _mm512_sub_ps(ifft1381, ifft1382);
__m512 ifft1309 = _mm512_fnmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1393 = _mm512_fnmadd_ps(ifft1391, _mm512_set1_ps(7.0710677e-01f), ifft1383);
__m512 ifft1310 = _mm512_fmadd_ps(ifft1307, _mm512_set1_ps(7.0710677e-01f), ifft1299);
__m512 ifft1394 = _mm512_fmadd_ps(ifft1391, _mm512_set1_ps(7.0710677e-01f), ifft1383);
__m512 ifft1311 = _mm512_fmadd_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1395 = _mm512_fmadd_ps(ifft1392, _mm512_set1_ps(7.0710677e-01f), ifft1384);
__m512 ifft1312 = _mm512_fmsub_ps(ifft1308, _mm512_set1_ps(7.0710677e-01f), ifft1300);
__m512 ifft1396 = _mm512_fmsub_ps(ifft1392, _mm512_set1_ps(7.0710677e-01f), ifft1384);
__m512 ifft1313 = _mm512_add_ps(ifft1309, ifft1310);
__m512 ifft1397 = _mm512_add_ps(ifft1393, ifft1394);
__m512 ifft1314 = _mm512_sub_ps(ifft1309, ifft1310);
__m512 ifft1398 = _mm512_sub_ps(ifft1393, ifft1394);
__m512 ifft1315 = _mm512_add_ps(ifft1311, ifft1312);
__m512 ifft1399 = _mm512_add_ps(ifft1395, ifft1396);
__m512 ifft1316 = _mm512_sub_ps(ifft1311, ifft1312);
__m512 ifft1400 = _mm512_sub_ps(ifft1395, ifft1396);
__m512 ifft1317 = _mm512_fmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1401 = _mm512_fmadd_ps(ifft1397, _mm512_set1_ps(1.5625e-02f), ifft1387);
__m512 ifft1318 = _mm512_fnmadd_ps(ifft1313, _mm512_set1_ps(1.5625e-02f), ifft1303);
__m512 ifft1402 = _mm512_fnmadd_ps(ifft1397, _mm512_set1_ps(1.5625e-02f), ifft1387);
__m512 ifft1319 = _mm512_fmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1403 = _mm512_fmadd_ps(ifft1399, _mm512_set1_ps(1.5625e-02f), ifft1389);
__m512 ifft1320 = _mm512_fnmadd_ps(ifft1315, _mm512_set1_ps(1.5625e-02f), ifft1305);
__m512 ifft1404 = _mm512_fnmadd_ps(ifft1399, _mm512_set1_ps(1.5625e-02f), ifft1389);
__m512 ifft1321 = _mm512_fnmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1405 = _mm512_fnmadd_ps(ifft1400, _mm512_set1_ps(1.5625e-02f), ifft1388);
__m512 ifft1322 = _mm512_fmadd_ps(ifft1316, _mm512_set1_ps(1.5625e-02f), ifft1304);
__m512 ifft1406 = _mm512_fmadd_ps(ifft1400, _mm512_set1_ps(1.5625e-02f), ifft1388);
__m512 ifft1323 = _mm512_fmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 ifft1407 = _mm512_fmadd_ps(ifft1398, _mm512_set1_ps(1.5625e-02f), ifft1390);
__m512 ifft1324 = _mm512_fnmadd_ps(ifft1314, _mm512_set1_ps(1.5625e-02f), ifft1306);
__m512 ifft1408 = _mm512_fnmadd_ps(ifft1398, _mm512_set1_ps(1.5625e-02f), ifft1390);
__m512 dat660 = ifft1317;
__m512 dat665 = ifft1401;
__m512 dat661 = ifft1319;
__m512 dat666 = ifft1403;
__m512 dat662 = ifft1321;
__m512 dat667 = ifft1405;
__m512 dat663 = ifft1323;
__m512 dat668 = ifft1407;
__m512 dat664 = ifft1318;
__m512 dat669 = ifft1402;
(void)ifft1320;
(void)ifft1404;
(void)ifft1322;
(void)ifft1406;
(void)ifft1324;
(void)ifft1408;
__m512i pm13 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack61 = _mm512_permutex2var_ps(dat660, pm13, dat665);
__m512i pm14 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack62 = _mm512_permutex2var_ps(dat660, pm14, dat665);
__m512 pack63 = _mm512_permutex2var_ps(dat661, pm13, dat666);
__m512 pack64 = _mm512_permutex2var_ps(dat661, pm14, dat666);
__m512 pack65 = _mm512_permutex2var_ps(dat662, pm13, dat667);
__m512 pack66 = _mm512_permutex2var_ps(dat662, pm14, dat667);
__m512 pack67 = _mm512_permutex2var_ps(dat663, pm13, dat668);
__m512 pack68 = _mm512_permutex2var_ps(dat663, pm14, dat668);
__m512 pack69 = _mm512_permutex2var_ps(dat664, pm13, dat669);
__m512 pack70 = _mm512_permutex2var_ps(dat664, pm14, dat669);
pack61 = _mm512_max_ps(_mm512_setzero_ps(), pack61);
pack62 = _mm512_max_ps(_mm512_setzero_ps(), pack62);
pack63 = _mm512_max_ps(_mm512_setzero_ps(), pack63);
pack64 = _mm512_max_ps(_mm512_setzero_ps(), pack64);
pack65 = _mm512_max_ps(_mm512_setzero_ps(), pack65);
pack66 = _mm512_max_ps(_mm512_setzero_ps(), pack66);
pack67 = _mm512_max_ps(_mm512_setzero_ps(), pack67);
pack68 = _mm512_max_ps(_mm512_setzero_ps(), pack68);
pack69 = _mm512_max_ps(_mm512_setzero_ps(), pack69);
pack70 = _mm512_max_ps(_mm512_setzero_ps(), pack70);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack61);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack62);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack63);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack64);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack65);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack66);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack67);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack68);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack69);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+40*t9, 127, pack70);
ptrdiff_t t10 = 0;
__m512 sfRe121 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm121 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe125 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm125 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe122 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm122 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe126 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm126 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe123 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm123 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe127 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm127 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe124 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm124 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfRe128 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512 sfIm128 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k29+768*r6+256*t10);
__m512i ifft1409 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1410 = _mm512_permutexvar_ps(ifft1409, sfRe121);
__m512 ifft1501 = _mm512_permutexvar_ps(ifft1409, sfRe125);
__m512i ifft1411 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1412 = _mm512_permutexvar_ps(ifft1411, sfRe121);
__m512 ifft1502 = _mm512_permutexvar_ps(ifft1411, sfRe125);
__m512 ifft1413 = _mm512_permutexvar_ps(ifft1409, sfIm121);
__m512 ifft1503 = _mm512_permutexvar_ps(ifft1409, sfIm125);
__m512 ifft1414 = _mm512_permutexvar_ps(ifft1411, sfIm121);
__m512 ifft1504 = _mm512_permutexvar_ps(ifft1411, sfIm125);
__m512 ifft1415 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1416 = _mm512_mask_fmadd_ps(ifft1414, 65021, ifft1415, ifft1410);
__m512 ifft1505 = _mm512_mask_fmadd_ps(ifft1504, 65021, ifft1415, ifft1501);
__m512 ifft1417 = _mm512_mask_fnmadd_ps(ifft1413, 65021, ifft1415, ifft1412);
__m512 ifft1506 = _mm512_mask_fnmadd_ps(ifft1503, 65021, ifft1415, ifft1502);
__m512 ifft1418 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1419 = _mm512_fmadd_ps(ifft1416, ifft1418, _mm512_shuffle_ps(ifft1416, ifft1416, 177));
__m512 ifft1507 = _mm512_fmadd_ps(ifft1505, ifft1418, _mm512_shuffle_ps(ifft1505, ifft1505, 177));
__m512 ifft1420 = _mm512_fmadd_ps(ifft1417, ifft1418, _mm512_shuffle_ps(ifft1417, ifft1417, 177));
__m512 ifft1508 = _mm512_fmadd_ps(ifft1506, ifft1418, _mm512_shuffle_ps(ifft1506, ifft1506, 177));
__m512 ifft1421 = _mm512_fmadd_ps(sfRe122, ifft1418, _mm512_shuffle_ps(sfRe122, sfRe122, 177));
__m512 ifft1509 = _mm512_fmadd_ps(sfRe126, ifft1418, _mm512_shuffle_ps(sfRe126, sfRe126, 177));
__m512 ifft1422 = _mm512_fmadd_ps(sfIm122, ifft1418, _mm512_shuffle_ps(sfIm122, sfIm122, 177));
__m512 ifft1510 = _mm512_fmadd_ps(sfIm126, ifft1418, _mm512_shuffle_ps(sfIm126, sfIm126, 177));
__m512 ifft1423 = _mm512_fmadd_ps(sfRe123, ifft1418, _mm512_shuffle_ps(sfRe123, sfRe123, 177));
__m512 ifft1511 = _mm512_fmadd_ps(sfRe127, ifft1418, _mm512_shuffle_ps(sfRe127, sfRe127, 177));
__m512 ifft1424 = _mm512_fmadd_ps(sfIm123, ifft1418, _mm512_shuffle_ps(sfIm123, sfIm123, 177));
__m512 ifft1512 = _mm512_fmadd_ps(sfIm127, ifft1418, _mm512_shuffle_ps(sfIm127, sfIm127, 177));
__m512 ifft1425 = _mm512_fmadd_ps(sfRe124, ifft1418, _mm512_shuffle_ps(sfRe124, sfRe124, 177));
__m512 ifft1513 = _mm512_fmadd_ps(sfRe128, ifft1418, _mm512_shuffle_ps(sfRe128, sfRe128, 177));
__m512 ifft1426 = _mm512_fmadd_ps(sfIm124, ifft1418, _mm512_shuffle_ps(sfIm124, sfIm124, 177));
__m512 ifft1514 = _mm512_fmadd_ps(sfIm128, ifft1418, _mm512_shuffle_ps(sfIm128, sfIm128, 177));
__m512 ifft1427 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1428 = _mm512_mul_ps(ifft1419, ifft1427);
__m512 ifft1515 = _mm512_mul_ps(ifft1507, ifft1427);
__m512 ifft1429 = _mm512_mul_ps(ifft1420, ifft1427);
__m512 ifft1516 = _mm512_mul_ps(ifft1508, ifft1427);
__m512 ifft1430 = _mm512_mul_ps(ifft1421, ifft1427);
__m512 ifft1517 = _mm512_mul_ps(ifft1509, ifft1427);
__m512 ifft1431 = _mm512_mul_ps(ifft1422, ifft1427);
__m512 ifft1518 = _mm512_mul_ps(ifft1510, ifft1427);
__m512 ifft1432 = _mm512_mul_ps(ifft1423, ifft1427);
__m512 ifft1519 = _mm512_mul_ps(ifft1511, ifft1427);
__m512 ifft1433 = _mm512_mul_ps(ifft1424, ifft1427);
__m512 ifft1520 = _mm512_mul_ps(ifft1512, ifft1427);
__m512 ifft1434 = _mm512_mul_ps(ifft1425, ifft1427);
__m512 ifft1521 = _mm512_mul_ps(ifft1513, ifft1427);
__m512 ifft1435 = _mm512_mul_ps(ifft1426, ifft1427);
__m512 ifft1522 = _mm512_mul_ps(ifft1514, ifft1427);
__m512 ifft1436 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1437 = _mm512_fnmadd_ps(ifft1420, ifft1436, ifft1428);
__m512 ifft1523 = _mm512_fnmadd_ps(ifft1508, ifft1436, ifft1515);
__m512 ifft1438 = _mm512_fmadd_ps(ifft1419, ifft1436, ifft1429);
__m512 ifft1524 = _mm512_fmadd_ps(ifft1507, ifft1436, ifft1516);
__m512 ifft1439 = _mm512_fnmadd_ps(ifft1422, ifft1436, ifft1430);
__m512 ifft1525 = _mm512_fnmadd_ps(ifft1510, ifft1436, ifft1517);
__m512 ifft1440 = _mm512_fmadd_ps(ifft1421, ifft1436, ifft1431);
__m512 ifft1526 = _mm512_fmadd_ps(ifft1509, ifft1436, ifft1518);
__m512 ifft1441 = _mm512_fnmadd_ps(ifft1424, ifft1436, ifft1432);
__m512 ifft1527 = _mm512_fnmadd_ps(ifft1512, ifft1436, ifft1519);
__m512 ifft1442 = _mm512_fmadd_ps(ifft1423, ifft1436, ifft1433);
__m512 ifft1528 = _mm512_fmadd_ps(ifft1511, ifft1436, ifft1520);
__m512 ifft1443 = _mm512_fnmadd_ps(ifft1426, ifft1436, ifft1434);
__m512 ifft1529 = _mm512_fnmadd_ps(ifft1514, ifft1436, ifft1521);
__m512 ifft1444 = _mm512_fmadd_ps(ifft1425, ifft1436, ifft1435);
__m512 ifft1530 = _mm512_fmadd_ps(ifft1513, ifft1436, ifft1522);
__m512 ifft1445 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1446 = _mm512_fmadd_ps(ifft1437, ifft1445, _mm512_shuffle_ps(ifft1437, ifft1437, 78));
__m512 ifft1531 = _mm512_fmadd_ps(ifft1523, ifft1445, _mm512_shuffle_ps(ifft1523, ifft1523, 78));
__m512 ifft1447 = _mm512_fmadd_ps(ifft1438, ifft1445, _mm512_shuffle_ps(ifft1438, ifft1438, 78));
__m512 ifft1532 = _mm512_fmadd_ps(ifft1524, ifft1445, _mm512_shuffle_ps(ifft1524, ifft1524, 78));
__m512 ifft1448 = _mm512_fmadd_ps(ifft1439, ifft1445, _mm512_shuffle_ps(ifft1439, ifft1439, 78));
__m512 ifft1533 = _mm512_fmadd_ps(ifft1525, ifft1445, _mm512_shuffle_ps(ifft1525, ifft1525, 78));
__m512 ifft1449 = _mm512_fmadd_ps(ifft1440, ifft1445, _mm512_shuffle_ps(ifft1440, ifft1440, 78));
__m512 ifft1534 = _mm512_fmadd_ps(ifft1526, ifft1445, _mm512_shuffle_ps(ifft1526, ifft1526, 78));
__m512 ifft1450 = _mm512_fmadd_ps(ifft1441, ifft1445, _mm512_shuffle_ps(ifft1441, ifft1441, 78));
__m512 ifft1535 = _mm512_fmadd_ps(ifft1527, ifft1445, _mm512_shuffle_ps(ifft1527, ifft1527, 78));
__m512 ifft1451 = _mm512_fmadd_ps(ifft1442, ifft1445, _mm512_shuffle_ps(ifft1442, ifft1442, 78));
__m512 ifft1536 = _mm512_fmadd_ps(ifft1528, ifft1445, _mm512_shuffle_ps(ifft1528, ifft1528, 78));
__m512 ifft1452 = _mm512_fmadd_ps(ifft1443, ifft1445, _mm512_shuffle_ps(ifft1443, ifft1443, 78));
__m512 ifft1537 = _mm512_fmadd_ps(ifft1529, ifft1445, _mm512_shuffle_ps(ifft1529, ifft1529, 78));
__m512 ifft1453 = _mm512_fmadd_ps(ifft1444, ifft1445, _mm512_shuffle_ps(ifft1444, ifft1444, 78));
__m512 ifft1538 = _mm512_fmadd_ps(ifft1530, ifft1445, _mm512_shuffle_ps(ifft1530, ifft1530, 78));
__m512 ifft1454 = _mm512_mask_sub_ps(ifft1446, 49344, _mm512_setzero_ps(), ifft1447);
__m512 ifft1539 = _mm512_mask_sub_ps(ifft1531, 49344, _mm512_setzero_ps(), ifft1532);
__m512 ifft1455 = _mm512_mask_mov_ps(ifft1447, 49344, ifft1446);
__m512 ifft1540 = _mm512_mask_mov_ps(ifft1532, 49344, ifft1531);
__m512 ifft1456 = _mm512_mask_sub_ps(ifft1448, 49344, _mm512_setzero_ps(), ifft1449);
__m512 ifft1541 = _mm512_mask_sub_ps(ifft1533, 49344, _mm512_setzero_ps(), ifft1534);
__m512 ifft1457 = _mm512_mask_mov_ps(ifft1449, 49344, ifft1448);
__m512 ifft1542 = _mm512_mask_mov_ps(ifft1534, 49344, ifft1533);
__m512 ifft1458 = _mm512_mask_sub_ps(ifft1450, 49344, _mm512_setzero_ps(), ifft1451);
__m512 ifft1543 = _mm512_mask_sub_ps(ifft1535, 49344, _mm512_setzero_ps(), ifft1536);
__m512 ifft1459 = _mm512_mask_mov_ps(ifft1451, 49344, ifft1450);
__m512 ifft1544 = _mm512_mask_mov_ps(ifft1536, 49344, ifft1535);
__m512 ifft1460 = _mm512_mask_sub_ps(ifft1452, 49344, _mm512_setzero_ps(), ifft1453);
__m512 ifft1545 = _mm512_mask_sub_ps(ifft1537, 49344, _mm512_setzero_ps(), ifft1538);
__m512 ifft1461 = _mm512_mask_mov_ps(ifft1453, 49344, ifft1452);
__m512 ifft1546 = _mm512_mask_mov_ps(ifft1538, 49344, ifft1537);
__m512 ifft1462 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1463 = _mm512_fmadd_ps(ifft1454, ifft1462, _mm512_shuffle_f32x4(ifft1454, ifft1454, 177));
__m512 ifft1547 = _mm512_fmadd_ps(ifft1539, ifft1462, _mm512_shuffle_f32x4(ifft1539, ifft1539, 177));
__m512 ifft1464 = _mm512_fmadd_ps(ifft1455, ifft1462, _mm512_shuffle_f32x4(ifft1455, ifft1455, 177));
__m512 ifft1548 = _mm512_fmadd_ps(ifft1540, ifft1462, _mm512_shuffle_f32x4(ifft1540, ifft1540, 177));
__m512 ifft1465 = _mm512_fmadd_ps(ifft1456, ifft1462, _mm512_shuffle_f32x4(ifft1456, ifft1456, 177));
__m512 ifft1549 = _mm512_fmadd_ps(ifft1541, ifft1462, _mm512_shuffle_f32x4(ifft1541, ifft1541, 177));
__m512 ifft1466 = _mm512_fmadd_ps(ifft1457, ifft1462, _mm512_shuffle_f32x4(ifft1457, ifft1457, 177));
__m512 ifft1550 = _mm512_fmadd_ps(ifft1542, ifft1462, _mm512_shuffle_f32x4(ifft1542, ifft1542, 177));
__m512 ifft1467 = _mm512_fmadd_ps(ifft1458, ifft1462, _mm512_shuffle_f32x4(ifft1458, ifft1458, 177));
__m512 ifft1551 = _mm512_fmadd_ps(ifft1543, ifft1462, _mm512_shuffle_f32x4(ifft1543, ifft1543, 177));
__m512 ifft1468 = _mm512_fnmsub_ps(ifft1459, ifft1462, _mm512_shuffle_f32x4(ifft1459, ifft1459, 177));
__m512 ifft1552 = _mm512_fnmsub_ps(ifft1544, ifft1462, _mm512_shuffle_f32x4(ifft1544, ifft1544, 177));
__m512 ifft1469 = _mm512_fmadd_ps(ifft1460, ifft1462, _mm512_shuffle_f32x4(ifft1460, ifft1460, 177));
__m512 ifft1553 = _mm512_fmadd_ps(ifft1545, ifft1462, _mm512_shuffle_f32x4(ifft1545, ifft1545, 177));
__m512 ifft1470 = _mm512_fmadd_ps(ifft1461, ifft1462, _mm512_shuffle_f32x4(ifft1461, ifft1461, 177));
__m512 ifft1554 = _mm512_fmadd_ps(ifft1546, ifft1462, _mm512_shuffle_f32x4(ifft1546, ifft1546, 177));
__m512 ifft1471 = _mm512_add_ps(ifft1463, ifft1464);
__m512 ifft1555 = _mm512_add_ps(ifft1547, ifft1548);
__m512 ifft1472 = _mm512_sub_ps(ifft1463, ifft1464);
__m512 ifft1556 = _mm512_sub_ps(ifft1547, ifft1548);
__m512 ifft1473 = _mm512_sub_ps(ifft1465, ifft1469);
__m512 ifft1557 = _mm512_sub_ps(ifft1549, ifft1553);
__m512 ifft1474 = _mm512_add_ps(ifft1466, ifft1470);
__m512 ifft1558 = _mm512_add_ps(ifft1550, ifft1554);
__m512 ifft1475 = _mm512_add_ps(ifft1465, ifft1469);
__m512 ifft1559 = _mm512_add_ps(ifft1549, ifft1553);
__m512 ifft1476 = _mm512_sub_ps(ifft1466, ifft1470);
__m512 ifft1560 = _mm512_sub_ps(ifft1550, ifft1554);
__m512 ifft1477 = _mm512_mul_ps(ifft1467, _mm512_set1_ps(3.125e-02f));
__m512 ifft1561 = _mm512_mul_ps(ifft1551, _mm512_set1_ps(3.125e-02f));
__m512 ifft1478 = _mm512_mul_ps(ifft1468, _mm512_set1_ps(3.125e-02f));
__m512 ifft1562 = _mm512_mul_ps(ifft1552, _mm512_set1_ps(3.125e-02f));
__m512 ifft1479 = _mm512_fmadd_ps(ifft1471, _mm512_set1_ps(1.5625e-02f), ifft1477);
__m512 ifft1563 = _mm512_fmadd_ps(ifft1555, _mm512_set1_ps(1.5625e-02f), ifft1561);
__m512 ifft1480 = _mm512_fmsub_ps(ifft1471, _mm512_set1_ps(1.5625e-02f), ifft1477);
__m512 ifft1564 = _mm512_fmsub_ps(ifft1555, _mm512_set1_ps(1.5625e-02f), ifft1561);
__m512 ifft1481 = _mm512_fmadd_ps(ifft1472, _mm512_set1_ps(1.5625e-02f), ifft1478);
__m512 ifft1565 = _mm512_fmadd_ps(ifft1556, _mm512_set1_ps(1.5625e-02f), ifft1562);
__m512 ifft1482 = _mm512_fmsub_ps(ifft1472, _mm512_set1_ps(1.5625e-02f), ifft1478);
__m512 ifft1566 = _mm512_fmsub_ps(ifft1556, _mm512_set1_ps(1.5625e-02f), ifft1562);
__m512 ifft1483 = _mm512_add_ps(ifft1473, ifft1474);
__m512 ifft1567 = _mm512_add_ps(ifft1557, ifft1558);
__m512 ifft1484 = _mm512_sub_ps(ifft1473, ifft1474);
__m512 ifft1568 = _mm512_sub_ps(ifft1557, ifft1558);
__m512 ifft1485 = _mm512_fnmadd_ps(ifft1483, _mm512_set1_ps(7.0710677e-01f), ifft1475);
__m512 ifft1569 = _mm512_fnmadd_ps(ifft1567, _mm512_set1_ps(7.0710677e-01f), ifft1559);
__m512 ifft1486 = _mm512_fmadd_ps(ifft1483, _mm512_set1_ps(7.0710677e-01f), ifft1475);
__m512 ifft1570 = _mm512_fmadd_ps(ifft1567, _mm512_set1_ps(7.0710677e-01f), ifft1559);
__m512 ifft1487 = _mm512_fmadd_ps(ifft1484, _mm512_set1_ps(7.0710677e-01f), ifft1476);
__m512 ifft1571 = _mm512_fmadd_ps(ifft1568, _mm512_set1_ps(7.0710677e-01f), ifft1560);
__m512 ifft1488 = _mm512_fmsub_ps(ifft1484, _mm512_set1_ps(7.0710677e-01f), ifft1476);
__m512 ifft1572 = _mm512_fmsub_ps(ifft1568, _mm512_set1_ps(7.0710677e-01f), ifft1560);
__m512 ifft1489 = _mm512_add_ps(ifft1485, ifft1486);
__m512 ifft1573 = _mm512_add_ps(ifft1569, ifft1570);
__m512 ifft1490 = _mm512_sub_ps(ifft1485, ifft1486);
__m512 ifft1574 = _mm512_sub_ps(ifft1569, ifft1570);
__m512 ifft1491 = _mm512_add_ps(ifft1487, ifft1488);
__m512 ifft1575 = _mm512_add_ps(ifft1571, ifft1572);
__m512 ifft1492 = _mm512_sub_ps(ifft1487, ifft1488);
__m512 ifft1576 = _mm512_sub_ps(ifft1571, ifft1572);
__m512 ifft1493 = _mm512_fmadd_ps(ifft1489, _mm512_set1_ps(1.5625e-02f), ifft1479);
__m512 ifft1577 = _mm512_fmadd_ps(ifft1573, _mm512_set1_ps(1.5625e-02f), ifft1563);
__m512 ifft1494 = _mm512_fnmadd_ps(ifft1489, _mm512_set1_ps(1.5625e-02f), ifft1479);
__m512 ifft1578 = _mm512_fnmadd_ps(ifft1573, _mm512_set1_ps(1.5625e-02f), ifft1563);
__m512 ifft1495 = _mm512_fmadd_ps(ifft1491, _mm512_set1_ps(1.5625e-02f), ifft1481);
__m512 ifft1579 = _mm512_fmadd_ps(ifft1575, _mm512_set1_ps(1.5625e-02f), ifft1565);
__m512 ifft1496 = _mm512_fnmadd_ps(ifft1491, _mm512_set1_ps(1.5625e-02f), ifft1481);
__m512 ifft1580 = _mm512_fnmadd_ps(ifft1575, _mm512_set1_ps(1.5625e-02f), ifft1565);
__m512 ifft1497 = _mm512_fnmadd_ps(ifft1492, _mm512_set1_ps(1.5625e-02f), ifft1480);
__m512 ifft1581 = _mm512_fnmadd_ps(ifft1576, _mm512_set1_ps(1.5625e-02f), ifft1564);
__m512 ifft1498 = _mm512_fmadd_ps(ifft1492, _mm512_set1_ps(1.5625e-02f), ifft1480);
__m512 ifft1582 = _mm512_fmadd_ps(ifft1576, _mm512_set1_ps(1.5625e-02f), ifft1564);
__m512 ifft1499 = _mm512_fmadd_ps(ifft1490, _mm512_set1_ps(1.5625e-02f), ifft1482);
__m512 ifft1583 = _mm512_fmadd_ps(ifft1574, _mm512_set1_ps(1.5625e-02f), ifft1566);
__m512 ifft1500 = _mm512_fnmadd_ps(ifft1490, _mm512_set1_ps(1.5625e-02f), ifft1482);
__m512 ifft1584 = _mm512_fnmadd_ps(ifft1574, _mm512_set1_ps(1.5625e-02f), ifft1566);
__m512 dat670 = ifft1493;
__m512 dat675 = ifft1577;
__m512 dat671 = ifft1495;
__m512 dat676 = ifft1579;
__m512 dat672 = ifft1497;
__m512 dat677 = ifft1581;
__m512 dat673 = ifft1499;
__m512 dat678 = ifft1583;
__m512 dat674 = ifft1494;
__m512 dat679 = ifft1578;
(void)ifft1496;
(void)ifft1580;
(void)ifft1498;
(void)ifft1582;
(void)ifft1500;
(void)ifft1584;
__m512i pm15 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack71 = _mm512_permutex2var_ps(dat670, pm15, dat675);
__m512i pm16 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack72 = _mm512_permutex2var_ps(dat670, pm16, dat675);
__m512 pack73 = _mm512_permutex2var_ps(dat671, pm15, dat676);
__m512 pack74 = _mm512_permutex2var_ps(dat671, pm16, dat676);
__m512 pack75 = _mm512_permutex2var_ps(dat672, pm15, dat677);
__m512 pack76 = _mm512_permutex2var_ps(dat672, pm16, dat677);
__m512 pack77 = _mm512_permutex2var_ps(dat673, pm15, dat678);
__m512 pack78 = _mm512_permutex2var_ps(dat673, pm16, dat678);
__m512 pack79 = _mm512_permutex2var_ps(dat674, pm15, dat679);
__m512 pack80 = _mm512_permutex2var_ps(dat674, pm16, dat679);
pack71 = _mm512_max_ps(_mm512_setzero_ps(), pack71);
pack72 = _mm512_max_ps(_mm512_setzero_ps(), pack72);
pack73 = _mm512_max_ps(_mm512_setzero_ps(), pack73);
pack74 = _mm512_max_ps(_mm512_setzero_ps(), pack74);
pack75 = _mm512_max_ps(_mm512_setzero_ps(), pack75);
pack76 = _mm512_max_ps(_mm512_setzero_ps(), pack76);
pack77 = _mm512_max_ps(_mm512_setzero_ps(), pack77);
pack78 = _mm512_max_ps(_mm512_setzero_ps(), pack78);
pack79 = _mm512_max_ps(_mm512_setzero_ps(), pack79);
pack80 = _mm512_max_ps(_mm512_setzero_ps(), pack80);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack71);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack72);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack73);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack74);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack75);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack76);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack77);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack78);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack79);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k29+100480*r6+448*toH5+4*toW5+0*t10, 1023, pack80);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 4;
}
if (rel5 < 7) {
ptrdiff_t toH6 = base5+5;
ptrdiff_t toW6 = -110+30*rel5;
ptrdiff_t jj13 = 6-rel5+j5;
for (; j5 <= jj13; toW6 += 30) {
ptrdiff_t k30 = 16*w21;
for (; k30 != 16; ++k30) {
ptrdiff_t r7 = 0;
for (; r7 != 2; ++r7) {
ptrdiff_t t11 = 0;
for (; t11 < 3; ++t11) {
__m512 sfRe129 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm129 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe133 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm133 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe130 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm130 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe134 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm134 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe131 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm131 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe135 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm135 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe132 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm132 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfRe136 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512 sfIm136 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k30+768*r7+256*t11);
__m512i ifft1585 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1586 = _mm512_permutexvar_ps(ifft1585, sfRe129);
__m512 ifft1677 = _mm512_permutexvar_ps(ifft1585, sfRe133);
__m512i ifft1587 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1588 = _mm512_permutexvar_ps(ifft1587, sfRe129);
__m512 ifft1678 = _mm512_permutexvar_ps(ifft1587, sfRe133);
__m512 ifft1589 = _mm512_permutexvar_ps(ifft1585, sfIm129);
__m512 ifft1679 = _mm512_permutexvar_ps(ifft1585, sfIm133);
__m512 ifft1590 = _mm512_permutexvar_ps(ifft1587, sfIm129);
__m512 ifft1680 = _mm512_permutexvar_ps(ifft1587, sfIm133);
__m512 ifft1591 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1592 = _mm512_mask_fmadd_ps(ifft1590, 65021, ifft1591, ifft1586);
__m512 ifft1681 = _mm512_mask_fmadd_ps(ifft1680, 65021, ifft1591, ifft1677);
__m512 ifft1593 = _mm512_mask_fnmadd_ps(ifft1589, 65021, ifft1591, ifft1588);
__m512 ifft1682 = _mm512_mask_fnmadd_ps(ifft1679, 65021, ifft1591, ifft1678);
__m512 ifft1594 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1595 = _mm512_fmadd_ps(ifft1592, ifft1594, _mm512_shuffle_ps(ifft1592, ifft1592, 177));
__m512 ifft1683 = _mm512_fmadd_ps(ifft1681, ifft1594, _mm512_shuffle_ps(ifft1681, ifft1681, 177));
__m512 ifft1596 = _mm512_fmadd_ps(ifft1593, ifft1594, _mm512_shuffle_ps(ifft1593, ifft1593, 177));
__m512 ifft1684 = _mm512_fmadd_ps(ifft1682, ifft1594, _mm512_shuffle_ps(ifft1682, ifft1682, 177));
__m512 ifft1597 = _mm512_fmadd_ps(sfRe130, ifft1594, _mm512_shuffle_ps(sfRe130, sfRe130, 177));
__m512 ifft1685 = _mm512_fmadd_ps(sfRe134, ifft1594, _mm512_shuffle_ps(sfRe134, sfRe134, 177));
__m512 ifft1598 = _mm512_fmadd_ps(sfIm130, ifft1594, _mm512_shuffle_ps(sfIm130, sfIm130, 177));
__m512 ifft1686 = _mm512_fmadd_ps(sfIm134, ifft1594, _mm512_shuffle_ps(sfIm134, sfIm134, 177));
__m512 ifft1599 = _mm512_fmadd_ps(sfRe131, ifft1594, _mm512_shuffle_ps(sfRe131, sfRe131, 177));
__m512 ifft1687 = _mm512_fmadd_ps(sfRe135, ifft1594, _mm512_shuffle_ps(sfRe135, sfRe135, 177));
__m512 ifft1600 = _mm512_fmadd_ps(sfIm131, ifft1594, _mm512_shuffle_ps(sfIm131, sfIm131, 177));
__m512 ifft1688 = _mm512_fmadd_ps(sfIm135, ifft1594, _mm512_shuffle_ps(sfIm135, sfIm135, 177));
__m512 ifft1601 = _mm512_fmadd_ps(sfRe132, ifft1594, _mm512_shuffle_ps(sfRe132, sfRe132, 177));
__m512 ifft1689 = _mm512_fmadd_ps(sfRe136, ifft1594, _mm512_shuffle_ps(sfRe136, sfRe136, 177));
__m512 ifft1602 = _mm512_fmadd_ps(sfIm132, ifft1594, _mm512_shuffle_ps(sfIm132, sfIm132, 177));
__m512 ifft1690 = _mm512_fmadd_ps(sfIm136, ifft1594, _mm512_shuffle_ps(sfIm136, sfIm136, 177));
__m512 ifft1603 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1604 = _mm512_mul_ps(ifft1595, ifft1603);
__m512 ifft1691 = _mm512_mul_ps(ifft1683, ifft1603);
__m512 ifft1605 = _mm512_mul_ps(ifft1596, ifft1603);
__m512 ifft1692 = _mm512_mul_ps(ifft1684, ifft1603);
__m512 ifft1606 = _mm512_mul_ps(ifft1597, ifft1603);
__m512 ifft1693 = _mm512_mul_ps(ifft1685, ifft1603);
__m512 ifft1607 = _mm512_mul_ps(ifft1598, ifft1603);
__m512 ifft1694 = _mm512_mul_ps(ifft1686, ifft1603);
__m512 ifft1608 = _mm512_mul_ps(ifft1599, ifft1603);
__m512 ifft1695 = _mm512_mul_ps(ifft1687, ifft1603);
__m512 ifft1609 = _mm512_mul_ps(ifft1600, ifft1603);
__m512 ifft1696 = _mm512_mul_ps(ifft1688, ifft1603);
__m512 ifft1610 = _mm512_mul_ps(ifft1601, ifft1603);
__m512 ifft1697 = _mm512_mul_ps(ifft1689, ifft1603);
__m512 ifft1611 = _mm512_mul_ps(ifft1602, ifft1603);
__m512 ifft1698 = _mm512_mul_ps(ifft1690, ifft1603);
__m512 ifft1612 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1613 = _mm512_fnmadd_ps(ifft1596, ifft1612, ifft1604);
__m512 ifft1699 = _mm512_fnmadd_ps(ifft1684, ifft1612, ifft1691);
__m512 ifft1614 = _mm512_fmadd_ps(ifft1595, ifft1612, ifft1605);
__m512 ifft1700 = _mm512_fmadd_ps(ifft1683, ifft1612, ifft1692);
__m512 ifft1615 = _mm512_fnmadd_ps(ifft1598, ifft1612, ifft1606);
__m512 ifft1701 = _mm512_fnmadd_ps(ifft1686, ifft1612, ifft1693);
__m512 ifft1616 = _mm512_fmadd_ps(ifft1597, ifft1612, ifft1607);
__m512 ifft1702 = _mm512_fmadd_ps(ifft1685, ifft1612, ifft1694);
__m512 ifft1617 = _mm512_fnmadd_ps(ifft1600, ifft1612, ifft1608);
__m512 ifft1703 = _mm512_fnmadd_ps(ifft1688, ifft1612, ifft1695);
__m512 ifft1618 = _mm512_fmadd_ps(ifft1599, ifft1612, ifft1609);
__m512 ifft1704 = _mm512_fmadd_ps(ifft1687, ifft1612, ifft1696);
__m512 ifft1619 = _mm512_fnmadd_ps(ifft1602, ifft1612, ifft1610);
__m512 ifft1705 = _mm512_fnmadd_ps(ifft1690, ifft1612, ifft1697);
__m512 ifft1620 = _mm512_fmadd_ps(ifft1601, ifft1612, ifft1611);
__m512 ifft1706 = _mm512_fmadd_ps(ifft1689, ifft1612, ifft1698);
__m512 ifft1621 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1622 = _mm512_fmadd_ps(ifft1613, ifft1621, _mm512_shuffle_ps(ifft1613, ifft1613, 78));
__m512 ifft1707 = _mm512_fmadd_ps(ifft1699, ifft1621, _mm512_shuffle_ps(ifft1699, ifft1699, 78));
__m512 ifft1623 = _mm512_fmadd_ps(ifft1614, ifft1621, _mm512_shuffle_ps(ifft1614, ifft1614, 78));
__m512 ifft1708 = _mm512_fmadd_ps(ifft1700, ifft1621, _mm512_shuffle_ps(ifft1700, ifft1700, 78));
__m512 ifft1624 = _mm512_fmadd_ps(ifft1615, ifft1621, _mm512_shuffle_ps(ifft1615, ifft1615, 78));
__m512 ifft1709 = _mm512_fmadd_ps(ifft1701, ifft1621, _mm512_shuffle_ps(ifft1701, ifft1701, 78));
__m512 ifft1625 = _mm512_fmadd_ps(ifft1616, ifft1621, _mm512_shuffle_ps(ifft1616, ifft1616, 78));
__m512 ifft1710 = _mm512_fmadd_ps(ifft1702, ifft1621, _mm512_shuffle_ps(ifft1702, ifft1702, 78));
__m512 ifft1626 = _mm512_fmadd_ps(ifft1617, ifft1621, _mm512_shuffle_ps(ifft1617, ifft1617, 78));
__m512 ifft1711 = _mm512_fmadd_ps(ifft1703, ifft1621, _mm512_shuffle_ps(ifft1703, ifft1703, 78));
__m512 ifft1627 = _mm512_fmadd_ps(ifft1618, ifft1621, _mm512_shuffle_ps(ifft1618, ifft1618, 78));
__m512 ifft1712 = _mm512_fmadd_ps(ifft1704, ifft1621, _mm512_shuffle_ps(ifft1704, ifft1704, 78));
__m512 ifft1628 = _mm512_fmadd_ps(ifft1619, ifft1621, _mm512_shuffle_ps(ifft1619, ifft1619, 78));
__m512 ifft1713 = _mm512_fmadd_ps(ifft1705, ifft1621, _mm512_shuffle_ps(ifft1705, ifft1705, 78));
__m512 ifft1629 = _mm512_fmadd_ps(ifft1620, ifft1621, _mm512_shuffle_ps(ifft1620, ifft1620, 78));
__m512 ifft1714 = _mm512_fmadd_ps(ifft1706, ifft1621, _mm512_shuffle_ps(ifft1706, ifft1706, 78));
__m512 ifft1630 = _mm512_mask_sub_ps(ifft1622, 49344, _mm512_setzero_ps(), ifft1623);
__m512 ifft1715 = _mm512_mask_sub_ps(ifft1707, 49344, _mm512_setzero_ps(), ifft1708);
__m512 ifft1631 = _mm512_mask_mov_ps(ifft1623, 49344, ifft1622);
__m512 ifft1716 = _mm512_mask_mov_ps(ifft1708, 49344, ifft1707);
__m512 ifft1632 = _mm512_mask_sub_ps(ifft1624, 49344, _mm512_setzero_ps(), ifft1625);
__m512 ifft1717 = _mm512_mask_sub_ps(ifft1709, 49344, _mm512_setzero_ps(), ifft1710);
__m512 ifft1633 = _mm512_mask_mov_ps(ifft1625, 49344, ifft1624);
__m512 ifft1718 = _mm512_mask_mov_ps(ifft1710, 49344, ifft1709);
__m512 ifft1634 = _mm512_mask_sub_ps(ifft1626, 49344, _mm512_setzero_ps(), ifft1627);
__m512 ifft1719 = _mm512_mask_sub_ps(ifft1711, 49344, _mm512_setzero_ps(), ifft1712);
__m512 ifft1635 = _mm512_mask_mov_ps(ifft1627, 49344, ifft1626);
__m512 ifft1720 = _mm512_mask_mov_ps(ifft1712, 49344, ifft1711);
__m512 ifft1636 = _mm512_mask_sub_ps(ifft1628, 49344, _mm512_setzero_ps(), ifft1629);
__m512 ifft1721 = _mm512_mask_sub_ps(ifft1713, 49344, _mm512_setzero_ps(), ifft1714);
__m512 ifft1637 = _mm512_mask_mov_ps(ifft1629, 49344, ifft1628);
__m512 ifft1722 = _mm512_mask_mov_ps(ifft1714, 49344, ifft1713);
__m512 ifft1638 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1639 = _mm512_fmadd_ps(ifft1630, ifft1638, _mm512_shuffle_f32x4(ifft1630, ifft1630, 177));
__m512 ifft1723 = _mm512_fmadd_ps(ifft1715, ifft1638, _mm512_shuffle_f32x4(ifft1715, ifft1715, 177));
__m512 ifft1640 = _mm512_fmadd_ps(ifft1631, ifft1638, _mm512_shuffle_f32x4(ifft1631, ifft1631, 177));
__m512 ifft1724 = _mm512_fmadd_ps(ifft1716, ifft1638, _mm512_shuffle_f32x4(ifft1716, ifft1716, 177));
__m512 ifft1641 = _mm512_fmadd_ps(ifft1632, ifft1638, _mm512_shuffle_f32x4(ifft1632, ifft1632, 177));
__m512 ifft1725 = _mm512_fmadd_ps(ifft1717, ifft1638, _mm512_shuffle_f32x4(ifft1717, ifft1717, 177));
__m512 ifft1642 = _mm512_fmadd_ps(ifft1633, ifft1638, _mm512_shuffle_f32x4(ifft1633, ifft1633, 177));
__m512 ifft1726 = _mm512_fmadd_ps(ifft1718, ifft1638, _mm512_shuffle_f32x4(ifft1718, ifft1718, 177));
__m512 ifft1643 = _mm512_fmadd_ps(ifft1634, ifft1638, _mm512_shuffle_f32x4(ifft1634, ifft1634, 177));
__m512 ifft1727 = _mm512_fmadd_ps(ifft1719, ifft1638, _mm512_shuffle_f32x4(ifft1719, ifft1719, 177));
__m512 ifft1644 = _mm512_fnmsub_ps(ifft1635, ifft1638, _mm512_shuffle_f32x4(ifft1635, ifft1635, 177));
__m512 ifft1728 = _mm512_fnmsub_ps(ifft1720, ifft1638, _mm512_shuffle_f32x4(ifft1720, ifft1720, 177));
__m512 ifft1645 = _mm512_fmadd_ps(ifft1636, ifft1638, _mm512_shuffle_f32x4(ifft1636, ifft1636, 177));
__m512 ifft1729 = _mm512_fmadd_ps(ifft1721, ifft1638, _mm512_shuffle_f32x4(ifft1721, ifft1721, 177));
__m512 ifft1646 = _mm512_fmadd_ps(ifft1637, ifft1638, _mm512_shuffle_f32x4(ifft1637, ifft1637, 177));
__m512 ifft1730 = _mm512_fmadd_ps(ifft1722, ifft1638, _mm512_shuffle_f32x4(ifft1722, ifft1722, 177));
__m512 ifft1647 = _mm512_add_ps(ifft1639, ifft1640);
__m512 ifft1731 = _mm512_add_ps(ifft1723, ifft1724);
__m512 ifft1648 = _mm512_sub_ps(ifft1639, ifft1640);
__m512 ifft1732 = _mm512_sub_ps(ifft1723, ifft1724);
__m512 ifft1649 = _mm512_sub_ps(ifft1641, ifft1645);
__m512 ifft1733 = _mm512_sub_ps(ifft1725, ifft1729);
__m512 ifft1650 = _mm512_add_ps(ifft1642, ifft1646);
__m512 ifft1734 = _mm512_add_ps(ifft1726, ifft1730);
__m512 ifft1651 = _mm512_add_ps(ifft1641, ifft1645);
__m512 ifft1735 = _mm512_add_ps(ifft1725, ifft1729);
__m512 ifft1652 = _mm512_sub_ps(ifft1642, ifft1646);
__m512 ifft1736 = _mm512_sub_ps(ifft1726, ifft1730);
__m512 ifft1653 = _mm512_mul_ps(ifft1643, _mm512_set1_ps(3.125e-02f));
__m512 ifft1737 = _mm512_mul_ps(ifft1727, _mm512_set1_ps(3.125e-02f));
__m512 ifft1654 = _mm512_mul_ps(ifft1644, _mm512_set1_ps(3.125e-02f));
__m512 ifft1738 = _mm512_mul_ps(ifft1728, _mm512_set1_ps(3.125e-02f));
__m512 ifft1655 = _mm512_fmadd_ps(ifft1647, _mm512_set1_ps(1.5625e-02f), ifft1653);
__m512 ifft1739 = _mm512_fmadd_ps(ifft1731, _mm512_set1_ps(1.5625e-02f), ifft1737);
__m512 ifft1656 = _mm512_fmsub_ps(ifft1647, _mm512_set1_ps(1.5625e-02f), ifft1653);
__m512 ifft1740 = _mm512_fmsub_ps(ifft1731, _mm512_set1_ps(1.5625e-02f), ifft1737);
__m512 ifft1657 = _mm512_fmadd_ps(ifft1648, _mm512_set1_ps(1.5625e-02f), ifft1654);
__m512 ifft1741 = _mm512_fmadd_ps(ifft1732, _mm512_set1_ps(1.5625e-02f), ifft1738);
__m512 ifft1658 = _mm512_fmsub_ps(ifft1648, _mm512_set1_ps(1.5625e-02f), ifft1654);
__m512 ifft1742 = _mm512_fmsub_ps(ifft1732, _mm512_set1_ps(1.5625e-02f), ifft1738);
__m512 ifft1659 = _mm512_add_ps(ifft1649, ifft1650);
__m512 ifft1743 = _mm512_add_ps(ifft1733, ifft1734);
__m512 ifft1660 = _mm512_sub_ps(ifft1649, ifft1650);
__m512 ifft1744 = _mm512_sub_ps(ifft1733, ifft1734);
__m512 ifft1661 = _mm512_fnmadd_ps(ifft1659, _mm512_set1_ps(7.0710677e-01f), ifft1651);
__m512 ifft1745 = _mm512_fnmadd_ps(ifft1743, _mm512_set1_ps(7.0710677e-01f), ifft1735);
__m512 ifft1662 = _mm512_fmadd_ps(ifft1659, _mm512_set1_ps(7.0710677e-01f), ifft1651);
__m512 ifft1746 = _mm512_fmadd_ps(ifft1743, _mm512_set1_ps(7.0710677e-01f), ifft1735);
__m512 ifft1663 = _mm512_fmadd_ps(ifft1660, _mm512_set1_ps(7.0710677e-01f), ifft1652);
__m512 ifft1747 = _mm512_fmadd_ps(ifft1744, _mm512_set1_ps(7.0710677e-01f), ifft1736);
__m512 ifft1664 = _mm512_fmsub_ps(ifft1660, _mm512_set1_ps(7.0710677e-01f), ifft1652);
__m512 ifft1748 = _mm512_fmsub_ps(ifft1744, _mm512_set1_ps(7.0710677e-01f), ifft1736);
__m512 ifft1665 = _mm512_add_ps(ifft1661, ifft1662);
__m512 ifft1749 = _mm512_add_ps(ifft1745, ifft1746);
__m512 ifft1666 = _mm512_sub_ps(ifft1661, ifft1662);
__m512 ifft1750 = _mm512_sub_ps(ifft1745, ifft1746);
__m512 ifft1667 = _mm512_add_ps(ifft1663, ifft1664);
__m512 ifft1751 = _mm512_add_ps(ifft1747, ifft1748);
__m512 ifft1668 = _mm512_sub_ps(ifft1663, ifft1664);
__m512 ifft1752 = _mm512_sub_ps(ifft1747, ifft1748);
__m512 ifft1669 = _mm512_fmadd_ps(ifft1665, _mm512_set1_ps(1.5625e-02f), ifft1655);
__m512 ifft1753 = _mm512_fmadd_ps(ifft1749, _mm512_set1_ps(1.5625e-02f), ifft1739);
__m512 ifft1670 = _mm512_fnmadd_ps(ifft1665, _mm512_set1_ps(1.5625e-02f), ifft1655);
__m512 ifft1754 = _mm512_fnmadd_ps(ifft1749, _mm512_set1_ps(1.5625e-02f), ifft1739);
__m512 ifft1671 = _mm512_fmadd_ps(ifft1667, _mm512_set1_ps(1.5625e-02f), ifft1657);
__m512 ifft1755 = _mm512_fmadd_ps(ifft1751, _mm512_set1_ps(1.5625e-02f), ifft1741);
__m512 ifft1672 = _mm512_fnmadd_ps(ifft1667, _mm512_set1_ps(1.5625e-02f), ifft1657);
__m512 ifft1756 = _mm512_fnmadd_ps(ifft1751, _mm512_set1_ps(1.5625e-02f), ifft1741);
__m512 ifft1673 = _mm512_fnmadd_ps(ifft1668, _mm512_set1_ps(1.5625e-02f), ifft1656);
__m512 ifft1757 = _mm512_fnmadd_ps(ifft1752, _mm512_set1_ps(1.5625e-02f), ifft1740);
__m512 ifft1674 = _mm512_fmadd_ps(ifft1668, _mm512_set1_ps(1.5625e-02f), ifft1656);
__m512 ifft1758 = _mm512_fmadd_ps(ifft1752, _mm512_set1_ps(1.5625e-02f), ifft1740);
__m512 ifft1675 = _mm512_fmadd_ps(ifft1666, _mm512_set1_ps(1.5625e-02f), ifft1658);
__m512 ifft1759 = _mm512_fmadd_ps(ifft1750, _mm512_set1_ps(1.5625e-02f), ifft1742);
__m512 ifft1676 = _mm512_fnmadd_ps(ifft1666, _mm512_set1_ps(1.5625e-02f), ifft1658);
__m512 ifft1760 = _mm512_fnmadd_ps(ifft1750, _mm512_set1_ps(1.5625e-02f), ifft1742);
__m512 dat680 = ifft1669;
__m512 dat685 = ifft1753;
__m512 dat681 = ifft1671;
__m512 dat686 = ifft1755;
__m512 dat682 = ifft1673;
__m512 dat687 = ifft1757;
__m512 dat683 = ifft1675;
__m512 dat688 = ifft1759;
__m512 dat684 = ifft1670;
__m512 dat689 = ifft1754;
(void)ifft1672;
(void)ifft1756;
(void)ifft1674;
(void)ifft1758;
(void)ifft1676;
(void)ifft1760;
__m512i pm17 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack81 = _mm512_permutex2var_ps(dat680, pm17, dat685);
__m512i pm18 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack82 = _mm512_permutex2var_ps(dat680, pm18, dat685);
__m512 pack83 = _mm512_permutex2var_ps(dat681, pm17, dat686);
__m512 pack84 = _mm512_permutex2var_ps(dat681, pm18, dat686);
__m512 pack85 = _mm512_permutex2var_ps(dat682, pm17, dat687);
__m512 pack86 = _mm512_permutex2var_ps(dat682, pm18, dat687);
__m512 pack87 = _mm512_permutex2var_ps(dat683, pm17, dat688);
__m512 pack88 = _mm512_permutex2var_ps(dat683, pm18, dat688);
__m512 pack89 = _mm512_permutex2var_ps(dat684, pm17, dat689);
__m512 pack90 = _mm512_permutex2var_ps(dat684, pm18, dat689);
pack81 = _mm512_max_ps(_mm512_setzero_ps(), pack81);
pack82 = _mm512_max_ps(_mm512_setzero_ps(), pack82);
pack83 = _mm512_max_ps(_mm512_setzero_ps(), pack83);
pack84 = _mm512_max_ps(_mm512_setzero_ps(), pack84);
pack85 = _mm512_max_ps(_mm512_setzero_ps(), pack85);
pack86 = _mm512_max_ps(_mm512_setzero_ps(), pack86);
pack87 = _mm512_max_ps(_mm512_setzero_ps(), pack87);
pack88 = _mm512_max_ps(_mm512_setzero_ps(), pack88);
pack89 = _mm512_max_ps(_mm512_setzero_ps(), pack89);
pack90 = _mm512_max_ps(_mm512_setzero_ps(), pack90);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack81);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack82);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack83);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack84);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack85);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack86);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack87);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack88);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack89);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k30+100480*r7+448*toH6+4*toW6+40*t11, 1023, pack90);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 7;
}
if (rel5 < 8) {
ptrdiff_t toH7 = base5+5;
ptrdiff_t toW7 = 100;
ptrdiff_t k31 = 16*w21;
for (; k31 != 16; ++k31) {
ptrdiff_t r8 = 0;
for (; r8 != 2; ++r8) {
ptrdiff_t t12 = 0;
__m512 sfRe137 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm137 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe141 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm141 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe138 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm138 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe142 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm142 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe139 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm139 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe143 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm143 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe140 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm140 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfRe144 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512 sfIm144 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k31+768*r8+256*t12);
__m512i ifft1761 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1762 = _mm512_permutexvar_ps(ifft1761, sfRe137);
__m512 ifft1853 = _mm512_permutexvar_ps(ifft1761, sfRe141);
__m512i ifft1763 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1764 = _mm512_permutexvar_ps(ifft1763, sfRe137);
__m512 ifft1854 = _mm512_permutexvar_ps(ifft1763, sfRe141);
__m512 ifft1765 = _mm512_permutexvar_ps(ifft1761, sfIm137);
__m512 ifft1855 = _mm512_permutexvar_ps(ifft1761, sfIm141);
__m512 ifft1766 = _mm512_permutexvar_ps(ifft1763, sfIm137);
__m512 ifft1856 = _mm512_permutexvar_ps(ifft1763, sfIm141);
__m512 ifft1767 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1768 = _mm512_mask_fmadd_ps(ifft1766, 65021, ifft1767, ifft1762);
__m512 ifft1857 = _mm512_mask_fmadd_ps(ifft1856, 65021, ifft1767, ifft1853);
__m512 ifft1769 = _mm512_mask_fnmadd_ps(ifft1765, 65021, ifft1767, ifft1764);
__m512 ifft1858 = _mm512_mask_fnmadd_ps(ifft1855, 65021, ifft1767, ifft1854);
__m512 ifft1770 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1771 = _mm512_fmadd_ps(ifft1768, ifft1770, _mm512_shuffle_ps(ifft1768, ifft1768, 177));
__m512 ifft1859 = _mm512_fmadd_ps(ifft1857, ifft1770, _mm512_shuffle_ps(ifft1857, ifft1857, 177));
__m512 ifft1772 = _mm512_fmadd_ps(ifft1769, ifft1770, _mm512_shuffle_ps(ifft1769, ifft1769, 177));
__m512 ifft1860 = _mm512_fmadd_ps(ifft1858, ifft1770, _mm512_shuffle_ps(ifft1858, ifft1858, 177));
__m512 ifft1773 = _mm512_fmadd_ps(sfRe138, ifft1770, _mm512_shuffle_ps(sfRe138, sfRe138, 177));
__m512 ifft1861 = _mm512_fmadd_ps(sfRe142, ifft1770, _mm512_shuffle_ps(sfRe142, sfRe142, 177));
__m512 ifft1774 = _mm512_fmadd_ps(sfIm138, ifft1770, _mm512_shuffle_ps(sfIm138, sfIm138, 177));
__m512 ifft1862 = _mm512_fmadd_ps(sfIm142, ifft1770, _mm512_shuffle_ps(sfIm142, sfIm142, 177));
__m512 ifft1775 = _mm512_fmadd_ps(sfRe139, ifft1770, _mm512_shuffle_ps(sfRe139, sfRe139, 177));
__m512 ifft1863 = _mm512_fmadd_ps(sfRe143, ifft1770, _mm512_shuffle_ps(sfRe143, sfRe143, 177));
__m512 ifft1776 = _mm512_fmadd_ps(sfIm139, ifft1770, _mm512_shuffle_ps(sfIm139, sfIm139, 177));
__m512 ifft1864 = _mm512_fmadd_ps(sfIm143, ifft1770, _mm512_shuffle_ps(sfIm143, sfIm143, 177));
__m512 ifft1777 = _mm512_fmadd_ps(sfRe140, ifft1770, _mm512_shuffle_ps(sfRe140, sfRe140, 177));
__m512 ifft1865 = _mm512_fmadd_ps(sfRe144, ifft1770, _mm512_shuffle_ps(sfRe144, sfRe144, 177));
__m512 ifft1778 = _mm512_fmadd_ps(sfIm140, ifft1770, _mm512_shuffle_ps(sfIm140, sfIm140, 177));
__m512 ifft1866 = _mm512_fmadd_ps(sfIm144, ifft1770, _mm512_shuffle_ps(sfIm144, sfIm144, 177));
__m512 ifft1779 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1780 = _mm512_mul_ps(ifft1771, ifft1779);
__m512 ifft1867 = _mm512_mul_ps(ifft1859, ifft1779);
__m512 ifft1781 = _mm512_mul_ps(ifft1772, ifft1779);
__m512 ifft1868 = _mm512_mul_ps(ifft1860, ifft1779);
__m512 ifft1782 = _mm512_mul_ps(ifft1773, ifft1779);
__m512 ifft1869 = _mm512_mul_ps(ifft1861, ifft1779);
__m512 ifft1783 = _mm512_mul_ps(ifft1774, ifft1779);
__m512 ifft1870 = _mm512_mul_ps(ifft1862, ifft1779);
__m512 ifft1784 = _mm512_mul_ps(ifft1775, ifft1779);
__m512 ifft1871 = _mm512_mul_ps(ifft1863, ifft1779);
__m512 ifft1785 = _mm512_mul_ps(ifft1776, ifft1779);
__m512 ifft1872 = _mm512_mul_ps(ifft1864, ifft1779);
__m512 ifft1786 = _mm512_mul_ps(ifft1777, ifft1779);
__m512 ifft1873 = _mm512_mul_ps(ifft1865, ifft1779);
__m512 ifft1787 = _mm512_mul_ps(ifft1778, ifft1779);
__m512 ifft1874 = _mm512_mul_ps(ifft1866, ifft1779);
__m512 ifft1788 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1789 = _mm512_fnmadd_ps(ifft1772, ifft1788, ifft1780);
__m512 ifft1875 = _mm512_fnmadd_ps(ifft1860, ifft1788, ifft1867);
__m512 ifft1790 = _mm512_fmadd_ps(ifft1771, ifft1788, ifft1781);
__m512 ifft1876 = _mm512_fmadd_ps(ifft1859, ifft1788, ifft1868);
__m512 ifft1791 = _mm512_fnmadd_ps(ifft1774, ifft1788, ifft1782);
__m512 ifft1877 = _mm512_fnmadd_ps(ifft1862, ifft1788, ifft1869);
__m512 ifft1792 = _mm512_fmadd_ps(ifft1773, ifft1788, ifft1783);
__m512 ifft1878 = _mm512_fmadd_ps(ifft1861, ifft1788, ifft1870);
__m512 ifft1793 = _mm512_fnmadd_ps(ifft1776, ifft1788, ifft1784);
__m512 ifft1879 = _mm512_fnmadd_ps(ifft1864, ifft1788, ifft1871);
__m512 ifft1794 = _mm512_fmadd_ps(ifft1775, ifft1788, ifft1785);
__m512 ifft1880 = _mm512_fmadd_ps(ifft1863, ifft1788, ifft1872);
__m512 ifft1795 = _mm512_fnmadd_ps(ifft1778, ifft1788, ifft1786);
__m512 ifft1881 = _mm512_fnmadd_ps(ifft1866, ifft1788, ifft1873);
__m512 ifft1796 = _mm512_fmadd_ps(ifft1777, ifft1788, ifft1787);
__m512 ifft1882 = _mm512_fmadd_ps(ifft1865, ifft1788, ifft1874);
__m512 ifft1797 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1798 = _mm512_fmadd_ps(ifft1789, ifft1797, _mm512_shuffle_ps(ifft1789, ifft1789, 78));
__m512 ifft1883 = _mm512_fmadd_ps(ifft1875, ifft1797, _mm512_shuffle_ps(ifft1875, ifft1875, 78));
__m512 ifft1799 = _mm512_fmadd_ps(ifft1790, ifft1797, _mm512_shuffle_ps(ifft1790, ifft1790, 78));
__m512 ifft1884 = _mm512_fmadd_ps(ifft1876, ifft1797, _mm512_shuffle_ps(ifft1876, ifft1876, 78));
__m512 ifft1800 = _mm512_fmadd_ps(ifft1791, ifft1797, _mm512_shuffle_ps(ifft1791, ifft1791, 78));
__m512 ifft1885 = _mm512_fmadd_ps(ifft1877, ifft1797, _mm512_shuffle_ps(ifft1877, ifft1877, 78));
__m512 ifft1801 = _mm512_fmadd_ps(ifft1792, ifft1797, _mm512_shuffle_ps(ifft1792, ifft1792, 78));
__m512 ifft1886 = _mm512_fmadd_ps(ifft1878, ifft1797, _mm512_shuffle_ps(ifft1878, ifft1878, 78));
__m512 ifft1802 = _mm512_fmadd_ps(ifft1793, ifft1797, _mm512_shuffle_ps(ifft1793, ifft1793, 78));
__m512 ifft1887 = _mm512_fmadd_ps(ifft1879, ifft1797, _mm512_shuffle_ps(ifft1879, ifft1879, 78));
__m512 ifft1803 = _mm512_fmadd_ps(ifft1794, ifft1797, _mm512_shuffle_ps(ifft1794, ifft1794, 78));
__m512 ifft1888 = _mm512_fmadd_ps(ifft1880, ifft1797, _mm512_shuffle_ps(ifft1880, ifft1880, 78));
__m512 ifft1804 = _mm512_fmadd_ps(ifft1795, ifft1797, _mm512_shuffle_ps(ifft1795, ifft1795, 78));
__m512 ifft1889 = _mm512_fmadd_ps(ifft1881, ifft1797, _mm512_shuffle_ps(ifft1881, ifft1881, 78));
__m512 ifft1805 = _mm512_fmadd_ps(ifft1796, ifft1797, _mm512_shuffle_ps(ifft1796, ifft1796, 78));
__m512 ifft1890 = _mm512_fmadd_ps(ifft1882, ifft1797, _mm512_shuffle_ps(ifft1882, ifft1882, 78));
__m512 ifft1806 = _mm512_mask_sub_ps(ifft1798, 49344, _mm512_setzero_ps(), ifft1799);
__m512 ifft1891 = _mm512_mask_sub_ps(ifft1883, 49344, _mm512_setzero_ps(), ifft1884);
__m512 ifft1807 = _mm512_mask_mov_ps(ifft1799, 49344, ifft1798);
__m512 ifft1892 = _mm512_mask_mov_ps(ifft1884, 49344, ifft1883);
__m512 ifft1808 = _mm512_mask_sub_ps(ifft1800, 49344, _mm512_setzero_ps(), ifft1801);
__m512 ifft1893 = _mm512_mask_sub_ps(ifft1885, 49344, _mm512_setzero_ps(), ifft1886);
__m512 ifft1809 = _mm512_mask_mov_ps(ifft1801, 49344, ifft1800);
__m512 ifft1894 = _mm512_mask_mov_ps(ifft1886, 49344, ifft1885);
__m512 ifft1810 = _mm512_mask_sub_ps(ifft1802, 49344, _mm512_setzero_ps(), ifft1803);
__m512 ifft1895 = _mm512_mask_sub_ps(ifft1887, 49344, _mm512_setzero_ps(), ifft1888);
__m512 ifft1811 = _mm512_mask_mov_ps(ifft1803, 49344, ifft1802);
__m512 ifft1896 = _mm512_mask_mov_ps(ifft1888, 49344, ifft1887);
__m512 ifft1812 = _mm512_mask_sub_ps(ifft1804, 49344, _mm512_setzero_ps(), ifft1805);
__m512 ifft1897 = _mm512_mask_sub_ps(ifft1889, 49344, _mm512_setzero_ps(), ifft1890);
__m512 ifft1813 = _mm512_mask_mov_ps(ifft1805, 49344, ifft1804);
__m512 ifft1898 = _mm512_mask_mov_ps(ifft1890, 49344, ifft1889);
__m512 ifft1814 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1815 = _mm512_fmadd_ps(ifft1806, ifft1814, _mm512_shuffle_f32x4(ifft1806, ifft1806, 177));
__m512 ifft1899 = _mm512_fmadd_ps(ifft1891, ifft1814, _mm512_shuffle_f32x4(ifft1891, ifft1891, 177));
__m512 ifft1816 = _mm512_fmadd_ps(ifft1807, ifft1814, _mm512_shuffle_f32x4(ifft1807, ifft1807, 177));
__m512 ifft1900 = _mm512_fmadd_ps(ifft1892, ifft1814, _mm512_shuffle_f32x4(ifft1892, ifft1892, 177));
__m512 ifft1817 = _mm512_fmadd_ps(ifft1808, ifft1814, _mm512_shuffle_f32x4(ifft1808, ifft1808, 177));
__m512 ifft1901 = _mm512_fmadd_ps(ifft1893, ifft1814, _mm512_shuffle_f32x4(ifft1893, ifft1893, 177));
__m512 ifft1818 = _mm512_fmadd_ps(ifft1809, ifft1814, _mm512_shuffle_f32x4(ifft1809, ifft1809, 177));
__m512 ifft1902 = _mm512_fmadd_ps(ifft1894, ifft1814, _mm512_shuffle_f32x4(ifft1894, ifft1894, 177));
__m512 ifft1819 = _mm512_fmadd_ps(ifft1810, ifft1814, _mm512_shuffle_f32x4(ifft1810, ifft1810, 177));
__m512 ifft1903 = _mm512_fmadd_ps(ifft1895, ifft1814, _mm512_shuffle_f32x4(ifft1895, ifft1895, 177));
__m512 ifft1820 = _mm512_fnmsub_ps(ifft1811, ifft1814, _mm512_shuffle_f32x4(ifft1811, ifft1811, 177));
__m512 ifft1904 = _mm512_fnmsub_ps(ifft1896, ifft1814, _mm512_shuffle_f32x4(ifft1896, ifft1896, 177));
__m512 ifft1821 = _mm512_fmadd_ps(ifft1812, ifft1814, _mm512_shuffle_f32x4(ifft1812, ifft1812, 177));
__m512 ifft1905 = _mm512_fmadd_ps(ifft1897, ifft1814, _mm512_shuffle_f32x4(ifft1897, ifft1897, 177));
__m512 ifft1822 = _mm512_fmadd_ps(ifft1813, ifft1814, _mm512_shuffle_f32x4(ifft1813, ifft1813, 177));
__m512 ifft1906 = _mm512_fmadd_ps(ifft1898, ifft1814, _mm512_shuffle_f32x4(ifft1898, ifft1898, 177));
__m512 ifft1823 = _mm512_add_ps(ifft1815, ifft1816);
__m512 ifft1907 = _mm512_add_ps(ifft1899, ifft1900);
__m512 ifft1824 = _mm512_sub_ps(ifft1815, ifft1816);
__m512 ifft1908 = _mm512_sub_ps(ifft1899, ifft1900);
__m512 ifft1825 = _mm512_sub_ps(ifft1817, ifft1821);
__m512 ifft1909 = _mm512_sub_ps(ifft1901, ifft1905);
__m512 ifft1826 = _mm512_add_ps(ifft1818, ifft1822);
__m512 ifft1910 = _mm512_add_ps(ifft1902, ifft1906);
__m512 ifft1827 = _mm512_add_ps(ifft1817, ifft1821);
__m512 ifft1911 = _mm512_add_ps(ifft1901, ifft1905);
__m512 ifft1828 = _mm512_sub_ps(ifft1818, ifft1822);
__m512 ifft1912 = _mm512_sub_ps(ifft1902, ifft1906);
__m512 ifft1829 = _mm512_mul_ps(ifft1819, _mm512_set1_ps(3.125e-02f));
__m512 ifft1913 = _mm512_mul_ps(ifft1903, _mm512_set1_ps(3.125e-02f));
__m512 ifft1830 = _mm512_mul_ps(ifft1820, _mm512_set1_ps(3.125e-02f));
__m512 ifft1914 = _mm512_mul_ps(ifft1904, _mm512_set1_ps(3.125e-02f));
__m512 ifft1831 = _mm512_fmadd_ps(ifft1823, _mm512_set1_ps(1.5625e-02f), ifft1829);
__m512 ifft1915 = _mm512_fmadd_ps(ifft1907, _mm512_set1_ps(1.5625e-02f), ifft1913);
__m512 ifft1832 = _mm512_fmsub_ps(ifft1823, _mm512_set1_ps(1.5625e-02f), ifft1829);
__m512 ifft1916 = _mm512_fmsub_ps(ifft1907, _mm512_set1_ps(1.5625e-02f), ifft1913);
__m512 ifft1833 = _mm512_fmadd_ps(ifft1824, _mm512_set1_ps(1.5625e-02f), ifft1830);
__m512 ifft1917 = _mm512_fmadd_ps(ifft1908, _mm512_set1_ps(1.5625e-02f), ifft1914);
__m512 ifft1834 = _mm512_fmsub_ps(ifft1824, _mm512_set1_ps(1.5625e-02f), ifft1830);
__m512 ifft1918 = _mm512_fmsub_ps(ifft1908, _mm512_set1_ps(1.5625e-02f), ifft1914);
__m512 ifft1835 = _mm512_add_ps(ifft1825, ifft1826);
__m512 ifft1919 = _mm512_add_ps(ifft1909, ifft1910);
__m512 ifft1836 = _mm512_sub_ps(ifft1825, ifft1826);
__m512 ifft1920 = _mm512_sub_ps(ifft1909, ifft1910);
__m512 ifft1837 = _mm512_fnmadd_ps(ifft1835, _mm512_set1_ps(7.0710677e-01f), ifft1827);
__m512 ifft1921 = _mm512_fnmadd_ps(ifft1919, _mm512_set1_ps(7.0710677e-01f), ifft1911);
__m512 ifft1838 = _mm512_fmadd_ps(ifft1835, _mm512_set1_ps(7.0710677e-01f), ifft1827);
__m512 ifft1922 = _mm512_fmadd_ps(ifft1919, _mm512_set1_ps(7.0710677e-01f), ifft1911);
__m512 ifft1839 = _mm512_fmadd_ps(ifft1836, _mm512_set1_ps(7.0710677e-01f), ifft1828);
__m512 ifft1923 = _mm512_fmadd_ps(ifft1920, _mm512_set1_ps(7.0710677e-01f), ifft1912);
__m512 ifft1840 = _mm512_fmsub_ps(ifft1836, _mm512_set1_ps(7.0710677e-01f), ifft1828);
__m512 ifft1924 = _mm512_fmsub_ps(ifft1920, _mm512_set1_ps(7.0710677e-01f), ifft1912);
__m512 ifft1841 = _mm512_add_ps(ifft1837, ifft1838);
__m512 ifft1925 = _mm512_add_ps(ifft1921, ifft1922);
__m512 ifft1842 = _mm512_sub_ps(ifft1837, ifft1838);
__m512 ifft1926 = _mm512_sub_ps(ifft1921, ifft1922);
__m512 ifft1843 = _mm512_add_ps(ifft1839, ifft1840);
__m512 ifft1927 = _mm512_add_ps(ifft1923, ifft1924);
__m512 ifft1844 = _mm512_sub_ps(ifft1839, ifft1840);
__m512 ifft1928 = _mm512_sub_ps(ifft1923, ifft1924);
__m512 ifft1845 = _mm512_fmadd_ps(ifft1841, _mm512_set1_ps(1.5625e-02f), ifft1831);
__m512 ifft1929 = _mm512_fmadd_ps(ifft1925, _mm512_set1_ps(1.5625e-02f), ifft1915);
__m512 ifft1846 = _mm512_fnmadd_ps(ifft1841, _mm512_set1_ps(1.5625e-02f), ifft1831);
__m512 ifft1930 = _mm512_fnmadd_ps(ifft1925, _mm512_set1_ps(1.5625e-02f), ifft1915);
__m512 ifft1847 = _mm512_fmadd_ps(ifft1843, _mm512_set1_ps(1.5625e-02f), ifft1833);
__m512 ifft1931 = _mm512_fmadd_ps(ifft1927, _mm512_set1_ps(1.5625e-02f), ifft1917);
__m512 ifft1848 = _mm512_fnmadd_ps(ifft1843, _mm512_set1_ps(1.5625e-02f), ifft1833);
__m512 ifft1932 = _mm512_fnmadd_ps(ifft1927, _mm512_set1_ps(1.5625e-02f), ifft1917);
__m512 ifft1849 = _mm512_fnmadd_ps(ifft1844, _mm512_set1_ps(1.5625e-02f), ifft1832);
__m512 ifft1933 = _mm512_fnmadd_ps(ifft1928, _mm512_set1_ps(1.5625e-02f), ifft1916);
__m512 ifft1850 = _mm512_fmadd_ps(ifft1844, _mm512_set1_ps(1.5625e-02f), ifft1832);
__m512 ifft1934 = _mm512_fmadd_ps(ifft1928, _mm512_set1_ps(1.5625e-02f), ifft1916);
__m512 ifft1851 = _mm512_fmadd_ps(ifft1842, _mm512_set1_ps(1.5625e-02f), ifft1834);
__m512 ifft1935 = _mm512_fmadd_ps(ifft1926, _mm512_set1_ps(1.5625e-02f), ifft1918);
__m512 ifft1852 = _mm512_fnmadd_ps(ifft1842, _mm512_set1_ps(1.5625e-02f), ifft1834);
__m512 ifft1936 = _mm512_fnmadd_ps(ifft1926, _mm512_set1_ps(1.5625e-02f), ifft1918);
__m512 dat690 = ifft1845;
__m512 dat695 = ifft1929;
__m512 dat691 = ifft1847;
__m512 dat696 = ifft1931;
__m512 dat692 = ifft1849;
__m512 dat697 = ifft1933;
__m512 dat693 = ifft1851;
__m512 dat698 = ifft1935;
__m512 dat694 = ifft1846;
__m512 dat699 = ifft1930;
(void)ifft1848;
(void)ifft1932;
(void)ifft1850;
(void)ifft1934;
(void)ifft1852;
(void)ifft1936;
__m512i pm19 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack91 = _mm512_permutex2var_ps(dat690, pm19, dat695);
__m512i pm20 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack92 = _mm512_permutex2var_ps(dat690, pm20, dat695);
__m512 pack93 = _mm512_permutex2var_ps(dat691, pm19, dat696);
__m512 pack94 = _mm512_permutex2var_ps(dat691, pm20, dat696);
__m512 pack95 = _mm512_permutex2var_ps(dat692, pm19, dat697);
__m512 pack96 = _mm512_permutex2var_ps(dat692, pm20, dat697);
__m512 pack97 = _mm512_permutex2var_ps(dat693, pm19, dat698);
__m512 pack98 = _mm512_permutex2var_ps(dat693, pm20, dat698);
__m512 pack99 = _mm512_permutex2var_ps(dat694, pm19, dat699);
__m512 pack100 = _mm512_permutex2var_ps(dat694, pm20, dat699);
pack91 = _mm512_max_ps(_mm512_setzero_ps(), pack91);
pack92 = _mm512_max_ps(_mm512_setzero_ps(), pack92);
pack93 = _mm512_max_ps(_mm512_setzero_ps(), pack93);
pack94 = _mm512_max_ps(_mm512_setzero_ps(), pack94);
pack95 = _mm512_max_ps(_mm512_setzero_ps(), pack95);
pack96 = _mm512_max_ps(_mm512_setzero_ps(), pack96);
pack97 = _mm512_max_ps(_mm512_setzero_ps(), pack97);
pack98 = _mm512_max_ps(_mm512_setzero_ps(), pack98);
pack99 = _mm512_max_ps(_mm512_setzero_ps(), pack99);
pack100 = _mm512_max_ps(_mm512_setzero_ps(), pack100);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack91);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack92);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack93);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack94);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack95);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack96);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack97);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack98);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack99);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t12, 1023, pack100);
ptrdiff_t t13 = 0;
__m512 sfRe145 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm145 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe149 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm149 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe146 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm146 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe150 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm150 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe147 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm147 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe151 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm151 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe148 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm148 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfRe152 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512 sfIm152 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k31+768*r8+256*t13);
__m512i ifft1937 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft1938 = _mm512_permutexvar_ps(ifft1937, sfRe145);
__m512 ifft2029 = _mm512_permutexvar_ps(ifft1937, sfRe149);
__m512i ifft1939 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft1940 = _mm512_permutexvar_ps(ifft1939, sfRe145);
__m512 ifft2030 = _mm512_permutexvar_ps(ifft1939, sfRe149);
__m512 ifft1941 = _mm512_permutexvar_ps(ifft1937, sfIm145);
__m512 ifft2031 = _mm512_permutexvar_ps(ifft1937, sfIm149);
__m512 ifft1942 = _mm512_permutexvar_ps(ifft1939, sfIm145);
__m512 ifft2032 = _mm512_permutexvar_ps(ifft1939, sfIm149);
__m512 ifft1943 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft1944 = _mm512_mask_fmadd_ps(ifft1942, 65021, ifft1943, ifft1938);
__m512 ifft2033 = _mm512_mask_fmadd_ps(ifft2032, 65021, ifft1943, ifft2029);
__m512 ifft1945 = _mm512_mask_fnmadd_ps(ifft1941, 65021, ifft1943, ifft1940);
__m512 ifft2034 = _mm512_mask_fnmadd_ps(ifft2031, 65021, ifft1943, ifft2030);
__m512 ifft1946 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft1947 = _mm512_fmadd_ps(ifft1944, ifft1946, _mm512_shuffle_ps(ifft1944, ifft1944, 177));
__m512 ifft2035 = _mm512_fmadd_ps(ifft2033, ifft1946, _mm512_shuffle_ps(ifft2033, ifft2033, 177));
__m512 ifft1948 = _mm512_fmadd_ps(ifft1945, ifft1946, _mm512_shuffle_ps(ifft1945, ifft1945, 177));
__m512 ifft2036 = _mm512_fmadd_ps(ifft2034, ifft1946, _mm512_shuffle_ps(ifft2034, ifft2034, 177));
__m512 ifft1949 = _mm512_fmadd_ps(sfRe146, ifft1946, _mm512_shuffle_ps(sfRe146, sfRe146, 177));
__m512 ifft2037 = _mm512_fmadd_ps(sfRe150, ifft1946, _mm512_shuffle_ps(sfRe150, sfRe150, 177));
__m512 ifft1950 = _mm512_fmadd_ps(sfIm146, ifft1946, _mm512_shuffle_ps(sfIm146, sfIm146, 177));
__m512 ifft2038 = _mm512_fmadd_ps(sfIm150, ifft1946, _mm512_shuffle_ps(sfIm150, sfIm150, 177));
__m512 ifft1951 = _mm512_fmadd_ps(sfRe147, ifft1946, _mm512_shuffle_ps(sfRe147, sfRe147, 177));
__m512 ifft2039 = _mm512_fmadd_ps(sfRe151, ifft1946, _mm512_shuffle_ps(sfRe151, sfRe151, 177));
__m512 ifft1952 = _mm512_fmadd_ps(sfIm147, ifft1946, _mm512_shuffle_ps(sfIm147, sfIm147, 177));
__m512 ifft2040 = _mm512_fmadd_ps(sfIm151, ifft1946, _mm512_shuffle_ps(sfIm151, sfIm151, 177));
__m512 ifft1953 = _mm512_fmadd_ps(sfRe148, ifft1946, _mm512_shuffle_ps(sfRe148, sfRe148, 177));
__m512 ifft2041 = _mm512_fmadd_ps(sfRe152, ifft1946, _mm512_shuffle_ps(sfRe152, sfRe152, 177));
__m512 ifft1954 = _mm512_fmadd_ps(sfIm148, ifft1946, _mm512_shuffle_ps(sfIm148, sfIm148, 177));
__m512 ifft2042 = _mm512_fmadd_ps(sfIm152, ifft1946, _mm512_shuffle_ps(sfIm152, sfIm152, 177));
__m512 ifft1955 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft1956 = _mm512_mul_ps(ifft1947, ifft1955);
__m512 ifft2043 = _mm512_mul_ps(ifft2035, ifft1955);
__m512 ifft1957 = _mm512_mul_ps(ifft1948, ifft1955);
__m512 ifft2044 = _mm512_mul_ps(ifft2036, ifft1955);
__m512 ifft1958 = _mm512_mul_ps(ifft1949, ifft1955);
__m512 ifft2045 = _mm512_mul_ps(ifft2037, ifft1955);
__m512 ifft1959 = _mm512_mul_ps(ifft1950, ifft1955);
__m512 ifft2046 = _mm512_mul_ps(ifft2038, ifft1955);
__m512 ifft1960 = _mm512_mul_ps(ifft1951, ifft1955);
__m512 ifft2047 = _mm512_mul_ps(ifft2039, ifft1955);
__m512 ifft1961 = _mm512_mul_ps(ifft1952, ifft1955);
__m512 ifft2048 = _mm512_mul_ps(ifft2040, ifft1955);
__m512 ifft1962 = _mm512_mul_ps(ifft1953, ifft1955);
__m512 ifft2049 = _mm512_mul_ps(ifft2041, ifft1955);
__m512 ifft1963 = _mm512_mul_ps(ifft1954, ifft1955);
__m512 ifft2050 = _mm512_mul_ps(ifft2042, ifft1955);
__m512 ifft1964 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft1965 = _mm512_fnmadd_ps(ifft1948, ifft1964, ifft1956);
__m512 ifft2051 = _mm512_fnmadd_ps(ifft2036, ifft1964, ifft2043);
__m512 ifft1966 = _mm512_fmadd_ps(ifft1947, ifft1964, ifft1957);
__m512 ifft2052 = _mm512_fmadd_ps(ifft2035, ifft1964, ifft2044);
__m512 ifft1967 = _mm512_fnmadd_ps(ifft1950, ifft1964, ifft1958);
__m512 ifft2053 = _mm512_fnmadd_ps(ifft2038, ifft1964, ifft2045);
__m512 ifft1968 = _mm512_fmadd_ps(ifft1949, ifft1964, ifft1959);
__m512 ifft2054 = _mm512_fmadd_ps(ifft2037, ifft1964, ifft2046);
__m512 ifft1969 = _mm512_fnmadd_ps(ifft1952, ifft1964, ifft1960);
__m512 ifft2055 = _mm512_fnmadd_ps(ifft2040, ifft1964, ifft2047);
__m512 ifft1970 = _mm512_fmadd_ps(ifft1951, ifft1964, ifft1961);
__m512 ifft2056 = _mm512_fmadd_ps(ifft2039, ifft1964, ifft2048);
__m512 ifft1971 = _mm512_fnmadd_ps(ifft1954, ifft1964, ifft1962);
__m512 ifft2057 = _mm512_fnmadd_ps(ifft2042, ifft1964, ifft2049);
__m512 ifft1972 = _mm512_fmadd_ps(ifft1953, ifft1964, ifft1963);
__m512 ifft2058 = _mm512_fmadd_ps(ifft2041, ifft1964, ifft2050);
__m512 ifft1973 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft1974 = _mm512_fmadd_ps(ifft1965, ifft1973, _mm512_shuffle_ps(ifft1965, ifft1965, 78));
__m512 ifft2059 = _mm512_fmadd_ps(ifft2051, ifft1973, _mm512_shuffle_ps(ifft2051, ifft2051, 78));
__m512 ifft1975 = _mm512_fmadd_ps(ifft1966, ifft1973, _mm512_shuffle_ps(ifft1966, ifft1966, 78));
__m512 ifft2060 = _mm512_fmadd_ps(ifft2052, ifft1973, _mm512_shuffle_ps(ifft2052, ifft2052, 78));
__m512 ifft1976 = _mm512_fmadd_ps(ifft1967, ifft1973, _mm512_shuffle_ps(ifft1967, ifft1967, 78));
__m512 ifft2061 = _mm512_fmadd_ps(ifft2053, ifft1973, _mm512_shuffle_ps(ifft2053, ifft2053, 78));
__m512 ifft1977 = _mm512_fmadd_ps(ifft1968, ifft1973, _mm512_shuffle_ps(ifft1968, ifft1968, 78));
__m512 ifft2062 = _mm512_fmadd_ps(ifft2054, ifft1973, _mm512_shuffle_ps(ifft2054, ifft2054, 78));
__m512 ifft1978 = _mm512_fmadd_ps(ifft1969, ifft1973, _mm512_shuffle_ps(ifft1969, ifft1969, 78));
__m512 ifft2063 = _mm512_fmadd_ps(ifft2055, ifft1973, _mm512_shuffle_ps(ifft2055, ifft2055, 78));
__m512 ifft1979 = _mm512_fmadd_ps(ifft1970, ifft1973, _mm512_shuffle_ps(ifft1970, ifft1970, 78));
__m512 ifft2064 = _mm512_fmadd_ps(ifft2056, ifft1973, _mm512_shuffle_ps(ifft2056, ifft2056, 78));
__m512 ifft1980 = _mm512_fmadd_ps(ifft1971, ifft1973, _mm512_shuffle_ps(ifft1971, ifft1971, 78));
__m512 ifft2065 = _mm512_fmadd_ps(ifft2057, ifft1973, _mm512_shuffle_ps(ifft2057, ifft2057, 78));
__m512 ifft1981 = _mm512_fmadd_ps(ifft1972, ifft1973, _mm512_shuffle_ps(ifft1972, ifft1972, 78));
__m512 ifft2066 = _mm512_fmadd_ps(ifft2058, ifft1973, _mm512_shuffle_ps(ifft2058, ifft2058, 78));
__m512 ifft1982 = _mm512_mask_sub_ps(ifft1974, 49344, _mm512_setzero_ps(), ifft1975);
__m512 ifft2067 = _mm512_mask_sub_ps(ifft2059, 49344, _mm512_setzero_ps(), ifft2060);
__m512 ifft1983 = _mm512_mask_mov_ps(ifft1975, 49344, ifft1974);
__m512 ifft2068 = _mm512_mask_mov_ps(ifft2060, 49344, ifft2059);
__m512 ifft1984 = _mm512_mask_sub_ps(ifft1976, 49344, _mm512_setzero_ps(), ifft1977);
__m512 ifft2069 = _mm512_mask_sub_ps(ifft2061, 49344, _mm512_setzero_ps(), ifft2062);
__m512 ifft1985 = _mm512_mask_mov_ps(ifft1977, 49344, ifft1976);
__m512 ifft2070 = _mm512_mask_mov_ps(ifft2062, 49344, ifft2061);
__m512 ifft1986 = _mm512_mask_sub_ps(ifft1978, 49344, _mm512_setzero_ps(), ifft1979);
__m512 ifft2071 = _mm512_mask_sub_ps(ifft2063, 49344, _mm512_setzero_ps(), ifft2064);
__m512 ifft1987 = _mm512_mask_mov_ps(ifft1979, 49344, ifft1978);
__m512 ifft2072 = _mm512_mask_mov_ps(ifft2064, 49344, ifft2063);
__m512 ifft1988 = _mm512_mask_sub_ps(ifft1980, 49344, _mm512_setzero_ps(), ifft1981);
__m512 ifft2073 = _mm512_mask_sub_ps(ifft2065, 49344, _mm512_setzero_ps(), ifft2066);
__m512 ifft1989 = _mm512_mask_mov_ps(ifft1981, 49344, ifft1980);
__m512 ifft2074 = _mm512_mask_mov_ps(ifft2066, 49344, ifft2065);
__m512 ifft1990 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft1991 = _mm512_fmadd_ps(ifft1982, ifft1990, _mm512_shuffle_f32x4(ifft1982, ifft1982, 177));
__m512 ifft2075 = _mm512_fmadd_ps(ifft2067, ifft1990, _mm512_shuffle_f32x4(ifft2067, ifft2067, 177));
__m512 ifft1992 = _mm512_fmadd_ps(ifft1983, ifft1990, _mm512_shuffle_f32x4(ifft1983, ifft1983, 177));
__m512 ifft2076 = _mm512_fmadd_ps(ifft2068, ifft1990, _mm512_shuffle_f32x4(ifft2068, ifft2068, 177));
__m512 ifft1993 = _mm512_fmadd_ps(ifft1984, ifft1990, _mm512_shuffle_f32x4(ifft1984, ifft1984, 177));
__m512 ifft2077 = _mm512_fmadd_ps(ifft2069, ifft1990, _mm512_shuffle_f32x4(ifft2069, ifft2069, 177));
__m512 ifft1994 = _mm512_fmadd_ps(ifft1985, ifft1990, _mm512_shuffle_f32x4(ifft1985, ifft1985, 177));
__m512 ifft2078 = _mm512_fmadd_ps(ifft2070, ifft1990, _mm512_shuffle_f32x4(ifft2070, ifft2070, 177));
__m512 ifft1995 = _mm512_fmadd_ps(ifft1986, ifft1990, _mm512_shuffle_f32x4(ifft1986, ifft1986, 177));
__m512 ifft2079 = _mm512_fmadd_ps(ifft2071, ifft1990, _mm512_shuffle_f32x4(ifft2071, ifft2071, 177));
__m512 ifft1996 = _mm512_fnmsub_ps(ifft1987, ifft1990, _mm512_shuffle_f32x4(ifft1987, ifft1987, 177));
__m512 ifft2080 = _mm512_fnmsub_ps(ifft2072, ifft1990, _mm512_shuffle_f32x4(ifft2072, ifft2072, 177));
__m512 ifft1997 = _mm512_fmadd_ps(ifft1988, ifft1990, _mm512_shuffle_f32x4(ifft1988, ifft1988, 177));
__m512 ifft2081 = _mm512_fmadd_ps(ifft2073, ifft1990, _mm512_shuffle_f32x4(ifft2073, ifft2073, 177));
__m512 ifft1998 = _mm512_fmadd_ps(ifft1989, ifft1990, _mm512_shuffle_f32x4(ifft1989, ifft1989, 177));
__m512 ifft2082 = _mm512_fmadd_ps(ifft2074, ifft1990, _mm512_shuffle_f32x4(ifft2074, ifft2074, 177));
__m512 ifft1999 = _mm512_add_ps(ifft1991, ifft1992);
__m512 ifft2083 = _mm512_add_ps(ifft2075, ifft2076);
__m512 ifft2000 = _mm512_sub_ps(ifft1991, ifft1992);
__m512 ifft2084 = _mm512_sub_ps(ifft2075, ifft2076);
__m512 ifft2001 = _mm512_sub_ps(ifft1993, ifft1997);
__m512 ifft2085 = _mm512_sub_ps(ifft2077, ifft2081);
__m512 ifft2002 = _mm512_add_ps(ifft1994, ifft1998);
__m512 ifft2086 = _mm512_add_ps(ifft2078, ifft2082);
__m512 ifft2003 = _mm512_add_ps(ifft1993, ifft1997);
__m512 ifft2087 = _mm512_add_ps(ifft2077, ifft2081);
__m512 ifft2004 = _mm512_sub_ps(ifft1994, ifft1998);
__m512 ifft2088 = _mm512_sub_ps(ifft2078, ifft2082);
__m512 ifft2005 = _mm512_mul_ps(ifft1995, _mm512_set1_ps(3.125e-02f));
__m512 ifft2089 = _mm512_mul_ps(ifft2079, _mm512_set1_ps(3.125e-02f));
__m512 ifft2006 = _mm512_mul_ps(ifft1996, _mm512_set1_ps(3.125e-02f));
__m512 ifft2090 = _mm512_mul_ps(ifft2080, _mm512_set1_ps(3.125e-02f));
__m512 ifft2007 = _mm512_fmadd_ps(ifft1999, _mm512_set1_ps(1.5625e-02f), ifft2005);
__m512 ifft2091 = _mm512_fmadd_ps(ifft2083, _mm512_set1_ps(1.5625e-02f), ifft2089);
__m512 ifft2008 = _mm512_fmsub_ps(ifft1999, _mm512_set1_ps(1.5625e-02f), ifft2005);
__m512 ifft2092 = _mm512_fmsub_ps(ifft2083, _mm512_set1_ps(1.5625e-02f), ifft2089);
__m512 ifft2009 = _mm512_fmadd_ps(ifft2000, _mm512_set1_ps(1.5625e-02f), ifft2006);
__m512 ifft2093 = _mm512_fmadd_ps(ifft2084, _mm512_set1_ps(1.5625e-02f), ifft2090);
__m512 ifft2010 = _mm512_fmsub_ps(ifft2000, _mm512_set1_ps(1.5625e-02f), ifft2006);
__m512 ifft2094 = _mm512_fmsub_ps(ifft2084, _mm512_set1_ps(1.5625e-02f), ifft2090);
__m512 ifft2011 = _mm512_add_ps(ifft2001, ifft2002);
__m512 ifft2095 = _mm512_add_ps(ifft2085, ifft2086);
__m512 ifft2012 = _mm512_sub_ps(ifft2001, ifft2002);
__m512 ifft2096 = _mm512_sub_ps(ifft2085, ifft2086);
__m512 ifft2013 = _mm512_fnmadd_ps(ifft2011, _mm512_set1_ps(7.0710677e-01f), ifft2003);
__m512 ifft2097 = _mm512_fnmadd_ps(ifft2095, _mm512_set1_ps(7.0710677e-01f), ifft2087);
__m512 ifft2014 = _mm512_fmadd_ps(ifft2011, _mm512_set1_ps(7.0710677e-01f), ifft2003);
__m512 ifft2098 = _mm512_fmadd_ps(ifft2095, _mm512_set1_ps(7.0710677e-01f), ifft2087);
__m512 ifft2015 = _mm512_fmadd_ps(ifft2012, _mm512_set1_ps(7.0710677e-01f), ifft2004);
__m512 ifft2099 = _mm512_fmadd_ps(ifft2096, _mm512_set1_ps(7.0710677e-01f), ifft2088);
__m512 ifft2016 = _mm512_fmsub_ps(ifft2012, _mm512_set1_ps(7.0710677e-01f), ifft2004);
__m512 ifft2100 = _mm512_fmsub_ps(ifft2096, _mm512_set1_ps(7.0710677e-01f), ifft2088);
__m512 ifft2017 = _mm512_add_ps(ifft2013, ifft2014);
__m512 ifft2101 = _mm512_add_ps(ifft2097, ifft2098);
__m512 ifft2018 = _mm512_sub_ps(ifft2013, ifft2014);
__m512 ifft2102 = _mm512_sub_ps(ifft2097, ifft2098);
__m512 ifft2019 = _mm512_add_ps(ifft2015, ifft2016);
__m512 ifft2103 = _mm512_add_ps(ifft2099, ifft2100);
__m512 ifft2020 = _mm512_sub_ps(ifft2015, ifft2016);
__m512 ifft2104 = _mm512_sub_ps(ifft2099, ifft2100);
__m512 ifft2021 = _mm512_fmadd_ps(ifft2017, _mm512_set1_ps(1.5625e-02f), ifft2007);
__m512 ifft2105 = _mm512_fmadd_ps(ifft2101, _mm512_set1_ps(1.5625e-02f), ifft2091);
__m512 ifft2022 = _mm512_fnmadd_ps(ifft2017, _mm512_set1_ps(1.5625e-02f), ifft2007);
__m512 ifft2106 = _mm512_fnmadd_ps(ifft2101, _mm512_set1_ps(1.5625e-02f), ifft2091);
__m512 ifft2023 = _mm512_fmadd_ps(ifft2019, _mm512_set1_ps(1.5625e-02f), ifft2009);
__m512 ifft2107 = _mm512_fmadd_ps(ifft2103, _mm512_set1_ps(1.5625e-02f), ifft2093);
__m512 ifft2024 = _mm512_fnmadd_ps(ifft2019, _mm512_set1_ps(1.5625e-02f), ifft2009);
__m512 ifft2108 = _mm512_fnmadd_ps(ifft2103, _mm512_set1_ps(1.5625e-02f), ifft2093);
__m512 ifft2025 = _mm512_fnmadd_ps(ifft2020, _mm512_set1_ps(1.5625e-02f), ifft2008);
__m512 ifft2109 = _mm512_fnmadd_ps(ifft2104, _mm512_set1_ps(1.5625e-02f), ifft2092);
__m512 ifft2026 = _mm512_fmadd_ps(ifft2020, _mm512_set1_ps(1.5625e-02f), ifft2008);
__m512 ifft2110 = _mm512_fmadd_ps(ifft2104, _mm512_set1_ps(1.5625e-02f), ifft2092);
__m512 ifft2027 = _mm512_fmadd_ps(ifft2018, _mm512_set1_ps(1.5625e-02f), ifft2010);
__m512 ifft2111 = _mm512_fmadd_ps(ifft2102, _mm512_set1_ps(1.5625e-02f), ifft2094);
__m512 ifft2028 = _mm512_fnmadd_ps(ifft2018, _mm512_set1_ps(1.5625e-02f), ifft2010);
__m512 ifft2112 = _mm512_fnmadd_ps(ifft2102, _mm512_set1_ps(1.5625e-02f), ifft2094);
__m512 dat700 = ifft2021;
__m512 dat705 = ifft2105;
__m512 dat701 = ifft2023;
__m512 dat706 = ifft2107;
__m512 dat702 = ifft2025;
__m512 dat707 = ifft2109;
__m512 dat703 = ifft2027;
__m512 dat708 = ifft2111;
__m512 dat704 = ifft2022;
__m512 dat709 = ifft2106;
(void)ifft2024;
(void)ifft2108;
(void)ifft2026;
(void)ifft2110;
(void)ifft2028;
(void)ifft2112;
dat700 = _mm512_max_ps(_mm512_setzero_ps(), dat700);
dat705 = _mm512_max_ps(_mm512_setzero_ps(), dat705);
dat701 = _mm512_max_ps(_mm512_setzero_ps(), dat701);
dat706 = _mm512_max_ps(_mm512_setzero_ps(), dat706);
dat702 = _mm512_max_ps(_mm512_setzero_ps(), dat702);
dat707 = _mm512_max_ps(_mm512_setzero_ps(), dat707);
dat703 = _mm512_max_ps(_mm512_setzero_ps(), dat703);
dat708 = _mm512_max_ps(_mm512_setzero_ps(), dat708);
dat704 = _mm512_max_ps(_mm512_setzero_ps(), dat704);
dat709 = _mm512_max_ps(_mm512_setzero_ps(), dat709);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat700);
_mm512_mask_storeu_ps(datPtr2+52048+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat700);
_mm512_mask_storeu_ps(datPtr2+1840+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat705);
_mm512_mask_storeu_ps(datPtr2+50248+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat705);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat701);
_mm512_mask_storeu_ps(datPtr2+52496+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat701);
_mm512_mask_storeu_ps(datPtr2+2288+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat706);
_mm512_mask_storeu_ps(datPtr2+50696+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat706);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat702);
_mm512_mask_storeu_ps(datPtr2+52944+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat702);
_mm512_mask_storeu_ps(datPtr2+2736+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat707);
_mm512_mask_storeu_ps(datPtr2+51144+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat707);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat703);
_mm512_mask_storeu_ps(datPtr2+53392+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat703);
_mm512_mask_storeu_ps(datPtr2+3184+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat708);
_mm512_mask_storeu_ps(datPtr2+51592+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat708);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 3, dat704);
_mm512_mask_storeu_ps(datPtr2+53840+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 7936, dat704);
_mm512_mask_storeu_ps(datPtr2+3632+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 31, dat709);
_mm512_mask_storeu_ps(datPtr2+52040+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+0*t13, 768, dat709);
ptrdiff_t t14 = 0;
__m512 sfRe153 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm153 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe157 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm157 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe154 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm154 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe158 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm158 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe155 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm155 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe159 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm159 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe156 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm156 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfRe160 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512 sfIm160 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k31+768*r8+256*t14);
__m512i ifft2113 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2114 = _mm512_permutexvar_ps(ifft2113, sfRe153);
__m512 ifft2205 = _mm512_permutexvar_ps(ifft2113, sfRe157);
__m512i ifft2115 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2116 = _mm512_permutexvar_ps(ifft2115, sfRe153);
__m512 ifft2206 = _mm512_permutexvar_ps(ifft2115, sfRe157);
__m512 ifft2117 = _mm512_permutexvar_ps(ifft2113, sfIm153);
__m512 ifft2207 = _mm512_permutexvar_ps(ifft2113, sfIm157);
__m512 ifft2118 = _mm512_permutexvar_ps(ifft2115, sfIm153);
__m512 ifft2208 = _mm512_permutexvar_ps(ifft2115, sfIm157);
__m512 ifft2119 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2120 = _mm512_mask_fmadd_ps(ifft2118, 65021, ifft2119, ifft2114);
__m512 ifft2209 = _mm512_mask_fmadd_ps(ifft2208, 65021, ifft2119, ifft2205);
__m512 ifft2121 = _mm512_mask_fnmadd_ps(ifft2117, 65021, ifft2119, ifft2116);
__m512 ifft2210 = _mm512_mask_fnmadd_ps(ifft2207, 65021, ifft2119, ifft2206);
__m512 ifft2122 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2123 = _mm512_fmadd_ps(ifft2120, ifft2122, _mm512_shuffle_ps(ifft2120, ifft2120, 177));
__m512 ifft2211 = _mm512_fmadd_ps(ifft2209, ifft2122, _mm512_shuffle_ps(ifft2209, ifft2209, 177));
__m512 ifft2124 = _mm512_fmadd_ps(ifft2121, ifft2122, _mm512_shuffle_ps(ifft2121, ifft2121, 177));
__m512 ifft2212 = _mm512_fmadd_ps(ifft2210, ifft2122, _mm512_shuffle_ps(ifft2210, ifft2210, 177));
__m512 ifft2125 = _mm512_fmadd_ps(sfRe154, ifft2122, _mm512_shuffle_ps(sfRe154, sfRe154, 177));
__m512 ifft2213 = _mm512_fmadd_ps(sfRe158, ifft2122, _mm512_shuffle_ps(sfRe158, sfRe158, 177));
__m512 ifft2126 = _mm512_fmadd_ps(sfIm154, ifft2122, _mm512_shuffle_ps(sfIm154, sfIm154, 177));
__m512 ifft2214 = _mm512_fmadd_ps(sfIm158, ifft2122, _mm512_shuffle_ps(sfIm158, sfIm158, 177));
__m512 ifft2127 = _mm512_fmadd_ps(sfRe155, ifft2122, _mm512_shuffle_ps(sfRe155, sfRe155, 177));
__m512 ifft2215 = _mm512_fmadd_ps(sfRe159, ifft2122, _mm512_shuffle_ps(sfRe159, sfRe159, 177));
__m512 ifft2128 = _mm512_fmadd_ps(sfIm155, ifft2122, _mm512_shuffle_ps(sfIm155, sfIm155, 177));
__m512 ifft2216 = _mm512_fmadd_ps(sfIm159, ifft2122, _mm512_shuffle_ps(sfIm159, sfIm159, 177));
__m512 ifft2129 = _mm512_fmadd_ps(sfRe156, ifft2122, _mm512_shuffle_ps(sfRe156, sfRe156, 177));
__m512 ifft2217 = _mm512_fmadd_ps(sfRe160, ifft2122, _mm512_shuffle_ps(sfRe160, sfRe160, 177));
__m512 ifft2130 = _mm512_fmadd_ps(sfIm156, ifft2122, _mm512_shuffle_ps(sfIm156, sfIm156, 177));
__m512 ifft2218 = _mm512_fmadd_ps(sfIm160, ifft2122, _mm512_shuffle_ps(sfIm160, sfIm160, 177));
__m512 ifft2131 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2132 = _mm512_mul_ps(ifft2123, ifft2131);
__m512 ifft2219 = _mm512_mul_ps(ifft2211, ifft2131);
__m512 ifft2133 = _mm512_mul_ps(ifft2124, ifft2131);
__m512 ifft2220 = _mm512_mul_ps(ifft2212, ifft2131);
__m512 ifft2134 = _mm512_mul_ps(ifft2125, ifft2131);
__m512 ifft2221 = _mm512_mul_ps(ifft2213, ifft2131);
__m512 ifft2135 = _mm512_mul_ps(ifft2126, ifft2131);
__m512 ifft2222 = _mm512_mul_ps(ifft2214, ifft2131);
__m512 ifft2136 = _mm512_mul_ps(ifft2127, ifft2131);
__m512 ifft2223 = _mm512_mul_ps(ifft2215, ifft2131);
__m512 ifft2137 = _mm512_mul_ps(ifft2128, ifft2131);
__m512 ifft2224 = _mm512_mul_ps(ifft2216, ifft2131);
__m512 ifft2138 = _mm512_mul_ps(ifft2129, ifft2131);
__m512 ifft2225 = _mm512_mul_ps(ifft2217, ifft2131);
__m512 ifft2139 = _mm512_mul_ps(ifft2130, ifft2131);
__m512 ifft2226 = _mm512_mul_ps(ifft2218, ifft2131);
__m512 ifft2140 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2141 = _mm512_fnmadd_ps(ifft2124, ifft2140, ifft2132);
__m512 ifft2227 = _mm512_fnmadd_ps(ifft2212, ifft2140, ifft2219);
__m512 ifft2142 = _mm512_fmadd_ps(ifft2123, ifft2140, ifft2133);
__m512 ifft2228 = _mm512_fmadd_ps(ifft2211, ifft2140, ifft2220);
__m512 ifft2143 = _mm512_fnmadd_ps(ifft2126, ifft2140, ifft2134);
__m512 ifft2229 = _mm512_fnmadd_ps(ifft2214, ifft2140, ifft2221);
__m512 ifft2144 = _mm512_fmadd_ps(ifft2125, ifft2140, ifft2135);
__m512 ifft2230 = _mm512_fmadd_ps(ifft2213, ifft2140, ifft2222);
__m512 ifft2145 = _mm512_fnmadd_ps(ifft2128, ifft2140, ifft2136);
__m512 ifft2231 = _mm512_fnmadd_ps(ifft2216, ifft2140, ifft2223);
__m512 ifft2146 = _mm512_fmadd_ps(ifft2127, ifft2140, ifft2137);
__m512 ifft2232 = _mm512_fmadd_ps(ifft2215, ifft2140, ifft2224);
__m512 ifft2147 = _mm512_fnmadd_ps(ifft2130, ifft2140, ifft2138);
__m512 ifft2233 = _mm512_fnmadd_ps(ifft2218, ifft2140, ifft2225);
__m512 ifft2148 = _mm512_fmadd_ps(ifft2129, ifft2140, ifft2139);
__m512 ifft2234 = _mm512_fmadd_ps(ifft2217, ifft2140, ifft2226);
__m512 ifft2149 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2150 = _mm512_fmadd_ps(ifft2141, ifft2149, _mm512_shuffle_ps(ifft2141, ifft2141, 78));
__m512 ifft2235 = _mm512_fmadd_ps(ifft2227, ifft2149, _mm512_shuffle_ps(ifft2227, ifft2227, 78));
__m512 ifft2151 = _mm512_fmadd_ps(ifft2142, ifft2149, _mm512_shuffle_ps(ifft2142, ifft2142, 78));
__m512 ifft2236 = _mm512_fmadd_ps(ifft2228, ifft2149, _mm512_shuffle_ps(ifft2228, ifft2228, 78));
__m512 ifft2152 = _mm512_fmadd_ps(ifft2143, ifft2149, _mm512_shuffle_ps(ifft2143, ifft2143, 78));
__m512 ifft2237 = _mm512_fmadd_ps(ifft2229, ifft2149, _mm512_shuffle_ps(ifft2229, ifft2229, 78));
__m512 ifft2153 = _mm512_fmadd_ps(ifft2144, ifft2149, _mm512_shuffle_ps(ifft2144, ifft2144, 78));
__m512 ifft2238 = _mm512_fmadd_ps(ifft2230, ifft2149, _mm512_shuffle_ps(ifft2230, ifft2230, 78));
__m512 ifft2154 = _mm512_fmadd_ps(ifft2145, ifft2149, _mm512_shuffle_ps(ifft2145, ifft2145, 78));
__m512 ifft2239 = _mm512_fmadd_ps(ifft2231, ifft2149, _mm512_shuffle_ps(ifft2231, ifft2231, 78));
__m512 ifft2155 = _mm512_fmadd_ps(ifft2146, ifft2149, _mm512_shuffle_ps(ifft2146, ifft2146, 78));
__m512 ifft2240 = _mm512_fmadd_ps(ifft2232, ifft2149, _mm512_shuffle_ps(ifft2232, ifft2232, 78));
__m512 ifft2156 = _mm512_fmadd_ps(ifft2147, ifft2149, _mm512_shuffle_ps(ifft2147, ifft2147, 78));
__m512 ifft2241 = _mm512_fmadd_ps(ifft2233, ifft2149, _mm512_shuffle_ps(ifft2233, ifft2233, 78));
__m512 ifft2157 = _mm512_fmadd_ps(ifft2148, ifft2149, _mm512_shuffle_ps(ifft2148, ifft2148, 78));
__m512 ifft2242 = _mm512_fmadd_ps(ifft2234, ifft2149, _mm512_shuffle_ps(ifft2234, ifft2234, 78));
__m512 ifft2158 = _mm512_mask_sub_ps(ifft2150, 49344, _mm512_setzero_ps(), ifft2151);
__m512 ifft2243 = _mm512_mask_sub_ps(ifft2235, 49344, _mm512_setzero_ps(), ifft2236);
__m512 ifft2159 = _mm512_mask_mov_ps(ifft2151, 49344, ifft2150);
__m512 ifft2244 = _mm512_mask_mov_ps(ifft2236, 49344, ifft2235);
__m512 ifft2160 = _mm512_mask_sub_ps(ifft2152, 49344, _mm512_setzero_ps(), ifft2153);
__m512 ifft2245 = _mm512_mask_sub_ps(ifft2237, 49344, _mm512_setzero_ps(), ifft2238);
__m512 ifft2161 = _mm512_mask_mov_ps(ifft2153, 49344, ifft2152);
__m512 ifft2246 = _mm512_mask_mov_ps(ifft2238, 49344, ifft2237);
__m512 ifft2162 = _mm512_mask_sub_ps(ifft2154, 49344, _mm512_setzero_ps(), ifft2155);
__m512 ifft2247 = _mm512_mask_sub_ps(ifft2239, 49344, _mm512_setzero_ps(), ifft2240);
__m512 ifft2163 = _mm512_mask_mov_ps(ifft2155, 49344, ifft2154);
__m512 ifft2248 = _mm512_mask_mov_ps(ifft2240, 49344, ifft2239);
__m512 ifft2164 = _mm512_mask_sub_ps(ifft2156, 49344, _mm512_setzero_ps(), ifft2157);
__m512 ifft2249 = _mm512_mask_sub_ps(ifft2241, 49344, _mm512_setzero_ps(), ifft2242);
__m512 ifft2165 = _mm512_mask_mov_ps(ifft2157, 49344, ifft2156);
__m512 ifft2250 = _mm512_mask_mov_ps(ifft2242, 49344, ifft2241);
__m512 ifft2166 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2167 = _mm512_fmadd_ps(ifft2158, ifft2166, _mm512_shuffle_f32x4(ifft2158, ifft2158, 177));
__m512 ifft2251 = _mm512_fmadd_ps(ifft2243, ifft2166, _mm512_shuffle_f32x4(ifft2243, ifft2243, 177));
__m512 ifft2168 = _mm512_fmadd_ps(ifft2159, ifft2166, _mm512_shuffle_f32x4(ifft2159, ifft2159, 177));
__m512 ifft2252 = _mm512_fmadd_ps(ifft2244, ifft2166, _mm512_shuffle_f32x4(ifft2244, ifft2244, 177));
__m512 ifft2169 = _mm512_fmadd_ps(ifft2160, ifft2166, _mm512_shuffle_f32x4(ifft2160, ifft2160, 177));
__m512 ifft2253 = _mm512_fmadd_ps(ifft2245, ifft2166, _mm512_shuffle_f32x4(ifft2245, ifft2245, 177));
__m512 ifft2170 = _mm512_fmadd_ps(ifft2161, ifft2166, _mm512_shuffle_f32x4(ifft2161, ifft2161, 177));
__m512 ifft2254 = _mm512_fmadd_ps(ifft2246, ifft2166, _mm512_shuffle_f32x4(ifft2246, ifft2246, 177));
__m512 ifft2171 = _mm512_fmadd_ps(ifft2162, ifft2166, _mm512_shuffle_f32x4(ifft2162, ifft2162, 177));
__m512 ifft2255 = _mm512_fmadd_ps(ifft2247, ifft2166, _mm512_shuffle_f32x4(ifft2247, ifft2247, 177));
__m512 ifft2172 = _mm512_fnmsub_ps(ifft2163, ifft2166, _mm512_shuffle_f32x4(ifft2163, ifft2163, 177));
__m512 ifft2256 = _mm512_fnmsub_ps(ifft2248, ifft2166, _mm512_shuffle_f32x4(ifft2248, ifft2248, 177));
__m512 ifft2173 = _mm512_fmadd_ps(ifft2164, ifft2166, _mm512_shuffle_f32x4(ifft2164, ifft2164, 177));
__m512 ifft2257 = _mm512_fmadd_ps(ifft2249, ifft2166, _mm512_shuffle_f32x4(ifft2249, ifft2249, 177));
__m512 ifft2174 = _mm512_fmadd_ps(ifft2165, ifft2166, _mm512_shuffle_f32x4(ifft2165, ifft2165, 177));
__m512 ifft2258 = _mm512_fmadd_ps(ifft2250, ifft2166, _mm512_shuffle_f32x4(ifft2250, ifft2250, 177));
__m512 ifft2175 = _mm512_add_ps(ifft2167, ifft2168);
__m512 ifft2259 = _mm512_add_ps(ifft2251, ifft2252);
__m512 ifft2176 = _mm512_sub_ps(ifft2167, ifft2168);
__m512 ifft2260 = _mm512_sub_ps(ifft2251, ifft2252);
__m512 ifft2177 = _mm512_sub_ps(ifft2169, ifft2173);
__m512 ifft2261 = _mm512_sub_ps(ifft2253, ifft2257);
__m512 ifft2178 = _mm512_add_ps(ifft2170, ifft2174);
__m512 ifft2262 = _mm512_add_ps(ifft2254, ifft2258);
__m512 ifft2179 = _mm512_add_ps(ifft2169, ifft2173);
__m512 ifft2263 = _mm512_add_ps(ifft2253, ifft2257);
__m512 ifft2180 = _mm512_sub_ps(ifft2170, ifft2174);
__m512 ifft2264 = _mm512_sub_ps(ifft2254, ifft2258);
__m512 ifft2181 = _mm512_mul_ps(ifft2171, _mm512_set1_ps(3.125e-02f));
__m512 ifft2265 = _mm512_mul_ps(ifft2255, _mm512_set1_ps(3.125e-02f));
__m512 ifft2182 = _mm512_mul_ps(ifft2172, _mm512_set1_ps(3.125e-02f));
__m512 ifft2266 = _mm512_mul_ps(ifft2256, _mm512_set1_ps(3.125e-02f));
__m512 ifft2183 = _mm512_fmadd_ps(ifft2175, _mm512_set1_ps(1.5625e-02f), ifft2181);
__m512 ifft2267 = _mm512_fmadd_ps(ifft2259, _mm512_set1_ps(1.5625e-02f), ifft2265);
__m512 ifft2184 = _mm512_fmsub_ps(ifft2175, _mm512_set1_ps(1.5625e-02f), ifft2181);
__m512 ifft2268 = _mm512_fmsub_ps(ifft2259, _mm512_set1_ps(1.5625e-02f), ifft2265);
__m512 ifft2185 = _mm512_fmadd_ps(ifft2176, _mm512_set1_ps(1.5625e-02f), ifft2182);
__m512 ifft2269 = _mm512_fmadd_ps(ifft2260, _mm512_set1_ps(1.5625e-02f), ifft2266);
__m512 ifft2186 = _mm512_fmsub_ps(ifft2176, _mm512_set1_ps(1.5625e-02f), ifft2182);
__m512 ifft2270 = _mm512_fmsub_ps(ifft2260, _mm512_set1_ps(1.5625e-02f), ifft2266);
__m512 ifft2187 = _mm512_add_ps(ifft2177, ifft2178);
__m512 ifft2271 = _mm512_add_ps(ifft2261, ifft2262);
__m512 ifft2188 = _mm512_sub_ps(ifft2177, ifft2178);
__m512 ifft2272 = _mm512_sub_ps(ifft2261, ifft2262);
__m512 ifft2189 = _mm512_fnmadd_ps(ifft2187, _mm512_set1_ps(7.0710677e-01f), ifft2179);
__m512 ifft2273 = _mm512_fnmadd_ps(ifft2271, _mm512_set1_ps(7.0710677e-01f), ifft2263);
__m512 ifft2190 = _mm512_fmadd_ps(ifft2187, _mm512_set1_ps(7.0710677e-01f), ifft2179);
__m512 ifft2274 = _mm512_fmadd_ps(ifft2271, _mm512_set1_ps(7.0710677e-01f), ifft2263);
__m512 ifft2191 = _mm512_fmadd_ps(ifft2188, _mm512_set1_ps(7.0710677e-01f), ifft2180);
__m512 ifft2275 = _mm512_fmadd_ps(ifft2272, _mm512_set1_ps(7.0710677e-01f), ifft2264);
__m512 ifft2192 = _mm512_fmsub_ps(ifft2188, _mm512_set1_ps(7.0710677e-01f), ifft2180);
__m512 ifft2276 = _mm512_fmsub_ps(ifft2272, _mm512_set1_ps(7.0710677e-01f), ifft2264);
__m512 ifft2193 = _mm512_add_ps(ifft2189, ifft2190);
__m512 ifft2277 = _mm512_add_ps(ifft2273, ifft2274);
__m512 ifft2194 = _mm512_sub_ps(ifft2189, ifft2190);
__m512 ifft2278 = _mm512_sub_ps(ifft2273, ifft2274);
__m512 ifft2195 = _mm512_add_ps(ifft2191, ifft2192);
__m512 ifft2279 = _mm512_add_ps(ifft2275, ifft2276);
__m512 ifft2196 = _mm512_sub_ps(ifft2191, ifft2192);
__m512 ifft2280 = _mm512_sub_ps(ifft2275, ifft2276);
__m512 ifft2197 = _mm512_fmadd_ps(ifft2193, _mm512_set1_ps(1.5625e-02f), ifft2183);
__m512 ifft2281 = _mm512_fmadd_ps(ifft2277, _mm512_set1_ps(1.5625e-02f), ifft2267);
__m512 ifft2198 = _mm512_fnmadd_ps(ifft2193, _mm512_set1_ps(1.5625e-02f), ifft2183);
__m512 ifft2282 = _mm512_fnmadd_ps(ifft2277, _mm512_set1_ps(1.5625e-02f), ifft2267);
__m512 ifft2199 = _mm512_fmadd_ps(ifft2195, _mm512_set1_ps(1.5625e-02f), ifft2185);
__m512 ifft2283 = _mm512_fmadd_ps(ifft2279, _mm512_set1_ps(1.5625e-02f), ifft2269);
__m512 ifft2200 = _mm512_fnmadd_ps(ifft2195, _mm512_set1_ps(1.5625e-02f), ifft2185);
__m512 ifft2284 = _mm512_fnmadd_ps(ifft2279, _mm512_set1_ps(1.5625e-02f), ifft2269);
__m512 ifft2201 = _mm512_fnmadd_ps(ifft2196, _mm512_set1_ps(1.5625e-02f), ifft2184);
__m512 ifft2285 = _mm512_fnmadd_ps(ifft2280, _mm512_set1_ps(1.5625e-02f), ifft2268);
__m512 ifft2202 = _mm512_fmadd_ps(ifft2196, _mm512_set1_ps(1.5625e-02f), ifft2184);
__m512 ifft2286 = _mm512_fmadd_ps(ifft2280, _mm512_set1_ps(1.5625e-02f), ifft2268);
__m512 ifft2203 = _mm512_fmadd_ps(ifft2194, _mm512_set1_ps(1.5625e-02f), ifft2186);
__m512 ifft2287 = _mm512_fmadd_ps(ifft2278, _mm512_set1_ps(1.5625e-02f), ifft2270);
__m512 ifft2204 = _mm512_fnmadd_ps(ifft2194, _mm512_set1_ps(1.5625e-02f), ifft2186);
__m512 ifft2288 = _mm512_fnmadd_ps(ifft2278, _mm512_set1_ps(1.5625e-02f), ifft2270);
__m512 dat710 = ifft2197;
__m512 dat715 = ifft2281;
__m512 dat711 = ifft2199;
__m512 dat716 = ifft2283;
__m512 dat712 = ifft2201;
__m512 dat717 = ifft2285;
__m512 dat713 = ifft2203;
__m512 dat718 = ifft2287;
__m512 dat714 = ifft2198;
__m512 dat719 = ifft2282;
(void)ifft2200;
(void)ifft2284;
(void)ifft2202;
(void)ifft2286;
(void)ifft2204;
(void)ifft2288;
__m512i pm21 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack101 = _mm512_permutex2var_ps(dat710, pm21, dat715);
__m512i pm22 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack102 = _mm512_permutex2var_ps(dat710, pm22, dat715);
__m512 pack103 = _mm512_permutex2var_ps(dat711, pm21, dat716);
__m512 pack104 = _mm512_permutex2var_ps(dat711, pm22, dat716);
__m512 pack105 = _mm512_permutex2var_ps(dat712, pm21, dat717);
__m512 pack106 = _mm512_permutex2var_ps(dat712, pm22, dat717);
__m512 pack107 = _mm512_permutex2var_ps(dat713, pm21, dat718);
__m512 pack108 = _mm512_permutex2var_ps(dat713, pm22, dat718);
__m512 pack109 = _mm512_permutex2var_ps(dat714, pm21, dat719);
__m512 pack110 = _mm512_permutex2var_ps(dat714, pm22, dat719);
pack101 = _mm512_max_ps(_mm512_setzero_ps(), pack101);
pack102 = _mm512_max_ps(_mm512_setzero_ps(), pack102);
pack103 = _mm512_max_ps(_mm512_setzero_ps(), pack103);
pack104 = _mm512_max_ps(_mm512_setzero_ps(), pack104);
pack105 = _mm512_max_ps(_mm512_setzero_ps(), pack105);
pack106 = _mm512_max_ps(_mm512_setzero_ps(), pack106);
pack107 = _mm512_max_ps(_mm512_setzero_ps(), pack107);
pack108 = _mm512_max_ps(_mm512_setzero_ps(), pack108);
pack109 = _mm512_max_ps(_mm512_setzero_ps(), pack109);
pack110 = _mm512_max_ps(_mm512_setzero_ps(), pack110);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack101);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack102);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack103);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack104);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack105);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack106);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack107);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack108);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack109);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k31+100480*r8+448*toH7+4*toW7+40*t14, 1023, pack110);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 8;
}
ptrdiff_t toH8 = base5+10;
ptrdiff_t toW8 = -225+30*rel5;
ptrdiff_t jj14 = 10-rel5+j5;
for (; j5 <= jj14; toW8 += 30) {
ptrdiff_t k32 = 16*w21;
for (; k32 != 16; ++k32) {
ptrdiff_t r9 = 0;
for (; r9 != 2; ++r9) {
ptrdiff_t t15 = 0;
for (; t15 < 3; ++t15) {
__m512 sfRe161 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm161 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe165 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm165 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe162 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm162 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe166 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm166 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe163 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm163 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe167 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm167 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe164 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm164 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfRe168 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512 sfIm168 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k32+768*r9+256*t15);
__m512i ifft2289 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2290 = _mm512_permutexvar_ps(ifft2289, sfRe161);
__m512 ifft2381 = _mm512_permutexvar_ps(ifft2289, sfRe165);
__m512i ifft2291 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2292 = _mm512_permutexvar_ps(ifft2291, sfRe161);
__m512 ifft2382 = _mm512_permutexvar_ps(ifft2291, sfRe165);
__m512 ifft2293 = _mm512_permutexvar_ps(ifft2289, sfIm161);
__m512 ifft2383 = _mm512_permutexvar_ps(ifft2289, sfIm165);
__m512 ifft2294 = _mm512_permutexvar_ps(ifft2291, sfIm161);
__m512 ifft2384 = _mm512_permutexvar_ps(ifft2291, sfIm165);
__m512 ifft2295 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2296 = _mm512_mask_fmadd_ps(ifft2294, 65021, ifft2295, ifft2290);
__m512 ifft2385 = _mm512_mask_fmadd_ps(ifft2384, 65021, ifft2295, ifft2381);
__m512 ifft2297 = _mm512_mask_fnmadd_ps(ifft2293, 65021, ifft2295, ifft2292);
__m512 ifft2386 = _mm512_mask_fnmadd_ps(ifft2383, 65021, ifft2295, ifft2382);
__m512 ifft2298 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2299 = _mm512_fmadd_ps(ifft2296, ifft2298, _mm512_shuffle_ps(ifft2296, ifft2296, 177));
__m512 ifft2387 = _mm512_fmadd_ps(ifft2385, ifft2298, _mm512_shuffle_ps(ifft2385, ifft2385, 177));
__m512 ifft2300 = _mm512_fmadd_ps(ifft2297, ifft2298, _mm512_shuffle_ps(ifft2297, ifft2297, 177));
__m512 ifft2388 = _mm512_fmadd_ps(ifft2386, ifft2298, _mm512_shuffle_ps(ifft2386, ifft2386, 177));
__m512 ifft2301 = _mm512_fmadd_ps(sfRe162, ifft2298, _mm512_shuffle_ps(sfRe162, sfRe162, 177));
__m512 ifft2389 = _mm512_fmadd_ps(sfRe166, ifft2298, _mm512_shuffle_ps(sfRe166, sfRe166, 177));
__m512 ifft2302 = _mm512_fmadd_ps(sfIm162, ifft2298, _mm512_shuffle_ps(sfIm162, sfIm162, 177));
__m512 ifft2390 = _mm512_fmadd_ps(sfIm166, ifft2298, _mm512_shuffle_ps(sfIm166, sfIm166, 177));
__m512 ifft2303 = _mm512_fmadd_ps(sfRe163, ifft2298, _mm512_shuffle_ps(sfRe163, sfRe163, 177));
__m512 ifft2391 = _mm512_fmadd_ps(sfRe167, ifft2298, _mm512_shuffle_ps(sfRe167, sfRe167, 177));
__m512 ifft2304 = _mm512_fmadd_ps(sfIm163, ifft2298, _mm512_shuffle_ps(sfIm163, sfIm163, 177));
__m512 ifft2392 = _mm512_fmadd_ps(sfIm167, ifft2298, _mm512_shuffle_ps(sfIm167, sfIm167, 177));
__m512 ifft2305 = _mm512_fmadd_ps(sfRe164, ifft2298, _mm512_shuffle_ps(sfRe164, sfRe164, 177));
__m512 ifft2393 = _mm512_fmadd_ps(sfRe168, ifft2298, _mm512_shuffle_ps(sfRe168, sfRe168, 177));
__m512 ifft2306 = _mm512_fmadd_ps(sfIm164, ifft2298, _mm512_shuffle_ps(sfIm164, sfIm164, 177));
__m512 ifft2394 = _mm512_fmadd_ps(sfIm168, ifft2298, _mm512_shuffle_ps(sfIm168, sfIm168, 177));
__m512 ifft2307 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2308 = _mm512_mul_ps(ifft2299, ifft2307);
__m512 ifft2395 = _mm512_mul_ps(ifft2387, ifft2307);
__m512 ifft2309 = _mm512_mul_ps(ifft2300, ifft2307);
__m512 ifft2396 = _mm512_mul_ps(ifft2388, ifft2307);
__m512 ifft2310 = _mm512_mul_ps(ifft2301, ifft2307);
__m512 ifft2397 = _mm512_mul_ps(ifft2389, ifft2307);
__m512 ifft2311 = _mm512_mul_ps(ifft2302, ifft2307);
__m512 ifft2398 = _mm512_mul_ps(ifft2390, ifft2307);
__m512 ifft2312 = _mm512_mul_ps(ifft2303, ifft2307);
__m512 ifft2399 = _mm512_mul_ps(ifft2391, ifft2307);
__m512 ifft2313 = _mm512_mul_ps(ifft2304, ifft2307);
__m512 ifft2400 = _mm512_mul_ps(ifft2392, ifft2307);
__m512 ifft2314 = _mm512_mul_ps(ifft2305, ifft2307);
__m512 ifft2401 = _mm512_mul_ps(ifft2393, ifft2307);
__m512 ifft2315 = _mm512_mul_ps(ifft2306, ifft2307);
__m512 ifft2402 = _mm512_mul_ps(ifft2394, ifft2307);
__m512 ifft2316 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2317 = _mm512_fnmadd_ps(ifft2300, ifft2316, ifft2308);
__m512 ifft2403 = _mm512_fnmadd_ps(ifft2388, ifft2316, ifft2395);
__m512 ifft2318 = _mm512_fmadd_ps(ifft2299, ifft2316, ifft2309);
__m512 ifft2404 = _mm512_fmadd_ps(ifft2387, ifft2316, ifft2396);
__m512 ifft2319 = _mm512_fnmadd_ps(ifft2302, ifft2316, ifft2310);
__m512 ifft2405 = _mm512_fnmadd_ps(ifft2390, ifft2316, ifft2397);
__m512 ifft2320 = _mm512_fmadd_ps(ifft2301, ifft2316, ifft2311);
__m512 ifft2406 = _mm512_fmadd_ps(ifft2389, ifft2316, ifft2398);
__m512 ifft2321 = _mm512_fnmadd_ps(ifft2304, ifft2316, ifft2312);
__m512 ifft2407 = _mm512_fnmadd_ps(ifft2392, ifft2316, ifft2399);
__m512 ifft2322 = _mm512_fmadd_ps(ifft2303, ifft2316, ifft2313);
__m512 ifft2408 = _mm512_fmadd_ps(ifft2391, ifft2316, ifft2400);
__m512 ifft2323 = _mm512_fnmadd_ps(ifft2306, ifft2316, ifft2314);
__m512 ifft2409 = _mm512_fnmadd_ps(ifft2394, ifft2316, ifft2401);
__m512 ifft2324 = _mm512_fmadd_ps(ifft2305, ifft2316, ifft2315);
__m512 ifft2410 = _mm512_fmadd_ps(ifft2393, ifft2316, ifft2402);
__m512 ifft2325 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2326 = _mm512_fmadd_ps(ifft2317, ifft2325, _mm512_shuffle_ps(ifft2317, ifft2317, 78));
__m512 ifft2411 = _mm512_fmadd_ps(ifft2403, ifft2325, _mm512_shuffle_ps(ifft2403, ifft2403, 78));
__m512 ifft2327 = _mm512_fmadd_ps(ifft2318, ifft2325, _mm512_shuffle_ps(ifft2318, ifft2318, 78));
__m512 ifft2412 = _mm512_fmadd_ps(ifft2404, ifft2325, _mm512_shuffle_ps(ifft2404, ifft2404, 78));
__m512 ifft2328 = _mm512_fmadd_ps(ifft2319, ifft2325, _mm512_shuffle_ps(ifft2319, ifft2319, 78));
__m512 ifft2413 = _mm512_fmadd_ps(ifft2405, ifft2325, _mm512_shuffle_ps(ifft2405, ifft2405, 78));
__m512 ifft2329 = _mm512_fmadd_ps(ifft2320, ifft2325, _mm512_shuffle_ps(ifft2320, ifft2320, 78));
__m512 ifft2414 = _mm512_fmadd_ps(ifft2406, ifft2325, _mm512_shuffle_ps(ifft2406, ifft2406, 78));
__m512 ifft2330 = _mm512_fmadd_ps(ifft2321, ifft2325, _mm512_shuffle_ps(ifft2321, ifft2321, 78));
__m512 ifft2415 = _mm512_fmadd_ps(ifft2407, ifft2325, _mm512_shuffle_ps(ifft2407, ifft2407, 78));
__m512 ifft2331 = _mm512_fmadd_ps(ifft2322, ifft2325, _mm512_shuffle_ps(ifft2322, ifft2322, 78));
__m512 ifft2416 = _mm512_fmadd_ps(ifft2408, ifft2325, _mm512_shuffle_ps(ifft2408, ifft2408, 78));
__m512 ifft2332 = _mm512_fmadd_ps(ifft2323, ifft2325, _mm512_shuffle_ps(ifft2323, ifft2323, 78));
__m512 ifft2417 = _mm512_fmadd_ps(ifft2409, ifft2325, _mm512_shuffle_ps(ifft2409, ifft2409, 78));
__m512 ifft2333 = _mm512_fmadd_ps(ifft2324, ifft2325, _mm512_shuffle_ps(ifft2324, ifft2324, 78));
__m512 ifft2418 = _mm512_fmadd_ps(ifft2410, ifft2325, _mm512_shuffle_ps(ifft2410, ifft2410, 78));
__m512 ifft2334 = _mm512_mask_sub_ps(ifft2326, 49344, _mm512_setzero_ps(), ifft2327);
__m512 ifft2419 = _mm512_mask_sub_ps(ifft2411, 49344, _mm512_setzero_ps(), ifft2412);
__m512 ifft2335 = _mm512_mask_mov_ps(ifft2327, 49344, ifft2326);
__m512 ifft2420 = _mm512_mask_mov_ps(ifft2412, 49344, ifft2411);
__m512 ifft2336 = _mm512_mask_sub_ps(ifft2328, 49344, _mm512_setzero_ps(), ifft2329);
__m512 ifft2421 = _mm512_mask_sub_ps(ifft2413, 49344, _mm512_setzero_ps(), ifft2414);
__m512 ifft2337 = _mm512_mask_mov_ps(ifft2329, 49344, ifft2328);
__m512 ifft2422 = _mm512_mask_mov_ps(ifft2414, 49344, ifft2413);
__m512 ifft2338 = _mm512_mask_sub_ps(ifft2330, 49344, _mm512_setzero_ps(), ifft2331);
__m512 ifft2423 = _mm512_mask_sub_ps(ifft2415, 49344, _mm512_setzero_ps(), ifft2416);
__m512 ifft2339 = _mm512_mask_mov_ps(ifft2331, 49344, ifft2330);
__m512 ifft2424 = _mm512_mask_mov_ps(ifft2416, 49344, ifft2415);
__m512 ifft2340 = _mm512_mask_sub_ps(ifft2332, 49344, _mm512_setzero_ps(), ifft2333);
__m512 ifft2425 = _mm512_mask_sub_ps(ifft2417, 49344, _mm512_setzero_ps(), ifft2418);
__m512 ifft2341 = _mm512_mask_mov_ps(ifft2333, 49344, ifft2332);
__m512 ifft2426 = _mm512_mask_mov_ps(ifft2418, 49344, ifft2417);
__m512 ifft2342 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2343 = _mm512_fmadd_ps(ifft2334, ifft2342, _mm512_shuffle_f32x4(ifft2334, ifft2334, 177));
__m512 ifft2427 = _mm512_fmadd_ps(ifft2419, ifft2342, _mm512_shuffle_f32x4(ifft2419, ifft2419, 177));
__m512 ifft2344 = _mm512_fmadd_ps(ifft2335, ifft2342, _mm512_shuffle_f32x4(ifft2335, ifft2335, 177));
__m512 ifft2428 = _mm512_fmadd_ps(ifft2420, ifft2342, _mm512_shuffle_f32x4(ifft2420, ifft2420, 177));
__m512 ifft2345 = _mm512_fmadd_ps(ifft2336, ifft2342, _mm512_shuffle_f32x4(ifft2336, ifft2336, 177));
__m512 ifft2429 = _mm512_fmadd_ps(ifft2421, ifft2342, _mm512_shuffle_f32x4(ifft2421, ifft2421, 177));
__m512 ifft2346 = _mm512_fmadd_ps(ifft2337, ifft2342, _mm512_shuffle_f32x4(ifft2337, ifft2337, 177));
__m512 ifft2430 = _mm512_fmadd_ps(ifft2422, ifft2342, _mm512_shuffle_f32x4(ifft2422, ifft2422, 177));
__m512 ifft2347 = _mm512_fmadd_ps(ifft2338, ifft2342, _mm512_shuffle_f32x4(ifft2338, ifft2338, 177));
__m512 ifft2431 = _mm512_fmadd_ps(ifft2423, ifft2342, _mm512_shuffle_f32x4(ifft2423, ifft2423, 177));
__m512 ifft2348 = _mm512_fnmsub_ps(ifft2339, ifft2342, _mm512_shuffle_f32x4(ifft2339, ifft2339, 177));
__m512 ifft2432 = _mm512_fnmsub_ps(ifft2424, ifft2342, _mm512_shuffle_f32x4(ifft2424, ifft2424, 177));
__m512 ifft2349 = _mm512_fmadd_ps(ifft2340, ifft2342, _mm512_shuffle_f32x4(ifft2340, ifft2340, 177));
__m512 ifft2433 = _mm512_fmadd_ps(ifft2425, ifft2342, _mm512_shuffle_f32x4(ifft2425, ifft2425, 177));
__m512 ifft2350 = _mm512_fmadd_ps(ifft2341, ifft2342, _mm512_shuffle_f32x4(ifft2341, ifft2341, 177));
__m512 ifft2434 = _mm512_fmadd_ps(ifft2426, ifft2342, _mm512_shuffle_f32x4(ifft2426, ifft2426, 177));
__m512 ifft2351 = _mm512_add_ps(ifft2343, ifft2344);
__m512 ifft2435 = _mm512_add_ps(ifft2427, ifft2428);
__m512 ifft2352 = _mm512_sub_ps(ifft2343, ifft2344);
__m512 ifft2436 = _mm512_sub_ps(ifft2427, ifft2428);
__m512 ifft2353 = _mm512_sub_ps(ifft2345, ifft2349);
__m512 ifft2437 = _mm512_sub_ps(ifft2429, ifft2433);
__m512 ifft2354 = _mm512_add_ps(ifft2346, ifft2350);
__m512 ifft2438 = _mm512_add_ps(ifft2430, ifft2434);
__m512 ifft2355 = _mm512_add_ps(ifft2345, ifft2349);
__m512 ifft2439 = _mm512_add_ps(ifft2429, ifft2433);
__m512 ifft2356 = _mm512_sub_ps(ifft2346, ifft2350);
__m512 ifft2440 = _mm512_sub_ps(ifft2430, ifft2434);
__m512 ifft2357 = _mm512_mul_ps(ifft2347, _mm512_set1_ps(3.125e-02f));
__m512 ifft2441 = _mm512_mul_ps(ifft2431, _mm512_set1_ps(3.125e-02f));
__m512 ifft2358 = _mm512_mul_ps(ifft2348, _mm512_set1_ps(3.125e-02f));
__m512 ifft2442 = _mm512_mul_ps(ifft2432, _mm512_set1_ps(3.125e-02f));
__m512 ifft2359 = _mm512_fmadd_ps(ifft2351, _mm512_set1_ps(1.5625e-02f), ifft2357);
__m512 ifft2443 = _mm512_fmadd_ps(ifft2435, _mm512_set1_ps(1.5625e-02f), ifft2441);
__m512 ifft2360 = _mm512_fmsub_ps(ifft2351, _mm512_set1_ps(1.5625e-02f), ifft2357);
__m512 ifft2444 = _mm512_fmsub_ps(ifft2435, _mm512_set1_ps(1.5625e-02f), ifft2441);
__m512 ifft2361 = _mm512_fmadd_ps(ifft2352, _mm512_set1_ps(1.5625e-02f), ifft2358);
__m512 ifft2445 = _mm512_fmadd_ps(ifft2436, _mm512_set1_ps(1.5625e-02f), ifft2442);
__m512 ifft2362 = _mm512_fmsub_ps(ifft2352, _mm512_set1_ps(1.5625e-02f), ifft2358);
__m512 ifft2446 = _mm512_fmsub_ps(ifft2436, _mm512_set1_ps(1.5625e-02f), ifft2442);
__m512 ifft2363 = _mm512_add_ps(ifft2353, ifft2354);
__m512 ifft2447 = _mm512_add_ps(ifft2437, ifft2438);
__m512 ifft2364 = _mm512_sub_ps(ifft2353, ifft2354);
__m512 ifft2448 = _mm512_sub_ps(ifft2437, ifft2438);
__m512 ifft2365 = _mm512_fnmadd_ps(ifft2363, _mm512_set1_ps(7.0710677e-01f), ifft2355);
__m512 ifft2449 = _mm512_fnmadd_ps(ifft2447, _mm512_set1_ps(7.0710677e-01f), ifft2439);
__m512 ifft2366 = _mm512_fmadd_ps(ifft2363, _mm512_set1_ps(7.0710677e-01f), ifft2355);
__m512 ifft2450 = _mm512_fmadd_ps(ifft2447, _mm512_set1_ps(7.0710677e-01f), ifft2439);
__m512 ifft2367 = _mm512_fmadd_ps(ifft2364, _mm512_set1_ps(7.0710677e-01f), ifft2356);
__m512 ifft2451 = _mm512_fmadd_ps(ifft2448, _mm512_set1_ps(7.0710677e-01f), ifft2440);
__m512 ifft2368 = _mm512_fmsub_ps(ifft2364, _mm512_set1_ps(7.0710677e-01f), ifft2356);
__m512 ifft2452 = _mm512_fmsub_ps(ifft2448, _mm512_set1_ps(7.0710677e-01f), ifft2440);
__m512 ifft2369 = _mm512_add_ps(ifft2365, ifft2366);
__m512 ifft2453 = _mm512_add_ps(ifft2449, ifft2450);
__m512 ifft2370 = _mm512_sub_ps(ifft2365, ifft2366);
__m512 ifft2454 = _mm512_sub_ps(ifft2449, ifft2450);
__m512 ifft2371 = _mm512_add_ps(ifft2367, ifft2368);
__m512 ifft2455 = _mm512_add_ps(ifft2451, ifft2452);
__m512 ifft2372 = _mm512_sub_ps(ifft2367, ifft2368);
__m512 ifft2456 = _mm512_sub_ps(ifft2451, ifft2452);
__m512 ifft2373 = _mm512_fmadd_ps(ifft2369, _mm512_set1_ps(1.5625e-02f), ifft2359);
__m512 ifft2457 = _mm512_fmadd_ps(ifft2453, _mm512_set1_ps(1.5625e-02f), ifft2443);
__m512 ifft2374 = _mm512_fnmadd_ps(ifft2369, _mm512_set1_ps(1.5625e-02f), ifft2359);
__m512 ifft2458 = _mm512_fnmadd_ps(ifft2453, _mm512_set1_ps(1.5625e-02f), ifft2443);
__m512 ifft2375 = _mm512_fmadd_ps(ifft2371, _mm512_set1_ps(1.5625e-02f), ifft2361);
__m512 ifft2459 = _mm512_fmadd_ps(ifft2455, _mm512_set1_ps(1.5625e-02f), ifft2445);
__m512 ifft2376 = _mm512_fnmadd_ps(ifft2371, _mm512_set1_ps(1.5625e-02f), ifft2361);
__m512 ifft2460 = _mm512_fnmadd_ps(ifft2455, _mm512_set1_ps(1.5625e-02f), ifft2445);
__m512 ifft2377 = _mm512_fnmadd_ps(ifft2372, _mm512_set1_ps(1.5625e-02f), ifft2360);
__m512 ifft2461 = _mm512_fnmadd_ps(ifft2456, _mm512_set1_ps(1.5625e-02f), ifft2444);
__m512 ifft2378 = _mm512_fmadd_ps(ifft2372, _mm512_set1_ps(1.5625e-02f), ifft2360);
__m512 ifft2462 = _mm512_fmadd_ps(ifft2456, _mm512_set1_ps(1.5625e-02f), ifft2444);
__m512 ifft2379 = _mm512_fmadd_ps(ifft2370, _mm512_set1_ps(1.5625e-02f), ifft2362);
__m512 ifft2463 = _mm512_fmadd_ps(ifft2454, _mm512_set1_ps(1.5625e-02f), ifft2446);
__m512 ifft2380 = _mm512_fnmadd_ps(ifft2370, _mm512_set1_ps(1.5625e-02f), ifft2362);
__m512 ifft2464 = _mm512_fnmadd_ps(ifft2454, _mm512_set1_ps(1.5625e-02f), ifft2446);
__m512 dat720 = ifft2373;
__m512 dat725 = ifft2457;
__m512 dat721 = ifft2375;
__m512 dat726 = ifft2459;
__m512 dat722 = ifft2377;
__m512 dat727 = ifft2461;
__m512 dat723 = ifft2379;
__m512 dat728 = ifft2463;
__m512 dat724 = ifft2374;
__m512 dat729 = ifft2458;
(void)ifft2376;
(void)ifft2460;
(void)ifft2378;
(void)ifft2462;
(void)ifft2380;
(void)ifft2464;
__m512i pm23 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack111 = _mm512_permutex2var_ps(dat720, pm23, dat725);
__m512i pm24 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack112 = _mm512_permutex2var_ps(dat720, pm24, dat725);
__m512 pack113 = _mm512_permutex2var_ps(dat721, pm23, dat726);
__m512 pack114 = _mm512_permutex2var_ps(dat721, pm24, dat726);
__m512 pack115 = _mm512_permutex2var_ps(dat722, pm23, dat727);
__m512 pack116 = _mm512_permutex2var_ps(dat722, pm24, dat727);
__m512 pack117 = _mm512_permutex2var_ps(dat723, pm23, dat728);
__m512 pack118 = _mm512_permutex2var_ps(dat723, pm24, dat728);
__m512 pack119 = _mm512_permutex2var_ps(dat724, pm23, dat729);
__m512 pack120 = _mm512_permutex2var_ps(dat724, pm24, dat729);
pack111 = _mm512_max_ps(_mm512_setzero_ps(), pack111);
pack112 = _mm512_max_ps(_mm512_setzero_ps(), pack112);
pack113 = _mm512_max_ps(_mm512_setzero_ps(), pack113);
pack114 = _mm512_max_ps(_mm512_setzero_ps(), pack114);
pack115 = _mm512_max_ps(_mm512_setzero_ps(), pack115);
pack116 = _mm512_max_ps(_mm512_setzero_ps(), pack116);
pack117 = _mm512_max_ps(_mm512_setzero_ps(), pack117);
pack118 = _mm512_max_ps(_mm512_setzero_ps(), pack118);
pack119 = _mm512_max_ps(_mm512_setzero_ps(), pack119);
pack120 = _mm512_max_ps(_mm512_setzero_ps(), pack120);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack111);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack112);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack113);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack114);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack115);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack116);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack117);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack118);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack119);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k32+100480*r9+448*toH8+4*toW8+40*t15, 1023, pack120);
}
}
}
if (j5 >= last2) return;
++j5;
}
if (j5 >= 84) break;
rel5 = 11;
}
if (rel5 < 16) {
if (rel5 < 12) {
ptrdiff_t toH9 = base5+10;
ptrdiff_t toW9 = 105;
ptrdiff_t k33 = 16*w21;
for (; k33 != 16; ++k33) {
ptrdiff_t r10 = 0;
for (; r10 != 2; ++r10) {
ptrdiff_t t16 = 0;
__m512 sfRe169 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm169 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe173 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm173 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe170 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm170 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe174 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm174 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe171 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm171 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe175 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm175 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe172 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm172 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfRe176 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512 sfIm176 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k33+768*r10+256*t16);
__m512i ifft2465 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2466 = _mm512_permutexvar_ps(ifft2465, sfRe169);
__m512 ifft2557 = _mm512_permutexvar_ps(ifft2465, sfRe173);
__m512i ifft2467 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2468 = _mm512_permutexvar_ps(ifft2467, sfRe169);
__m512 ifft2558 = _mm512_permutexvar_ps(ifft2467, sfRe173);
__m512 ifft2469 = _mm512_permutexvar_ps(ifft2465, sfIm169);
__m512 ifft2559 = _mm512_permutexvar_ps(ifft2465, sfIm173);
__m512 ifft2470 = _mm512_permutexvar_ps(ifft2467, sfIm169);
__m512 ifft2560 = _mm512_permutexvar_ps(ifft2467, sfIm173);
__m512 ifft2471 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2472 = _mm512_mask_fmadd_ps(ifft2470, 65021, ifft2471, ifft2466);
__m512 ifft2561 = _mm512_mask_fmadd_ps(ifft2560, 65021, ifft2471, ifft2557);
__m512 ifft2473 = _mm512_mask_fnmadd_ps(ifft2469, 65021, ifft2471, ifft2468);
__m512 ifft2562 = _mm512_mask_fnmadd_ps(ifft2559, 65021, ifft2471, ifft2558);
__m512 ifft2474 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2475 = _mm512_fmadd_ps(ifft2472, ifft2474, _mm512_shuffle_ps(ifft2472, ifft2472, 177));
__m512 ifft2563 = _mm512_fmadd_ps(ifft2561, ifft2474, _mm512_shuffle_ps(ifft2561, ifft2561, 177));
__m512 ifft2476 = _mm512_fmadd_ps(ifft2473, ifft2474, _mm512_shuffle_ps(ifft2473, ifft2473, 177));
__m512 ifft2564 = _mm512_fmadd_ps(ifft2562, ifft2474, _mm512_shuffle_ps(ifft2562, ifft2562, 177));
__m512 ifft2477 = _mm512_fmadd_ps(sfRe170, ifft2474, _mm512_shuffle_ps(sfRe170, sfRe170, 177));
__m512 ifft2565 = _mm512_fmadd_ps(sfRe174, ifft2474, _mm512_shuffle_ps(sfRe174, sfRe174, 177));
__m512 ifft2478 = _mm512_fmadd_ps(sfIm170, ifft2474, _mm512_shuffle_ps(sfIm170, sfIm170, 177));
__m512 ifft2566 = _mm512_fmadd_ps(sfIm174, ifft2474, _mm512_shuffle_ps(sfIm174, sfIm174, 177));
__m512 ifft2479 = _mm512_fmadd_ps(sfRe171, ifft2474, _mm512_shuffle_ps(sfRe171, sfRe171, 177));
__m512 ifft2567 = _mm512_fmadd_ps(sfRe175, ifft2474, _mm512_shuffle_ps(sfRe175, sfRe175, 177));
__m512 ifft2480 = _mm512_fmadd_ps(sfIm171, ifft2474, _mm512_shuffle_ps(sfIm171, sfIm171, 177));
__m512 ifft2568 = _mm512_fmadd_ps(sfIm175, ifft2474, _mm512_shuffle_ps(sfIm175, sfIm175, 177));
__m512 ifft2481 = _mm512_fmadd_ps(sfRe172, ifft2474, _mm512_shuffle_ps(sfRe172, sfRe172, 177));
__m512 ifft2569 = _mm512_fmadd_ps(sfRe176, ifft2474, _mm512_shuffle_ps(sfRe176, sfRe176, 177));
__m512 ifft2482 = _mm512_fmadd_ps(sfIm172, ifft2474, _mm512_shuffle_ps(sfIm172, sfIm172, 177));
__m512 ifft2570 = _mm512_fmadd_ps(sfIm176, ifft2474, _mm512_shuffle_ps(sfIm176, sfIm176, 177));
__m512 ifft2483 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2484 = _mm512_mul_ps(ifft2475, ifft2483);
__m512 ifft2571 = _mm512_mul_ps(ifft2563, ifft2483);
__m512 ifft2485 = _mm512_mul_ps(ifft2476, ifft2483);
__m512 ifft2572 = _mm512_mul_ps(ifft2564, ifft2483);
__m512 ifft2486 = _mm512_mul_ps(ifft2477, ifft2483);
__m512 ifft2573 = _mm512_mul_ps(ifft2565, ifft2483);
__m512 ifft2487 = _mm512_mul_ps(ifft2478, ifft2483);
__m512 ifft2574 = _mm512_mul_ps(ifft2566, ifft2483);
__m512 ifft2488 = _mm512_mul_ps(ifft2479, ifft2483);
__m512 ifft2575 = _mm512_mul_ps(ifft2567, ifft2483);
__m512 ifft2489 = _mm512_mul_ps(ifft2480, ifft2483);
__m512 ifft2576 = _mm512_mul_ps(ifft2568, ifft2483);
__m512 ifft2490 = _mm512_mul_ps(ifft2481, ifft2483);
__m512 ifft2577 = _mm512_mul_ps(ifft2569, ifft2483);
__m512 ifft2491 = _mm512_mul_ps(ifft2482, ifft2483);
__m512 ifft2578 = _mm512_mul_ps(ifft2570, ifft2483);
__m512 ifft2492 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2493 = _mm512_fnmadd_ps(ifft2476, ifft2492, ifft2484);
__m512 ifft2579 = _mm512_fnmadd_ps(ifft2564, ifft2492, ifft2571);
__m512 ifft2494 = _mm512_fmadd_ps(ifft2475, ifft2492, ifft2485);
__m512 ifft2580 = _mm512_fmadd_ps(ifft2563, ifft2492, ifft2572);
__m512 ifft2495 = _mm512_fnmadd_ps(ifft2478, ifft2492, ifft2486);
__m512 ifft2581 = _mm512_fnmadd_ps(ifft2566, ifft2492, ifft2573);
__m512 ifft2496 = _mm512_fmadd_ps(ifft2477, ifft2492, ifft2487);
__m512 ifft2582 = _mm512_fmadd_ps(ifft2565, ifft2492, ifft2574);
__m512 ifft2497 = _mm512_fnmadd_ps(ifft2480, ifft2492, ifft2488);
__m512 ifft2583 = _mm512_fnmadd_ps(ifft2568, ifft2492, ifft2575);
__m512 ifft2498 = _mm512_fmadd_ps(ifft2479, ifft2492, ifft2489);
__m512 ifft2584 = _mm512_fmadd_ps(ifft2567, ifft2492, ifft2576);
__m512 ifft2499 = _mm512_fnmadd_ps(ifft2482, ifft2492, ifft2490);
__m512 ifft2585 = _mm512_fnmadd_ps(ifft2570, ifft2492, ifft2577);
__m512 ifft2500 = _mm512_fmadd_ps(ifft2481, ifft2492, ifft2491);
__m512 ifft2586 = _mm512_fmadd_ps(ifft2569, ifft2492, ifft2578);
__m512 ifft2501 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2502 = _mm512_fmadd_ps(ifft2493, ifft2501, _mm512_shuffle_ps(ifft2493, ifft2493, 78));
__m512 ifft2587 = _mm512_fmadd_ps(ifft2579, ifft2501, _mm512_shuffle_ps(ifft2579, ifft2579, 78));
__m512 ifft2503 = _mm512_fmadd_ps(ifft2494, ifft2501, _mm512_shuffle_ps(ifft2494, ifft2494, 78));
__m512 ifft2588 = _mm512_fmadd_ps(ifft2580, ifft2501, _mm512_shuffle_ps(ifft2580, ifft2580, 78));
__m512 ifft2504 = _mm512_fmadd_ps(ifft2495, ifft2501, _mm512_shuffle_ps(ifft2495, ifft2495, 78));
__m512 ifft2589 = _mm512_fmadd_ps(ifft2581, ifft2501, _mm512_shuffle_ps(ifft2581, ifft2581, 78));
__m512 ifft2505 = _mm512_fmadd_ps(ifft2496, ifft2501, _mm512_shuffle_ps(ifft2496, ifft2496, 78));
__m512 ifft2590 = _mm512_fmadd_ps(ifft2582, ifft2501, _mm512_shuffle_ps(ifft2582, ifft2582, 78));
__m512 ifft2506 = _mm512_fmadd_ps(ifft2497, ifft2501, _mm512_shuffle_ps(ifft2497, ifft2497, 78));
__m512 ifft2591 = _mm512_fmadd_ps(ifft2583, ifft2501, _mm512_shuffle_ps(ifft2583, ifft2583, 78));
__m512 ifft2507 = _mm512_fmadd_ps(ifft2498, ifft2501, _mm512_shuffle_ps(ifft2498, ifft2498, 78));
__m512 ifft2592 = _mm512_fmadd_ps(ifft2584, ifft2501, _mm512_shuffle_ps(ifft2584, ifft2584, 78));
__m512 ifft2508 = _mm512_fmadd_ps(ifft2499, ifft2501, _mm512_shuffle_ps(ifft2499, ifft2499, 78));
__m512 ifft2593 = _mm512_fmadd_ps(ifft2585, ifft2501, _mm512_shuffle_ps(ifft2585, ifft2585, 78));
__m512 ifft2509 = _mm512_fmadd_ps(ifft2500, ifft2501, _mm512_shuffle_ps(ifft2500, ifft2500, 78));
__m512 ifft2594 = _mm512_fmadd_ps(ifft2586, ifft2501, _mm512_shuffle_ps(ifft2586, ifft2586, 78));
__m512 ifft2510 = _mm512_mask_sub_ps(ifft2502, 49344, _mm512_setzero_ps(), ifft2503);
__m512 ifft2595 = _mm512_mask_sub_ps(ifft2587, 49344, _mm512_setzero_ps(), ifft2588);
__m512 ifft2511 = _mm512_mask_mov_ps(ifft2503, 49344, ifft2502);
__m512 ifft2596 = _mm512_mask_mov_ps(ifft2588, 49344, ifft2587);
__m512 ifft2512 = _mm512_mask_sub_ps(ifft2504, 49344, _mm512_setzero_ps(), ifft2505);
__m512 ifft2597 = _mm512_mask_sub_ps(ifft2589, 49344, _mm512_setzero_ps(), ifft2590);
__m512 ifft2513 = _mm512_mask_mov_ps(ifft2505, 49344, ifft2504);
__m512 ifft2598 = _mm512_mask_mov_ps(ifft2590, 49344, ifft2589);
__m512 ifft2514 = _mm512_mask_sub_ps(ifft2506, 49344, _mm512_setzero_ps(), ifft2507);
__m512 ifft2599 = _mm512_mask_sub_ps(ifft2591, 49344, _mm512_setzero_ps(), ifft2592);
__m512 ifft2515 = _mm512_mask_mov_ps(ifft2507, 49344, ifft2506);
__m512 ifft2600 = _mm512_mask_mov_ps(ifft2592, 49344, ifft2591);
__m512 ifft2516 = _mm512_mask_sub_ps(ifft2508, 49344, _mm512_setzero_ps(), ifft2509);
__m512 ifft2601 = _mm512_mask_sub_ps(ifft2593, 49344, _mm512_setzero_ps(), ifft2594);
__m512 ifft2517 = _mm512_mask_mov_ps(ifft2509, 49344, ifft2508);
__m512 ifft2602 = _mm512_mask_mov_ps(ifft2594, 49344, ifft2593);
__m512 ifft2518 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2519 = _mm512_fmadd_ps(ifft2510, ifft2518, _mm512_shuffle_f32x4(ifft2510, ifft2510, 177));
__m512 ifft2603 = _mm512_fmadd_ps(ifft2595, ifft2518, _mm512_shuffle_f32x4(ifft2595, ifft2595, 177));
__m512 ifft2520 = _mm512_fmadd_ps(ifft2511, ifft2518, _mm512_shuffle_f32x4(ifft2511, ifft2511, 177));
__m512 ifft2604 = _mm512_fmadd_ps(ifft2596, ifft2518, _mm512_shuffle_f32x4(ifft2596, ifft2596, 177));
__m512 ifft2521 = _mm512_fmadd_ps(ifft2512, ifft2518, _mm512_shuffle_f32x4(ifft2512, ifft2512, 177));
__m512 ifft2605 = _mm512_fmadd_ps(ifft2597, ifft2518, _mm512_shuffle_f32x4(ifft2597, ifft2597, 177));
__m512 ifft2522 = _mm512_fmadd_ps(ifft2513, ifft2518, _mm512_shuffle_f32x4(ifft2513, ifft2513, 177));
__m512 ifft2606 = _mm512_fmadd_ps(ifft2598, ifft2518, _mm512_shuffle_f32x4(ifft2598, ifft2598, 177));
__m512 ifft2523 = _mm512_fmadd_ps(ifft2514, ifft2518, _mm512_shuffle_f32x4(ifft2514, ifft2514, 177));
__m512 ifft2607 = _mm512_fmadd_ps(ifft2599, ifft2518, _mm512_shuffle_f32x4(ifft2599, ifft2599, 177));
__m512 ifft2524 = _mm512_fnmsub_ps(ifft2515, ifft2518, _mm512_shuffle_f32x4(ifft2515, ifft2515, 177));
__m512 ifft2608 = _mm512_fnmsub_ps(ifft2600, ifft2518, _mm512_shuffle_f32x4(ifft2600, ifft2600, 177));
__m512 ifft2525 = _mm512_fmadd_ps(ifft2516, ifft2518, _mm512_shuffle_f32x4(ifft2516, ifft2516, 177));
__m512 ifft2609 = _mm512_fmadd_ps(ifft2601, ifft2518, _mm512_shuffle_f32x4(ifft2601, ifft2601, 177));
__m512 ifft2526 = _mm512_fmadd_ps(ifft2517, ifft2518, _mm512_shuffle_f32x4(ifft2517, ifft2517, 177));
__m512 ifft2610 = _mm512_fmadd_ps(ifft2602, ifft2518, _mm512_shuffle_f32x4(ifft2602, ifft2602, 177));
__m512 ifft2527 = _mm512_add_ps(ifft2519, ifft2520);
__m512 ifft2611 = _mm512_add_ps(ifft2603, ifft2604);
__m512 ifft2528 = _mm512_sub_ps(ifft2519, ifft2520);
__m512 ifft2612 = _mm512_sub_ps(ifft2603, ifft2604);
__m512 ifft2529 = _mm512_sub_ps(ifft2521, ifft2525);
__m512 ifft2613 = _mm512_sub_ps(ifft2605, ifft2609);
__m512 ifft2530 = _mm512_add_ps(ifft2522, ifft2526);
__m512 ifft2614 = _mm512_add_ps(ifft2606, ifft2610);
__m512 ifft2531 = _mm512_add_ps(ifft2521, ifft2525);
__m512 ifft2615 = _mm512_add_ps(ifft2605, ifft2609);
__m512 ifft2532 = _mm512_sub_ps(ifft2522, ifft2526);
__m512 ifft2616 = _mm512_sub_ps(ifft2606, ifft2610);
__m512 ifft2533 = _mm512_mul_ps(ifft2523, _mm512_set1_ps(3.125e-02f));
__m512 ifft2617 = _mm512_mul_ps(ifft2607, _mm512_set1_ps(3.125e-02f));
__m512 ifft2534 = _mm512_mul_ps(ifft2524, _mm512_set1_ps(3.125e-02f));
__m512 ifft2618 = _mm512_mul_ps(ifft2608, _mm512_set1_ps(3.125e-02f));
__m512 ifft2535 = _mm512_fmadd_ps(ifft2527, _mm512_set1_ps(1.5625e-02f), ifft2533);
__m512 ifft2619 = _mm512_fmadd_ps(ifft2611, _mm512_set1_ps(1.5625e-02f), ifft2617);
__m512 ifft2536 = _mm512_fmsub_ps(ifft2527, _mm512_set1_ps(1.5625e-02f), ifft2533);
__m512 ifft2620 = _mm512_fmsub_ps(ifft2611, _mm512_set1_ps(1.5625e-02f), ifft2617);
__m512 ifft2537 = _mm512_fmadd_ps(ifft2528, _mm512_set1_ps(1.5625e-02f), ifft2534);
__m512 ifft2621 = _mm512_fmadd_ps(ifft2612, _mm512_set1_ps(1.5625e-02f), ifft2618);
__m512 ifft2538 = _mm512_fmsub_ps(ifft2528, _mm512_set1_ps(1.5625e-02f), ifft2534);
__m512 ifft2622 = _mm512_fmsub_ps(ifft2612, _mm512_set1_ps(1.5625e-02f), ifft2618);
__m512 ifft2539 = _mm512_add_ps(ifft2529, ifft2530);
__m512 ifft2623 = _mm512_add_ps(ifft2613, ifft2614);
__m512 ifft2540 = _mm512_sub_ps(ifft2529, ifft2530);
__m512 ifft2624 = _mm512_sub_ps(ifft2613, ifft2614);
__m512 ifft2541 = _mm512_fnmadd_ps(ifft2539, _mm512_set1_ps(7.0710677e-01f), ifft2531);
__m512 ifft2625 = _mm512_fnmadd_ps(ifft2623, _mm512_set1_ps(7.0710677e-01f), ifft2615);
__m512 ifft2542 = _mm512_fmadd_ps(ifft2539, _mm512_set1_ps(7.0710677e-01f), ifft2531);
__m512 ifft2626 = _mm512_fmadd_ps(ifft2623, _mm512_set1_ps(7.0710677e-01f), ifft2615);
__m512 ifft2543 = _mm512_fmadd_ps(ifft2540, _mm512_set1_ps(7.0710677e-01f), ifft2532);
__m512 ifft2627 = _mm512_fmadd_ps(ifft2624, _mm512_set1_ps(7.0710677e-01f), ifft2616);
__m512 ifft2544 = _mm512_fmsub_ps(ifft2540, _mm512_set1_ps(7.0710677e-01f), ifft2532);
__m512 ifft2628 = _mm512_fmsub_ps(ifft2624, _mm512_set1_ps(7.0710677e-01f), ifft2616);
__m512 ifft2545 = _mm512_add_ps(ifft2541, ifft2542);
__m512 ifft2629 = _mm512_add_ps(ifft2625, ifft2626);
__m512 ifft2546 = _mm512_sub_ps(ifft2541, ifft2542);
__m512 ifft2630 = _mm512_sub_ps(ifft2625, ifft2626);
__m512 ifft2547 = _mm512_add_ps(ifft2543, ifft2544);
__m512 ifft2631 = _mm512_add_ps(ifft2627, ifft2628);
__m512 ifft2548 = _mm512_sub_ps(ifft2543, ifft2544);
__m512 ifft2632 = _mm512_sub_ps(ifft2627, ifft2628);
__m512 ifft2549 = _mm512_fmadd_ps(ifft2545, _mm512_set1_ps(1.5625e-02f), ifft2535);
__m512 ifft2633 = _mm512_fmadd_ps(ifft2629, _mm512_set1_ps(1.5625e-02f), ifft2619);
__m512 ifft2550 = _mm512_fnmadd_ps(ifft2545, _mm512_set1_ps(1.5625e-02f), ifft2535);
__m512 ifft2634 = _mm512_fnmadd_ps(ifft2629, _mm512_set1_ps(1.5625e-02f), ifft2619);
__m512 ifft2551 = _mm512_fmadd_ps(ifft2547, _mm512_set1_ps(1.5625e-02f), ifft2537);
__m512 ifft2635 = _mm512_fmadd_ps(ifft2631, _mm512_set1_ps(1.5625e-02f), ifft2621);
__m512 ifft2552 = _mm512_fnmadd_ps(ifft2547, _mm512_set1_ps(1.5625e-02f), ifft2537);
__m512 ifft2636 = _mm512_fnmadd_ps(ifft2631, _mm512_set1_ps(1.5625e-02f), ifft2621);
__m512 ifft2553 = _mm512_fnmadd_ps(ifft2548, _mm512_set1_ps(1.5625e-02f), ifft2536);
__m512 ifft2637 = _mm512_fnmadd_ps(ifft2632, _mm512_set1_ps(1.5625e-02f), ifft2620);
__m512 ifft2554 = _mm512_fmadd_ps(ifft2548, _mm512_set1_ps(1.5625e-02f), ifft2536);
__m512 ifft2638 = _mm512_fmadd_ps(ifft2632, _mm512_set1_ps(1.5625e-02f), ifft2620);
__m512 ifft2555 = _mm512_fmadd_ps(ifft2546, _mm512_set1_ps(1.5625e-02f), ifft2538);
__m512 ifft2639 = _mm512_fmadd_ps(ifft2630, _mm512_set1_ps(1.5625e-02f), ifft2622);
__m512 ifft2556 = _mm512_fnmadd_ps(ifft2546, _mm512_set1_ps(1.5625e-02f), ifft2538);
__m512 ifft2640 = _mm512_fnmadd_ps(ifft2630, _mm512_set1_ps(1.5625e-02f), ifft2622);
__m512 dat730 = ifft2549;
__m512 dat735 = ifft2633;
__m512 dat731 = ifft2551;
__m512 dat736 = ifft2635;
__m512 dat732 = ifft2553;
__m512 dat737 = ifft2637;
__m512 dat733 = ifft2555;
__m512 dat738 = ifft2639;
__m512 dat734 = ifft2550;
__m512 dat739 = ifft2634;
(void)ifft2552;
(void)ifft2636;
(void)ifft2554;
(void)ifft2638;
(void)ifft2556;
(void)ifft2640;
__m512i pm25 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack121 = _mm512_permutex2var_ps(dat730, pm25, dat735);
__m512i pm26 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack122 = _mm512_permutex2var_ps(dat730, pm26, dat735);
__m512 pack123 = _mm512_permutex2var_ps(dat731, pm25, dat736);
__m512 pack124 = _mm512_permutex2var_ps(dat731, pm26, dat736);
__m512 pack125 = _mm512_permutex2var_ps(dat732, pm25, dat737);
__m512 pack126 = _mm512_permutex2var_ps(dat732, pm26, dat737);
__m512 pack127 = _mm512_permutex2var_ps(dat733, pm25, dat738);
__m512 pack128 = _mm512_permutex2var_ps(dat733, pm26, dat738);
__m512 pack129 = _mm512_permutex2var_ps(dat734, pm25, dat739);
__m512 pack130 = _mm512_permutex2var_ps(dat734, pm26, dat739);
pack121 = _mm512_max_ps(_mm512_setzero_ps(), pack121);
pack122 = _mm512_max_ps(_mm512_setzero_ps(), pack122);
pack123 = _mm512_max_ps(_mm512_setzero_ps(), pack123);
pack124 = _mm512_max_ps(_mm512_setzero_ps(), pack124);
pack125 = _mm512_max_ps(_mm512_setzero_ps(), pack125);
pack126 = _mm512_max_ps(_mm512_setzero_ps(), pack126);
pack127 = _mm512_max_ps(_mm512_setzero_ps(), pack127);
pack128 = _mm512_max_ps(_mm512_setzero_ps(), pack128);
pack129 = _mm512_max_ps(_mm512_setzero_ps(), pack129);
pack130 = _mm512_max_ps(_mm512_setzero_ps(), pack130);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack121);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack122);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack123);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack124);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack125);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack126);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack127);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack128);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack129);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t16, 127, pack130);
ptrdiff_t t17 = 0;
__m512 sfRe177 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm177 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe181 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm181 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe178 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm178 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe182 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm182 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe179 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm179 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe183 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm183 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe180 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm180 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfRe184 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512 sfIm184 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k33+768*r10+256*t17);
__m512i ifft2641 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2642 = _mm512_permutexvar_ps(ifft2641, sfRe177);
__m512 ifft2733 = _mm512_permutexvar_ps(ifft2641, sfRe181);
__m512i ifft2643 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2644 = _mm512_permutexvar_ps(ifft2643, sfRe177);
__m512 ifft2734 = _mm512_permutexvar_ps(ifft2643, sfRe181);
__m512 ifft2645 = _mm512_permutexvar_ps(ifft2641, sfIm177);
__m512 ifft2735 = _mm512_permutexvar_ps(ifft2641, sfIm181);
__m512 ifft2646 = _mm512_permutexvar_ps(ifft2643, sfIm177);
__m512 ifft2736 = _mm512_permutexvar_ps(ifft2643, sfIm181);
__m512 ifft2647 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2648 = _mm512_mask_fmadd_ps(ifft2646, 65021, ifft2647, ifft2642);
__m512 ifft2737 = _mm512_mask_fmadd_ps(ifft2736, 65021, ifft2647, ifft2733);
__m512 ifft2649 = _mm512_mask_fnmadd_ps(ifft2645, 65021, ifft2647, ifft2644);
__m512 ifft2738 = _mm512_mask_fnmadd_ps(ifft2735, 65021, ifft2647, ifft2734);
__m512 ifft2650 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2651 = _mm512_fmadd_ps(ifft2648, ifft2650, _mm512_shuffle_ps(ifft2648, ifft2648, 177));
__m512 ifft2739 = _mm512_fmadd_ps(ifft2737, ifft2650, _mm512_shuffle_ps(ifft2737, ifft2737, 177));
__m512 ifft2652 = _mm512_fmadd_ps(ifft2649, ifft2650, _mm512_shuffle_ps(ifft2649, ifft2649, 177));
__m512 ifft2740 = _mm512_fmadd_ps(ifft2738, ifft2650, _mm512_shuffle_ps(ifft2738, ifft2738, 177));
__m512 ifft2653 = _mm512_fmadd_ps(sfRe178, ifft2650, _mm512_shuffle_ps(sfRe178, sfRe178, 177));
__m512 ifft2741 = _mm512_fmadd_ps(sfRe182, ifft2650, _mm512_shuffle_ps(sfRe182, sfRe182, 177));
__m512 ifft2654 = _mm512_fmadd_ps(sfIm178, ifft2650, _mm512_shuffle_ps(sfIm178, sfIm178, 177));
__m512 ifft2742 = _mm512_fmadd_ps(sfIm182, ifft2650, _mm512_shuffle_ps(sfIm182, sfIm182, 177));
__m512 ifft2655 = _mm512_fmadd_ps(sfRe179, ifft2650, _mm512_shuffle_ps(sfRe179, sfRe179, 177));
__m512 ifft2743 = _mm512_fmadd_ps(sfRe183, ifft2650, _mm512_shuffle_ps(sfRe183, sfRe183, 177));
__m512 ifft2656 = _mm512_fmadd_ps(sfIm179, ifft2650, _mm512_shuffle_ps(sfIm179, sfIm179, 177));
__m512 ifft2744 = _mm512_fmadd_ps(sfIm183, ifft2650, _mm512_shuffle_ps(sfIm183, sfIm183, 177));
__m512 ifft2657 = _mm512_fmadd_ps(sfRe180, ifft2650, _mm512_shuffle_ps(sfRe180, sfRe180, 177));
__m512 ifft2745 = _mm512_fmadd_ps(sfRe184, ifft2650, _mm512_shuffle_ps(sfRe184, sfRe184, 177));
__m512 ifft2658 = _mm512_fmadd_ps(sfIm180, ifft2650, _mm512_shuffle_ps(sfIm180, sfIm180, 177));
__m512 ifft2746 = _mm512_fmadd_ps(sfIm184, ifft2650, _mm512_shuffle_ps(sfIm184, sfIm184, 177));
__m512 ifft2659 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2660 = _mm512_mul_ps(ifft2651, ifft2659);
__m512 ifft2747 = _mm512_mul_ps(ifft2739, ifft2659);
__m512 ifft2661 = _mm512_mul_ps(ifft2652, ifft2659);
__m512 ifft2748 = _mm512_mul_ps(ifft2740, ifft2659);
__m512 ifft2662 = _mm512_mul_ps(ifft2653, ifft2659);
__m512 ifft2749 = _mm512_mul_ps(ifft2741, ifft2659);
__m512 ifft2663 = _mm512_mul_ps(ifft2654, ifft2659);
__m512 ifft2750 = _mm512_mul_ps(ifft2742, ifft2659);
__m512 ifft2664 = _mm512_mul_ps(ifft2655, ifft2659);
__m512 ifft2751 = _mm512_mul_ps(ifft2743, ifft2659);
__m512 ifft2665 = _mm512_mul_ps(ifft2656, ifft2659);
__m512 ifft2752 = _mm512_mul_ps(ifft2744, ifft2659);
__m512 ifft2666 = _mm512_mul_ps(ifft2657, ifft2659);
__m512 ifft2753 = _mm512_mul_ps(ifft2745, ifft2659);
__m512 ifft2667 = _mm512_mul_ps(ifft2658, ifft2659);
__m512 ifft2754 = _mm512_mul_ps(ifft2746, ifft2659);
__m512 ifft2668 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2669 = _mm512_fnmadd_ps(ifft2652, ifft2668, ifft2660);
__m512 ifft2755 = _mm512_fnmadd_ps(ifft2740, ifft2668, ifft2747);
__m512 ifft2670 = _mm512_fmadd_ps(ifft2651, ifft2668, ifft2661);
__m512 ifft2756 = _mm512_fmadd_ps(ifft2739, ifft2668, ifft2748);
__m512 ifft2671 = _mm512_fnmadd_ps(ifft2654, ifft2668, ifft2662);
__m512 ifft2757 = _mm512_fnmadd_ps(ifft2742, ifft2668, ifft2749);
__m512 ifft2672 = _mm512_fmadd_ps(ifft2653, ifft2668, ifft2663);
__m512 ifft2758 = _mm512_fmadd_ps(ifft2741, ifft2668, ifft2750);
__m512 ifft2673 = _mm512_fnmadd_ps(ifft2656, ifft2668, ifft2664);
__m512 ifft2759 = _mm512_fnmadd_ps(ifft2744, ifft2668, ifft2751);
__m512 ifft2674 = _mm512_fmadd_ps(ifft2655, ifft2668, ifft2665);
__m512 ifft2760 = _mm512_fmadd_ps(ifft2743, ifft2668, ifft2752);
__m512 ifft2675 = _mm512_fnmadd_ps(ifft2658, ifft2668, ifft2666);
__m512 ifft2761 = _mm512_fnmadd_ps(ifft2746, ifft2668, ifft2753);
__m512 ifft2676 = _mm512_fmadd_ps(ifft2657, ifft2668, ifft2667);
__m512 ifft2762 = _mm512_fmadd_ps(ifft2745, ifft2668, ifft2754);
__m512 ifft2677 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2678 = _mm512_fmadd_ps(ifft2669, ifft2677, _mm512_shuffle_ps(ifft2669, ifft2669, 78));
__m512 ifft2763 = _mm512_fmadd_ps(ifft2755, ifft2677, _mm512_shuffle_ps(ifft2755, ifft2755, 78));
__m512 ifft2679 = _mm512_fmadd_ps(ifft2670, ifft2677, _mm512_shuffle_ps(ifft2670, ifft2670, 78));
__m512 ifft2764 = _mm512_fmadd_ps(ifft2756, ifft2677, _mm512_shuffle_ps(ifft2756, ifft2756, 78));
__m512 ifft2680 = _mm512_fmadd_ps(ifft2671, ifft2677, _mm512_shuffle_ps(ifft2671, ifft2671, 78));
__m512 ifft2765 = _mm512_fmadd_ps(ifft2757, ifft2677, _mm512_shuffle_ps(ifft2757, ifft2757, 78));
__m512 ifft2681 = _mm512_fmadd_ps(ifft2672, ifft2677, _mm512_shuffle_ps(ifft2672, ifft2672, 78));
__m512 ifft2766 = _mm512_fmadd_ps(ifft2758, ifft2677, _mm512_shuffle_ps(ifft2758, ifft2758, 78));
__m512 ifft2682 = _mm512_fmadd_ps(ifft2673, ifft2677, _mm512_shuffle_ps(ifft2673, ifft2673, 78));
__m512 ifft2767 = _mm512_fmadd_ps(ifft2759, ifft2677, _mm512_shuffle_ps(ifft2759, ifft2759, 78));
__m512 ifft2683 = _mm512_fmadd_ps(ifft2674, ifft2677, _mm512_shuffle_ps(ifft2674, ifft2674, 78));
__m512 ifft2768 = _mm512_fmadd_ps(ifft2760, ifft2677, _mm512_shuffle_ps(ifft2760, ifft2760, 78));
__m512 ifft2684 = _mm512_fmadd_ps(ifft2675, ifft2677, _mm512_shuffle_ps(ifft2675, ifft2675, 78));
__m512 ifft2769 = _mm512_fmadd_ps(ifft2761, ifft2677, _mm512_shuffle_ps(ifft2761, ifft2761, 78));
__m512 ifft2685 = _mm512_fmadd_ps(ifft2676, ifft2677, _mm512_shuffle_ps(ifft2676, ifft2676, 78));
__m512 ifft2770 = _mm512_fmadd_ps(ifft2762, ifft2677, _mm512_shuffle_ps(ifft2762, ifft2762, 78));
__m512 ifft2686 = _mm512_mask_sub_ps(ifft2678, 49344, _mm512_setzero_ps(), ifft2679);
__m512 ifft2771 = _mm512_mask_sub_ps(ifft2763, 49344, _mm512_setzero_ps(), ifft2764);
__m512 ifft2687 = _mm512_mask_mov_ps(ifft2679, 49344, ifft2678);
__m512 ifft2772 = _mm512_mask_mov_ps(ifft2764, 49344, ifft2763);
__m512 ifft2688 = _mm512_mask_sub_ps(ifft2680, 49344, _mm512_setzero_ps(), ifft2681);
__m512 ifft2773 = _mm512_mask_sub_ps(ifft2765, 49344, _mm512_setzero_ps(), ifft2766);
__m512 ifft2689 = _mm512_mask_mov_ps(ifft2681, 49344, ifft2680);
__m512 ifft2774 = _mm512_mask_mov_ps(ifft2766, 49344, ifft2765);
__m512 ifft2690 = _mm512_mask_sub_ps(ifft2682, 49344, _mm512_setzero_ps(), ifft2683);
__m512 ifft2775 = _mm512_mask_sub_ps(ifft2767, 49344, _mm512_setzero_ps(), ifft2768);
__m512 ifft2691 = _mm512_mask_mov_ps(ifft2683, 49344, ifft2682);
__m512 ifft2776 = _mm512_mask_mov_ps(ifft2768, 49344, ifft2767);
__m512 ifft2692 = _mm512_mask_sub_ps(ifft2684, 49344, _mm512_setzero_ps(), ifft2685);
__m512 ifft2777 = _mm512_mask_sub_ps(ifft2769, 49344, _mm512_setzero_ps(), ifft2770);
__m512 ifft2693 = _mm512_mask_mov_ps(ifft2685, 49344, ifft2684);
__m512 ifft2778 = _mm512_mask_mov_ps(ifft2770, 49344, ifft2769);
__m512 ifft2694 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2695 = _mm512_fmadd_ps(ifft2686, ifft2694, _mm512_shuffle_f32x4(ifft2686, ifft2686, 177));
__m512 ifft2779 = _mm512_fmadd_ps(ifft2771, ifft2694, _mm512_shuffle_f32x4(ifft2771, ifft2771, 177));
__m512 ifft2696 = _mm512_fmadd_ps(ifft2687, ifft2694, _mm512_shuffle_f32x4(ifft2687, ifft2687, 177));
__m512 ifft2780 = _mm512_fmadd_ps(ifft2772, ifft2694, _mm512_shuffle_f32x4(ifft2772, ifft2772, 177));
__m512 ifft2697 = _mm512_fmadd_ps(ifft2688, ifft2694, _mm512_shuffle_f32x4(ifft2688, ifft2688, 177));
__m512 ifft2781 = _mm512_fmadd_ps(ifft2773, ifft2694, _mm512_shuffle_f32x4(ifft2773, ifft2773, 177));
__m512 ifft2698 = _mm512_fmadd_ps(ifft2689, ifft2694, _mm512_shuffle_f32x4(ifft2689, ifft2689, 177));
__m512 ifft2782 = _mm512_fmadd_ps(ifft2774, ifft2694, _mm512_shuffle_f32x4(ifft2774, ifft2774, 177));
__m512 ifft2699 = _mm512_fmadd_ps(ifft2690, ifft2694, _mm512_shuffle_f32x4(ifft2690, ifft2690, 177));
__m512 ifft2783 = _mm512_fmadd_ps(ifft2775, ifft2694, _mm512_shuffle_f32x4(ifft2775, ifft2775, 177));
__m512 ifft2700 = _mm512_fnmsub_ps(ifft2691, ifft2694, _mm512_shuffle_f32x4(ifft2691, ifft2691, 177));
__m512 ifft2784 = _mm512_fnmsub_ps(ifft2776, ifft2694, _mm512_shuffle_f32x4(ifft2776, ifft2776, 177));
__m512 ifft2701 = _mm512_fmadd_ps(ifft2692, ifft2694, _mm512_shuffle_f32x4(ifft2692, ifft2692, 177));
__m512 ifft2785 = _mm512_fmadd_ps(ifft2777, ifft2694, _mm512_shuffle_f32x4(ifft2777, ifft2777, 177));
__m512 ifft2702 = _mm512_fmadd_ps(ifft2693, ifft2694, _mm512_shuffle_f32x4(ifft2693, ifft2693, 177));
__m512 ifft2786 = _mm512_fmadd_ps(ifft2778, ifft2694, _mm512_shuffle_f32x4(ifft2778, ifft2778, 177));
__m512 ifft2703 = _mm512_add_ps(ifft2695, ifft2696);
__m512 ifft2787 = _mm512_add_ps(ifft2779, ifft2780);
__m512 ifft2704 = _mm512_sub_ps(ifft2695, ifft2696);
__m512 ifft2788 = _mm512_sub_ps(ifft2779, ifft2780);
__m512 ifft2705 = _mm512_sub_ps(ifft2697, ifft2701);
__m512 ifft2789 = _mm512_sub_ps(ifft2781, ifft2785);
__m512 ifft2706 = _mm512_add_ps(ifft2698, ifft2702);
__m512 ifft2790 = _mm512_add_ps(ifft2782, ifft2786);
__m512 ifft2707 = _mm512_add_ps(ifft2697, ifft2701);
__m512 ifft2791 = _mm512_add_ps(ifft2781, ifft2785);
__m512 ifft2708 = _mm512_sub_ps(ifft2698, ifft2702);
__m512 ifft2792 = _mm512_sub_ps(ifft2782, ifft2786);
__m512 ifft2709 = _mm512_mul_ps(ifft2699, _mm512_set1_ps(3.125e-02f));
__m512 ifft2793 = _mm512_mul_ps(ifft2783, _mm512_set1_ps(3.125e-02f));
__m512 ifft2710 = _mm512_mul_ps(ifft2700, _mm512_set1_ps(3.125e-02f));
__m512 ifft2794 = _mm512_mul_ps(ifft2784, _mm512_set1_ps(3.125e-02f));
__m512 ifft2711 = _mm512_fmadd_ps(ifft2703, _mm512_set1_ps(1.5625e-02f), ifft2709);
__m512 ifft2795 = _mm512_fmadd_ps(ifft2787, _mm512_set1_ps(1.5625e-02f), ifft2793);
__m512 ifft2712 = _mm512_fmsub_ps(ifft2703, _mm512_set1_ps(1.5625e-02f), ifft2709);
__m512 ifft2796 = _mm512_fmsub_ps(ifft2787, _mm512_set1_ps(1.5625e-02f), ifft2793);
__m512 ifft2713 = _mm512_fmadd_ps(ifft2704, _mm512_set1_ps(1.5625e-02f), ifft2710);
__m512 ifft2797 = _mm512_fmadd_ps(ifft2788, _mm512_set1_ps(1.5625e-02f), ifft2794);
__m512 ifft2714 = _mm512_fmsub_ps(ifft2704, _mm512_set1_ps(1.5625e-02f), ifft2710);
__m512 ifft2798 = _mm512_fmsub_ps(ifft2788, _mm512_set1_ps(1.5625e-02f), ifft2794);
__m512 ifft2715 = _mm512_add_ps(ifft2705, ifft2706);
__m512 ifft2799 = _mm512_add_ps(ifft2789, ifft2790);
__m512 ifft2716 = _mm512_sub_ps(ifft2705, ifft2706);
__m512 ifft2800 = _mm512_sub_ps(ifft2789, ifft2790);
__m512 ifft2717 = _mm512_fnmadd_ps(ifft2715, _mm512_set1_ps(7.0710677e-01f), ifft2707);
__m512 ifft2801 = _mm512_fnmadd_ps(ifft2799, _mm512_set1_ps(7.0710677e-01f), ifft2791);
__m512 ifft2718 = _mm512_fmadd_ps(ifft2715, _mm512_set1_ps(7.0710677e-01f), ifft2707);
__m512 ifft2802 = _mm512_fmadd_ps(ifft2799, _mm512_set1_ps(7.0710677e-01f), ifft2791);
__m512 ifft2719 = _mm512_fmadd_ps(ifft2716, _mm512_set1_ps(7.0710677e-01f), ifft2708);
__m512 ifft2803 = _mm512_fmadd_ps(ifft2800, _mm512_set1_ps(7.0710677e-01f), ifft2792);
__m512 ifft2720 = _mm512_fmsub_ps(ifft2716, _mm512_set1_ps(7.0710677e-01f), ifft2708);
__m512 ifft2804 = _mm512_fmsub_ps(ifft2800, _mm512_set1_ps(7.0710677e-01f), ifft2792);
__m512 ifft2721 = _mm512_add_ps(ifft2717, ifft2718);
__m512 ifft2805 = _mm512_add_ps(ifft2801, ifft2802);
__m512 ifft2722 = _mm512_sub_ps(ifft2717, ifft2718);
__m512 ifft2806 = _mm512_sub_ps(ifft2801, ifft2802);
__m512 ifft2723 = _mm512_add_ps(ifft2719, ifft2720);
__m512 ifft2807 = _mm512_add_ps(ifft2803, ifft2804);
__m512 ifft2724 = _mm512_sub_ps(ifft2719, ifft2720);
__m512 ifft2808 = _mm512_sub_ps(ifft2803, ifft2804);
__m512 ifft2725 = _mm512_fmadd_ps(ifft2721, _mm512_set1_ps(1.5625e-02f), ifft2711);
__m512 ifft2809 = _mm512_fmadd_ps(ifft2805, _mm512_set1_ps(1.5625e-02f), ifft2795);
__m512 ifft2726 = _mm512_fnmadd_ps(ifft2721, _mm512_set1_ps(1.5625e-02f), ifft2711);
__m512 ifft2810 = _mm512_fnmadd_ps(ifft2805, _mm512_set1_ps(1.5625e-02f), ifft2795);
__m512 ifft2727 = _mm512_fmadd_ps(ifft2723, _mm512_set1_ps(1.5625e-02f), ifft2713);
__m512 ifft2811 = _mm512_fmadd_ps(ifft2807, _mm512_set1_ps(1.5625e-02f), ifft2797);
__m512 ifft2728 = _mm512_fnmadd_ps(ifft2723, _mm512_set1_ps(1.5625e-02f), ifft2713);
__m512 ifft2812 = _mm512_fnmadd_ps(ifft2807, _mm512_set1_ps(1.5625e-02f), ifft2797);
__m512 ifft2729 = _mm512_fnmadd_ps(ifft2724, _mm512_set1_ps(1.5625e-02f), ifft2712);
__m512 ifft2813 = _mm512_fnmadd_ps(ifft2808, _mm512_set1_ps(1.5625e-02f), ifft2796);
__m512 ifft2730 = _mm512_fmadd_ps(ifft2724, _mm512_set1_ps(1.5625e-02f), ifft2712);
__m512 ifft2814 = _mm512_fmadd_ps(ifft2808, _mm512_set1_ps(1.5625e-02f), ifft2796);
__m512 ifft2731 = _mm512_fmadd_ps(ifft2722, _mm512_set1_ps(1.5625e-02f), ifft2714);
__m512 ifft2815 = _mm512_fmadd_ps(ifft2806, _mm512_set1_ps(1.5625e-02f), ifft2798);
__m512 ifft2732 = _mm512_fnmadd_ps(ifft2722, _mm512_set1_ps(1.5625e-02f), ifft2714);
__m512 ifft2816 = _mm512_fnmadd_ps(ifft2806, _mm512_set1_ps(1.5625e-02f), ifft2798);
__m512 dat740 = ifft2725;
__m512 dat745 = ifft2809;
__m512 dat741 = ifft2727;
__m512 dat746 = ifft2811;
__m512 dat742 = ifft2729;
__m512 dat747 = ifft2813;
__m512 dat743 = ifft2731;
__m512 dat748 = ifft2815;
__m512 dat744 = ifft2726;
__m512 dat749 = ifft2810;
(void)ifft2728;
(void)ifft2812;
(void)ifft2730;
(void)ifft2814;
(void)ifft2732;
(void)ifft2816;
__m512i pm27 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack131 = _mm512_permutex2var_ps(dat740, pm27, dat745);
__m512i pm28 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack132 = _mm512_permutex2var_ps(dat740, pm28, dat745);
__m512 pack133 = _mm512_permutex2var_ps(dat741, pm27, dat746);
__m512 pack134 = _mm512_permutex2var_ps(dat741, pm28, dat746);
__m512 pack135 = _mm512_permutex2var_ps(dat742, pm27, dat747);
__m512 pack136 = _mm512_permutex2var_ps(dat742, pm28, dat747);
__m512 pack137 = _mm512_permutex2var_ps(dat743, pm27, dat748);
__m512 pack138 = _mm512_permutex2var_ps(dat743, pm28, dat748);
__m512 pack139 = _mm512_permutex2var_ps(dat744, pm27, dat749);
__m512 pack140 = _mm512_permutex2var_ps(dat744, pm28, dat749);
pack131 = _mm512_max_ps(_mm512_setzero_ps(), pack131);
pack132 = _mm512_max_ps(_mm512_setzero_ps(), pack132);
pack133 = _mm512_max_ps(_mm512_setzero_ps(), pack133);
pack134 = _mm512_max_ps(_mm512_setzero_ps(), pack134);
pack135 = _mm512_max_ps(_mm512_setzero_ps(), pack135);
pack136 = _mm512_max_ps(_mm512_setzero_ps(), pack136);
pack137 = _mm512_max_ps(_mm512_setzero_ps(), pack137);
pack138 = _mm512_max_ps(_mm512_setzero_ps(), pack138);
pack139 = _mm512_max_ps(_mm512_setzero_ps(), pack139);
pack140 = _mm512_max_ps(_mm512_setzero_ps(), pack140);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack131);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack132);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack133);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack134);
_mm512_mask_storeu_ps(datPtr2+2716+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack135);
_mm512_mask_storeu_ps(datPtr2+52956+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack136);
_mm512_mask_storeu_ps(datPtr2+3164+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack137);
_mm512_mask_storeu_ps(datPtr2+53404+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack138);
_mm512_mask_storeu_ps(datPtr2+3612+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack139);
_mm512_mask_storeu_ps(datPtr2+53852+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+0*t17, 1023, pack140);
ptrdiff_t t18 = 0;
__m512 sfRe185 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm185 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe189 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm189 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe186 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm186 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe190 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm190 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe187 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm187 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe191 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm191 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe188 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm188 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfRe192 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512 sfIm192 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k33+768*r10+256*t18);
__m512i ifft2817 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2818 = _mm512_permutexvar_ps(ifft2817, sfRe185);
__m512 ifft2909 = _mm512_permutexvar_ps(ifft2817, sfRe189);
__m512i ifft2819 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2820 = _mm512_permutexvar_ps(ifft2819, sfRe185);
__m512 ifft2910 = _mm512_permutexvar_ps(ifft2819, sfRe189);
__m512 ifft2821 = _mm512_permutexvar_ps(ifft2817, sfIm185);
__m512 ifft2911 = _mm512_permutexvar_ps(ifft2817, sfIm189);
__m512 ifft2822 = _mm512_permutexvar_ps(ifft2819, sfIm185);
__m512 ifft2912 = _mm512_permutexvar_ps(ifft2819, sfIm189);
__m512 ifft2823 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft2824 = _mm512_mask_fmadd_ps(ifft2822, 65021, ifft2823, ifft2818);
__m512 ifft2913 = _mm512_mask_fmadd_ps(ifft2912, 65021, ifft2823, ifft2909);
__m512 ifft2825 = _mm512_mask_fnmadd_ps(ifft2821, 65021, ifft2823, ifft2820);
__m512 ifft2914 = _mm512_mask_fnmadd_ps(ifft2911, 65021, ifft2823, ifft2910);
__m512 ifft2826 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft2827 = _mm512_fmadd_ps(ifft2824, ifft2826, _mm512_shuffle_ps(ifft2824, ifft2824, 177));
__m512 ifft2915 = _mm512_fmadd_ps(ifft2913, ifft2826, _mm512_shuffle_ps(ifft2913, ifft2913, 177));
__m512 ifft2828 = _mm512_fmadd_ps(ifft2825, ifft2826, _mm512_shuffle_ps(ifft2825, ifft2825, 177));
__m512 ifft2916 = _mm512_fmadd_ps(ifft2914, ifft2826, _mm512_shuffle_ps(ifft2914, ifft2914, 177));
__m512 ifft2829 = _mm512_fmadd_ps(sfRe186, ifft2826, _mm512_shuffle_ps(sfRe186, sfRe186, 177));
__m512 ifft2917 = _mm512_fmadd_ps(sfRe190, ifft2826, _mm512_shuffle_ps(sfRe190, sfRe190, 177));
__m512 ifft2830 = _mm512_fmadd_ps(sfIm186, ifft2826, _mm512_shuffle_ps(sfIm186, sfIm186, 177));
__m512 ifft2918 = _mm512_fmadd_ps(sfIm190, ifft2826, _mm512_shuffle_ps(sfIm190, sfIm190, 177));
__m512 ifft2831 = _mm512_fmadd_ps(sfRe187, ifft2826, _mm512_shuffle_ps(sfRe187, sfRe187, 177));
__m512 ifft2919 = _mm512_fmadd_ps(sfRe191, ifft2826, _mm512_shuffle_ps(sfRe191, sfRe191, 177));
__m512 ifft2832 = _mm512_fmadd_ps(sfIm187, ifft2826, _mm512_shuffle_ps(sfIm187, sfIm187, 177));
__m512 ifft2920 = _mm512_fmadd_ps(sfIm191, ifft2826, _mm512_shuffle_ps(sfIm191, sfIm191, 177));
__m512 ifft2833 = _mm512_fmadd_ps(sfRe188, ifft2826, _mm512_shuffle_ps(sfRe188, sfRe188, 177));
__m512 ifft2921 = _mm512_fmadd_ps(sfRe192, ifft2826, _mm512_shuffle_ps(sfRe192, sfRe192, 177));
__m512 ifft2834 = _mm512_fmadd_ps(sfIm188, ifft2826, _mm512_shuffle_ps(sfIm188, sfIm188, 177));
__m512 ifft2922 = _mm512_fmadd_ps(sfIm192, ifft2826, _mm512_shuffle_ps(sfIm192, sfIm192, 177));
__m512 ifft2835 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft2836 = _mm512_mul_ps(ifft2827, ifft2835);
__m512 ifft2923 = _mm512_mul_ps(ifft2915, ifft2835);
__m512 ifft2837 = _mm512_mul_ps(ifft2828, ifft2835);
__m512 ifft2924 = _mm512_mul_ps(ifft2916, ifft2835);
__m512 ifft2838 = _mm512_mul_ps(ifft2829, ifft2835);
__m512 ifft2925 = _mm512_mul_ps(ifft2917, ifft2835);
__m512 ifft2839 = _mm512_mul_ps(ifft2830, ifft2835);
__m512 ifft2926 = _mm512_mul_ps(ifft2918, ifft2835);
__m512 ifft2840 = _mm512_mul_ps(ifft2831, ifft2835);
__m512 ifft2927 = _mm512_mul_ps(ifft2919, ifft2835);
__m512 ifft2841 = _mm512_mul_ps(ifft2832, ifft2835);
__m512 ifft2928 = _mm512_mul_ps(ifft2920, ifft2835);
__m512 ifft2842 = _mm512_mul_ps(ifft2833, ifft2835);
__m512 ifft2929 = _mm512_mul_ps(ifft2921, ifft2835);
__m512 ifft2843 = _mm512_mul_ps(ifft2834, ifft2835);
__m512 ifft2930 = _mm512_mul_ps(ifft2922, ifft2835);
__m512 ifft2844 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft2845 = _mm512_fnmadd_ps(ifft2828, ifft2844, ifft2836);
__m512 ifft2931 = _mm512_fnmadd_ps(ifft2916, ifft2844, ifft2923);
__m512 ifft2846 = _mm512_fmadd_ps(ifft2827, ifft2844, ifft2837);
__m512 ifft2932 = _mm512_fmadd_ps(ifft2915, ifft2844, ifft2924);
__m512 ifft2847 = _mm512_fnmadd_ps(ifft2830, ifft2844, ifft2838);
__m512 ifft2933 = _mm512_fnmadd_ps(ifft2918, ifft2844, ifft2925);
__m512 ifft2848 = _mm512_fmadd_ps(ifft2829, ifft2844, ifft2839);
__m512 ifft2934 = _mm512_fmadd_ps(ifft2917, ifft2844, ifft2926);
__m512 ifft2849 = _mm512_fnmadd_ps(ifft2832, ifft2844, ifft2840);
__m512 ifft2935 = _mm512_fnmadd_ps(ifft2920, ifft2844, ifft2927);
__m512 ifft2850 = _mm512_fmadd_ps(ifft2831, ifft2844, ifft2841);
__m512 ifft2936 = _mm512_fmadd_ps(ifft2919, ifft2844, ifft2928);
__m512 ifft2851 = _mm512_fnmadd_ps(ifft2834, ifft2844, ifft2842);
__m512 ifft2937 = _mm512_fnmadd_ps(ifft2922, ifft2844, ifft2929);
__m512 ifft2852 = _mm512_fmadd_ps(ifft2833, ifft2844, ifft2843);
__m512 ifft2938 = _mm512_fmadd_ps(ifft2921, ifft2844, ifft2930);
__m512 ifft2853 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft2854 = _mm512_fmadd_ps(ifft2845, ifft2853, _mm512_shuffle_ps(ifft2845, ifft2845, 78));
__m512 ifft2939 = _mm512_fmadd_ps(ifft2931, ifft2853, _mm512_shuffle_ps(ifft2931, ifft2931, 78));
__m512 ifft2855 = _mm512_fmadd_ps(ifft2846, ifft2853, _mm512_shuffle_ps(ifft2846, ifft2846, 78));
__m512 ifft2940 = _mm512_fmadd_ps(ifft2932, ifft2853, _mm512_shuffle_ps(ifft2932, ifft2932, 78));
__m512 ifft2856 = _mm512_fmadd_ps(ifft2847, ifft2853, _mm512_shuffle_ps(ifft2847, ifft2847, 78));
__m512 ifft2941 = _mm512_fmadd_ps(ifft2933, ifft2853, _mm512_shuffle_ps(ifft2933, ifft2933, 78));
__m512 ifft2857 = _mm512_fmadd_ps(ifft2848, ifft2853, _mm512_shuffle_ps(ifft2848, ifft2848, 78));
__m512 ifft2942 = _mm512_fmadd_ps(ifft2934, ifft2853, _mm512_shuffle_ps(ifft2934, ifft2934, 78));
__m512 ifft2858 = _mm512_fmadd_ps(ifft2849, ifft2853, _mm512_shuffle_ps(ifft2849, ifft2849, 78));
__m512 ifft2943 = _mm512_fmadd_ps(ifft2935, ifft2853, _mm512_shuffle_ps(ifft2935, ifft2935, 78));
__m512 ifft2859 = _mm512_fmadd_ps(ifft2850, ifft2853, _mm512_shuffle_ps(ifft2850, ifft2850, 78));
__m512 ifft2944 = _mm512_fmadd_ps(ifft2936, ifft2853, _mm512_shuffle_ps(ifft2936, ifft2936, 78));
__m512 ifft2860 = _mm512_fmadd_ps(ifft2851, ifft2853, _mm512_shuffle_ps(ifft2851, ifft2851, 78));
__m512 ifft2945 = _mm512_fmadd_ps(ifft2937, ifft2853, _mm512_shuffle_ps(ifft2937, ifft2937, 78));
__m512 ifft2861 = _mm512_fmadd_ps(ifft2852, ifft2853, _mm512_shuffle_ps(ifft2852, ifft2852, 78));
__m512 ifft2946 = _mm512_fmadd_ps(ifft2938, ifft2853, _mm512_shuffle_ps(ifft2938, ifft2938, 78));
__m512 ifft2862 = _mm512_mask_sub_ps(ifft2854, 49344, _mm512_setzero_ps(), ifft2855);
__m512 ifft2947 = _mm512_mask_sub_ps(ifft2939, 49344, _mm512_setzero_ps(), ifft2940);
__m512 ifft2863 = _mm512_mask_mov_ps(ifft2855, 49344, ifft2854);
__m512 ifft2948 = _mm512_mask_mov_ps(ifft2940, 49344, ifft2939);
__m512 ifft2864 = _mm512_mask_sub_ps(ifft2856, 49344, _mm512_setzero_ps(), ifft2857);
__m512 ifft2949 = _mm512_mask_sub_ps(ifft2941, 49344, _mm512_setzero_ps(), ifft2942);
__m512 ifft2865 = _mm512_mask_mov_ps(ifft2857, 49344, ifft2856);
__m512 ifft2950 = _mm512_mask_mov_ps(ifft2942, 49344, ifft2941);
__m512 ifft2866 = _mm512_mask_sub_ps(ifft2858, 49344, _mm512_setzero_ps(), ifft2859);
__m512 ifft2951 = _mm512_mask_sub_ps(ifft2943, 49344, _mm512_setzero_ps(), ifft2944);
__m512 ifft2867 = _mm512_mask_mov_ps(ifft2859, 49344, ifft2858);
__m512 ifft2952 = _mm512_mask_mov_ps(ifft2944, 49344, ifft2943);
__m512 ifft2868 = _mm512_mask_sub_ps(ifft2860, 49344, _mm512_setzero_ps(), ifft2861);
__m512 ifft2953 = _mm512_mask_sub_ps(ifft2945, 49344, _mm512_setzero_ps(), ifft2946);
__m512 ifft2869 = _mm512_mask_mov_ps(ifft2861, 49344, ifft2860);
__m512 ifft2954 = _mm512_mask_mov_ps(ifft2946, 49344, ifft2945);
__m512 ifft2870 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft2871 = _mm512_fmadd_ps(ifft2862, ifft2870, _mm512_shuffle_f32x4(ifft2862, ifft2862, 177));
__m512 ifft2955 = _mm512_fmadd_ps(ifft2947, ifft2870, _mm512_shuffle_f32x4(ifft2947, ifft2947, 177));
__m512 ifft2872 = _mm512_fmadd_ps(ifft2863, ifft2870, _mm512_shuffle_f32x4(ifft2863, ifft2863, 177));
__m512 ifft2956 = _mm512_fmadd_ps(ifft2948, ifft2870, _mm512_shuffle_f32x4(ifft2948, ifft2948, 177));
__m512 ifft2873 = _mm512_fmadd_ps(ifft2864, ifft2870, _mm512_shuffle_f32x4(ifft2864, ifft2864, 177));
__m512 ifft2957 = _mm512_fmadd_ps(ifft2949, ifft2870, _mm512_shuffle_f32x4(ifft2949, ifft2949, 177));
__m512 ifft2874 = _mm512_fmadd_ps(ifft2865, ifft2870, _mm512_shuffle_f32x4(ifft2865, ifft2865, 177));
__m512 ifft2958 = _mm512_fmadd_ps(ifft2950, ifft2870, _mm512_shuffle_f32x4(ifft2950, ifft2950, 177));
__m512 ifft2875 = _mm512_fmadd_ps(ifft2866, ifft2870, _mm512_shuffle_f32x4(ifft2866, ifft2866, 177));
__m512 ifft2959 = _mm512_fmadd_ps(ifft2951, ifft2870, _mm512_shuffle_f32x4(ifft2951, ifft2951, 177));
__m512 ifft2876 = _mm512_fnmsub_ps(ifft2867, ifft2870, _mm512_shuffle_f32x4(ifft2867, ifft2867, 177));
__m512 ifft2960 = _mm512_fnmsub_ps(ifft2952, ifft2870, _mm512_shuffle_f32x4(ifft2952, ifft2952, 177));
__m512 ifft2877 = _mm512_fmadd_ps(ifft2868, ifft2870, _mm512_shuffle_f32x4(ifft2868, ifft2868, 177));
__m512 ifft2961 = _mm512_fmadd_ps(ifft2953, ifft2870, _mm512_shuffle_f32x4(ifft2953, ifft2953, 177));
__m512 ifft2878 = _mm512_fmadd_ps(ifft2869, ifft2870, _mm512_shuffle_f32x4(ifft2869, ifft2869, 177));
__m512 ifft2962 = _mm512_fmadd_ps(ifft2954, ifft2870, _mm512_shuffle_f32x4(ifft2954, ifft2954, 177));
__m512 ifft2879 = _mm512_add_ps(ifft2871, ifft2872);
__m512 ifft2963 = _mm512_add_ps(ifft2955, ifft2956);
__m512 ifft2880 = _mm512_sub_ps(ifft2871, ifft2872);
__m512 ifft2964 = _mm512_sub_ps(ifft2955, ifft2956);
__m512 ifft2881 = _mm512_sub_ps(ifft2873, ifft2877);
__m512 ifft2965 = _mm512_sub_ps(ifft2957, ifft2961);
__m512 ifft2882 = _mm512_add_ps(ifft2874, ifft2878);
__m512 ifft2966 = _mm512_add_ps(ifft2958, ifft2962);
__m512 ifft2883 = _mm512_add_ps(ifft2873, ifft2877);
__m512 ifft2967 = _mm512_add_ps(ifft2957, ifft2961);
__m512 ifft2884 = _mm512_sub_ps(ifft2874, ifft2878);
__m512 ifft2968 = _mm512_sub_ps(ifft2958, ifft2962);
__m512 ifft2885 = _mm512_mul_ps(ifft2875, _mm512_set1_ps(3.125e-02f));
__m512 ifft2969 = _mm512_mul_ps(ifft2959, _mm512_set1_ps(3.125e-02f));
__m512 ifft2886 = _mm512_mul_ps(ifft2876, _mm512_set1_ps(3.125e-02f));
__m512 ifft2970 = _mm512_mul_ps(ifft2960, _mm512_set1_ps(3.125e-02f));
__m512 ifft2887 = _mm512_fmadd_ps(ifft2879, _mm512_set1_ps(1.5625e-02f), ifft2885);
__m512 ifft2971 = _mm512_fmadd_ps(ifft2963, _mm512_set1_ps(1.5625e-02f), ifft2969);
__m512 ifft2888 = _mm512_fmsub_ps(ifft2879, _mm512_set1_ps(1.5625e-02f), ifft2885);
__m512 ifft2972 = _mm512_fmsub_ps(ifft2963, _mm512_set1_ps(1.5625e-02f), ifft2969);
__m512 ifft2889 = _mm512_fmadd_ps(ifft2880, _mm512_set1_ps(1.5625e-02f), ifft2886);
__m512 ifft2973 = _mm512_fmadd_ps(ifft2964, _mm512_set1_ps(1.5625e-02f), ifft2970);
__m512 ifft2890 = _mm512_fmsub_ps(ifft2880, _mm512_set1_ps(1.5625e-02f), ifft2886);
__m512 ifft2974 = _mm512_fmsub_ps(ifft2964, _mm512_set1_ps(1.5625e-02f), ifft2970);
__m512 ifft2891 = _mm512_add_ps(ifft2881, ifft2882);
__m512 ifft2975 = _mm512_add_ps(ifft2965, ifft2966);
__m512 ifft2892 = _mm512_sub_ps(ifft2881, ifft2882);
__m512 ifft2976 = _mm512_sub_ps(ifft2965, ifft2966);
__m512 ifft2893 = _mm512_fnmadd_ps(ifft2891, _mm512_set1_ps(7.0710677e-01f), ifft2883);
__m512 ifft2977 = _mm512_fnmadd_ps(ifft2975, _mm512_set1_ps(7.0710677e-01f), ifft2967);
__m512 ifft2894 = _mm512_fmadd_ps(ifft2891, _mm512_set1_ps(7.0710677e-01f), ifft2883);
__m512 ifft2978 = _mm512_fmadd_ps(ifft2975, _mm512_set1_ps(7.0710677e-01f), ifft2967);
__m512 ifft2895 = _mm512_fmadd_ps(ifft2892, _mm512_set1_ps(7.0710677e-01f), ifft2884);
__m512 ifft2979 = _mm512_fmadd_ps(ifft2976, _mm512_set1_ps(7.0710677e-01f), ifft2968);
__m512 ifft2896 = _mm512_fmsub_ps(ifft2892, _mm512_set1_ps(7.0710677e-01f), ifft2884);
__m512 ifft2980 = _mm512_fmsub_ps(ifft2976, _mm512_set1_ps(7.0710677e-01f), ifft2968);
__m512 ifft2897 = _mm512_add_ps(ifft2893, ifft2894);
__m512 ifft2981 = _mm512_add_ps(ifft2977, ifft2978);
__m512 ifft2898 = _mm512_sub_ps(ifft2893, ifft2894);
__m512 ifft2982 = _mm512_sub_ps(ifft2977, ifft2978);
__m512 ifft2899 = _mm512_add_ps(ifft2895, ifft2896);
__m512 ifft2983 = _mm512_add_ps(ifft2979, ifft2980);
__m512 ifft2900 = _mm512_sub_ps(ifft2895, ifft2896);
__m512 ifft2984 = _mm512_sub_ps(ifft2979, ifft2980);
__m512 ifft2901 = _mm512_fmadd_ps(ifft2897, _mm512_set1_ps(1.5625e-02f), ifft2887);
__m512 ifft2985 = _mm512_fmadd_ps(ifft2981, _mm512_set1_ps(1.5625e-02f), ifft2971);
__m512 ifft2902 = _mm512_fnmadd_ps(ifft2897, _mm512_set1_ps(1.5625e-02f), ifft2887);
__m512 ifft2986 = _mm512_fnmadd_ps(ifft2981, _mm512_set1_ps(1.5625e-02f), ifft2971);
__m512 ifft2903 = _mm512_fmadd_ps(ifft2899, _mm512_set1_ps(1.5625e-02f), ifft2889);
__m512 ifft2987 = _mm512_fmadd_ps(ifft2983, _mm512_set1_ps(1.5625e-02f), ifft2973);
__m512 ifft2904 = _mm512_fnmadd_ps(ifft2899, _mm512_set1_ps(1.5625e-02f), ifft2889);
__m512 ifft2988 = _mm512_fnmadd_ps(ifft2983, _mm512_set1_ps(1.5625e-02f), ifft2973);
__m512 ifft2905 = _mm512_fnmadd_ps(ifft2900, _mm512_set1_ps(1.5625e-02f), ifft2888);
__m512 ifft2989 = _mm512_fnmadd_ps(ifft2984, _mm512_set1_ps(1.5625e-02f), ifft2972);
__m512 ifft2906 = _mm512_fmadd_ps(ifft2900, _mm512_set1_ps(1.5625e-02f), ifft2888);
__m512 ifft2990 = _mm512_fmadd_ps(ifft2984, _mm512_set1_ps(1.5625e-02f), ifft2972);
__m512 ifft2907 = _mm512_fmadd_ps(ifft2898, _mm512_set1_ps(1.5625e-02f), ifft2890);
__m512 ifft2991 = _mm512_fmadd_ps(ifft2982, _mm512_set1_ps(1.5625e-02f), ifft2974);
__m512 ifft2908 = _mm512_fnmadd_ps(ifft2898, _mm512_set1_ps(1.5625e-02f), ifft2890);
__m512 ifft2992 = _mm512_fnmadd_ps(ifft2982, _mm512_set1_ps(1.5625e-02f), ifft2974);
__m512 dat750 = ifft2901;
__m512 dat755 = ifft2985;
__m512 dat751 = ifft2903;
__m512 dat756 = ifft2987;
__m512 dat752 = ifft2905;
__m512 dat757 = ifft2989;
__m512 dat753 = ifft2907;
__m512 dat758 = ifft2991;
__m512 dat754 = ifft2902;
__m512 dat759 = ifft2986;
(void)ifft2904;
(void)ifft2988;
(void)ifft2906;
(void)ifft2990;
(void)ifft2908;
(void)ifft2992;
__m512i pm29 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack141 = _mm512_permutex2var_ps(dat750, pm29, dat755);
__m512i pm30 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack142 = _mm512_permutex2var_ps(dat750, pm30, dat755);
__m512 pack143 = _mm512_permutex2var_ps(dat751, pm29, dat756);
__m512 pack144 = _mm512_permutex2var_ps(dat751, pm30, dat756);
__m512 pack145 = _mm512_permutex2var_ps(dat752, pm29, dat757);
__m512 pack146 = _mm512_permutex2var_ps(dat752, pm30, dat757);
__m512 pack147 = _mm512_permutex2var_ps(dat753, pm29, dat758);
__m512 pack148 = _mm512_permutex2var_ps(dat753, pm30, dat758);
__m512 pack149 = _mm512_permutex2var_ps(dat754, pm29, dat759);
__m512 pack150 = _mm512_permutex2var_ps(dat754, pm30, dat759);
pack141 = _mm512_max_ps(_mm512_setzero_ps(), pack141);
pack142 = _mm512_max_ps(_mm512_setzero_ps(), pack142);
pack143 = _mm512_max_ps(_mm512_setzero_ps(), pack143);
pack144 = _mm512_max_ps(_mm512_setzero_ps(), pack144);
pack145 = _mm512_max_ps(_mm512_setzero_ps(), pack145);
pack146 = _mm512_max_ps(_mm512_setzero_ps(), pack146);
pack147 = _mm512_max_ps(_mm512_setzero_ps(), pack147);
pack148 = _mm512_max_ps(_mm512_setzero_ps(), pack148);
pack149 = _mm512_max_ps(_mm512_setzero_ps(), pack149);
pack150 = _mm512_max_ps(_mm512_setzero_ps(), pack150);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack141);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack142);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack143);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack144);
_mm512_mask_storeu_ps(datPtr2+2756+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack145);
_mm512_mask_storeu_ps(datPtr2+52996+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack146);
_mm512_mask_storeu_ps(datPtr2+3204+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack147);
_mm512_mask_storeu_ps(datPtr2+53444+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack148);
_mm512_mask_storeu_ps(datPtr2+3652+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack149);
_mm512_mask_storeu_ps(datPtr2+53892+3215360*i9+200960*k33+100480*r10+448*toH9+4*toW9+40*t18, 1023, pack150);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 12;
}
if (rel5 < 15) {
ptrdiff_t toH10 = base5+15;
ptrdiff_t toW10 = -340+30*rel5;
ptrdiff_t jj15 = 14-rel5+j5;
for (; j5 <= jj15; toW10 += 30) {
ptrdiff_t k34 = 16*w21;
for (; k34 != 16; ++k34) {
ptrdiff_t r11 = 0;
for (; r11 != 2; ++r11) {
ptrdiff_t t19 = 0;
for (; t19 < 3; ++t19) {
__m512 sfRe193 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm193 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe197 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm197 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe194 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm194 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe198 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm198 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe195 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm195 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe199 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm199 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe196 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm196 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfRe200 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512 sfIm200 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k34+768*r11+256*t19);
__m512i ifft2993 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft2994 = _mm512_permutexvar_ps(ifft2993, sfRe193);
__m512 ifft3085 = _mm512_permutexvar_ps(ifft2993, sfRe197);
__m512i ifft2995 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft2996 = _mm512_permutexvar_ps(ifft2995, sfRe193);
__m512 ifft3086 = _mm512_permutexvar_ps(ifft2995, sfRe197);
__m512 ifft2997 = _mm512_permutexvar_ps(ifft2993, sfIm193);
__m512 ifft3087 = _mm512_permutexvar_ps(ifft2993, sfIm197);
__m512 ifft2998 = _mm512_permutexvar_ps(ifft2995, sfIm193);
__m512 ifft3088 = _mm512_permutexvar_ps(ifft2995, sfIm197);
__m512 ifft2999 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3000 = _mm512_mask_fmadd_ps(ifft2998, 65021, ifft2999, ifft2994);
__m512 ifft3089 = _mm512_mask_fmadd_ps(ifft3088, 65021, ifft2999, ifft3085);
__m512 ifft3001 = _mm512_mask_fnmadd_ps(ifft2997, 65021, ifft2999, ifft2996);
__m512 ifft3090 = _mm512_mask_fnmadd_ps(ifft3087, 65021, ifft2999, ifft3086);
__m512 ifft3002 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3003 = _mm512_fmadd_ps(ifft3000, ifft3002, _mm512_shuffle_ps(ifft3000, ifft3000, 177));
__m512 ifft3091 = _mm512_fmadd_ps(ifft3089, ifft3002, _mm512_shuffle_ps(ifft3089, ifft3089, 177));
__m512 ifft3004 = _mm512_fmadd_ps(ifft3001, ifft3002, _mm512_shuffle_ps(ifft3001, ifft3001, 177));
__m512 ifft3092 = _mm512_fmadd_ps(ifft3090, ifft3002, _mm512_shuffle_ps(ifft3090, ifft3090, 177));
__m512 ifft3005 = _mm512_fmadd_ps(sfRe194, ifft3002, _mm512_shuffle_ps(sfRe194, sfRe194, 177));
__m512 ifft3093 = _mm512_fmadd_ps(sfRe198, ifft3002, _mm512_shuffle_ps(sfRe198, sfRe198, 177));
__m512 ifft3006 = _mm512_fmadd_ps(sfIm194, ifft3002, _mm512_shuffle_ps(sfIm194, sfIm194, 177));
__m512 ifft3094 = _mm512_fmadd_ps(sfIm198, ifft3002, _mm512_shuffle_ps(sfIm198, sfIm198, 177));
__m512 ifft3007 = _mm512_fmadd_ps(sfRe195, ifft3002, _mm512_shuffle_ps(sfRe195, sfRe195, 177));
__m512 ifft3095 = _mm512_fmadd_ps(sfRe199, ifft3002, _mm512_shuffle_ps(sfRe199, sfRe199, 177));
__m512 ifft3008 = _mm512_fmadd_ps(sfIm195, ifft3002, _mm512_shuffle_ps(sfIm195, sfIm195, 177));
__m512 ifft3096 = _mm512_fmadd_ps(sfIm199, ifft3002, _mm512_shuffle_ps(sfIm199, sfIm199, 177));
__m512 ifft3009 = _mm512_fmadd_ps(sfRe196, ifft3002, _mm512_shuffle_ps(sfRe196, sfRe196, 177));
__m512 ifft3097 = _mm512_fmadd_ps(sfRe200, ifft3002, _mm512_shuffle_ps(sfRe200, sfRe200, 177));
__m512 ifft3010 = _mm512_fmadd_ps(sfIm196, ifft3002, _mm512_shuffle_ps(sfIm196, sfIm196, 177));
__m512 ifft3098 = _mm512_fmadd_ps(sfIm200, ifft3002, _mm512_shuffle_ps(sfIm200, sfIm200, 177));
__m512 ifft3011 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3012 = _mm512_mul_ps(ifft3003, ifft3011);
__m512 ifft3099 = _mm512_mul_ps(ifft3091, ifft3011);
__m512 ifft3013 = _mm512_mul_ps(ifft3004, ifft3011);
__m512 ifft3100 = _mm512_mul_ps(ifft3092, ifft3011);
__m512 ifft3014 = _mm512_mul_ps(ifft3005, ifft3011);
__m512 ifft3101 = _mm512_mul_ps(ifft3093, ifft3011);
__m512 ifft3015 = _mm512_mul_ps(ifft3006, ifft3011);
__m512 ifft3102 = _mm512_mul_ps(ifft3094, ifft3011);
__m512 ifft3016 = _mm512_mul_ps(ifft3007, ifft3011);
__m512 ifft3103 = _mm512_mul_ps(ifft3095, ifft3011);
__m512 ifft3017 = _mm512_mul_ps(ifft3008, ifft3011);
__m512 ifft3104 = _mm512_mul_ps(ifft3096, ifft3011);
__m512 ifft3018 = _mm512_mul_ps(ifft3009, ifft3011);
__m512 ifft3105 = _mm512_mul_ps(ifft3097, ifft3011);
__m512 ifft3019 = _mm512_mul_ps(ifft3010, ifft3011);
__m512 ifft3106 = _mm512_mul_ps(ifft3098, ifft3011);
__m512 ifft3020 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3021 = _mm512_fnmadd_ps(ifft3004, ifft3020, ifft3012);
__m512 ifft3107 = _mm512_fnmadd_ps(ifft3092, ifft3020, ifft3099);
__m512 ifft3022 = _mm512_fmadd_ps(ifft3003, ifft3020, ifft3013);
__m512 ifft3108 = _mm512_fmadd_ps(ifft3091, ifft3020, ifft3100);
__m512 ifft3023 = _mm512_fnmadd_ps(ifft3006, ifft3020, ifft3014);
__m512 ifft3109 = _mm512_fnmadd_ps(ifft3094, ifft3020, ifft3101);
__m512 ifft3024 = _mm512_fmadd_ps(ifft3005, ifft3020, ifft3015);
__m512 ifft3110 = _mm512_fmadd_ps(ifft3093, ifft3020, ifft3102);
__m512 ifft3025 = _mm512_fnmadd_ps(ifft3008, ifft3020, ifft3016);
__m512 ifft3111 = _mm512_fnmadd_ps(ifft3096, ifft3020, ifft3103);
__m512 ifft3026 = _mm512_fmadd_ps(ifft3007, ifft3020, ifft3017);
__m512 ifft3112 = _mm512_fmadd_ps(ifft3095, ifft3020, ifft3104);
__m512 ifft3027 = _mm512_fnmadd_ps(ifft3010, ifft3020, ifft3018);
__m512 ifft3113 = _mm512_fnmadd_ps(ifft3098, ifft3020, ifft3105);
__m512 ifft3028 = _mm512_fmadd_ps(ifft3009, ifft3020, ifft3019);
__m512 ifft3114 = _mm512_fmadd_ps(ifft3097, ifft3020, ifft3106);
__m512 ifft3029 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3030 = _mm512_fmadd_ps(ifft3021, ifft3029, _mm512_shuffle_ps(ifft3021, ifft3021, 78));
__m512 ifft3115 = _mm512_fmadd_ps(ifft3107, ifft3029, _mm512_shuffle_ps(ifft3107, ifft3107, 78));
__m512 ifft3031 = _mm512_fmadd_ps(ifft3022, ifft3029, _mm512_shuffle_ps(ifft3022, ifft3022, 78));
__m512 ifft3116 = _mm512_fmadd_ps(ifft3108, ifft3029, _mm512_shuffle_ps(ifft3108, ifft3108, 78));
__m512 ifft3032 = _mm512_fmadd_ps(ifft3023, ifft3029, _mm512_shuffle_ps(ifft3023, ifft3023, 78));
__m512 ifft3117 = _mm512_fmadd_ps(ifft3109, ifft3029, _mm512_shuffle_ps(ifft3109, ifft3109, 78));
__m512 ifft3033 = _mm512_fmadd_ps(ifft3024, ifft3029, _mm512_shuffle_ps(ifft3024, ifft3024, 78));
__m512 ifft3118 = _mm512_fmadd_ps(ifft3110, ifft3029, _mm512_shuffle_ps(ifft3110, ifft3110, 78));
__m512 ifft3034 = _mm512_fmadd_ps(ifft3025, ifft3029, _mm512_shuffle_ps(ifft3025, ifft3025, 78));
__m512 ifft3119 = _mm512_fmadd_ps(ifft3111, ifft3029, _mm512_shuffle_ps(ifft3111, ifft3111, 78));
__m512 ifft3035 = _mm512_fmadd_ps(ifft3026, ifft3029, _mm512_shuffle_ps(ifft3026, ifft3026, 78));
__m512 ifft3120 = _mm512_fmadd_ps(ifft3112, ifft3029, _mm512_shuffle_ps(ifft3112, ifft3112, 78));
__m512 ifft3036 = _mm512_fmadd_ps(ifft3027, ifft3029, _mm512_shuffle_ps(ifft3027, ifft3027, 78));
__m512 ifft3121 = _mm512_fmadd_ps(ifft3113, ifft3029, _mm512_shuffle_ps(ifft3113, ifft3113, 78));
__m512 ifft3037 = _mm512_fmadd_ps(ifft3028, ifft3029, _mm512_shuffle_ps(ifft3028, ifft3028, 78));
__m512 ifft3122 = _mm512_fmadd_ps(ifft3114, ifft3029, _mm512_shuffle_ps(ifft3114, ifft3114, 78));
__m512 ifft3038 = _mm512_mask_sub_ps(ifft3030, 49344, _mm512_setzero_ps(), ifft3031);
__m512 ifft3123 = _mm512_mask_sub_ps(ifft3115, 49344, _mm512_setzero_ps(), ifft3116);
__m512 ifft3039 = _mm512_mask_mov_ps(ifft3031, 49344, ifft3030);
__m512 ifft3124 = _mm512_mask_mov_ps(ifft3116, 49344, ifft3115);
__m512 ifft3040 = _mm512_mask_sub_ps(ifft3032, 49344, _mm512_setzero_ps(), ifft3033);
__m512 ifft3125 = _mm512_mask_sub_ps(ifft3117, 49344, _mm512_setzero_ps(), ifft3118);
__m512 ifft3041 = _mm512_mask_mov_ps(ifft3033, 49344, ifft3032);
__m512 ifft3126 = _mm512_mask_mov_ps(ifft3118, 49344, ifft3117);
__m512 ifft3042 = _mm512_mask_sub_ps(ifft3034, 49344, _mm512_setzero_ps(), ifft3035);
__m512 ifft3127 = _mm512_mask_sub_ps(ifft3119, 49344, _mm512_setzero_ps(), ifft3120);
__m512 ifft3043 = _mm512_mask_mov_ps(ifft3035, 49344, ifft3034);
__m512 ifft3128 = _mm512_mask_mov_ps(ifft3120, 49344, ifft3119);
__m512 ifft3044 = _mm512_mask_sub_ps(ifft3036, 49344, _mm512_setzero_ps(), ifft3037);
__m512 ifft3129 = _mm512_mask_sub_ps(ifft3121, 49344, _mm512_setzero_ps(), ifft3122);
__m512 ifft3045 = _mm512_mask_mov_ps(ifft3037, 49344, ifft3036);
__m512 ifft3130 = _mm512_mask_mov_ps(ifft3122, 49344, ifft3121);
__m512 ifft3046 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3047 = _mm512_fmadd_ps(ifft3038, ifft3046, _mm512_shuffle_f32x4(ifft3038, ifft3038, 177));
__m512 ifft3131 = _mm512_fmadd_ps(ifft3123, ifft3046, _mm512_shuffle_f32x4(ifft3123, ifft3123, 177));
__m512 ifft3048 = _mm512_fmadd_ps(ifft3039, ifft3046, _mm512_shuffle_f32x4(ifft3039, ifft3039, 177));
__m512 ifft3132 = _mm512_fmadd_ps(ifft3124, ifft3046, _mm512_shuffle_f32x4(ifft3124, ifft3124, 177));
__m512 ifft3049 = _mm512_fmadd_ps(ifft3040, ifft3046, _mm512_shuffle_f32x4(ifft3040, ifft3040, 177));
__m512 ifft3133 = _mm512_fmadd_ps(ifft3125, ifft3046, _mm512_shuffle_f32x4(ifft3125, ifft3125, 177));
__m512 ifft3050 = _mm512_fmadd_ps(ifft3041, ifft3046, _mm512_shuffle_f32x4(ifft3041, ifft3041, 177));
__m512 ifft3134 = _mm512_fmadd_ps(ifft3126, ifft3046, _mm512_shuffle_f32x4(ifft3126, ifft3126, 177));
__m512 ifft3051 = _mm512_fmadd_ps(ifft3042, ifft3046, _mm512_shuffle_f32x4(ifft3042, ifft3042, 177));
__m512 ifft3135 = _mm512_fmadd_ps(ifft3127, ifft3046, _mm512_shuffle_f32x4(ifft3127, ifft3127, 177));
__m512 ifft3052 = _mm512_fnmsub_ps(ifft3043, ifft3046, _mm512_shuffle_f32x4(ifft3043, ifft3043, 177));
__m512 ifft3136 = _mm512_fnmsub_ps(ifft3128, ifft3046, _mm512_shuffle_f32x4(ifft3128, ifft3128, 177));
__m512 ifft3053 = _mm512_fmadd_ps(ifft3044, ifft3046, _mm512_shuffle_f32x4(ifft3044, ifft3044, 177));
__m512 ifft3137 = _mm512_fmadd_ps(ifft3129, ifft3046, _mm512_shuffle_f32x4(ifft3129, ifft3129, 177));
__m512 ifft3054 = _mm512_fmadd_ps(ifft3045, ifft3046, _mm512_shuffle_f32x4(ifft3045, ifft3045, 177));
__m512 ifft3138 = _mm512_fmadd_ps(ifft3130, ifft3046, _mm512_shuffle_f32x4(ifft3130, ifft3130, 177));
__m512 ifft3055 = _mm512_add_ps(ifft3047, ifft3048);
__m512 ifft3139 = _mm512_add_ps(ifft3131, ifft3132);
__m512 ifft3056 = _mm512_sub_ps(ifft3047, ifft3048);
__m512 ifft3140 = _mm512_sub_ps(ifft3131, ifft3132);
__m512 ifft3057 = _mm512_sub_ps(ifft3049, ifft3053);
__m512 ifft3141 = _mm512_sub_ps(ifft3133, ifft3137);
__m512 ifft3058 = _mm512_add_ps(ifft3050, ifft3054);
__m512 ifft3142 = _mm512_add_ps(ifft3134, ifft3138);
__m512 ifft3059 = _mm512_add_ps(ifft3049, ifft3053);
__m512 ifft3143 = _mm512_add_ps(ifft3133, ifft3137);
__m512 ifft3060 = _mm512_sub_ps(ifft3050, ifft3054);
__m512 ifft3144 = _mm512_sub_ps(ifft3134, ifft3138);
__m512 ifft3061 = _mm512_mul_ps(ifft3051, _mm512_set1_ps(3.125e-02f));
__m512 ifft3145 = _mm512_mul_ps(ifft3135, _mm512_set1_ps(3.125e-02f));
__m512 ifft3062 = _mm512_mul_ps(ifft3052, _mm512_set1_ps(3.125e-02f));
__m512 ifft3146 = _mm512_mul_ps(ifft3136, _mm512_set1_ps(3.125e-02f));
__m512 ifft3063 = _mm512_fmadd_ps(ifft3055, _mm512_set1_ps(1.5625e-02f), ifft3061);
__m512 ifft3147 = _mm512_fmadd_ps(ifft3139, _mm512_set1_ps(1.5625e-02f), ifft3145);
__m512 ifft3064 = _mm512_fmsub_ps(ifft3055, _mm512_set1_ps(1.5625e-02f), ifft3061);
__m512 ifft3148 = _mm512_fmsub_ps(ifft3139, _mm512_set1_ps(1.5625e-02f), ifft3145);
__m512 ifft3065 = _mm512_fmadd_ps(ifft3056, _mm512_set1_ps(1.5625e-02f), ifft3062);
__m512 ifft3149 = _mm512_fmadd_ps(ifft3140, _mm512_set1_ps(1.5625e-02f), ifft3146);
__m512 ifft3066 = _mm512_fmsub_ps(ifft3056, _mm512_set1_ps(1.5625e-02f), ifft3062);
__m512 ifft3150 = _mm512_fmsub_ps(ifft3140, _mm512_set1_ps(1.5625e-02f), ifft3146);
__m512 ifft3067 = _mm512_add_ps(ifft3057, ifft3058);
__m512 ifft3151 = _mm512_add_ps(ifft3141, ifft3142);
__m512 ifft3068 = _mm512_sub_ps(ifft3057, ifft3058);
__m512 ifft3152 = _mm512_sub_ps(ifft3141, ifft3142);
__m512 ifft3069 = _mm512_fnmadd_ps(ifft3067, _mm512_set1_ps(7.0710677e-01f), ifft3059);
__m512 ifft3153 = _mm512_fnmadd_ps(ifft3151, _mm512_set1_ps(7.0710677e-01f), ifft3143);
__m512 ifft3070 = _mm512_fmadd_ps(ifft3067, _mm512_set1_ps(7.0710677e-01f), ifft3059);
__m512 ifft3154 = _mm512_fmadd_ps(ifft3151, _mm512_set1_ps(7.0710677e-01f), ifft3143);
__m512 ifft3071 = _mm512_fmadd_ps(ifft3068, _mm512_set1_ps(7.0710677e-01f), ifft3060);
__m512 ifft3155 = _mm512_fmadd_ps(ifft3152, _mm512_set1_ps(7.0710677e-01f), ifft3144);
__m512 ifft3072 = _mm512_fmsub_ps(ifft3068, _mm512_set1_ps(7.0710677e-01f), ifft3060);
__m512 ifft3156 = _mm512_fmsub_ps(ifft3152, _mm512_set1_ps(7.0710677e-01f), ifft3144);
__m512 ifft3073 = _mm512_add_ps(ifft3069, ifft3070);
__m512 ifft3157 = _mm512_add_ps(ifft3153, ifft3154);
__m512 ifft3074 = _mm512_sub_ps(ifft3069, ifft3070);
__m512 ifft3158 = _mm512_sub_ps(ifft3153, ifft3154);
__m512 ifft3075 = _mm512_add_ps(ifft3071, ifft3072);
__m512 ifft3159 = _mm512_add_ps(ifft3155, ifft3156);
__m512 ifft3076 = _mm512_sub_ps(ifft3071, ifft3072);
__m512 ifft3160 = _mm512_sub_ps(ifft3155, ifft3156);
__m512 ifft3077 = _mm512_fmadd_ps(ifft3073, _mm512_set1_ps(1.5625e-02f), ifft3063);
__m512 ifft3161 = _mm512_fmadd_ps(ifft3157, _mm512_set1_ps(1.5625e-02f), ifft3147);
__m512 ifft3078 = _mm512_fnmadd_ps(ifft3073, _mm512_set1_ps(1.5625e-02f), ifft3063);
__m512 ifft3162 = _mm512_fnmadd_ps(ifft3157, _mm512_set1_ps(1.5625e-02f), ifft3147);
__m512 ifft3079 = _mm512_fmadd_ps(ifft3075, _mm512_set1_ps(1.5625e-02f), ifft3065);
__m512 ifft3163 = _mm512_fmadd_ps(ifft3159, _mm512_set1_ps(1.5625e-02f), ifft3149);
__m512 ifft3080 = _mm512_fnmadd_ps(ifft3075, _mm512_set1_ps(1.5625e-02f), ifft3065);
__m512 ifft3164 = _mm512_fnmadd_ps(ifft3159, _mm512_set1_ps(1.5625e-02f), ifft3149);
__m512 ifft3081 = _mm512_fnmadd_ps(ifft3076, _mm512_set1_ps(1.5625e-02f), ifft3064);
__m512 ifft3165 = _mm512_fnmadd_ps(ifft3160, _mm512_set1_ps(1.5625e-02f), ifft3148);
__m512 ifft3082 = _mm512_fmadd_ps(ifft3076, _mm512_set1_ps(1.5625e-02f), ifft3064);
__m512 ifft3166 = _mm512_fmadd_ps(ifft3160, _mm512_set1_ps(1.5625e-02f), ifft3148);
__m512 ifft3083 = _mm512_fmadd_ps(ifft3074, _mm512_set1_ps(1.5625e-02f), ifft3066);
__m512 ifft3167 = _mm512_fmadd_ps(ifft3158, _mm512_set1_ps(1.5625e-02f), ifft3150);
__m512 ifft3084 = _mm512_fnmadd_ps(ifft3074, _mm512_set1_ps(1.5625e-02f), ifft3066);
__m512 ifft3168 = _mm512_fnmadd_ps(ifft3158, _mm512_set1_ps(1.5625e-02f), ifft3150);
__m512 dat760 = ifft3077;
__m512 dat765 = ifft3161;
__m512 dat761 = ifft3079;
__m512 dat766 = ifft3163;
__m512 dat762 = ifft3081;
__m512 dat767 = ifft3165;
__m512 dat763 = ifft3083;
__m512 dat768 = ifft3167;
__m512 dat764 = ifft3078;
__m512 dat769 = ifft3162;
(void)ifft3080;
(void)ifft3164;
(void)ifft3082;
(void)ifft3166;
(void)ifft3084;
(void)ifft3168;
__m512i pm31 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack151 = _mm512_permutex2var_ps(dat760, pm31, dat765);
__m512i pm32 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack152 = _mm512_permutex2var_ps(dat760, pm32, dat765);
__m512 pack153 = _mm512_permutex2var_ps(dat761, pm31, dat766);
__m512 pack154 = _mm512_permutex2var_ps(dat761, pm32, dat766);
__m512 pack155 = _mm512_permutex2var_ps(dat762, pm31, dat767);
__m512 pack156 = _mm512_permutex2var_ps(dat762, pm32, dat767);
__m512 pack157 = _mm512_permutex2var_ps(dat763, pm31, dat768);
__m512 pack158 = _mm512_permutex2var_ps(dat763, pm32, dat768);
__m512 pack159 = _mm512_permutex2var_ps(dat764, pm31, dat769);
__m512 pack160 = _mm512_permutex2var_ps(dat764, pm32, dat769);
pack151 = _mm512_max_ps(_mm512_setzero_ps(), pack151);
pack152 = _mm512_max_ps(_mm512_setzero_ps(), pack152);
pack153 = _mm512_max_ps(_mm512_setzero_ps(), pack153);
pack154 = _mm512_max_ps(_mm512_setzero_ps(), pack154);
pack155 = _mm512_max_ps(_mm512_setzero_ps(), pack155);
pack156 = _mm512_max_ps(_mm512_setzero_ps(), pack156);
pack157 = _mm512_max_ps(_mm512_setzero_ps(), pack157);
pack158 = _mm512_max_ps(_mm512_setzero_ps(), pack158);
pack159 = _mm512_max_ps(_mm512_setzero_ps(), pack159);
pack160 = _mm512_max_ps(_mm512_setzero_ps(), pack160);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack151);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack152);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack153);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack154);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack155);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack156);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack157);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack158);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack159);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k34+100480*r11+448*toH10+4*toW10+40*t19, 1023, pack160);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 15;
}
ptrdiff_t toH11 = base5+15;
ptrdiff_t toW11 = 110;
ptrdiff_t k35 = 16*w21;
for (; k35 != 16; ++k35) {
ptrdiff_t r12 = 0;
for (; r12 != 2; ++r12) {
ptrdiff_t t20 = 0;
__m512 sfRe201 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm201 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe205 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm205 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe202 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm202 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe206 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm206 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe203 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm203 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe207 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm207 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe204 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm204 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfRe208 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512 sfIm208 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k35+768*r12+256*t20);
__m512i ifft3169 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3170 = _mm512_permutexvar_ps(ifft3169, sfRe201);
__m512 ifft3261 = _mm512_permutexvar_ps(ifft3169, sfRe205);
__m512i ifft3171 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3172 = _mm512_permutexvar_ps(ifft3171, sfRe201);
__m512 ifft3262 = _mm512_permutexvar_ps(ifft3171, sfRe205);
__m512 ifft3173 = _mm512_permutexvar_ps(ifft3169, sfIm201);
__m512 ifft3263 = _mm512_permutexvar_ps(ifft3169, sfIm205);
__m512 ifft3174 = _mm512_permutexvar_ps(ifft3171, sfIm201);
__m512 ifft3264 = _mm512_permutexvar_ps(ifft3171, sfIm205);
__m512 ifft3175 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3176 = _mm512_mask_fmadd_ps(ifft3174, 65021, ifft3175, ifft3170);
__m512 ifft3265 = _mm512_mask_fmadd_ps(ifft3264, 65021, ifft3175, ifft3261);
__m512 ifft3177 = _mm512_mask_fnmadd_ps(ifft3173, 65021, ifft3175, ifft3172);
__m512 ifft3266 = _mm512_mask_fnmadd_ps(ifft3263, 65021, ifft3175, ifft3262);
__m512 ifft3178 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3179 = _mm512_fmadd_ps(ifft3176, ifft3178, _mm512_shuffle_ps(ifft3176, ifft3176, 177));
__m512 ifft3267 = _mm512_fmadd_ps(ifft3265, ifft3178, _mm512_shuffle_ps(ifft3265, ifft3265, 177));
__m512 ifft3180 = _mm512_fmadd_ps(ifft3177, ifft3178, _mm512_shuffle_ps(ifft3177, ifft3177, 177));
__m512 ifft3268 = _mm512_fmadd_ps(ifft3266, ifft3178, _mm512_shuffle_ps(ifft3266, ifft3266, 177));
__m512 ifft3181 = _mm512_fmadd_ps(sfRe202, ifft3178, _mm512_shuffle_ps(sfRe202, sfRe202, 177));
__m512 ifft3269 = _mm512_fmadd_ps(sfRe206, ifft3178, _mm512_shuffle_ps(sfRe206, sfRe206, 177));
__m512 ifft3182 = _mm512_fmadd_ps(sfIm202, ifft3178, _mm512_shuffle_ps(sfIm202, sfIm202, 177));
__m512 ifft3270 = _mm512_fmadd_ps(sfIm206, ifft3178, _mm512_shuffle_ps(sfIm206, sfIm206, 177));
__m512 ifft3183 = _mm512_fmadd_ps(sfRe203, ifft3178, _mm512_shuffle_ps(sfRe203, sfRe203, 177));
__m512 ifft3271 = _mm512_fmadd_ps(sfRe207, ifft3178, _mm512_shuffle_ps(sfRe207, sfRe207, 177));
__m512 ifft3184 = _mm512_fmadd_ps(sfIm203, ifft3178, _mm512_shuffle_ps(sfIm203, sfIm203, 177));
__m512 ifft3272 = _mm512_fmadd_ps(sfIm207, ifft3178, _mm512_shuffle_ps(sfIm207, sfIm207, 177));
__m512 ifft3185 = _mm512_fmadd_ps(sfRe204, ifft3178, _mm512_shuffle_ps(sfRe204, sfRe204, 177));
__m512 ifft3273 = _mm512_fmadd_ps(sfRe208, ifft3178, _mm512_shuffle_ps(sfRe208, sfRe208, 177));
__m512 ifft3186 = _mm512_fmadd_ps(sfIm204, ifft3178, _mm512_shuffle_ps(sfIm204, sfIm204, 177));
__m512 ifft3274 = _mm512_fmadd_ps(sfIm208, ifft3178, _mm512_shuffle_ps(sfIm208, sfIm208, 177));
__m512 ifft3187 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3188 = _mm512_mul_ps(ifft3179, ifft3187);
__m512 ifft3275 = _mm512_mul_ps(ifft3267, ifft3187);
__m512 ifft3189 = _mm512_mul_ps(ifft3180, ifft3187);
__m512 ifft3276 = _mm512_mul_ps(ifft3268, ifft3187);
__m512 ifft3190 = _mm512_mul_ps(ifft3181, ifft3187);
__m512 ifft3277 = _mm512_mul_ps(ifft3269, ifft3187);
__m512 ifft3191 = _mm512_mul_ps(ifft3182, ifft3187);
__m512 ifft3278 = _mm512_mul_ps(ifft3270, ifft3187);
__m512 ifft3192 = _mm512_mul_ps(ifft3183, ifft3187);
__m512 ifft3279 = _mm512_mul_ps(ifft3271, ifft3187);
__m512 ifft3193 = _mm512_mul_ps(ifft3184, ifft3187);
__m512 ifft3280 = _mm512_mul_ps(ifft3272, ifft3187);
__m512 ifft3194 = _mm512_mul_ps(ifft3185, ifft3187);
__m512 ifft3281 = _mm512_mul_ps(ifft3273, ifft3187);
__m512 ifft3195 = _mm512_mul_ps(ifft3186, ifft3187);
__m512 ifft3282 = _mm512_mul_ps(ifft3274, ifft3187);
__m512 ifft3196 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3197 = _mm512_fnmadd_ps(ifft3180, ifft3196, ifft3188);
__m512 ifft3283 = _mm512_fnmadd_ps(ifft3268, ifft3196, ifft3275);
__m512 ifft3198 = _mm512_fmadd_ps(ifft3179, ifft3196, ifft3189);
__m512 ifft3284 = _mm512_fmadd_ps(ifft3267, ifft3196, ifft3276);
__m512 ifft3199 = _mm512_fnmadd_ps(ifft3182, ifft3196, ifft3190);
__m512 ifft3285 = _mm512_fnmadd_ps(ifft3270, ifft3196, ifft3277);
__m512 ifft3200 = _mm512_fmadd_ps(ifft3181, ifft3196, ifft3191);
__m512 ifft3286 = _mm512_fmadd_ps(ifft3269, ifft3196, ifft3278);
__m512 ifft3201 = _mm512_fnmadd_ps(ifft3184, ifft3196, ifft3192);
__m512 ifft3287 = _mm512_fnmadd_ps(ifft3272, ifft3196, ifft3279);
__m512 ifft3202 = _mm512_fmadd_ps(ifft3183, ifft3196, ifft3193);
__m512 ifft3288 = _mm512_fmadd_ps(ifft3271, ifft3196, ifft3280);
__m512 ifft3203 = _mm512_fnmadd_ps(ifft3186, ifft3196, ifft3194);
__m512 ifft3289 = _mm512_fnmadd_ps(ifft3274, ifft3196, ifft3281);
__m512 ifft3204 = _mm512_fmadd_ps(ifft3185, ifft3196, ifft3195);
__m512 ifft3290 = _mm512_fmadd_ps(ifft3273, ifft3196, ifft3282);
__m512 ifft3205 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3206 = _mm512_fmadd_ps(ifft3197, ifft3205, _mm512_shuffle_ps(ifft3197, ifft3197, 78));
__m512 ifft3291 = _mm512_fmadd_ps(ifft3283, ifft3205, _mm512_shuffle_ps(ifft3283, ifft3283, 78));
__m512 ifft3207 = _mm512_fmadd_ps(ifft3198, ifft3205, _mm512_shuffle_ps(ifft3198, ifft3198, 78));
__m512 ifft3292 = _mm512_fmadd_ps(ifft3284, ifft3205, _mm512_shuffle_ps(ifft3284, ifft3284, 78));
__m512 ifft3208 = _mm512_fmadd_ps(ifft3199, ifft3205, _mm512_shuffle_ps(ifft3199, ifft3199, 78));
__m512 ifft3293 = _mm512_fmadd_ps(ifft3285, ifft3205, _mm512_shuffle_ps(ifft3285, ifft3285, 78));
__m512 ifft3209 = _mm512_fmadd_ps(ifft3200, ifft3205, _mm512_shuffle_ps(ifft3200, ifft3200, 78));
__m512 ifft3294 = _mm512_fmadd_ps(ifft3286, ifft3205, _mm512_shuffle_ps(ifft3286, ifft3286, 78));
__m512 ifft3210 = _mm512_fmadd_ps(ifft3201, ifft3205, _mm512_shuffle_ps(ifft3201, ifft3201, 78));
__m512 ifft3295 = _mm512_fmadd_ps(ifft3287, ifft3205, _mm512_shuffle_ps(ifft3287, ifft3287, 78));
__m512 ifft3211 = _mm512_fmadd_ps(ifft3202, ifft3205, _mm512_shuffle_ps(ifft3202, ifft3202, 78));
__m512 ifft3296 = _mm512_fmadd_ps(ifft3288, ifft3205, _mm512_shuffle_ps(ifft3288, ifft3288, 78));
__m512 ifft3212 = _mm512_fmadd_ps(ifft3203, ifft3205, _mm512_shuffle_ps(ifft3203, ifft3203, 78));
__m512 ifft3297 = _mm512_fmadd_ps(ifft3289, ifft3205, _mm512_shuffle_ps(ifft3289, ifft3289, 78));
__m512 ifft3213 = _mm512_fmadd_ps(ifft3204, ifft3205, _mm512_shuffle_ps(ifft3204, ifft3204, 78));
__m512 ifft3298 = _mm512_fmadd_ps(ifft3290, ifft3205, _mm512_shuffle_ps(ifft3290, ifft3290, 78));
__m512 ifft3214 = _mm512_mask_sub_ps(ifft3206, 49344, _mm512_setzero_ps(), ifft3207);
__m512 ifft3299 = _mm512_mask_sub_ps(ifft3291, 49344, _mm512_setzero_ps(), ifft3292);
__m512 ifft3215 = _mm512_mask_mov_ps(ifft3207, 49344, ifft3206);
__m512 ifft3300 = _mm512_mask_mov_ps(ifft3292, 49344, ifft3291);
__m512 ifft3216 = _mm512_mask_sub_ps(ifft3208, 49344, _mm512_setzero_ps(), ifft3209);
__m512 ifft3301 = _mm512_mask_sub_ps(ifft3293, 49344, _mm512_setzero_ps(), ifft3294);
__m512 ifft3217 = _mm512_mask_mov_ps(ifft3209, 49344, ifft3208);
__m512 ifft3302 = _mm512_mask_mov_ps(ifft3294, 49344, ifft3293);
__m512 ifft3218 = _mm512_mask_sub_ps(ifft3210, 49344, _mm512_setzero_ps(), ifft3211);
__m512 ifft3303 = _mm512_mask_sub_ps(ifft3295, 49344, _mm512_setzero_ps(), ifft3296);
__m512 ifft3219 = _mm512_mask_mov_ps(ifft3211, 49344, ifft3210);
__m512 ifft3304 = _mm512_mask_mov_ps(ifft3296, 49344, ifft3295);
__m512 ifft3220 = _mm512_mask_sub_ps(ifft3212, 49344, _mm512_setzero_ps(), ifft3213);
__m512 ifft3305 = _mm512_mask_sub_ps(ifft3297, 49344, _mm512_setzero_ps(), ifft3298);
__m512 ifft3221 = _mm512_mask_mov_ps(ifft3213, 49344, ifft3212);
__m512 ifft3306 = _mm512_mask_mov_ps(ifft3298, 49344, ifft3297);
__m512 ifft3222 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3223 = _mm512_fmadd_ps(ifft3214, ifft3222, _mm512_shuffle_f32x4(ifft3214, ifft3214, 177));
__m512 ifft3307 = _mm512_fmadd_ps(ifft3299, ifft3222, _mm512_shuffle_f32x4(ifft3299, ifft3299, 177));
__m512 ifft3224 = _mm512_fmadd_ps(ifft3215, ifft3222, _mm512_shuffle_f32x4(ifft3215, ifft3215, 177));
__m512 ifft3308 = _mm512_fmadd_ps(ifft3300, ifft3222, _mm512_shuffle_f32x4(ifft3300, ifft3300, 177));
__m512 ifft3225 = _mm512_fmadd_ps(ifft3216, ifft3222, _mm512_shuffle_f32x4(ifft3216, ifft3216, 177));
__m512 ifft3309 = _mm512_fmadd_ps(ifft3301, ifft3222, _mm512_shuffle_f32x4(ifft3301, ifft3301, 177));
__m512 ifft3226 = _mm512_fmadd_ps(ifft3217, ifft3222, _mm512_shuffle_f32x4(ifft3217, ifft3217, 177));
__m512 ifft3310 = _mm512_fmadd_ps(ifft3302, ifft3222, _mm512_shuffle_f32x4(ifft3302, ifft3302, 177));
__m512 ifft3227 = _mm512_fmadd_ps(ifft3218, ifft3222, _mm512_shuffle_f32x4(ifft3218, ifft3218, 177));
__m512 ifft3311 = _mm512_fmadd_ps(ifft3303, ifft3222, _mm512_shuffle_f32x4(ifft3303, ifft3303, 177));
__m512 ifft3228 = _mm512_fnmsub_ps(ifft3219, ifft3222, _mm512_shuffle_f32x4(ifft3219, ifft3219, 177));
__m512 ifft3312 = _mm512_fnmsub_ps(ifft3304, ifft3222, _mm512_shuffle_f32x4(ifft3304, ifft3304, 177));
__m512 ifft3229 = _mm512_fmadd_ps(ifft3220, ifft3222, _mm512_shuffle_f32x4(ifft3220, ifft3220, 177));
__m512 ifft3313 = _mm512_fmadd_ps(ifft3305, ifft3222, _mm512_shuffle_f32x4(ifft3305, ifft3305, 177));
__m512 ifft3230 = _mm512_fmadd_ps(ifft3221, ifft3222, _mm512_shuffle_f32x4(ifft3221, ifft3221, 177));
__m512 ifft3314 = _mm512_fmadd_ps(ifft3306, ifft3222, _mm512_shuffle_f32x4(ifft3306, ifft3306, 177));
__m512 ifft3231 = _mm512_add_ps(ifft3223, ifft3224);
__m512 ifft3315 = _mm512_add_ps(ifft3307, ifft3308);
__m512 ifft3232 = _mm512_sub_ps(ifft3223, ifft3224);
__m512 ifft3316 = _mm512_sub_ps(ifft3307, ifft3308);
__m512 ifft3233 = _mm512_sub_ps(ifft3225, ifft3229);
__m512 ifft3317 = _mm512_sub_ps(ifft3309, ifft3313);
__m512 ifft3234 = _mm512_add_ps(ifft3226, ifft3230);
__m512 ifft3318 = _mm512_add_ps(ifft3310, ifft3314);
__m512 ifft3235 = _mm512_add_ps(ifft3225, ifft3229);
__m512 ifft3319 = _mm512_add_ps(ifft3309, ifft3313);
__m512 ifft3236 = _mm512_sub_ps(ifft3226, ifft3230);
__m512 ifft3320 = _mm512_sub_ps(ifft3310, ifft3314);
__m512 ifft3237 = _mm512_mul_ps(ifft3227, _mm512_set1_ps(3.125e-02f));
__m512 ifft3321 = _mm512_mul_ps(ifft3311, _mm512_set1_ps(3.125e-02f));
__m512 ifft3238 = _mm512_mul_ps(ifft3228, _mm512_set1_ps(3.125e-02f));
__m512 ifft3322 = _mm512_mul_ps(ifft3312, _mm512_set1_ps(3.125e-02f));
__m512 ifft3239 = _mm512_fmadd_ps(ifft3231, _mm512_set1_ps(1.5625e-02f), ifft3237);
__m512 ifft3323 = _mm512_fmadd_ps(ifft3315, _mm512_set1_ps(1.5625e-02f), ifft3321);
__m512 ifft3240 = _mm512_fmsub_ps(ifft3231, _mm512_set1_ps(1.5625e-02f), ifft3237);
__m512 ifft3324 = _mm512_fmsub_ps(ifft3315, _mm512_set1_ps(1.5625e-02f), ifft3321);
__m512 ifft3241 = _mm512_fmadd_ps(ifft3232, _mm512_set1_ps(1.5625e-02f), ifft3238);
__m512 ifft3325 = _mm512_fmadd_ps(ifft3316, _mm512_set1_ps(1.5625e-02f), ifft3322);
__m512 ifft3242 = _mm512_fmsub_ps(ifft3232, _mm512_set1_ps(1.5625e-02f), ifft3238);
__m512 ifft3326 = _mm512_fmsub_ps(ifft3316, _mm512_set1_ps(1.5625e-02f), ifft3322);
__m512 ifft3243 = _mm512_add_ps(ifft3233, ifft3234);
__m512 ifft3327 = _mm512_add_ps(ifft3317, ifft3318);
__m512 ifft3244 = _mm512_sub_ps(ifft3233, ifft3234);
__m512 ifft3328 = _mm512_sub_ps(ifft3317, ifft3318);
__m512 ifft3245 = _mm512_fnmadd_ps(ifft3243, _mm512_set1_ps(7.0710677e-01f), ifft3235);
__m512 ifft3329 = _mm512_fnmadd_ps(ifft3327, _mm512_set1_ps(7.0710677e-01f), ifft3319);
__m512 ifft3246 = _mm512_fmadd_ps(ifft3243, _mm512_set1_ps(7.0710677e-01f), ifft3235);
__m512 ifft3330 = _mm512_fmadd_ps(ifft3327, _mm512_set1_ps(7.0710677e-01f), ifft3319);
__m512 ifft3247 = _mm512_fmadd_ps(ifft3244, _mm512_set1_ps(7.0710677e-01f), ifft3236);
__m512 ifft3331 = _mm512_fmadd_ps(ifft3328, _mm512_set1_ps(7.0710677e-01f), ifft3320);
__m512 ifft3248 = _mm512_fmsub_ps(ifft3244, _mm512_set1_ps(7.0710677e-01f), ifft3236);
__m512 ifft3332 = _mm512_fmsub_ps(ifft3328, _mm512_set1_ps(7.0710677e-01f), ifft3320);
__m512 ifft3249 = _mm512_add_ps(ifft3245, ifft3246);
__m512 ifft3333 = _mm512_add_ps(ifft3329, ifft3330);
__m512 ifft3250 = _mm512_sub_ps(ifft3245, ifft3246);
__m512 ifft3334 = _mm512_sub_ps(ifft3329, ifft3330);
__m512 ifft3251 = _mm512_add_ps(ifft3247, ifft3248);
__m512 ifft3335 = _mm512_add_ps(ifft3331, ifft3332);
__m512 ifft3252 = _mm512_sub_ps(ifft3247, ifft3248);
__m512 ifft3336 = _mm512_sub_ps(ifft3331, ifft3332);
__m512 ifft3253 = _mm512_fmadd_ps(ifft3249, _mm512_set1_ps(1.5625e-02f), ifft3239);
__m512 ifft3337 = _mm512_fmadd_ps(ifft3333, _mm512_set1_ps(1.5625e-02f), ifft3323);
__m512 ifft3254 = _mm512_fnmadd_ps(ifft3249, _mm512_set1_ps(1.5625e-02f), ifft3239);
__m512 ifft3338 = _mm512_fnmadd_ps(ifft3333, _mm512_set1_ps(1.5625e-02f), ifft3323);
__m512 ifft3255 = _mm512_fmadd_ps(ifft3251, _mm512_set1_ps(1.5625e-02f), ifft3241);
__m512 ifft3339 = _mm512_fmadd_ps(ifft3335, _mm512_set1_ps(1.5625e-02f), ifft3325);
__m512 ifft3256 = _mm512_fnmadd_ps(ifft3251, _mm512_set1_ps(1.5625e-02f), ifft3241);
__m512 ifft3340 = _mm512_fnmadd_ps(ifft3335, _mm512_set1_ps(1.5625e-02f), ifft3325);
__m512 ifft3257 = _mm512_fnmadd_ps(ifft3252, _mm512_set1_ps(1.5625e-02f), ifft3240);
__m512 ifft3341 = _mm512_fnmadd_ps(ifft3336, _mm512_set1_ps(1.5625e-02f), ifft3324);
__m512 ifft3258 = _mm512_fmadd_ps(ifft3252, _mm512_set1_ps(1.5625e-02f), ifft3240);
__m512 ifft3342 = _mm512_fmadd_ps(ifft3336, _mm512_set1_ps(1.5625e-02f), ifft3324);
__m512 ifft3259 = _mm512_fmadd_ps(ifft3250, _mm512_set1_ps(1.5625e-02f), ifft3242);
__m512 ifft3343 = _mm512_fmadd_ps(ifft3334, _mm512_set1_ps(1.5625e-02f), ifft3326);
__m512 ifft3260 = _mm512_fnmadd_ps(ifft3250, _mm512_set1_ps(1.5625e-02f), ifft3242);
__m512 ifft3344 = _mm512_fnmadd_ps(ifft3334, _mm512_set1_ps(1.5625e-02f), ifft3326);
__m512 dat770 = ifft3253;
__m512 dat775 = ifft3337;
__m512 dat771 = ifft3255;
__m512 dat776 = ifft3339;
__m512 dat772 = ifft3257;
__m512 dat777 = ifft3341;
__m512 dat773 = ifft3259;
__m512 dat778 = ifft3343;
__m512 dat774 = ifft3254;
__m512 dat779 = ifft3338;
(void)ifft3256;
(void)ifft3340;
(void)ifft3258;
(void)ifft3342;
(void)ifft3260;
(void)ifft3344;
dat770 = _mm512_max_ps(_mm512_setzero_ps(), dat770);
dat775 = _mm512_max_ps(_mm512_setzero_ps(), dat775);
dat771 = _mm512_max_ps(_mm512_setzero_ps(), dat771);
dat776 = _mm512_max_ps(_mm512_setzero_ps(), dat776);
dat772 = _mm512_max_ps(_mm512_setzero_ps(), dat772);
dat777 = _mm512_max_ps(_mm512_setzero_ps(), dat777);
dat773 = _mm512_max_ps(_mm512_setzero_ps(), dat773);
dat778 = _mm512_max_ps(_mm512_setzero_ps(), dat778);
dat774 = _mm512_max_ps(_mm512_setzero_ps(), dat774);
dat779 = _mm512_max_ps(_mm512_setzero_ps(), dat779);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat770);
_mm512_mask_storeu_ps(datPtr2+52008+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat770);
_mm512_mask_storeu_ps(datPtr2+1800+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat775);
_mm512_mask_storeu_ps(datPtr2+50208+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat775);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat771);
_mm512_mask_storeu_ps(datPtr2+52456+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat771);
_mm512_mask_storeu_ps(datPtr2+2248+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat776);
_mm512_mask_storeu_ps(datPtr2+50656+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat776);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat772);
_mm512_mask_storeu_ps(datPtr2+52904+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat772);
_mm512_mask_storeu_ps(datPtr2+2696+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat777);
_mm512_mask_storeu_ps(datPtr2+51104+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat777);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat773);
_mm512_mask_storeu_ps(datPtr2+53352+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat773);
_mm512_mask_storeu_ps(datPtr2+3144+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat778);
_mm512_mask_storeu_ps(datPtr2+51552+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat778);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 3, dat774);
_mm512_mask_storeu_ps(datPtr2+53800+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 7936, dat774);
_mm512_mask_storeu_ps(datPtr2+3592+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 31, dat779);
_mm512_mask_storeu_ps(datPtr2+52000+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+0*t20, 768, dat779);
ptrdiff_t t21 = 0;
for (; t21 < 2; ++t21) {
__m512 sfRe209 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm209 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe213 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm213 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe210 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm210 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe214 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm214 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe211 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm211 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe215 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm215 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe212 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm212 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfRe216 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512 sfIm216 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k35+768*r12+256*t21);
__m512i ifft3345 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3346 = _mm512_permutexvar_ps(ifft3345, sfRe209);
__m512 ifft3437 = _mm512_permutexvar_ps(ifft3345, sfRe213);
__m512i ifft3347 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3348 = _mm512_permutexvar_ps(ifft3347, sfRe209);
__m512 ifft3438 = _mm512_permutexvar_ps(ifft3347, sfRe213);
__m512 ifft3349 = _mm512_permutexvar_ps(ifft3345, sfIm209);
__m512 ifft3439 = _mm512_permutexvar_ps(ifft3345, sfIm213);
__m512 ifft3350 = _mm512_permutexvar_ps(ifft3347, sfIm209);
__m512 ifft3440 = _mm512_permutexvar_ps(ifft3347, sfIm213);
__m512 ifft3351 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3352 = _mm512_mask_fmadd_ps(ifft3350, 65021, ifft3351, ifft3346);
__m512 ifft3441 = _mm512_mask_fmadd_ps(ifft3440, 65021, ifft3351, ifft3437);
__m512 ifft3353 = _mm512_mask_fnmadd_ps(ifft3349, 65021, ifft3351, ifft3348);
__m512 ifft3442 = _mm512_mask_fnmadd_ps(ifft3439, 65021, ifft3351, ifft3438);
__m512 ifft3354 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3355 = _mm512_fmadd_ps(ifft3352, ifft3354, _mm512_shuffle_ps(ifft3352, ifft3352, 177));
__m512 ifft3443 = _mm512_fmadd_ps(ifft3441, ifft3354, _mm512_shuffle_ps(ifft3441, ifft3441, 177));
__m512 ifft3356 = _mm512_fmadd_ps(ifft3353, ifft3354, _mm512_shuffle_ps(ifft3353, ifft3353, 177));
__m512 ifft3444 = _mm512_fmadd_ps(ifft3442, ifft3354, _mm512_shuffle_ps(ifft3442, ifft3442, 177));
__m512 ifft3357 = _mm512_fmadd_ps(sfRe210, ifft3354, _mm512_shuffle_ps(sfRe210, sfRe210, 177));
__m512 ifft3445 = _mm512_fmadd_ps(sfRe214, ifft3354, _mm512_shuffle_ps(sfRe214, sfRe214, 177));
__m512 ifft3358 = _mm512_fmadd_ps(sfIm210, ifft3354, _mm512_shuffle_ps(sfIm210, sfIm210, 177));
__m512 ifft3446 = _mm512_fmadd_ps(sfIm214, ifft3354, _mm512_shuffle_ps(sfIm214, sfIm214, 177));
__m512 ifft3359 = _mm512_fmadd_ps(sfRe211, ifft3354, _mm512_shuffle_ps(sfRe211, sfRe211, 177));
__m512 ifft3447 = _mm512_fmadd_ps(sfRe215, ifft3354, _mm512_shuffle_ps(sfRe215, sfRe215, 177));
__m512 ifft3360 = _mm512_fmadd_ps(sfIm211, ifft3354, _mm512_shuffle_ps(sfIm211, sfIm211, 177));
__m512 ifft3448 = _mm512_fmadd_ps(sfIm215, ifft3354, _mm512_shuffle_ps(sfIm215, sfIm215, 177));
__m512 ifft3361 = _mm512_fmadd_ps(sfRe212, ifft3354, _mm512_shuffle_ps(sfRe212, sfRe212, 177));
__m512 ifft3449 = _mm512_fmadd_ps(sfRe216, ifft3354, _mm512_shuffle_ps(sfRe216, sfRe216, 177));
__m512 ifft3362 = _mm512_fmadd_ps(sfIm212, ifft3354, _mm512_shuffle_ps(sfIm212, sfIm212, 177));
__m512 ifft3450 = _mm512_fmadd_ps(sfIm216, ifft3354, _mm512_shuffle_ps(sfIm216, sfIm216, 177));
__m512 ifft3363 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3364 = _mm512_mul_ps(ifft3355, ifft3363);
__m512 ifft3451 = _mm512_mul_ps(ifft3443, ifft3363);
__m512 ifft3365 = _mm512_mul_ps(ifft3356, ifft3363);
__m512 ifft3452 = _mm512_mul_ps(ifft3444, ifft3363);
__m512 ifft3366 = _mm512_mul_ps(ifft3357, ifft3363);
__m512 ifft3453 = _mm512_mul_ps(ifft3445, ifft3363);
__m512 ifft3367 = _mm512_mul_ps(ifft3358, ifft3363);
__m512 ifft3454 = _mm512_mul_ps(ifft3446, ifft3363);
__m512 ifft3368 = _mm512_mul_ps(ifft3359, ifft3363);
__m512 ifft3455 = _mm512_mul_ps(ifft3447, ifft3363);
__m512 ifft3369 = _mm512_mul_ps(ifft3360, ifft3363);
__m512 ifft3456 = _mm512_mul_ps(ifft3448, ifft3363);
__m512 ifft3370 = _mm512_mul_ps(ifft3361, ifft3363);
__m512 ifft3457 = _mm512_mul_ps(ifft3449, ifft3363);
__m512 ifft3371 = _mm512_mul_ps(ifft3362, ifft3363);
__m512 ifft3458 = _mm512_mul_ps(ifft3450, ifft3363);
__m512 ifft3372 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3373 = _mm512_fnmadd_ps(ifft3356, ifft3372, ifft3364);
__m512 ifft3459 = _mm512_fnmadd_ps(ifft3444, ifft3372, ifft3451);
__m512 ifft3374 = _mm512_fmadd_ps(ifft3355, ifft3372, ifft3365);
__m512 ifft3460 = _mm512_fmadd_ps(ifft3443, ifft3372, ifft3452);
__m512 ifft3375 = _mm512_fnmadd_ps(ifft3358, ifft3372, ifft3366);
__m512 ifft3461 = _mm512_fnmadd_ps(ifft3446, ifft3372, ifft3453);
__m512 ifft3376 = _mm512_fmadd_ps(ifft3357, ifft3372, ifft3367);
__m512 ifft3462 = _mm512_fmadd_ps(ifft3445, ifft3372, ifft3454);
__m512 ifft3377 = _mm512_fnmadd_ps(ifft3360, ifft3372, ifft3368);
__m512 ifft3463 = _mm512_fnmadd_ps(ifft3448, ifft3372, ifft3455);
__m512 ifft3378 = _mm512_fmadd_ps(ifft3359, ifft3372, ifft3369);
__m512 ifft3464 = _mm512_fmadd_ps(ifft3447, ifft3372, ifft3456);
__m512 ifft3379 = _mm512_fnmadd_ps(ifft3362, ifft3372, ifft3370);
__m512 ifft3465 = _mm512_fnmadd_ps(ifft3450, ifft3372, ifft3457);
__m512 ifft3380 = _mm512_fmadd_ps(ifft3361, ifft3372, ifft3371);
__m512 ifft3466 = _mm512_fmadd_ps(ifft3449, ifft3372, ifft3458);
__m512 ifft3381 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3382 = _mm512_fmadd_ps(ifft3373, ifft3381, _mm512_shuffle_ps(ifft3373, ifft3373, 78));
__m512 ifft3467 = _mm512_fmadd_ps(ifft3459, ifft3381, _mm512_shuffle_ps(ifft3459, ifft3459, 78));
__m512 ifft3383 = _mm512_fmadd_ps(ifft3374, ifft3381, _mm512_shuffle_ps(ifft3374, ifft3374, 78));
__m512 ifft3468 = _mm512_fmadd_ps(ifft3460, ifft3381, _mm512_shuffle_ps(ifft3460, ifft3460, 78));
__m512 ifft3384 = _mm512_fmadd_ps(ifft3375, ifft3381, _mm512_shuffle_ps(ifft3375, ifft3375, 78));
__m512 ifft3469 = _mm512_fmadd_ps(ifft3461, ifft3381, _mm512_shuffle_ps(ifft3461, ifft3461, 78));
__m512 ifft3385 = _mm512_fmadd_ps(ifft3376, ifft3381, _mm512_shuffle_ps(ifft3376, ifft3376, 78));
__m512 ifft3470 = _mm512_fmadd_ps(ifft3462, ifft3381, _mm512_shuffle_ps(ifft3462, ifft3462, 78));
__m512 ifft3386 = _mm512_fmadd_ps(ifft3377, ifft3381, _mm512_shuffle_ps(ifft3377, ifft3377, 78));
__m512 ifft3471 = _mm512_fmadd_ps(ifft3463, ifft3381, _mm512_shuffle_ps(ifft3463, ifft3463, 78));
__m512 ifft3387 = _mm512_fmadd_ps(ifft3378, ifft3381, _mm512_shuffle_ps(ifft3378, ifft3378, 78));
__m512 ifft3472 = _mm512_fmadd_ps(ifft3464, ifft3381, _mm512_shuffle_ps(ifft3464, ifft3464, 78));
__m512 ifft3388 = _mm512_fmadd_ps(ifft3379, ifft3381, _mm512_shuffle_ps(ifft3379, ifft3379, 78));
__m512 ifft3473 = _mm512_fmadd_ps(ifft3465, ifft3381, _mm512_shuffle_ps(ifft3465, ifft3465, 78));
__m512 ifft3389 = _mm512_fmadd_ps(ifft3380, ifft3381, _mm512_shuffle_ps(ifft3380, ifft3380, 78));
__m512 ifft3474 = _mm512_fmadd_ps(ifft3466, ifft3381, _mm512_shuffle_ps(ifft3466, ifft3466, 78));
__m512 ifft3390 = _mm512_mask_sub_ps(ifft3382, 49344, _mm512_setzero_ps(), ifft3383);
__m512 ifft3475 = _mm512_mask_sub_ps(ifft3467, 49344, _mm512_setzero_ps(), ifft3468);
__m512 ifft3391 = _mm512_mask_mov_ps(ifft3383, 49344, ifft3382);
__m512 ifft3476 = _mm512_mask_mov_ps(ifft3468, 49344, ifft3467);
__m512 ifft3392 = _mm512_mask_sub_ps(ifft3384, 49344, _mm512_setzero_ps(), ifft3385);
__m512 ifft3477 = _mm512_mask_sub_ps(ifft3469, 49344, _mm512_setzero_ps(), ifft3470);
__m512 ifft3393 = _mm512_mask_mov_ps(ifft3385, 49344, ifft3384);
__m512 ifft3478 = _mm512_mask_mov_ps(ifft3470, 49344, ifft3469);
__m512 ifft3394 = _mm512_mask_sub_ps(ifft3386, 49344, _mm512_setzero_ps(), ifft3387);
__m512 ifft3479 = _mm512_mask_sub_ps(ifft3471, 49344, _mm512_setzero_ps(), ifft3472);
__m512 ifft3395 = _mm512_mask_mov_ps(ifft3387, 49344, ifft3386);
__m512 ifft3480 = _mm512_mask_mov_ps(ifft3472, 49344, ifft3471);
__m512 ifft3396 = _mm512_mask_sub_ps(ifft3388, 49344, _mm512_setzero_ps(), ifft3389);
__m512 ifft3481 = _mm512_mask_sub_ps(ifft3473, 49344, _mm512_setzero_ps(), ifft3474);
__m512 ifft3397 = _mm512_mask_mov_ps(ifft3389, 49344, ifft3388);
__m512 ifft3482 = _mm512_mask_mov_ps(ifft3474, 49344, ifft3473);
__m512 ifft3398 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3399 = _mm512_fmadd_ps(ifft3390, ifft3398, _mm512_shuffle_f32x4(ifft3390, ifft3390, 177));
__m512 ifft3483 = _mm512_fmadd_ps(ifft3475, ifft3398, _mm512_shuffle_f32x4(ifft3475, ifft3475, 177));
__m512 ifft3400 = _mm512_fmadd_ps(ifft3391, ifft3398, _mm512_shuffle_f32x4(ifft3391, ifft3391, 177));
__m512 ifft3484 = _mm512_fmadd_ps(ifft3476, ifft3398, _mm512_shuffle_f32x4(ifft3476, ifft3476, 177));
__m512 ifft3401 = _mm512_fmadd_ps(ifft3392, ifft3398, _mm512_shuffle_f32x4(ifft3392, ifft3392, 177));
__m512 ifft3485 = _mm512_fmadd_ps(ifft3477, ifft3398, _mm512_shuffle_f32x4(ifft3477, ifft3477, 177));
__m512 ifft3402 = _mm512_fmadd_ps(ifft3393, ifft3398, _mm512_shuffle_f32x4(ifft3393, ifft3393, 177));
__m512 ifft3486 = _mm512_fmadd_ps(ifft3478, ifft3398, _mm512_shuffle_f32x4(ifft3478, ifft3478, 177));
__m512 ifft3403 = _mm512_fmadd_ps(ifft3394, ifft3398, _mm512_shuffle_f32x4(ifft3394, ifft3394, 177));
__m512 ifft3487 = _mm512_fmadd_ps(ifft3479, ifft3398, _mm512_shuffle_f32x4(ifft3479, ifft3479, 177));
__m512 ifft3404 = _mm512_fnmsub_ps(ifft3395, ifft3398, _mm512_shuffle_f32x4(ifft3395, ifft3395, 177));
__m512 ifft3488 = _mm512_fnmsub_ps(ifft3480, ifft3398, _mm512_shuffle_f32x4(ifft3480, ifft3480, 177));
__m512 ifft3405 = _mm512_fmadd_ps(ifft3396, ifft3398, _mm512_shuffle_f32x4(ifft3396, ifft3396, 177));
__m512 ifft3489 = _mm512_fmadd_ps(ifft3481, ifft3398, _mm512_shuffle_f32x4(ifft3481, ifft3481, 177));
__m512 ifft3406 = _mm512_fmadd_ps(ifft3397, ifft3398, _mm512_shuffle_f32x4(ifft3397, ifft3397, 177));
__m512 ifft3490 = _mm512_fmadd_ps(ifft3482, ifft3398, _mm512_shuffle_f32x4(ifft3482, ifft3482, 177));
__m512 ifft3407 = _mm512_add_ps(ifft3399, ifft3400);
__m512 ifft3491 = _mm512_add_ps(ifft3483, ifft3484);
__m512 ifft3408 = _mm512_sub_ps(ifft3399, ifft3400);
__m512 ifft3492 = _mm512_sub_ps(ifft3483, ifft3484);
__m512 ifft3409 = _mm512_sub_ps(ifft3401, ifft3405);
__m512 ifft3493 = _mm512_sub_ps(ifft3485, ifft3489);
__m512 ifft3410 = _mm512_add_ps(ifft3402, ifft3406);
__m512 ifft3494 = _mm512_add_ps(ifft3486, ifft3490);
__m512 ifft3411 = _mm512_add_ps(ifft3401, ifft3405);
__m512 ifft3495 = _mm512_add_ps(ifft3485, ifft3489);
__m512 ifft3412 = _mm512_sub_ps(ifft3402, ifft3406);
__m512 ifft3496 = _mm512_sub_ps(ifft3486, ifft3490);
__m512 ifft3413 = _mm512_mul_ps(ifft3403, _mm512_set1_ps(3.125e-02f));
__m512 ifft3497 = _mm512_mul_ps(ifft3487, _mm512_set1_ps(3.125e-02f));
__m512 ifft3414 = _mm512_mul_ps(ifft3404, _mm512_set1_ps(3.125e-02f));
__m512 ifft3498 = _mm512_mul_ps(ifft3488, _mm512_set1_ps(3.125e-02f));
__m512 ifft3415 = _mm512_fmadd_ps(ifft3407, _mm512_set1_ps(1.5625e-02f), ifft3413);
__m512 ifft3499 = _mm512_fmadd_ps(ifft3491, _mm512_set1_ps(1.5625e-02f), ifft3497);
__m512 ifft3416 = _mm512_fmsub_ps(ifft3407, _mm512_set1_ps(1.5625e-02f), ifft3413);
__m512 ifft3500 = _mm512_fmsub_ps(ifft3491, _mm512_set1_ps(1.5625e-02f), ifft3497);
__m512 ifft3417 = _mm512_fmadd_ps(ifft3408, _mm512_set1_ps(1.5625e-02f), ifft3414);
__m512 ifft3501 = _mm512_fmadd_ps(ifft3492, _mm512_set1_ps(1.5625e-02f), ifft3498);
__m512 ifft3418 = _mm512_fmsub_ps(ifft3408, _mm512_set1_ps(1.5625e-02f), ifft3414);
__m512 ifft3502 = _mm512_fmsub_ps(ifft3492, _mm512_set1_ps(1.5625e-02f), ifft3498);
__m512 ifft3419 = _mm512_add_ps(ifft3409, ifft3410);
__m512 ifft3503 = _mm512_add_ps(ifft3493, ifft3494);
__m512 ifft3420 = _mm512_sub_ps(ifft3409, ifft3410);
__m512 ifft3504 = _mm512_sub_ps(ifft3493, ifft3494);
__m512 ifft3421 = _mm512_fnmadd_ps(ifft3419, _mm512_set1_ps(7.0710677e-01f), ifft3411);
__m512 ifft3505 = _mm512_fnmadd_ps(ifft3503, _mm512_set1_ps(7.0710677e-01f), ifft3495);
__m512 ifft3422 = _mm512_fmadd_ps(ifft3419, _mm512_set1_ps(7.0710677e-01f), ifft3411);
__m512 ifft3506 = _mm512_fmadd_ps(ifft3503, _mm512_set1_ps(7.0710677e-01f), ifft3495);
__m512 ifft3423 = _mm512_fmadd_ps(ifft3420, _mm512_set1_ps(7.0710677e-01f), ifft3412);
__m512 ifft3507 = _mm512_fmadd_ps(ifft3504, _mm512_set1_ps(7.0710677e-01f), ifft3496);
__m512 ifft3424 = _mm512_fmsub_ps(ifft3420, _mm512_set1_ps(7.0710677e-01f), ifft3412);
__m512 ifft3508 = _mm512_fmsub_ps(ifft3504, _mm512_set1_ps(7.0710677e-01f), ifft3496);
__m512 ifft3425 = _mm512_add_ps(ifft3421, ifft3422);
__m512 ifft3509 = _mm512_add_ps(ifft3505, ifft3506);
__m512 ifft3426 = _mm512_sub_ps(ifft3421, ifft3422);
__m512 ifft3510 = _mm512_sub_ps(ifft3505, ifft3506);
__m512 ifft3427 = _mm512_add_ps(ifft3423, ifft3424);
__m512 ifft3511 = _mm512_add_ps(ifft3507, ifft3508);
__m512 ifft3428 = _mm512_sub_ps(ifft3423, ifft3424);
__m512 ifft3512 = _mm512_sub_ps(ifft3507, ifft3508);
__m512 ifft3429 = _mm512_fmadd_ps(ifft3425, _mm512_set1_ps(1.5625e-02f), ifft3415);
__m512 ifft3513 = _mm512_fmadd_ps(ifft3509, _mm512_set1_ps(1.5625e-02f), ifft3499);
__m512 ifft3430 = _mm512_fnmadd_ps(ifft3425, _mm512_set1_ps(1.5625e-02f), ifft3415);
__m512 ifft3514 = _mm512_fnmadd_ps(ifft3509, _mm512_set1_ps(1.5625e-02f), ifft3499);
__m512 ifft3431 = _mm512_fmadd_ps(ifft3427, _mm512_set1_ps(1.5625e-02f), ifft3417);
__m512 ifft3515 = _mm512_fmadd_ps(ifft3511, _mm512_set1_ps(1.5625e-02f), ifft3501);
__m512 ifft3432 = _mm512_fnmadd_ps(ifft3427, _mm512_set1_ps(1.5625e-02f), ifft3417);
__m512 ifft3516 = _mm512_fnmadd_ps(ifft3511, _mm512_set1_ps(1.5625e-02f), ifft3501);
__m512 ifft3433 = _mm512_fnmadd_ps(ifft3428, _mm512_set1_ps(1.5625e-02f), ifft3416);
__m512 ifft3517 = _mm512_fnmadd_ps(ifft3512, _mm512_set1_ps(1.5625e-02f), ifft3500);
__m512 ifft3434 = _mm512_fmadd_ps(ifft3428, _mm512_set1_ps(1.5625e-02f), ifft3416);
__m512 ifft3518 = _mm512_fmadd_ps(ifft3512, _mm512_set1_ps(1.5625e-02f), ifft3500);
__m512 ifft3435 = _mm512_fmadd_ps(ifft3426, _mm512_set1_ps(1.5625e-02f), ifft3418);
__m512 ifft3519 = _mm512_fmadd_ps(ifft3510, _mm512_set1_ps(1.5625e-02f), ifft3502);
__m512 ifft3436 = _mm512_fnmadd_ps(ifft3426, _mm512_set1_ps(1.5625e-02f), ifft3418);
__m512 ifft3520 = _mm512_fnmadd_ps(ifft3510, _mm512_set1_ps(1.5625e-02f), ifft3502);
__m512 dat780 = ifft3429;
__m512 dat785 = ifft3513;
__m512 dat781 = ifft3431;
__m512 dat786 = ifft3515;
__m512 dat782 = ifft3433;
__m512 dat787 = ifft3517;
__m512 dat783 = ifft3435;
__m512 dat788 = ifft3519;
__m512 dat784 = ifft3430;
__m512 dat789 = ifft3514;
(void)ifft3432;
(void)ifft3516;
(void)ifft3434;
(void)ifft3518;
(void)ifft3436;
(void)ifft3520;
__m512i pm33 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack161 = _mm512_permutex2var_ps(dat780, pm33, dat785);
__m512i pm34 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack162 = _mm512_permutex2var_ps(dat780, pm34, dat785);
__m512 pack163 = _mm512_permutex2var_ps(dat781, pm33, dat786);
__m512 pack164 = _mm512_permutex2var_ps(dat781, pm34, dat786);
__m512 pack165 = _mm512_permutex2var_ps(dat782, pm33, dat787);
__m512 pack166 = _mm512_permutex2var_ps(dat782, pm34, dat787);
__m512 pack167 = _mm512_permutex2var_ps(dat783, pm33, dat788);
__m512 pack168 = _mm512_permutex2var_ps(dat783, pm34, dat788);
__m512 pack169 = _mm512_permutex2var_ps(dat784, pm33, dat789);
__m512 pack170 = _mm512_permutex2var_ps(dat784, pm34, dat789);
pack161 = _mm512_max_ps(_mm512_setzero_ps(), pack161);
pack162 = _mm512_max_ps(_mm512_setzero_ps(), pack162);
pack163 = _mm512_max_ps(_mm512_setzero_ps(), pack163);
pack164 = _mm512_max_ps(_mm512_setzero_ps(), pack164);
pack165 = _mm512_max_ps(_mm512_setzero_ps(), pack165);
pack166 = _mm512_max_ps(_mm512_setzero_ps(), pack166);
pack167 = _mm512_max_ps(_mm512_setzero_ps(), pack167);
pack168 = _mm512_max_ps(_mm512_setzero_ps(), pack168);
pack169 = _mm512_max_ps(_mm512_setzero_ps(), pack169);
pack170 = _mm512_max_ps(_mm512_setzero_ps(), pack170);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack161);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack162);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack163);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack164);
_mm512_mask_storeu_ps(datPtr2+2716+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack165);
_mm512_mask_storeu_ps(datPtr2+52956+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack166);
_mm512_mask_storeu_ps(datPtr2+3164+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack167);
_mm512_mask_storeu_ps(datPtr2+53404+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack168);
_mm512_mask_storeu_ps(datPtr2+3612+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack169);
_mm512_mask_storeu_ps(datPtr2+53852+3215360*i9+200960*k35+100480*r12+448*toH11+4*toW11+40*t21, 1023, pack170);
}
}
}
if (j5 >= last2) return;
++j5;
rel5 = 16;
}
if (rel5 < 19) {
if (rel5 < 18) {
ptrdiff_t toH12 = base5+20;
ptrdiff_t toW12 = -455+30*rel5;
ptrdiff_t jj16 = 17-rel5+j5;
for (; j5 <= jj16; toW12 += 30) {
ptrdiff_t k36 = 16*w21;
for (; k36 != 16; ++k36) {
ptrdiff_t r13 = 0;
for (; r13 != 2; ++r13) {
ptrdiff_t t22 = 0;
for (; t22 < 3; ++t22) {
__m512 sfRe217 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm217 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe221 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm221 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe218 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm218 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe222 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm222 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe219 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm219 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe223 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm223 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe220 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm220 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfRe224 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512 sfIm224 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k36+768*r13+256*t22);
__m512i ifft3521 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3522 = _mm512_permutexvar_ps(ifft3521, sfRe217);
__m512 ifft3613 = _mm512_permutexvar_ps(ifft3521, sfRe221);
__m512i ifft3523 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3524 = _mm512_permutexvar_ps(ifft3523, sfRe217);
__m512 ifft3614 = _mm512_permutexvar_ps(ifft3523, sfRe221);
__m512 ifft3525 = _mm512_permutexvar_ps(ifft3521, sfIm217);
__m512 ifft3615 = _mm512_permutexvar_ps(ifft3521, sfIm221);
__m512 ifft3526 = _mm512_permutexvar_ps(ifft3523, sfIm217);
__m512 ifft3616 = _mm512_permutexvar_ps(ifft3523, sfIm221);
__m512 ifft3527 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3528 = _mm512_mask_fmadd_ps(ifft3526, 65021, ifft3527, ifft3522);
__m512 ifft3617 = _mm512_mask_fmadd_ps(ifft3616, 65021, ifft3527, ifft3613);
__m512 ifft3529 = _mm512_mask_fnmadd_ps(ifft3525, 65021, ifft3527, ifft3524);
__m512 ifft3618 = _mm512_mask_fnmadd_ps(ifft3615, 65021, ifft3527, ifft3614);
__m512 ifft3530 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3531 = _mm512_fmadd_ps(ifft3528, ifft3530, _mm512_shuffle_ps(ifft3528, ifft3528, 177));
__m512 ifft3619 = _mm512_fmadd_ps(ifft3617, ifft3530, _mm512_shuffle_ps(ifft3617, ifft3617, 177));
__m512 ifft3532 = _mm512_fmadd_ps(ifft3529, ifft3530, _mm512_shuffle_ps(ifft3529, ifft3529, 177));
__m512 ifft3620 = _mm512_fmadd_ps(ifft3618, ifft3530, _mm512_shuffle_ps(ifft3618, ifft3618, 177));
__m512 ifft3533 = _mm512_fmadd_ps(sfRe218, ifft3530, _mm512_shuffle_ps(sfRe218, sfRe218, 177));
__m512 ifft3621 = _mm512_fmadd_ps(sfRe222, ifft3530, _mm512_shuffle_ps(sfRe222, sfRe222, 177));
__m512 ifft3534 = _mm512_fmadd_ps(sfIm218, ifft3530, _mm512_shuffle_ps(sfIm218, sfIm218, 177));
__m512 ifft3622 = _mm512_fmadd_ps(sfIm222, ifft3530, _mm512_shuffle_ps(sfIm222, sfIm222, 177));
__m512 ifft3535 = _mm512_fmadd_ps(sfRe219, ifft3530, _mm512_shuffle_ps(sfRe219, sfRe219, 177));
__m512 ifft3623 = _mm512_fmadd_ps(sfRe223, ifft3530, _mm512_shuffle_ps(sfRe223, sfRe223, 177));
__m512 ifft3536 = _mm512_fmadd_ps(sfIm219, ifft3530, _mm512_shuffle_ps(sfIm219, sfIm219, 177));
__m512 ifft3624 = _mm512_fmadd_ps(sfIm223, ifft3530, _mm512_shuffle_ps(sfIm223, sfIm223, 177));
__m512 ifft3537 = _mm512_fmadd_ps(sfRe220, ifft3530, _mm512_shuffle_ps(sfRe220, sfRe220, 177));
__m512 ifft3625 = _mm512_fmadd_ps(sfRe224, ifft3530, _mm512_shuffle_ps(sfRe224, sfRe224, 177));
__m512 ifft3538 = _mm512_fmadd_ps(sfIm220, ifft3530, _mm512_shuffle_ps(sfIm220, sfIm220, 177));
__m512 ifft3626 = _mm512_fmadd_ps(sfIm224, ifft3530, _mm512_shuffle_ps(sfIm224, sfIm224, 177));
__m512 ifft3539 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3540 = _mm512_mul_ps(ifft3531, ifft3539);
__m512 ifft3627 = _mm512_mul_ps(ifft3619, ifft3539);
__m512 ifft3541 = _mm512_mul_ps(ifft3532, ifft3539);
__m512 ifft3628 = _mm512_mul_ps(ifft3620, ifft3539);
__m512 ifft3542 = _mm512_mul_ps(ifft3533, ifft3539);
__m512 ifft3629 = _mm512_mul_ps(ifft3621, ifft3539);
__m512 ifft3543 = _mm512_mul_ps(ifft3534, ifft3539);
__m512 ifft3630 = _mm512_mul_ps(ifft3622, ifft3539);
__m512 ifft3544 = _mm512_mul_ps(ifft3535, ifft3539);
__m512 ifft3631 = _mm512_mul_ps(ifft3623, ifft3539);
__m512 ifft3545 = _mm512_mul_ps(ifft3536, ifft3539);
__m512 ifft3632 = _mm512_mul_ps(ifft3624, ifft3539);
__m512 ifft3546 = _mm512_mul_ps(ifft3537, ifft3539);
__m512 ifft3633 = _mm512_mul_ps(ifft3625, ifft3539);
__m512 ifft3547 = _mm512_mul_ps(ifft3538, ifft3539);
__m512 ifft3634 = _mm512_mul_ps(ifft3626, ifft3539);
__m512 ifft3548 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3549 = _mm512_fnmadd_ps(ifft3532, ifft3548, ifft3540);
__m512 ifft3635 = _mm512_fnmadd_ps(ifft3620, ifft3548, ifft3627);
__m512 ifft3550 = _mm512_fmadd_ps(ifft3531, ifft3548, ifft3541);
__m512 ifft3636 = _mm512_fmadd_ps(ifft3619, ifft3548, ifft3628);
__m512 ifft3551 = _mm512_fnmadd_ps(ifft3534, ifft3548, ifft3542);
__m512 ifft3637 = _mm512_fnmadd_ps(ifft3622, ifft3548, ifft3629);
__m512 ifft3552 = _mm512_fmadd_ps(ifft3533, ifft3548, ifft3543);
__m512 ifft3638 = _mm512_fmadd_ps(ifft3621, ifft3548, ifft3630);
__m512 ifft3553 = _mm512_fnmadd_ps(ifft3536, ifft3548, ifft3544);
__m512 ifft3639 = _mm512_fnmadd_ps(ifft3624, ifft3548, ifft3631);
__m512 ifft3554 = _mm512_fmadd_ps(ifft3535, ifft3548, ifft3545);
__m512 ifft3640 = _mm512_fmadd_ps(ifft3623, ifft3548, ifft3632);
__m512 ifft3555 = _mm512_fnmadd_ps(ifft3538, ifft3548, ifft3546);
__m512 ifft3641 = _mm512_fnmadd_ps(ifft3626, ifft3548, ifft3633);
__m512 ifft3556 = _mm512_fmadd_ps(ifft3537, ifft3548, ifft3547);
__m512 ifft3642 = _mm512_fmadd_ps(ifft3625, ifft3548, ifft3634);
__m512 ifft3557 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3558 = _mm512_fmadd_ps(ifft3549, ifft3557, _mm512_shuffle_ps(ifft3549, ifft3549, 78));
__m512 ifft3643 = _mm512_fmadd_ps(ifft3635, ifft3557, _mm512_shuffle_ps(ifft3635, ifft3635, 78));
__m512 ifft3559 = _mm512_fmadd_ps(ifft3550, ifft3557, _mm512_shuffle_ps(ifft3550, ifft3550, 78));
__m512 ifft3644 = _mm512_fmadd_ps(ifft3636, ifft3557, _mm512_shuffle_ps(ifft3636, ifft3636, 78));
__m512 ifft3560 = _mm512_fmadd_ps(ifft3551, ifft3557, _mm512_shuffle_ps(ifft3551, ifft3551, 78));
__m512 ifft3645 = _mm512_fmadd_ps(ifft3637, ifft3557, _mm512_shuffle_ps(ifft3637, ifft3637, 78));
__m512 ifft3561 = _mm512_fmadd_ps(ifft3552, ifft3557, _mm512_shuffle_ps(ifft3552, ifft3552, 78));
__m512 ifft3646 = _mm512_fmadd_ps(ifft3638, ifft3557, _mm512_shuffle_ps(ifft3638, ifft3638, 78));
__m512 ifft3562 = _mm512_fmadd_ps(ifft3553, ifft3557, _mm512_shuffle_ps(ifft3553, ifft3553, 78));
__m512 ifft3647 = _mm512_fmadd_ps(ifft3639, ifft3557, _mm512_shuffle_ps(ifft3639, ifft3639, 78));
__m512 ifft3563 = _mm512_fmadd_ps(ifft3554, ifft3557, _mm512_shuffle_ps(ifft3554, ifft3554, 78));
__m512 ifft3648 = _mm512_fmadd_ps(ifft3640, ifft3557, _mm512_shuffle_ps(ifft3640, ifft3640, 78));
__m512 ifft3564 = _mm512_fmadd_ps(ifft3555, ifft3557, _mm512_shuffle_ps(ifft3555, ifft3555, 78));
__m512 ifft3649 = _mm512_fmadd_ps(ifft3641, ifft3557, _mm512_shuffle_ps(ifft3641, ifft3641, 78));
__m512 ifft3565 = _mm512_fmadd_ps(ifft3556, ifft3557, _mm512_shuffle_ps(ifft3556, ifft3556, 78));
__m512 ifft3650 = _mm512_fmadd_ps(ifft3642, ifft3557, _mm512_shuffle_ps(ifft3642, ifft3642, 78));
__m512 ifft3566 = _mm512_mask_sub_ps(ifft3558, 49344, _mm512_setzero_ps(), ifft3559);
__m512 ifft3651 = _mm512_mask_sub_ps(ifft3643, 49344, _mm512_setzero_ps(), ifft3644);
__m512 ifft3567 = _mm512_mask_mov_ps(ifft3559, 49344, ifft3558);
__m512 ifft3652 = _mm512_mask_mov_ps(ifft3644, 49344, ifft3643);
__m512 ifft3568 = _mm512_mask_sub_ps(ifft3560, 49344, _mm512_setzero_ps(), ifft3561);
__m512 ifft3653 = _mm512_mask_sub_ps(ifft3645, 49344, _mm512_setzero_ps(), ifft3646);
__m512 ifft3569 = _mm512_mask_mov_ps(ifft3561, 49344, ifft3560);
__m512 ifft3654 = _mm512_mask_mov_ps(ifft3646, 49344, ifft3645);
__m512 ifft3570 = _mm512_mask_sub_ps(ifft3562, 49344, _mm512_setzero_ps(), ifft3563);
__m512 ifft3655 = _mm512_mask_sub_ps(ifft3647, 49344, _mm512_setzero_ps(), ifft3648);
__m512 ifft3571 = _mm512_mask_mov_ps(ifft3563, 49344, ifft3562);
__m512 ifft3656 = _mm512_mask_mov_ps(ifft3648, 49344, ifft3647);
__m512 ifft3572 = _mm512_mask_sub_ps(ifft3564, 49344, _mm512_setzero_ps(), ifft3565);
__m512 ifft3657 = _mm512_mask_sub_ps(ifft3649, 49344, _mm512_setzero_ps(), ifft3650);
__m512 ifft3573 = _mm512_mask_mov_ps(ifft3565, 49344, ifft3564);
__m512 ifft3658 = _mm512_mask_mov_ps(ifft3650, 49344, ifft3649);
__m512 ifft3574 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3575 = _mm512_fmadd_ps(ifft3566, ifft3574, _mm512_shuffle_f32x4(ifft3566, ifft3566, 177));
__m512 ifft3659 = _mm512_fmadd_ps(ifft3651, ifft3574, _mm512_shuffle_f32x4(ifft3651, ifft3651, 177));
__m512 ifft3576 = _mm512_fmadd_ps(ifft3567, ifft3574, _mm512_shuffle_f32x4(ifft3567, ifft3567, 177));
__m512 ifft3660 = _mm512_fmadd_ps(ifft3652, ifft3574, _mm512_shuffle_f32x4(ifft3652, ifft3652, 177));
__m512 ifft3577 = _mm512_fmadd_ps(ifft3568, ifft3574, _mm512_shuffle_f32x4(ifft3568, ifft3568, 177));
__m512 ifft3661 = _mm512_fmadd_ps(ifft3653, ifft3574, _mm512_shuffle_f32x4(ifft3653, ifft3653, 177));
__m512 ifft3578 = _mm512_fmadd_ps(ifft3569, ifft3574, _mm512_shuffle_f32x4(ifft3569, ifft3569, 177));
__m512 ifft3662 = _mm512_fmadd_ps(ifft3654, ifft3574, _mm512_shuffle_f32x4(ifft3654, ifft3654, 177));
__m512 ifft3579 = _mm512_fmadd_ps(ifft3570, ifft3574, _mm512_shuffle_f32x4(ifft3570, ifft3570, 177));
__m512 ifft3663 = _mm512_fmadd_ps(ifft3655, ifft3574, _mm512_shuffle_f32x4(ifft3655, ifft3655, 177));
__m512 ifft3580 = _mm512_fnmsub_ps(ifft3571, ifft3574, _mm512_shuffle_f32x4(ifft3571, ifft3571, 177));
__m512 ifft3664 = _mm512_fnmsub_ps(ifft3656, ifft3574, _mm512_shuffle_f32x4(ifft3656, ifft3656, 177));
__m512 ifft3581 = _mm512_fmadd_ps(ifft3572, ifft3574, _mm512_shuffle_f32x4(ifft3572, ifft3572, 177));
__m512 ifft3665 = _mm512_fmadd_ps(ifft3657, ifft3574, _mm512_shuffle_f32x4(ifft3657, ifft3657, 177));
__m512 ifft3582 = _mm512_fmadd_ps(ifft3573, ifft3574, _mm512_shuffle_f32x4(ifft3573, ifft3573, 177));
__m512 ifft3666 = _mm512_fmadd_ps(ifft3658, ifft3574, _mm512_shuffle_f32x4(ifft3658, ifft3658, 177));
__m512 ifft3583 = _mm512_add_ps(ifft3575, ifft3576);
__m512 ifft3667 = _mm512_add_ps(ifft3659, ifft3660);
__m512 ifft3584 = _mm512_sub_ps(ifft3575, ifft3576);
__m512 ifft3668 = _mm512_sub_ps(ifft3659, ifft3660);
__m512 ifft3585 = _mm512_sub_ps(ifft3577, ifft3581);
__m512 ifft3669 = _mm512_sub_ps(ifft3661, ifft3665);
__m512 ifft3586 = _mm512_add_ps(ifft3578, ifft3582);
__m512 ifft3670 = _mm512_add_ps(ifft3662, ifft3666);
__m512 ifft3587 = _mm512_add_ps(ifft3577, ifft3581);
__m512 ifft3671 = _mm512_add_ps(ifft3661, ifft3665);
__m512 ifft3588 = _mm512_sub_ps(ifft3578, ifft3582);
__m512 ifft3672 = _mm512_sub_ps(ifft3662, ifft3666);
__m512 ifft3589 = _mm512_mul_ps(ifft3579, _mm512_set1_ps(3.125e-02f));
__m512 ifft3673 = _mm512_mul_ps(ifft3663, _mm512_set1_ps(3.125e-02f));
__m512 ifft3590 = _mm512_mul_ps(ifft3580, _mm512_set1_ps(3.125e-02f));
__m512 ifft3674 = _mm512_mul_ps(ifft3664, _mm512_set1_ps(3.125e-02f));
__m512 ifft3591 = _mm512_fmadd_ps(ifft3583, _mm512_set1_ps(1.5625e-02f), ifft3589);
__m512 ifft3675 = _mm512_fmadd_ps(ifft3667, _mm512_set1_ps(1.5625e-02f), ifft3673);
__m512 ifft3592 = _mm512_fmsub_ps(ifft3583, _mm512_set1_ps(1.5625e-02f), ifft3589);
__m512 ifft3676 = _mm512_fmsub_ps(ifft3667, _mm512_set1_ps(1.5625e-02f), ifft3673);
__m512 ifft3593 = _mm512_fmadd_ps(ifft3584, _mm512_set1_ps(1.5625e-02f), ifft3590);
__m512 ifft3677 = _mm512_fmadd_ps(ifft3668, _mm512_set1_ps(1.5625e-02f), ifft3674);
__m512 ifft3594 = _mm512_fmsub_ps(ifft3584, _mm512_set1_ps(1.5625e-02f), ifft3590);
__m512 ifft3678 = _mm512_fmsub_ps(ifft3668, _mm512_set1_ps(1.5625e-02f), ifft3674);
__m512 ifft3595 = _mm512_add_ps(ifft3585, ifft3586);
__m512 ifft3679 = _mm512_add_ps(ifft3669, ifft3670);
__m512 ifft3596 = _mm512_sub_ps(ifft3585, ifft3586);
__m512 ifft3680 = _mm512_sub_ps(ifft3669, ifft3670);
__m512 ifft3597 = _mm512_fnmadd_ps(ifft3595, _mm512_set1_ps(7.0710677e-01f), ifft3587);
__m512 ifft3681 = _mm512_fnmadd_ps(ifft3679, _mm512_set1_ps(7.0710677e-01f), ifft3671);
__m512 ifft3598 = _mm512_fmadd_ps(ifft3595, _mm512_set1_ps(7.0710677e-01f), ifft3587);
__m512 ifft3682 = _mm512_fmadd_ps(ifft3679, _mm512_set1_ps(7.0710677e-01f), ifft3671);
__m512 ifft3599 = _mm512_fmadd_ps(ifft3596, _mm512_set1_ps(7.0710677e-01f), ifft3588);
__m512 ifft3683 = _mm512_fmadd_ps(ifft3680, _mm512_set1_ps(7.0710677e-01f), ifft3672);
__m512 ifft3600 = _mm512_fmsub_ps(ifft3596, _mm512_set1_ps(7.0710677e-01f), ifft3588);
__m512 ifft3684 = _mm512_fmsub_ps(ifft3680, _mm512_set1_ps(7.0710677e-01f), ifft3672);
__m512 ifft3601 = _mm512_add_ps(ifft3597, ifft3598);
__m512 ifft3685 = _mm512_add_ps(ifft3681, ifft3682);
__m512 ifft3602 = _mm512_sub_ps(ifft3597, ifft3598);
__m512 ifft3686 = _mm512_sub_ps(ifft3681, ifft3682);
__m512 ifft3603 = _mm512_add_ps(ifft3599, ifft3600);
__m512 ifft3687 = _mm512_add_ps(ifft3683, ifft3684);
__m512 ifft3604 = _mm512_sub_ps(ifft3599, ifft3600);
__m512 ifft3688 = _mm512_sub_ps(ifft3683, ifft3684);
__m512 ifft3605 = _mm512_fmadd_ps(ifft3601, _mm512_set1_ps(1.5625e-02f), ifft3591);
__m512 ifft3689 = _mm512_fmadd_ps(ifft3685, _mm512_set1_ps(1.5625e-02f), ifft3675);
__m512 ifft3606 = _mm512_fnmadd_ps(ifft3601, _mm512_set1_ps(1.5625e-02f), ifft3591);
__m512 ifft3690 = _mm512_fnmadd_ps(ifft3685, _mm512_set1_ps(1.5625e-02f), ifft3675);
__m512 ifft3607 = _mm512_fmadd_ps(ifft3603, _mm512_set1_ps(1.5625e-02f), ifft3593);
__m512 ifft3691 = _mm512_fmadd_ps(ifft3687, _mm512_set1_ps(1.5625e-02f), ifft3677);
__m512 ifft3608 = _mm512_fnmadd_ps(ifft3603, _mm512_set1_ps(1.5625e-02f), ifft3593);
__m512 ifft3692 = _mm512_fnmadd_ps(ifft3687, _mm512_set1_ps(1.5625e-02f), ifft3677);
__m512 ifft3609 = _mm512_fnmadd_ps(ifft3604, _mm512_set1_ps(1.5625e-02f), ifft3592);
__m512 ifft3693 = _mm512_fnmadd_ps(ifft3688, _mm512_set1_ps(1.5625e-02f), ifft3676);
__m512 ifft3610 = _mm512_fmadd_ps(ifft3604, _mm512_set1_ps(1.5625e-02f), ifft3592);
__m512 ifft3694 = _mm512_fmadd_ps(ifft3688, _mm512_set1_ps(1.5625e-02f), ifft3676);
__m512 ifft3611 = _mm512_fmadd_ps(ifft3602, _mm512_set1_ps(1.5625e-02f), ifft3594);
__m512 ifft3695 = _mm512_fmadd_ps(ifft3686, _mm512_set1_ps(1.5625e-02f), ifft3678);
__m512 ifft3612 = _mm512_fnmadd_ps(ifft3602, _mm512_set1_ps(1.5625e-02f), ifft3594);
__m512 ifft3696 = _mm512_fnmadd_ps(ifft3686, _mm512_set1_ps(1.5625e-02f), ifft3678);
__m512 dat790 = ifft3605;
__m512 dat795 = ifft3689;
__m512 dat791 = ifft3607;
__m512 dat796 = ifft3691;
__m512 dat792 = ifft3609;
__m512 dat797 = ifft3693;
__m512 dat793 = ifft3611;
__m512 dat798 = ifft3695;
__m512 dat794 = ifft3606;
__m512 dat799 = ifft3690;
(void)ifft3608;
(void)ifft3692;
(void)ifft3610;
(void)ifft3694;
(void)ifft3612;
(void)ifft3696;
__m512i pm35 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack171 = _mm512_permutex2var_ps(dat790, pm35, dat795);
__m512i pm36 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack172 = _mm512_permutex2var_ps(dat790, pm36, dat795);
__m512 pack173 = _mm512_permutex2var_ps(dat791, pm35, dat796);
__m512 pack174 = _mm512_permutex2var_ps(dat791, pm36, dat796);
__m512 pack175 = _mm512_permutex2var_ps(dat792, pm35, dat797);
__m512 pack176 = _mm512_permutex2var_ps(dat792, pm36, dat797);
__m512 pack177 = _mm512_permutex2var_ps(dat793, pm35, dat798);
__m512 pack178 = _mm512_permutex2var_ps(dat793, pm36, dat798);
__m512 pack179 = _mm512_permutex2var_ps(dat794, pm35, dat799);
__m512 pack180 = _mm512_permutex2var_ps(dat794, pm36, dat799);
pack171 = _mm512_max_ps(_mm512_setzero_ps(), pack171);
pack172 = _mm512_max_ps(_mm512_setzero_ps(), pack172);
pack173 = _mm512_max_ps(_mm512_setzero_ps(), pack173);
pack174 = _mm512_max_ps(_mm512_setzero_ps(), pack174);
pack175 = _mm512_max_ps(_mm512_setzero_ps(), pack175);
pack176 = _mm512_max_ps(_mm512_setzero_ps(), pack176);
pack177 = _mm512_max_ps(_mm512_setzero_ps(), pack177);
pack178 = _mm512_max_ps(_mm512_setzero_ps(), pack178);
pack179 = _mm512_max_ps(_mm512_setzero_ps(), pack179);
pack180 = _mm512_max_ps(_mm512_setzero_ps(), pack180);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack171);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack172);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack173);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack174);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack175);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack176);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack177);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack178);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack179);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k36+100480*r13+448*toH12+4*toW12+40*t22, 1023, pack180);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 18;
}
ptrdiff_t toH13 = base5+20;
ptrdiff_t toW13 = 85;
ptrdiff_t k37 = 16*w21;
for (; k37 != 16; ++k37) {
ptrdiff_t r14 = 0;
for (; r14 != 2; ++r14) {
ptrdiff_t t23 = 0;
for (; t23 < 2; ++t23) {
__m512 sfRe225 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm225 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe229 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm229 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe226 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm226 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe230 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm230 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe227 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm227 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe231 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm231 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe228 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm228 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfRe232 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512 sfIm232 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k37+768*r14+256*t23);
__m512i ifft3697 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3698 = _mm512_permutexvar_ps(ifft3697, sfRe225);
__m512 ifft3789 = _mm512_permutexvar_ps(ifft3697, sfRe229);
__m512i ifft3699 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3700 = _mm512_permutexvar_ps(ifft3699, sfRe225);
__m512 ifft3790 = _mm512_permutexvar_ps(ifft3699, sfRe229);
__m512 ifft3701 = _mm512_permutexvar_ps(ifft3697, sfIm225);
__m512 ifft3791 = _mm512_permutexvar_ps(ifft3697, sfIm229);
__m512 ifft3702 = _mm512_permutexvar_ps(ifft3699, sfIm225);
__m512 ifft3792 = _mm512_permutexvar_ps(ifft3699, sfIm229);
__m512 ifft3703 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3704 = _mm512_mask_fmadd_ps(ifft3702, 65021, ifft3703, ifft3698);
__m512 ifft3793 = _mm512_mask_fmadd_ps(ifft3792, 65021, ifft3703, ifft3789);
__m512 ifft3705 = _mm512_mask_fnmadd_ps(ifft3701, 65021, ifft3703, ifft3700);
__m512 ifft3794 = _mm512_mask_fnmadd_ps(ifft3791, 65021, ifft3703, ifft3790);
__m512 ifft3706 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3707 = _mm512_fmadd_ps(ifft3704, ifft3706, _mm512_shuffle_ps(ifft3704, ifft3704, 177));
__m512 ifft3795 = _mm512_fmadd_ps(ifft3793, ifft3706, _mm512_shuffle_ps(ifft3793, ifft3793, 177));
__m512 ifft3708 = _mm512_fmadd_ps(ifft3705, ifft3706, _mm512_shuffle_ps(ifft3705, ifft3705, 177));
__m512 ifft3796 = _mm512_fmadd_ps(ifft3794, ifft3706, _mm512_shuffle_ps(ifft3794, ifft3794, 177));
__m512 ifft3709 = _mm512_fmadd_ps(sfRe226, ifft3706, _mm512_shuffle_ps(sfRe226, sfRe226, 177));
__m512 ifft3797 = _mm512_fmadd_ps(sfRe230, ifft3706, _mm512_shuffle_ps(sfRe230, sfRe230, 177));
__m512 ifft3710 = _mm512_fmadd_ps(sfIm226, ifft3706, _mm512_shuffle_ps(sfIm226, sfIm226, 177));
__m512 ifft3798 = _mm512_fmadd_ps(sfIm230, ifft3706, _mm512_shuffle_ps(sfIm230, sfIm230, 177));
__m512 ifft3711 = _mm512_fmadd_ps(sfRe227, ifft3706, _mm512_shuffle_ps(sfRe227, sfRe227, 177));
__m512 ifft3799 = _mm512_fmadd_ps(sfRe231, ifft3706, _mm512_shuffle_ps(sfRe231, sfRe231, 177));
__m512 ifft3712 = _mm512_fmadd_ps(sfIm227, ifft3706, _mm512_shuffle_ps(sfIm227, sfIm227, 177));
__m512 ifft3800 = _mm512_fmadd_ps(sfIm231, ifft3706, _mm512_shuffle_ps(sfIm231, sfIm231, 177));
__m512 ifft3713 = _mm512_fmadd_ps(sfRe228, ifft3706, _mm512_shuffle_ps(sfRe228, sfRe228, 177));
__m512 ifft3801 = _mm512_fmadd_ps(sfRe232, ifft3706, _mm512_shuffle_ps(sfRe232, sfRe232, 177));
__m512 ifft3714 = _mm512_fmadd_ps(sfIm228, ifft3706, _mm512_shuffle_ps(sfIm228, sfIm228, 177));
__m512 ifft3802 = _mm512_fmadd_ps(sfIm232, ifft3706, _mm512_shuffle_ps(sfIm232, sfIm232, 177));
__m512 ifft3715 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3716 = _mm512_mul_ps(ifft3707, ifft3715);
__m512 ifft3803 = _mm512_mul_ps(ifft3795, ifft3715);
__m512 ifft3717 = _mm512_mul_ps(ifft3708, ifft3715);
__m512 ifft3804 = _mm512_mul_ps(ifft3796, ifft3715);
__m512 ifft3718 = _mm512_mul_ps(ifft3709, ifft3715);
__m512 ifft3805 = _mm512_mul_ps(ifft3797, ifft3715);
__m512 ifft3719 = _mm512_mul_ps(ifft3710, ifft3715);
__m512 ifft3806 = _mm512_mul_ps(ifft3798, ifft3715);
__m512 ifft3720 = _mm512_mul_ps(ifft3711, ifft3715);
__m512 ifft3807 = _mm512_mul_ps(ifft3799, ifft3715);
__m512 ifft3721 = _mm512_mul_ps(ifft3712, ifft3715);
__m512 ifft3808 = _mm512_mul_ps(ifft3800, ifft3715);
__m512 ifft3722 = _mm512_mul_ps(ifft3713, ifft3715);
__m512 ifft3809 = _mm512_mul_ps(ifft3801, ifft3715);
__m512 ifft3723 = _mm512_mul_ps(ifft3714, ifft3715);
__m512 ifft3810 = _mm512_mul_ps(ifft3802, ifft3715);
__m512 ifft3724 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3725 = _mm512_fnmadd_ps(ifft3708, ifft3724, ifft3716);
__m512 ifft3811 = _mm512_fnmadd_ps(ifft3796, ifft3724, ifft3803);
__m512 ifft3726 = _mm512_fmadd_ps(ifft3707, ifft3724, ifft3717);
__m512 ifft3812 = _mm512_fmadd_ps(ifft3795, ifft3724, ifft3804);
__m512 ifft3727 = _mm512_fnmadd_ps(ifft3710, ifft3724, ifft3718);
__m512 ifft3813 = _mm512_fnmadd_ps(ifft3798, ifft3724, ifft3805);
__m512 ifft3728 = _mm512_fmadd_ps(ifft3709, ifft3724, ifft3719);
__m512 ifft3814 = _mm512_fmadd_ps(ifft3797, ifft3724, ifft3806);
__m512 ifft3729 = _mm512_fnmadd_ps(ifft3712, ifft3724, ifft3720);
__m512 ifft3815 = _mm512_fnmadd_ps(ifft3800, ifft3724, ifft3807);
__m512 ifft3730 = _mm512_fmadd_ps(ifft3711, ifft3724, ifft3721);
__m512 ifft3816 = _mm512_fmadd_ps(ifft3799, ifft3724, ifft3808);
__m512 ifft3731 = _mm512_fnmadd_ps(ifft3714, ifft3724, ifft3722);
__m512 ifft3817 = _mm512_fnmadd_ps(ifft3802, ifft3724, ifft3809);
__m512 ifft3732 = _mm512_fmadd_ps(ifft3713, ifft3724, ifft3723);
__m512 ifft3818 = _mm512_fmadd_ps(ifft3801, ifft3724, ifft3810);
__m512 ifft3733 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3734 = _mm512_fmadd_ps(ifft3725, ifft3733, _mm512_shuffle_ps(ifft3725, ifft3725, 78));
__m512 ifft3819 = _mm512_fmadd_ps(ifft3811, ifft3733, _mm512_shuffle_ps(ifft3811, ifft3811, 78));
__m512 ifft3735 = _mm512_fmadd_ps(ifft3726, ifft3733, _mm512_shuffle_ps(ifft3726, ifft3726, 78));
__m512 ifft3820 = _mm512_fmadd_ps(ifft3812, ifft3733, _mm512_shuffle_ps(ifft3812, ifft3812, 78));
__m512 ifft3736 = _mm512_fmadd_ps(ifft3727, ifft3733, _mm512_shuffle_ps(ifft3727, ifft3727, 78));
__m512 ifft3821 = _mm512_fmadd_ps(ifft3813, ifft3733, _mm512_shuffle_ps(ifft3813, ifft3813, 78));
__m512 ifft3737 = _mm512_fmadd_ps(ifft3728, ifft3733, _mm512_shuffle_ps(ifft3728, ifft3728, 78));
__m512 ifft3822 = _mm512_fmadd_ps(ifft3814, ifft3733, _mm512_shuffle_ps(ifft3814, ifft3814, 78));
__m512 ifft3738 = _mm512_fmadd_ps(ifft3729, ifft3733, _mm512_shuffle_ps(ifft3729, ifft3729, 78));
__m512 ifft3823 = _mm512_fmadd_ps(ifft3815, ifft3733, _mm512_shuffle_ps(ifft3815, ifft3815, 78));
__m512 ifft3739 = _mm512_fmadd_ps(ifft3730, ifft3733, _mm512_shuffle_ps(ifft3730, ifft3730, 78));
__m512 ifft3824 = _mm512_fmadd_ps(ifft3816, ifft3733, _mm512_shuffle_ps(ifft3816, ifft3816, 78));
__m512 ifft3740 = _mm512_fmadd_ps(ifft3731, ifft3733, _mm512_shuffle_ps(ifft3731, ifft3731, 78));
__m512 ifft3825 = _mm512_fmadd_ps(ifft3817, ifft3733, _mm512_shuffle_ps(ifft3817, ifft3817, 78));
__m512 ifft3741 = _mm512_fmadd_ps(ifft3732, ifft3733, _mm512_shuffle_ps(ifft3732, ifft3732, 78));
__m512 ifft3826 = _mm512_fmadd_ps(ifft3818, ifft3733, _mm512_shuffle_ps(ifft3818, ifft3818, 78));
__m512 ifft3742 = _mm512_mask_sub_ps(ifft3734, 49344, _mm512_setzero_ps(), ifft3735);
__m512 ifft3827 = _mm512_mask_sub_ps(ifft3819, 49344, _mm512_setzero_ps(), ifft3820);
__m512 ifft3743 = _mm512_mask_mov_ps(ifft3735, 49344, ifft3734);
__m512 ifft3828 = _mm512_mask_mov_ps(ifft3820, 49344, ifft3819);
__m512 ifft3744 = _mm512_mask_sub_ps(ifft3736, 49344, _mm512_setzero_ps(), ifft3737);
__m512 ifft3829 = _mm512_mask_sub_ps(ifft3821, 49344, _mm512_setzero_ps(), ifft3822);
__m512 ifft3745 = _mm512_mask_mov_ps(ifft3737, 49344, ifft3736);
__m512 ifft3830 = _mm512_mask_mov_ps(ifft3822, 49344, ifft3821);
__m512 ifft3746 = _mm512_mask_sub_ps(ifft3738, 49344, _mm512_setzero_ps(), ifft3739);
__m512 ifft3831 = _mm512_mask_sub_ps(ifft3823, 49344, _mm512_setzero_ps(), ifft3824);
__m512 ifft3747 = _mm512_mask_mov_ps(ifft3739, 49344, ifft3738);
__m512 ifft3832 = _mm512_mask_mov_ps(ifft3824, 49344, ifft3823);
__m512 ifft3748 = _mm512_mask_sub_ps(ifft3740, 49344, _mm512_setzero_ps(), ifft3741);
__m512 ifft3833 = _mm512_mask_sub_ps(ifft3825, 49344, _mm512_setzero_ps(), ifft3826);
__m512 ifft3749 = _mm512_mask_mov_ps(ifft3741, 49344, ifft3740);
__m512 ifft3834 = _mm512_mask_mov_ps(ifft3826, 49344, ifft3825);
__m512 ifft3750 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3751 = _mm512_fmadd_ps(ifft3742, ifft3750, _mm512_shuffle_f32x4(ifft3742, ifft3742, 177));
__m512 ifft3835 = _mm512_fmadd_ps(ifft3827, ifft3750, _mm512_shuffle_f32x4(ifft3827, ifft3827, 177));
__m512 ifft3752 = _mm512_fmadd_ps(ifft3743, ifft3750, _mm512_shuffle_f32x4(ifft3743, ifft3743, 177));
__m512 ifft3836 = _mm512_fmadd_ps(ifft3828, ifft3750, _mm512_shuffle_f32x4(ifft3828, ifft3828, 177));
__m512 ifft3753 = _mm512_fmadd_ps(ifft3744, ifft3750, _mm512_shuffle_f32x4(ifft3744, ifft3744, 177));
__m512 ifft3837 = _mm512_fmadd_ps(ifft3829, ifft3750, _mm512_shuffle_f32x4(ifft3829, ifft3829, 177));
__m512 ifft3754 = _mm512_fmadd_ps(ifft3745, ifft3750, _mm512_shuffle_f32x4(ifft3745, ifft3745, 177));
__m512 ifft3838 = _mm512_fmadd_ps(ifft3830, ifft3750, _mm512_shuffle_f32x4(ifft3830, ifft3830, 177));
__m512 ifft3755 = _mm512_fmadd_ps(ifft3746, ifft3750, _mm512_shuffle_f32x4(ifft3746, ifft3746, 177));
__m512 ifft3839 = _mm512_fmadd_ps(ifft3831, ifft3750, _mm512_shuffle_f32x4(ifft3831, ifft3831, 177));
__m512 ifft3756 = _mm512_fnmsub_ps(ifft3747, ifft3750, _mm512_shuffle_f32x4(ifft3747, ifft3747, 177));
__m512 ifft3840 = _mm512_fnmsub_ps(ifft3832, ifft3750, _mm512_shuffle_f32x4(ifft3832, ifft3832, 177));
__m512 ifft3757 = _mm512_fmadd_ps(ifft3748, ifft3750, _mm512_shuffle_f32x4(ifft3748, ifft3748, 177));
__m512 ifft3841 = _mm512_fmadd_ps(ifft3833, ifft3750, _mm512_shuffle_f32x4(ifft3833, ifft3833, 177));
__m512 ifft3758 = _mm512_fmadd_ps(ifft3749, ifft3750, _mm512_shuffle_f32x4(ifft3749, ifft3749, 177));
__m512 ifft3842 = _mm512_fmadd_ps(ifft3834, ifft3750, _mm512_shuffle_f32x4(ifft3834, ifft3834, 177));
__m512 ifft3759 = _mm512_add_ps(ifft3751, ifft3752);
__m512 ifft3843 = _mm512_add_ps(ifft3835, ifft3836);
__m512 ifft3760 = _mm512_sub_ps(ifft3751, ifft3752);
__m512 ifft3844 = _mm512_sub_ps(ifft3835, ifft3836);
__m512 ifft3761 = _mm512_sub_ps(ifft3753, ifft3757);
__m512 ifft3845 = _mm512_sub_ps(ifft3837, ifft3841);
__m512 ifft3762 = _mm512_add_ps(ifft3754, ifft3758);
__m512 ifft3846 = _mm512_add_ps(ifft3838, ifft3842);
__m512 ifft3763 = _mm512_add_ps(ifft3753, ifft3757);
__m512 ifft3847 = _mm512_add_ps(ifft3837, ifft3841);
__m512 ifft3764 = _mm512_sub_ps(ifft3754, ifft3758);
__m512 ifft3848 = _mm512_sub_ps(ifft3838, ifft3842);
__m512 ifft3765 = _mm512_mul_ps(ifft3755, _mm512_set1_ps(3.125e-02f));
__m512 ifft3849 = _mm512_mul_ps(ifft3839, _mm512_set1_ps(3.125e-02f));
__m512 ifft3766 = _mm512_mul_ps(ifft3756, _mm512_set1_ps(3.125e-02f));
__m512 ifft3850 = _mm512_mul_ps(ifft3840, _mm512_set1_ps(3.125e-02f));
__m512 ifft3767 = _mm512_fmadd_ps(ifft3759, _mm512_set1_ps(1.5625e-02f), ifft3765);
__m512 ifft3851 = _mm512_fmadd_ps(ifft3843, _mm512_set1_ps(1.5625e-02f), ifft3849);
__m512 ifft3768 = _mm512_fmsub_ps(ifft3759, _mm512_set1_ps(1.5625e-02f), ifft3765);
__m512 ifft3852 = _mm512_fmsub_ps(ifft3843, _mm512_set1_ps(1.5625e-02f), ifft3849);
__m512 ifft3769 = _mm512_fmadd_ps(ifft3760, _mm512_set1_ps(1.5625e-02f), ifft3766);
__m512 ifft3853 = _mm512_fmadd_ps(ifft3844, _mm512_set1_ps(1.5625e-02f), ifft3850);
__m512 ifft3770 = _mm512_fmsub_ps(ifft3760, _mm512_set1_ps(1.5625e-02f), ifft3766);
__m512 ifft3854 = _mm512_fmsub_ps(ifft3844, _mm512_set1_ps(1.5625e-02f), ifft3850);
__m512 ifft3771 = _mm512_add_ps(ifft3761, ifft3762);
__m512 ifft3855 = _mm512_add_ps(ifft3845, ifft3846);
__m512 ifft3772 = _mm512_sub_ps(ifft3761, ifft3762);
__m512 ifft3856 = _mm512_sub_ps(ifft3845, ifft3846);
__m512 ifft3773 = _mm512_fnmadd_ps(ifft3771, _mm512_set1_ps(7.0710677e-01f), ifft3763);
__m512 ifft3857 = _mm512_fnmadd_ps(ifft3855, _mm512_set1_ps(7.0710677e-01f), ifft3847);
__m512 ifft3774 = _mm512_fmadd_ps(ifft3771, _mm512_set1_ps(7.0710677e-01f), ifft3763);
__m512 ifft3858 = _mm512_fmadd_ps(ifft3855, _mm512_set1_ps(7.0710677e-01f), ifft3847);
__m512 ifft3775 = _mm512_fmadd_ps(ifft3772, _mm512_set1_ps(7.0710677e-01f), ifft3764);
__m512 ifft3859 = _mm512_fmadd_ps(ifft3856, _mm512_set1_ps(7.0710677e-01f), ifft3848);
__m512 ifft3776 = _mm512_fmsub_ps(ifft3772, _mm512_set1_ps(7.0710677e-01f), ifft3764);
__m512 ifft3860 = _mm512_fmsub_ps(ifft3856, _mm512_set1_ps(7.0710677e-01f), ifft3848);
__m512 ifft3777 = _mm512_add_ps(ifft3773, ifft3774);
__m512 ifft3861 = _mm512_add_ps(ifft3857, ifft3858);
__m512 ifft3778 = _mm512_sub_ps(ifft3773, ifft3774);
__m512 ifft3862 = _mm512_sub_ps(ifft3857, ifft3858);
__m512 ifft3779 = _mm512_add_ps(ifft3775, ifft3776);
__m512 ifft3863 = _mm512_add_ps(ifft3859, ifft3860);
__m512 ifft3780 = _mm512_sub_ps(ifft3775, ifft3776);
__m512 ifft3864 = _mm512_sub_ps(ifft3859, ifft3860);
__m512 ifft3781 = _mm512_fmadd_ps(ifft3777, _mm512_set1_ps(1.5625e-02f), ifft3767);
__m512 ifft3865 = _mm512_fmadd_ps(ifft3861, _mm512_set1_ps(1.5625e-02f), ifft3851);
__m512 ifft3782 = _mm512_fnmadd_ps(ifft3777, _mm512_set1_ps(1.5625e-02f), ifft3767);
__m512 ifft3866 = _mm512_fnmadd_ps(ifft3861, _mm512_set1_ps(1.5625e-02f), ifft3851);
__m512 ifft3783 = _mm512_fmadd_ps(ifft3779, _mm512_set1_ps(1.5625e-02f), ifft3769);
__m512 ifft3867 = _mm512_fmadd_ps(ifft3863, _mm512_set1_ps(1.5625e-02f), ifft3853);
__m512 ifft3784 = _mm512_fnmadd_ps(ifft3779, _mm512_set1_ps(1.5625e-02f), ifft3769);
__m512 ifft3868 = _mm512_fnmadd_ps(ifft3863, _mm512_set1_ps(1.5625e-02f), ifft3853);
__m512 ifft3785 = _mm512_fnmadd_ps(ifft3780, _mm512_set1_ps(1.5625e-02f), ifft3768);
__m512 ifft3869 = _mm512_fnmadd_ps(ifft3864, _mm512_set1_ps(1.5625e-02f), ifft3852);
__m512 ifft3786 = _mm512_fmadd_ps(ifft3780, _mm512_set1_ps(1.5625e-02f), ifft3768);
__m512 ifft3870 = _mm512_fmadd_ps(ifft3864, _mm512_set1_ps(1.5625e-02f), ifft3852);
__m512 ifft3787 = _mm512_fmadd_ps(ifft3778, _mm512_set1_ps(1.5625e-02f), ifft3770);
__m512 ifft3871 = _mm512_fmadd_ps(ifft3862, _mm512_set1_ps(1.5625e-02f), ifft3854);
__m512 ifft3788 = _mm512_fnmadd_ps(ifft3778, _mm512_set1_ps(1.5625e-02f), ifft3770);
__m512 ifft3872 = _mm512_fnmadd_ps(ifft3862, _mm512_set1_ps(1.5625e-02f), ifft3854);
__m512 dat800 = ifft3781;
__m512 dat805 = ifft3865;
__m512 dat801 = ifft3783;
__m512 dat806 = ifft3867;
__m512 dat802 = ifft3785;
__m512 dat807 = ifft3869;
__m512 dat803 = ifft3787;
__m512 dat808 = ifft3871;
__m512 dat804 = ifft3782;
__m512 dat809 = ifft3866;
(void)ifft3784;
(void)ifft3868;
(void)ifft3786;
(void)ifft3870;
(void)ifft3788;
(void)ifft3872;
__m512i pm37 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack181 = _mm512_permutex2var_ps(dat800, pm37, dat805);
__m512i pm38 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack182 = _mm512_permutex2var_ps(dat800, pm38, dat805);
__m512 pack183 = _mm512_permutex2var_ps(dat801, pm37, dat806);
__m512 pack184 = _mm512_permutex2var_ps(dat801, pm38, dat806);
__m512 pack185 = _mm512_permutex2var_ps(dat802, pm37, dat807);
__m512 pack186 = _mm512_permutex2var_ps(dat802, pm38, dat807);
__m512 pack187 = _mm512_permutex2var_ps(dat803, pm37, dat808);
__m512 pack188 = _mm512_permutex2var_ps(dat803, pm38, dat808);
__m512 pack189 = _mm512_permutex2var_ps(dat804, pm37, dat809);
__m512 pack190 = _mm512_permutex2var_ps(dat804, pm38, dat809);
pack181 = _mm512_max_ps(_mm512_setzero_ps(), pack181);
pack182 = _mm512_max_ps(_mm512_setzero_ps(), pack182);
pack183 = _mm512_max_ps(_mm512_setzero_ps(), pack183);
pack184 = _mm512_max_ps(_mm512_setzero_ps(), pack184);
pack185 = _mm512_max_ps(_mm512_setzero_ps(), pack185);
pack186 = _mm512_max_ps(_mm512_setzero_ps(), pack186);
pack187 = _mm512_max_ps(_mm512_setzero_ps(), pack187);
pack188 = _mm512_max_ps(_mm512_setzero_ps(), pack188);
pack189 = _mm512_max_ps(_mm512_setzero_ps(), pack189);
pack190 = _mm512_max_ps(_mm512_setzero_ps(), pack190);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack181);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack182);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack183);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack184);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack185);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack186);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack187);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack188);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack189);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t23, 1023, pack190);
}
ptrdiff_t t24 = 0;
__m512 sfRe233 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm233 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe237 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm237 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe234 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm234 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe238 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm238 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe235 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm235 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe239 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm239 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe236 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm236 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfRe240 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512 sfIm240 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k37+768*r14+256*t24);
__m512i ifft3873 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft3874 = _mm512_permutexvar_ps(ifft3873, sfRe233);
__m512 ifft3965 = _mm512_permutexvar_ps(ifft3873, sfRe237);
__m512i ifft3875 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft3876 = _mm512_permutexvar_ps(ifft3875, sfRe233);
__m512 ifft3966 = _mm512_permutexvar_ps(ifft3875, sfRe237);
__m512 ifft3877 = _mm512_permutexvar_ps(ifft3873, sfIm233);
__m512 ifft3967 = _mm512_permutexvar_ps(ifft3873, sfIm237);
__m512 ifft3878 = _mm512_permutexvar_ps(ifft3875, sfIm233);
__m512 ifft3968 = _mm512_permutexvar_ps(ifft3875, sfIm237);
__m512 ifft3879 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft3880 = _mm512_mask_fmadd_ps(ifft3878, 65021, ifft3879, ifft3874);
__m512 ifft3969 = _mm512_mask_fmadd_ps(ifft3968, 65021, ifft3879, ifft3965);
__m512 ifft3881 = _mm512_mask_fnmadd_ps(ifft3877, 65021, ifft3879, ifft3876);
__m512 ifft3970 = _mm512_mask_fnmadd_ps(ifft3967, 65021, ifft3879, ifft3966);
__m512 ifft3882 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft3883 = _mm512_fmadd_ps(ifft3880, ifft3882, _mm512_shuffle_ps(ifft3880, ifft3880, 177));
__m512 ifft3971 = _mm512_fmadd_ps(ifft3969, ifft3882, _mm512_shuffle_ps(ifft3969, ifft3969, 177));
__m512 ifft3884 = _mm512_fmadd_ps(ifft3881, ifft3882, _mm512_shuffle_ps(ifft3881, ifft3881, 177));
__m512 ifft3972 = _mm512_fmadd_ps(ifft3970, ifft3882, _mm512_shuffle_ps(ifft3970, ifft3970, 177));
__m512 ifft3885 = _mm512_fmadd_ps(sfRe234, ifft3882, _mm512_shuffle_ps(sfRe234, sfRe234, 177));
__m512 ifft3973 = _mm512_fmadd_ps(sfRe238, ifft3882, _mm512_shuffle_ps(sfRe238, sfRe238, 177));
__m512 ifft3886 = _mm512_fmadd_ps(sfIm234, ifft3882, _mm512_shuffle_ps(sfIm234, sfIm234, 177));
__m512 ifft3974 = _mm512_fmadd_ps(sfIm238, ifft3882, _mm512_shuffle_ps(sfIm238, sfIm238, 177));
__m512 ifft3887 = _mm512_fmadd_ps(sfRe235, ifft3882, _mm512_shuffle_ps(sfRe235, sfRe235, 177));
__m512 ifft3975 = _mm512_fmadd_ps(sfRe239, ifft3882, _mm512_shuffle_ps(sfRe239, sfRe239, 177));
__m512 ifft3888 = _mm512_fmadd_ps(sfIm235, ifft3882, _mm512_shuffle_ps(sfIm235, sfIm235, 177));
__m512 ifft3976 = _mm512_fmadd_ps(sfIm239, ifft3882, _mm512_shuffle_ps(sfIm239, sfIm239, 177));
__m512 ifft3889 = _mm512_fmadd_ps(sfRe236, ifft3882, _mm512_shuffle_ps(sfRe236, sfRe236, 177));
__m512 ifft3977 = _mm512_fmadd_ps(sfRe240, ifft3882, _mm512_shuffle_ps(sfRe240, sfRe240, 177));
__m512 ifft3890 = _mm512_fmadd_ps(sfIm236, ifft3882, _mm512_shuffle_ps(sfIm236, sfIm236, 177));
__m512 ifft3978 = _mm512_fmadd_ps(sfIm240, ifft3882, _mm512_shuffle_ps(sfIm240, sfIm240, 177));
__m512 ifft3891 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft3892 = _mm512_mul_ps(ifft3883, ifft3891);
__m512 ifft3979 = _mm512_mul_ps(ifft3971, ifft3891);
__m512 ifft3893 = _mm512_mul_ps(ifft3884, ifft3891);
__m512 ifft3980 = _mm512_mul_ps(ifft3972, ifft3891);
__m512 ifft3894 = _mm512_mul_ps(ifft3885, ifft3891);
__m512 ifft3981 = _mm512_mul_ps(ifft3973, ifft3891);
__m512 ifft3895 = _mm512_mul_ps(ifft3886, ifft3891);
__m512 ifft3982 = _mm512_mul_ps(ifft3974, ifft3891);
__m512 ifft3896 = _mm512_mul_ps(ifft3887, ifft3891);
__m512 ifft3983 = _mm512_mul_ps(ifft3975, ifft3891);
__m512 ifft3897 = _mm512_mul_ps(ifft3888, ifft3891);
__m512 ifft3984 = _mm512_mul_ps(ifft3976, ifft3891);
__m512 ifft3898 = _mm512_mul_ps(ifft3889, ifft3891);
__m512 ifft3985 = _mm512_mul_ps(ifft3977, ifft3891);
__m512 ifft3899 = _mm512_mul_ps(ifft3890, ifft3891);
__m512 ifft3986 = _mm512_mul_ps(ifft3978, ifft3891);
__m512 ifft3900 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft3901 = _mm512_fnmadd_ps(ifft3884, ifft3900, ifft3892);
__m512 ifft3987 = _mm512_fnmadd_ps(ifft3972, ifft3900, ifft3979);
__m512 ifft3902 = _mm512_fmadd_ps(ifft3883, ifft3900, ifft3893);
__m512 ifft3988 = _mm512_fmadd_ps(ifft3971, ifft3900, ifft3980);
__m512 ifft3903 = _mm512_fnmadd_ps(ifft3886, ifft3900, ifft3894);
__m512 ifft3989 = _mm512_fnmadd_ps(ifft3974, ifft3900, ifft3981);
__m512 ifft3904 = _mm512_fmadd_ps(ifft3885, ifft3900, ifft3895);
__m512 ifft3990 = _mm512_fmadd_ps(ifft3973, ifft3900, ifft3982);
__m512 ifft3905 = _mm512_fnmadd_ps(ifft3888, ifft3900, ifft3896);
__m512 ifft3991 = _mm512_fnmadd_ps(ifft3976, ifft3900, ifft3983);
__m512 ifft3906 = _mm512_fmadd_ps(ifft3887, ifft3900, ifft3897);
__m512 ifft3992 = _mm512_fmadd_ps(ifft3975, ifft3900, ifft3984);
__m512 ifft3907 = _mm512_fnmadd_ps(ifft3890, ifft3900, ifft3898);
__m512 ifft3993 = _mm512_fnmadd_ps(ifft3978, ifft3900, ifft3985);
__m512 ifft3908 = _mm512_fmadd_ps(ifft3889, ifft3900, ifft3899);
__m512 ifft3994 = _mm512_fmadd_ps(ifft3977, ifft3900, ifft3986);
__m512 ifft3909 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft3910 = _mm512_fmadd_ps(ifft3901, ifft3909, _mm512_shuffle_ps(ifft3901, ifft3901, 78));
__m512 ifft3995 = _mm512_fmadd_ps(ifft3987, ifft3909, _mm512_shuffle_ps(ifft3987, ifft3987, 78));
__m512 ifft3911 = _mm512_fmadd_ps(ifft3902, ifft3909, _mm512_shuffle_ps(ifft3902, ifft3902, 78));
__m512 ifft3996 = _mm512_fmadd_ps(ifft3988, ifft3909, _mm512_shuffle_ps(ifft3988, ifft3988, 78));
__m512 ifft3912 = _mm512_fmadd_ps(ifft3903, ifft3909, _mm512_shuffle_ps(ifft3903, ifft3903, 78));
__m512 ifft3997 = _mm512_fmadd_ps(ifft3989, ifft3909, _mm512_shuffle_ps(ifft3989, ifft3989, 78));
__m512 ifft3913 = _mm512_fmadd_ps(ifft3904, ifft3909, _mm512_shuffle_ps(ifft3904, ifft3904, 78));
__m512 ifft3998 = _mm512_fmadd_ps(ifft3990, ifft3909, _mm512_shuffle_ps(ifft3990, ifft3990, 78));
__m512 ifft3914 = _mm512_fmadd_ps(ifft3905, ifft3909, _mm512_shuffle_ps(ifft3905, ifft3905, 78));
__m512 ifft3999 = _mm512_fmadd_ps(ifft3991, ifft3909, _mm512_shuffle_ps(ifft3991, ifft3991, 78));
__m512 ifft3915 = _mm512_fmadd_ps(ifft3906, ifft3909, _mm512_shuffle_ps(ifft3906, ifft3906, 78));
__m512 ifft4000 = _mm512_fmadd_ps(ifft3992, ifft3909, _mm512_shuffle_ps(ifft3992, ifft3992, 78));
__m512 ifft3916 = _mm512_fmadd_ps(ifft3907, ifft3909, _mm512_shuffle_ps(ifft3907, ifft3907, 78));
__m512 ifft4001 = _mm512_fmadd_ps(ifft3993, ifft3909, _mm512_shuffle_ps(ifft3993, ifft3993, 78));
__m512 ifft3917 = _mm512_fmadd_ps(ifft3908, ifft3909, _mm512_shuffle_ps(ifft3908, ifft3908, 78));
__m512 ifft4002 = _mm512_fmadd_ps(ifft3994, ifft3909, _mm512_shuffle_ps(ifft3994, ifft3994, 78));
__m512 ifft3918 = _mm512_mask_sub_ps(ifft3910, 49344, _mm512_setzero_ps(), ifft3911);
__m512 ifft4003 = _mm512_mask_sub_ps(ifft3995, 49344, _mm512_setzero_ps(), ifft3996);
__m512 ifft3919 = _mm512_mask_mov_ps(ifft3911, 49344, ifft3910);
__m512 ifft4004 = _mm512_mask_mov_ps(ifft3996, 49344, ifft3995);
__m512 ifft3920 = _mm512_mask_sub_ps(ifft3912, 49344, _mm512_setzero_ps(), ifft3913);
__m512 ifft4005 = _mm512_mask_sub_ps(ifft3997, 49344, _mm512_setzero_ps(), ifft3998);
__m512 ifft3921 = _mm512_mask_mov_ps(ifft3913, 49344, ifft3912);
__m512 ifft4006 = _mm512_mask_mov_ps(ifft3998, 49344, ifft3997);
__m512 ifft3922 = _mm512_mask_sub_ps(ifft3914, 49344, _mm512_setzero_ps(), ifft3915);
__m512 ifft4007 = _mm512_mask_sub_ps(ifft3999, 49344, _mm512_setzero_ps(), ifft4000);
__m512 ifft3923 = _mm512_mask_mov_ps(ifft3915, 49344, ifft3914);
__m512 ifft4008 = _mm512_mask_mov_ps(ifft4000, 49344, ifft3999);
__m512 ifft3924 = _mm512_mask_sub_ps(ifft3916, 49344, _mm512_setzero_ps(), ifft3917);
__m512 ifft4009 = _mm512_mask_sub_ps(ifft4001, 49344, _mm512_setzero_ps(), ifft4002);
__m512 ifft3925 = _mm512_mask_mov_ps(ifft3917, 49344, ifft3916);
__m512 ifft4010 = _mm512_mask_mov_ps(ifft4002, 49344, ifft4001);
__m512 ifft3926 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft3927 = _mm512_fmadd_ps(ifft3918, ifft3926, _mm512_shuffle_f32x4(ifft3918, ifft3918, 177));
__m512 ifft4011 = _mm512_fmadd_ps(ifft4003, ifft3926, _mm512_shuffle_f32x4(ifft4003, ifft4003, 177));
__m512 ifft3928 = _mm512_fmadd_ps(ifft3919, ifft3926, _mm512_shuffle_f32x4(ifft3919, ifft3919, 177));
__m512 ifft4012 = _mm512_fmadd_ps(ifft4004, ifft3926, _mm512_shuffle_f32x4(ifft4004, ifft4004, 177));
__m512 ifft3929 = _mm512_fmadd_ps(ifft3920, ifft3926, _mm512_shuffle_f32x4(ifft3920, ifft3920, 177));
__m512 ifft4013 = _mm512_fmadd_ps(ifft4005, ifft3926, _mm512_shuffle_f32x4(ifft4005, ifft4005, 177));
__m512 ifft3930 = _mm512_fmadd_ps(ifft3921, ifft3926, _mm512_shuffle_f32x4(ifft3921, ifft3921, 177));
__m512 ifft4014 = _mm512_fmadd_ps(ifft4006, ifft3926, _mm512_shuffle_f32x4(ifft4006, ifft4006, 177));
__m512 ifft3931 = _mm512_fmadd_ps(ifft3922, ifft3926, _mm512_shuffle_f32x4(ifft3922, ifft3922, 177));
__m512 ifft4015 = _mm512_fmadd_ps(ifft4007, ifft3926, _mm512_shuffle_f32x4(ifft4007, ifft4007, 177));
__m512 ifft3932 = _mm512_fnmsub_ps(ifft3923, ifft3926, _mm512_shuffle_f32x4(ifft3923, ifft3923, 177));
__m512 ifft4016 = _mm512_fnmsub_ps(ifft4008, ifft3926, _mm512_shuffle_f32x4(ifft4008, ifft4008, 177));
__m512 ifft3933 = _mm512_fmadd_ps(ifft3924, ifft3926, _mm512_shuffle_f32x4(ifft3924, ifft3924, 177));
__m512 ifft4017 = _mm512_fmadd_ps(ifft4009, ifft3926, _mm512_shuffle_f32x4(ifft4009, ifft4009, 177));
__m512 ifft3934 = _mm512_fmadd_ps(ifft3925, ifft3926, _mm512_shuffle_f32x4(ifft3925, ifft3925, 177));
__m512 ifft4018 = _mm512_fmadd_ps(ifft4010, ifft3926, _mm512_shuffle_f32x4(ifft4010, ifft4010, 177));
__m512 ifft3935 = _mm512_add_ps(ifft3927, ifft3928);
__m512 ifft4019 = _mm512_add_ps(ifft4011, ifft4012);
__m512 ifft3936 = _mm512_sub_ps(ifft3927, ifft3928);
__m512 ifft4020 = _mm512_sub_ps(ifft4011, ifft4012);
__m512 ifft3937 = _mm512_sub_ps(ifft3929, ifft3933);
__m512 ifft4021 = _mm512_sub_ps(ifft4013, ifft4017);
__m512 ifft3938 = _mm512_add_ps(ifft3930, ifft3934);
__m512 ifft4022 = _mm512_add_ps(ifft4014, ifft4018);
__m512 ifft3939 = _mm512_add_ps(ifft3929, ifft3933);
__m512 ifft4023 = _mm512_add_ps(ifft4013, ifft4017);
__m512 ifft3940 = _mm512_sub_ps(ifft3930, ifft3934);
__m512 ifft4024 = _mm512_sub_ps(ifft4014, ifft4018);
__m512 ifft3941 = _mm512_mul_ps(ifft3931, _mm512_set1_ps(3.125e-02f));
__m512 ifft4025 = _mm512_mul_ps(ifft4015, _mm512_set1_ps(3.125e-02f));
__m512 ifft3942 = _mm512_mul_ps(ifft3932, _mm512_set1_ps(3.125e-02f));
__m512 ifft4026 = _mm512_mul_ps(ifft4016, _mm512_set1_ps(3.125e-02f));
__m512 ifft3943 = _mm512_fmadd_ps(ifft3935, _mm512_set1_ps(1.5625e-02f), ifft3941);
__m512 ifft4027 = _mm512_fmadd_ps(ifft4019, _mm512_set1_ps(1.5625e-02f), ifft4025);
__m512 ifft3944 = _mm512_fmsub_ps(ifft3935, _mm512_set1_ps(1.5625e-02f), ifft3941);
__m512 ifft4028 = _mm512_fmsub_ps(ifft4019, _mm512_set1_ps(1.5625e-02f), ifft4025);
__m512 ifft3945 = _mm512_fmadd_ps(ifft3936, _mm512_set1_ps(1.5625e-02f), ifft3942);
__m512 ifft4029 = _mm512_fmadd_ps(ifft4020, _mm512_set1_ps(1.5625e-02f), ifft4026);
__m512 ifft3946 = _mm512_fmsub_ps(ifft3936, _mm512_set1_ps(1.5625e-02f), ifft3942);
__m512 ifft4030 = _mm512_fmsub_ps(ifft4020, _mm512_set1_ps(1.5625e-02f), ifft4026);
__m512 ifft3947 = _mm512_add_ps(ifft3937, ifft3938);
__m512 ifft4031 = _mm512_add_ps(ifft4021, ifft4022);
__m512 ifft3948 = _mm512_sub_ps(ifft3937, ifft3938);
__m512 ifft4032 = _mm512_sub_ps(ifft4021, ifft4022);
__m512 ifft3949 = _mm512_fnmadd_ps(ifft3947, _mm512_set1_ps(7.0710677e-01f), ifft3939);
__m512 ifft4033 = _mm512_fnmadd_ps(ifft4031, _mm512_set1_ps(7.0710677e-01f), ifft4023);
__m512 ifft3950 = _mm512_fmadd_ps(ifft3947, _mm512_set1_ps(7.0710677e-01f), ifft3939);
__m512 ifft4034 = _mm512_fmadd_ps(ifft4031, _mm512_set1_ps(7.0710677e-01f), ifft4023);
__m512 ifft3951 = _mm512_fmadd_ps(ifft3948, _mm512_set1_ps(7.0710677e-01f), ifft3940);
__m512 ifft4035 = _mm512_fmadd_ps(ifft4032, _mm512_set1_ps(7.0710677e-01f), ifft4024);
__m512 ifft3952 = _mm512_fmsub_ps(ifft3948, _mm512_set1_ps(7.0710677e-01f), ifft3940);
__m512 ifft4036 = _mm512_fmsub_ps(ifft4032, _mm512_set1_ps(7.0710677e-01f), ifft4024);
__m512 ifft3953 = _mm512_add_ps(ifft3949, ifft3950);
__m512 ifft4037 = _mm512_add_ps(ifft4033, ifft4034);
__m512 ifft3954 = _mm512_sub_ps(ifft3949, ifft3950);
__m512 ifft4038 = _mm512_sub_ps(ifft4033, ifft4034);
__m512 ifft3955 = _mm512_add_ps(ifft3951, ifft3952);
__m512 ifft4039 = _mm512_add_ps(ifft4035, ifft4036);
__m512 ifft3956 = _mm512_sub_ps(ifft3951, ifft3952);
__m512 ifft4040 = _mm512_sub_ps(ifft4035, ifft4036);
__m512 ifft3957 = _mm512_fmadd_ps(ifft3953, _mm512_set1_ps(1.5625e-02f), ifft3943);
__m512 ifft4041 = _mm512_fmadd_ps(ifft4037, _mm512_set1_ps(1.5625e-02f), ifft4027);
__m512 ifft3958 = _mm512_fnmadd_ps(ifft3953, _mm512_set1_ps(1.5625e-02f), ifft3943);
__m512 ifft4042 = _mm512_fnmadd_ps(ifft4037, _mm512_set1_ps(1.5625e-02f), ifft4027);
__m512 ifft3959 = _mm512_fmadd_ps(ifft3955, _mm512_set1_ps(1.5625e-02f), ifft3945);
__m512 ifft4043 = _mm512_fmadd_ps(ifft4039, _mm512_set1_ps(1.5625e-02f), ifft4029);
__m512 ifft3960 = _mm512_fnmadd_ps(ifft3955, _mm512_set1_ps(1.5625e-02f), ifft3945);
__m512 ifft4044 = _mm512_fnmadd_ps(ifft4039, _mm512_set1_ps(1.5625e-02f), ifft4029);
__m512 ifft3961 = _mm512_fnmadd_ps(ifft3956, _mm512_set1_ps(1.5625e-02f), ifft3944);
__m512 ifft4045 = _mm512_fnmadd_ps(ifft4040, _mm512_set1_ps(1.5625e-02f), ifft4028);
__m512 ifft3962 = _mm512_fmadd_ps(ifft3956, _mm512_set1_ps(1.5625e-02f), ifft3944);
__m512 ifft4046 = _mm512_fmadd_ps(ifft4040, _mm512_set1_ps(1.5625e-02f), ifft4028);
__m512 ifft3963 = _mm512_fmadd_ps(ifft3954, _mm512_set1_ps(1.5625e-02f), ifft3946);
__m512 ifft4047 = _mm512_fmadd_ps(ifft4038, _mm512_set1_ps(1.5625e-02f), ifft4030);
__m512 ifft3964 = _mm512_fnmadd_ps(ifft3954, _mm512_set1_ps(1.5625e-02f), ifft3946);
__m512 ifft4048 = _mm512_fnmadd_ps(ifft4038, _mm512_set1_ps(1.5625e-02f), ifft4030);
__m512 dat810 = ifft3957;
__m512 dat815 = ifft4041;
__m512 dat811 = ifft3959;
__m512 dat816 = ifft4043;
__m512 dat812 = ifft3961;
__m512 dat817 = ifft4045;
__m512 dat813 = ifft3963;
__m512 dat818 = ifft4047;
__m512 dat814 = ifft3958;
__m512 dat819 = ifft4042;
(void)ifft3960;
(void)ifft4044;
(void)ifft3962;
(void)ifft4046;
(void)ifft3964;
(void)ifft4048;
__m512i pm39 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack191 = _mm512_permutex2var_ps(dat810, pm39, dat815);
__m512i pm40 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack192 = _mm512_permutex2var_ps(dat810, pm40, dat815);
__m512 pack193 = _mm512_permutex2var_ps(dat811, pm39, dat816);
__m512 pack194 = _mm512_permutex2var_ps(dat811, pm40, dat816);
__m512 pack195 = _mm512_permutex2var_ps(dat812, pm39, dat817);
__m512 pack196 = _mm512_permutex2var_ps(dat812, pm40, dat817);
__m512 pack197 = _mm512_permutex2var_ps(dat813, pm39, dat818);
__m512 pack198 = _mm512_permutex2var_ps(dat813, pm40, dat818);
__m512 pack199 = _mm512_permutex2var_ps(dat814, pm39, dat819);
__m512 pack200 = _mm512_permutex2var_ps(dat814, pm40, dat819);
pack191 = _mm512_max_ps(_mm512_setzero_ps(), pack191);
pack192 = _mm512_max_ps(_mm512_setzero_ps(), pack192);
pack193 = _mm512_max_ps(_mm512_setzero_ps(), pack193);
pack194 = _mm512_max_ps(_mm512_setzero_ps(), pack194);
pack195 = _mm512_max_ps(_mm512_setzero_ps(), pack195);
pack196 = _mm512_max_ps(_mm512_setzero_ps(), pack196);
pack197 = _mm512_max_ps(_mm512_setzero_ps(), pack197);
pack198 = _mm512_max_ps(_mm512_setzero_ps(), pack198);
pack199 = _mm512_max_ps(_mm512_setzero_ps(), pack199);
pack200 = _mm512_max_ps(_mm512_setzero_ps(), pack200);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack191);
_mm512_mask_storeu_ps(datPtr2+50320+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack192);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack193);
_mm512_mask_storeu_ps(datPtr2+50768+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack194);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack195);
_mm512_mask_storeu_ps(datPtr2+51216+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack196);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack197);
_mm512_mask_storeu_ps(datPtr2+51664+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack198);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack199);
_mm512_mask_storeu_ps(datPtr2+52112+3215360*i9+200960*k37+100480*r14+448*toH13+4*toW13+40*t24, 127, pack200);
}
}
if (j5 >= last2) return;
++j5;
rel5 = 19;
}
if (rel5 < 20) {
ptrdiff_t toH14 = base5+25;
ptrdiff_t toW14 = 0;
ptrdiff_t k38 = 16*w21;
for (; k38 != 16; ++k38) {
ptrdiff_t r15 = 0;
for (; r15 != 2; ++r15) {
ptrdiff_t t25 = 0;
__m512 sfRe241 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm241 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe245 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm245 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe242 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm242 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe246 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm246 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe243 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm243 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe247 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm247 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe244 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm244 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfRe248 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512 sfIm248 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k38+768*r15+256*t25);
__m512i ifft4049 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4050 = _mm512_permutexvar_ps(ifft4049, sfRe241);
__m512 ifft4141 = _mm512_permutexvar_ps(ifft4049, sfRe245);
__m512i ifft4051 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4052 = _mm512_permutexvar_ps(ifft4051, sfRe241);
__m512 ifft4142 = _mm512_permutexvar_ps(ifft4051, sfRe245);
__m512 ifft4053 = _mm512_permutexvar_ps(ifft4049, sfIm241);
__m512 ifft4143 = _mm512_permutexvar_ps(ifft4049, sfIm245);
__m512 ifft4054 = _mm512_permutexvar_ps(ifft4051, sfIm241);
__m512 ifft4144 = _mm512_permutexvar_ps(ifft4051, sfIm245);
__m512 ifft4055 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4056 = _mm512_mask_fmadd_ps(ifft4054, 65021, ifft4055, ifft4050);
__m512 ifft4145 = _mm512_mask_fmadd_ps(ifft4144, 65021, ifft4055, ifft4141);
__m512 ifft4057 = _mm512_mask_fnmadd_ps(ifft4053, 65021, ifft4055, ifft4052);
__m512 ifft4146 = _mm512_mask_fnmadd_ps(ifft4143, 65021, ifft4055, ifft4142);
__m512 ifft4058 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4059 = _mm512_fmadd_ps(ifft4056, ifft4058, _mm512_shuffle_ps(ifft4056, ifft4056, 177));
__m512 ifft4147 = _mm512_fmadd_ps(ifft4145, ifft4058, _mm512_shuffle_ps(ifft4145, ifft4145, 177));
__m512 ifft4060 = _mm512_fmadd_ps(ifft4057, ifft4058, _mm512_shuffle_ps(ifft4057, ifft4057, 177));
__m512 ifft4148 = _mm512_fmadd_ps(ifft4146, ifft4058, _mm512_shuffle_ps(ifft4146, ifft4146, 177));
__m512 ifft4061 = _mm512_fmadd_ps(sfRe242, ifft4058, _mm512_shuffle_ps(sfRe242, sfRe242, 177));
__m512 ifft4149 = _mm512_fmadd_ps(sfRe246, ifft4058, _mm512_shuffle_ps(sfRe246, sfRe246, 177));
__m512 ifft4062 = _mm512_fmadd_ps(sfIm242, ifft4058, _mm512_shuffle_ps(sfIm242, sfIm242, 177));
__m512 ifft4150 = _mm512_fmadd_ps(sfIm246, ifft4058, _mm512_shuffle_ps(sfIm246, sfIm246, 177));
__m512 ifft4063 = _mm512_fmadd_ps(sfRe243, ifft4058, _mm512_shuffle_ps(sfRe243, sfRe243, 177));
__m512 ifft4151 = _mm512_fmadd_ps(sfRe247, ifft4058, _mm512_shuffle_ps(sfRe247, sfRe247, 177));
__m512 ifft4064 = _mm512_fmadd_ps(sfIm243, ifft4058, _mm512_shuffle_ps(sfIm243, sfIm243, 177));
__m512 ifft4152 = _mm512_fmadd_ps(sfIm247, ifft4058, _mm512_shuffle_ps(sfIm247, sfIm247, 177));
__m512 ifft4065 = _mm512_fmadd_ps(sfRe244, ifft4058, _mm512_shuffle_ps(sfRe244, sfRe244, 177));
__m512 ifft4153 = _mm512_fmadd_ps(sfRe248, ifft4058, _mm512_shuffle_ps(sfRe248, sfRe248, 177));
__m512 ifft4066 = _mm512_fmadd_ps(sfIm244, ifft4058, _mm512_shuffle_ps(sfIm244, sfIm244, 177));
__m512 ifft4154 = _mm512_fmadd_ps(sfIm248, ifft4058, _mm512_shuffle_ps(sfIm248, sfIm248, 177));
__m512 ifft4067 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4068 = _mm512_mul_ps(ifft4059, ifft4067);
__m512 ifft4155 = _mm512_mul_ps(ifft4147, ifft4067);
__m512 ifft4069 = _mm512_mul_ps(ifft4060, ifft4067);
__m512 ifft4156 = _mm512_mul_ps(ifft4148, ifft4067);
__m512 ifft4070 = _mm512_mul_ps(ifft4061, ifft4067);
__m512 ifft4157 = _mm512_mul_ps(ifft4149, ifft4067);
__m512 ifft4071 = _mm512_mul_ps(ifft4062, ifft4067);
__m512 ifft4158 = _mm512_mul_ps(ifft4150, ifft4067);
__m512 ifft4072 = _mm512_mul_ps(ifft4063, ifft4067);
__m512 ifft4159 = _mm512_mul_ps(ifft4151, ifft4067);
__m512 ifft4073 = _mm512_mul_ps(ifft4064, ifft4067);
__m512 ifft4160 = _mm512_mul_ps(ifft4152, ifft4067);
__m512 ifft4074 = _mm512_mul_ps(ifft4065, ifft4067);
__m512 ifft4161 = _mm512_mul_ps(ifft4153, ifft4067);
__m512 ifft4075 = _mm512_mul_ps(ifft4066, ifft4067);
__m512 ifft4162 = _mm512_mul_ps(ifft4154, ifft4067);
__m512 ifft4076 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4077 = _mm512_fnmadd_ps(ifft4060, ifft4076, ifft4068);
__m512 ifft4163 = _mm512_fnmadd_ps(ifft4148, ifft4076, ifft4155);
__m512 ifft4078 = _mm512_fmadd_ps(ifft4059, ifft4076, ifft4069);
__m512 ifft4164 = _mm512_fmadd_ps(ifft4147, ifft4076, ifft4156);
__m512 ifft4079 = _mm512_fnmadd_ps(ifft4062, ifft4076, ifft4070);
__m512 ifft4165 = _mm512_fnmadd_ps(ifft4150, ifft4076, ifft4157);
__m512 ifft4080 = _mm512_fmadd_ps(ifft4061, ifft4076, ifft4071);
__m512 ifft4166 = _mm512_fmadd_ps(ifft4149, ifft4076, ifft4158);
__m512 ifft4081 = _mm512_fnmadd_ps(ifft4064, ifft4076, ifft4072);
__m512 ifft4167 = _mm512_fnmadd_ps(ifft4152, ifft4076, ifft4159);
__m512 ifft4082 = _mm512_fmadd_ps(ifft4063, ifft4076, ifft4073);
__m512 ifft4168 = _mm512_fmadd_ps(ifft4151, ifft4076, ifft4160);
__m512 ifft4083 = _mm512_fnmadd_ps(ifft4066, ifft4076, ifft4074);
__m512 ifft4169 = _mm512_fnmadd_ps(ifft4154, ifft4076, ifft4161);
__m512 ifft4084 = _mm512_fmadd_ps(ifft4065, ifft4076, ifft4075);
__m512 ifft4170 = _mm512_fmadd_ps(ifft4153, ifft4076, ifft4162);
__m512 ifft4085 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4086 = _mm512_fmadd_ps(ifft4077, ifft4085, _mm512_shuffle_ps(ifft4077, ifft4077, 78));
__m512 ifft4171 = _mm512_fmadd_ps(ifft4163, ifft4085, _mm512_shuffle_ps(ifft4163, ifft4163, 78));
__m512 ifft4087 = _mm512_fmadd_ps(ifft4078, ifft4085, _mm512_shuffle_ps(ifft4078, ifft4078, 78));
__m512 ifft4172 = _mm512_fmadd_ps(ifft4164, ifft4085, _mm512_shuffle_ps(ifft4164, ifft4164, 78));
__m512 ifft4088 = _mm512_fmadd_ps(ifft4079, ifft4085, _mm512_shuffle_ps(ifft4079, ifft4079, 78));
__m512 ifft4173 = _mm512_fmadd_ps(ifft4165, ifft4085, _mm512_shuffle_ps(ifft4165, ifft4165, 78));
__m512 ifft4089 = _mm512_fmadd_ps(ifft4080, ifft4085, _mm512_shuffle_ps(ifft4080, ifft4080, 78));
__m512 ifft4174 = _mm512_fmadd_ps(ifft4166, ifft4085, _mm512_shuffle_ps(ifft4166, ifft4166, 78));
__m512 ifft4090 = _mm512_fmadd_ps(ifft4081, ifft4085, _mm512_shuffle_ps(ifft4081, ifft4081, 78));
__m512 ifft4175 = _mm512_fmadd_ps(ifft4167, ifft4085, _mm512_shuffle_ps(ifft4167, ifft4167, 78));
__m512 ifft4091 = _mm512_fmadd_ps(ifft4082, ifft4085, _mm512_shuffle_ps(ifft4082, ifft4082, 78));
__m512 ifft4176 = _mm512_fmadd_ps(ifft4168, ifft4085, _mm512_shuffle_ps(ifft4168, ifft4168, 78));
__m512 ifft4092 = _mm512_fmadd_ps(ifft4083, ifft4085, _mm512_shuffle_ps(ifft4083, ifft4083, 78));
__m512 ifft4177 = _mm512_fmadd_ps(ifft4169, ifft4085, _mm512_shuffle_ps(ifft4169, ifft4169, 78));
__m512 ifft4093 = _mm512_fmadd_ps(ifft4084, ifft4085, _mm512_shuffle_ps(ifft4084, ifft4084, 78));
__m512 ifft4178 = _mm512_fmadd_ps(ifft4170, ifft4085, _mm512_shuffle_ps(ifft4170, ifft4170, 78));
__m512 ifft4094 = _mm512_mask_sub_ps(ifft4086, 49344, _mm512_setzero_ps(), ifft4087);
__m512 ifft4179 = _mm512_mask_sub_ps(ifft4171, 49344, _mm512_setzero_ps(), ifft4172);
__m512 ifft4095 = _mm512_mask_mov_ps(ifft4087, 49344, ifft4086);
__m512 ifft4180 = _mm512_mask_mov_ps(ifft4172, 49344, ifft4171);
__m512 ifft4096 = _mm512_mask_sub_ps(ifft4088, 49344, _mm512_setzero_ps(), ifft4089);
__m512 ifft4181 = _mm512_mask_sub_ps(ifft4173, 49344, _mm512_setzero_ps(), ifft4174);
__m512 ifft4097 = _mm512_mask_mov_ps(ifft4089, 49344, ifft4088);
__m512 ifft4182 = _mm512_mask_mov_ps(ifft4174, 49344, ifft4173);
__m512 ifft4098 = _mm512_mask_sub_ps(ifft4090, 49344, _mm512_setzero_ps(), ifft4091);
__m512 ifft4183 = _mm512_mask_sub_ps(ifft4175, 49344, _mm512_setzero_ps(), ifft4176);
__m512 ifft4099 = _mm512_mask_mov_ps(ifft4091, 49344, ifft4090);
__m512 ifft4184 = _mm512_mask_mov_ps(ifft4176, 49344, ifft4175);
__m512 ifft4100 = _mm512_mask_sub_ps(ifft4092, 49344, _mm512_setzero_ps(), ifft4093);
__m512 ifft4185 = _mm512_mask_sub_ps(ifft4177, 49344, _mm512_setzero_ps(), ifft4178);
__m512 ifft4101 = _mm512_mask_mov_ps(ifft4093, 49344, ifft4092);
__m512 ifft4186 = _mm512_mask_mov_ps(ifft4178, 49344, ifft4177);
__m512 ifft4102 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4103 = _mm512_fmadd_ps(ifft4094, ifft4102, _mm512_shuffle_f32x4(ifft4094, ifft4094, 177));
__m512 ifft4187 = _mm512_fmadd_ps(ifft4179, ifft4102, _mm512_shuffle_f32x4(ifft4179, ifft4179, 177));
__m512 ifft4104 = _mm512_fmadd_ps(ifft4095, ifft4102, _mm512_shuffle_f32x4(ifft4095, ifft4095, 177));
__m512 ifft4188 = _mm512_fmadd_ps(ifft4180, ifft4102, _mm512_shuffle_f32x4(ifft4180, ifft4180, 177));
__m512 ifft4105 = _mm512_fmadd_ps(ifft4096, ifft4102, _mm512_shuffle_f32x4(ifft4096, ifft4096, 177));
__m512 ifft4189 = _mm512_fmadd_ps(ifft4181, ifft4102, _mm512_shuffle_f32x4(ifft4181, ifft4181, 177));
__m512 ifft4106 = _mm512_fmadd_ps(ifft4097, ifft4102, _mm512_shuffle_f32x4(ifft4097, ifft4097, 177));
__m512 ifft4190 = _mm512_fmadd_ps(ifft4182, ifft4102, _mm512_shuffle_f32x4(ifft4182, ifft4182, 177));
__m512 ifft4107 = _mm512_fmadd_ps(ifft4098, ifft4102, _mm512_shuffle_f32x4(ifft4098, ifft4098, 177));
__m512 ifft4191 = _mm512_fmadd_ps(ifft4183, ifft4102, _mm512_shuffle_f32x4(ifft4183, ifft4183, 177));
__m512 ifft4108 = _mm512_fnmsub_ps(ifft4099, ifft4102, _mm512_shuffle_f32x4(ifft4099, ifft4099, 177));
__m512 ifft4192 = _mm512_fnmsub_ps(ifft4184, ifft4102, _mm512_shuffle_f32x4(ifft4184, ifft4184, 177));
__m512 ifft4109 = _mm512_fmadd_ps(ifft4100, ifft4102, _mm512_shuffle_f32x4(ifft4100, ifft4100, 177));
__m512 ifft4193 = _mm512_fmadd_ps(ifft4185, ifft4102, _mm512_shuffle_f32x4(ifft4185, ifft4185, 177));
__m512 ifft4110 = _mm512_fmadd_ps(ifft4101, ifft4102, _mm512_shuffle_f32x4(ifft4101, ifft4101, 177));
__m512 ifft4194 = _mm512_fmadd_ps(ifft4186, ifft4102, _mm512_shuffle_f32x4(ifft4186, ifft4186, 177));
__m512 ifft4111 = _mm512_add_ps(ifft4103, ifft4104);
__m512 ifft4195 = _mm512_add_ps(ifft4187, ifft4188);
__m512 ifft4112 = _mm512_sub_ps(ifft4103, ifft4104);
__m512 ifft4196 = _mm512_sub_ps(ifft4187, ifft4188);
__m512 ifft4113 = _mm512_sub_ps(ifft4105, ifft4109);
__m512 ifft4197 = _mm512_sub_ps(ifft4189, ifft4193);
__m512 ifft4114 = _mm512_add_ps(ifft4106, ifft4110);
__m512 ifft4198 = _mm512_add_ps(ifft4190, ifft4194);
__m512 ifft4115 = _mm512_add_ps(ifft4105, ifft4109);
__m512 ifft4199 = _mm512_add_ps(ifft4189, ifft4193);
__m512 ifft4116 = _mm512_sub_ps(ifft4106, ifft4110);
__m512 ifft4200 = _mm512_sub_ps(ifft4190, ifft4194);
__m512 ifft4117 = _mm512_mul_ps(ifft4107, _mm512_set1_ps(3.125e-02f));
__m512 ifft4201 = _mm512_mul_ps(ifft4191, _mm512_set1_ps(3.125e-02f));
__m512 ifft4118 = _mm512_mul_ps(ifft4108, _mm512_set1_ps(3.125e-02f));
__m512 ifft4202 = _mm512_mul_ps(ifft4192, _mm512_set1_ps(3.125e-02f));
__m512 ifft4119 = _mm512_fmadd_ps(ifft4111, _mm512_set1_ps(1.5625e-02f), ifft4117);
__m512 ifft4203 = _mm512_fmadd_ps(ifft4195, _mm512_set1_ps(1.5625e-02f), ifft4201);
__m512 ifft4120 = _mm512_fmsub_ps(ifft4111, _mm512_set1_ps(1.5625e-02f), ifft4117);
__m512 ifft4204 = _mm512_fmsub_ps(ifft4195, _mm512_set1_ps(1.5625e-02f), ifft4201);
__m512 ifft4121 = _mm512_fmadd_ps(ifft4112, _mm512_set1_ps(1.5625e-02f), ifft4118);
__m512 ifft4205 = _mm512_fmadd_ps(ifft4196, _mm512_set1_ps(1.5625e-02f), ifft4202);
__m512 ifft4122 = _mm512_fmsub_ps(ifft4112, _mm512_set1_ps(1.5625e-02f), ifft4118);
__m512 ifft4206 = _mm512_fmsub_ps(ifft4196, _mm512_set1_ps(1.5625e-02f), ifft4202);
__m512 ifft4123 = _mm512_add_ps(ifft4113, ifft4114);
__m512 ifft4207 = _mm512_add_ps(ifft4197, ifft4198);
__m512 ifft4124 = _mm512_sub_ps(ifft4113, ifft4114);
__m512 ifft4208 = _mm512_sub_ps(ifft4197, ifft4198);
__m512 ifft4125 = _mm512_fnmadd_ps(ifft4123, _mm512_set1_ps(7.0710677e-01f), ifft4115);
__m512 ifft4209 = _mm512_fnmadd_ps(ifft4207, _mm512_set1_ps(7.0710677e-01f), ifft4199);
__m512 ifft4126 = _mm512_fmadd_ps(ifft4123, _mm512_set1_ps(7.0710677e-01f), ifft4115);
__m512 ifft4210 = _mm512_fmadd_ps(ifft4207, _mm512_set1_ps(7.0710677e-01f), ifft4199);
__m512 ifft4127 = _mm512_fmadd_ps(ifft4124, _mm512_set1_ps(7.0710677e-01f), ifft4116);
__m512 ifft4211 = _mm512_fmadd_ps(ifft4208, _mm512_set1_ps(7.0710677e-01f), ifft4200);
__m512 ifft4128 = _mm512_fmsub_ps(ifft4124, _mm512_set1_ps(7.0710677e-01f), ifft4116);
__m512 ifft4212 = _mm512_fmsub_ps(ifft4208, _mm512_set1_ps(7.0710677e-01f), ifft4200);
__m512 ifft4129 = _mm512_add_ps(ifft4125, ifft4126);
__m512 ifft4213 = _mm512_add_ps(ifft4209, ifft4210);
__m512 ifft4130 = _mm512_sub_ps(ifft4125, ifft4126);
__m512 ifft4214 = _mm512_sub_ps(ifft4209, ifft4210);
__m512 ifft4131 = _mm512_add_ps(ifft4127, ifft4128);
__m512 ifft4215 = _mm512_add_ps(ifft4211, ifft4212);
__m512 ifft4132 = _mm512_sub_ps(ifft4127, ifft4128);
__m512 ifft4216 = _mm512_sub_ps(ifft4211, ifft4212);
__m512 ifft4133 = _mm512_fmadd_ps(ifft4129, _mm512_set1_ps(1.5625e-02f), ifft4119);
__m512 ifft4217 = _mm512_fmadd_ps(ifft4213, _mm512_set1_ps(1.5625e-02f), ifft4203);
__m512 ifft4134 = _mm512_fnmadd_ps(ifft4129, _mm512_set1_ps(1.5625e-02f), ifft4119);
__m512 ifft4218 = _mm512_fnmadd_ps(ifft4213, _mm512_set1_ps(1.5625e-02f), ifft4203);
__m512 ifft4135 = _mm512_fmadd_ps(ifft4131, _mm512_set1_ps(1.5625e-02f), ifft4121);
__m512 ifft4219 = _mm512_fmadd_ps(ifft4215, _mm512_set1_ps(1.5625e-02f), ifft4205);
__m512 ifft4136 = _mm512_fnmadd_ps(ifft4131, _mm512_set1_ps(1.5625e-02f), ifft4121);
__m512 ifft4220 = _mm512_fnmadd_ps(ifft4215, _mm512_set1_ps(1.5625e-02f), ifft4205);
__m512 ifft4137 = _mm512_fnmadd_ps(ifft4132, _mm512_set1_ps(1.5625e-02f), ifft4120);
__m512 ifft4221 = _mm512_fnmadd_ps(ifft4216, _mm512_set1_ps(1.5625e-02f), ifft4204);
__m512 ifft4138 = _mm512_fmadd_ps(ifft4132, _mm512_set1_ps(1.5625e-02f), ifft4120);
__m512 ifft4222 = _mm512_fmadd_ps(ifft4216, _mm512_set1_ps(1.5625e-02f), ifft4204);
__m512 ifft4139 = _mm512_fmadd_ps(ifft4130, _mm512_set1_ps(1.5625e-02f), ifft4122);
__m512 ifft4223 = _mm512_fmadd_ps(ifft4214, _mm512_set1_ps(1.5625e-02f), ifft4206);
__m512 ifft4140 = _mm512_fnmadd_ps(ifft4130, _mm512_set1_ps(1.5625e-02f), ifft4122);
__m512 ifft4224 = _mm512_fnmadd_ps(ifft4214, _mm512_set1_ps(1.5625e-02f), ifft4206);
__m512 dat820 = ifft4133;
__m512 dat825 = ifft4217;
__m512 dat821 = ifft4135;
__m512 dat826 = ifft4219;
__m512 dat822 = ifft4137;
__m512 dat827 = ifft4221;
__m512 dat823 = ifft4139;
__m512 dat828 = ifft4223;
__m512 dat824 = ifft4134;
__m512 dat829 = ifft4218;
(void)ifft4136;
(void)ifft4220;
(void)ifft4138;
(void)ifft4222;
(void)ifft4140;
(void)ifft4224;
__m512i pm41 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack201 = _mm512_permutex2var_ps(dat820, pm41, dat825);
__m512i pm42 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack202 = _mm512_permutex2var_ps(dat820, pm42, dat825);
__m512 pack203 = _mm512_permutex2var_ps(dat821, pm41, dat826);
__m512 pack204 = _mm512_permutex2var_ps(dat821, pm42, dat826);
__m512 pack205 = _mm512_permutex2var_ps(dat822, pm41, dat827);
__m512 pack206 = _mm512_permutex2var_ps(dat822, pm42, dat827);
__m512 pack207 = _mm512_permutex2var_ps(dat823, pm41, dat828);
__m512 pack208 = _mm512_permutex2var_ps(dat823, pm42, dat828);
__m512 pack209 = _mm512_permutex2var_ps(dat824, pm41, dat829);
__m512 pack210 = _mm512_permutex2var_ps(dat824, pm42, dat829);
pack201 = _mm512_max_ps(_mm512_setzero_ps(), pack201);
pack202 = _mm512_max_ps(_mm512_setzero_ps(), pack202);
pack203 = _mm512_max_ps(_mm512_setzero_ps(), pack203);
pack204 = _mm512_max_ps(_mm512_setzero_ps(), pack204);
pack205 = _mm512_max_ps(_mm512_setzero_ps(), pack205);
pack206 = _mm512_max_ps(_mm512_setzero_ps(), pack206);
pack207 = _mm512_max_ps(_mm512_setzero_ps(), pack207);
pack208 = _mm512_max_ps(_mm512_setzero_ps(), pack208);
pack209 = _mm512_max_ps(_mm512_setzero_ps(), pack209);
pack210 = _mm512_max_ps(_mm512_setzero_ps(), pack210);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack201);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack202);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack203);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack204);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack205);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack206);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack207);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack208);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack209);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+0*t25, 1023, pack210);
ptrdiff_t t26 = 0;
for (; t26 < 2; ++t26) {
__m512 sfRe249 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm249 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe253 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm253 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe250 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm250 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe254 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm254 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe251 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm251 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe255 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm255 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe252 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm252 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfRe256 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512 sfIm256 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k38+768*r15+256*t26);
__m512i ifft4225 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4226 = _mm512_permutexvar_ps(ifft4225, sfRe249);
__m512 ifft4317 = _mm512_permutexvar_ps(ifft4225, sfRe253);
__m512i ifft4227 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4228 = _mm512_permutexvar_ps(ifft4227, sfRe249);
__m512 ifft4318 = _mm512_permutexvar_ps(ifft4227, sfRe253);
__m512 ifft4229 = _mm512_permutexvar_ps(ifft4225, sfIm249);
__m512 ifft4319 = _mm512_permutexvar_ps(ifft4225, sfIm253);
__m512 ifft4230 = _mm512_permutexvar_ps(ifft4227, sfIm249);
__m512 ifft4320 = _mm512_permutexvar_ps(ifft4227, sfIm253);
__m512 ifft4231 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4232 = _mm512_mask_fmadd_ps(ifft4230, 65021, ifft4231, ifft4226);
__m512 ifft4321 = _mm512_mask_fmadd_ps(ifft4320, 65021, ifft4231, ifft4317);
__m512 ifft4233 = _mm512_mask_fnmadd_ps(ifft4229, 65021, ifft4231, ifft4228);
__m512 ifft4322 = _mm512_mask_fnmadd_ps(ifft4319, 65021, ifft4231, ifft4318);
__m512 ifft4234 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4235 = _mm512_fmadd_ps(ifft4232, ifft4234, _mm512_shuffle_ps(ifft4232, ifft4232, 177));
__m512 ifft4323 = _mm512_fmadd_ps(ifft4321, ifft4234, _mm512_shuffle_ps(ifft4321, ifft4321, 177));
__m512 ifft4236 = _mm512_fmadd_ps(ifft4233, ifft4234, _mm512_shuffle_ps(ifft4233, ifft4233, 177));
__m512 ifft4324 = _mm512_fmadd_ps(ifft4322, ifft4234, _mm512_shuffle_ps(ifft4322, ifft4322, 177));
__m512 ifft4237 = _mm512_fmadd_ps(sfRe250, ifft4234, _mm512_shuffle_ps(sfRe250, sfRe250, 177));
__m512 ifft4325 = _mm512_fmadd_ps(sfRe254, ifft4234, _mm512_shuffle_ps(sfRe254, sfRe254, 177));
__m512 ifft4238 = _mm512_fmadd_ps(sfIm250, ifft4234, _mm512_shuffle_ps(sfIm250, sfIm250, 177));
__m512 ifft4326 = _mm512_fmadd_ps(sfIm254, ifft4234, _mm512_shuffle_ps(sfIm254, sfIm254, 177));
__m512 ifft4239 = _mm512_fmadd_ps(sfRe251, ifft4234, _mm512_shuffle_ps(sfRe251, sfRe251, 177));
__m512 ifft4327 = _mm512_fmadd_ps(sfRe255, ifft4234, _mm512_shuffle_ps(sfRe255, sfRe255, 177));
__m512 ifft4240 = _mm512_fmadd_ps(sfIm251, ifft4234, _mm512_shuffle_ps(sfIm251, sfIm251, 177));
__m512 ifft4328 = _mm512_fmadd_ps(sfIm255, ifft4234, _mm512_shuffle_ps(sfIm255, sfIm255, 177));
__m512 ifft4241 = _mm512_fmadd_ps(sfRe252, ifft4234, _mm512_shuffle_ps(sfRe252, sfRe252, 177));
__m512 ifft4329 = _mm512_fmadd_ps(sfRe256, ifft4234, _mm512_shuffle_ps(sfRe256, sfRe256, 177));
__m512 ifft4242 = _mm512_fmadd_ps(sfIm252, ifft4234, _mm512_shuffle_ps(sfIm252, sfIm252, 177));
__m512 ifft4330 = _mm512_fmadd_ps(sfIm256, ifft4234, _mm512_shuffle_ps(sfIm256, sfIm256, 177));
__m512 ifft4243 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4244 = _mm512_mul_ps(ifft4235, ifft4243);
__m512 ifft4331 = _mm512_mul_ps(ifft4323, ifft4243);
__m512 ifft4245 = _mm512_mul_ps(ifft4236, ifft4243);
__m512 ifft4332 = _mm512_mul_ps(ifft4324, ifft4243);
__m512 ifft4246 = _mm512_mul_ps(ifft4237, ifft4243);
__m512 ifft4333 = _mm512_mul_ps(ifft4325, ifft4243);
__m512 ifft4247 = _mm512_mul_ps(ifft4238, ifft4243);
__m512 ifft4334 = _mm512_mul_ps(ifft4326, ifft4243);
__m512 ifft4248 = _mm512_mul_ps(ifft4239, ifft4243);
__m512 ifft4335 = _mm512_mul_ps(ifft4327, ifft4243);
__m512 ifft4249 = _mm512_mul_ps(ifft4240, ifft4243);
__m512 ifft4336 = _mm512_mul_ps(ifft4328, ifft4243);
__m512 ifft4250 = _mm512_mul_ps(ifft4241, ifft4243);
__m512 ifft4337 = _mm512_mul_ps(ifft4329, ifft4243);
__m512 ifft4251 = _mm512_mul_ps(ifft4242, ifft4243);
__m512 ifft4338 = _mm512_mul_ps(ifft4330, ifft4243);
__m512 ifft4252 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4253 = _mm512_fnmadd_ps(ifft4236, ifft4252, ifft4244);
__m512 ifft4339 = _mm512_fnmadd_ps(ifft4324, ifft4252, ifft4331);
__m512 ifft4254 = _mm512_fmadd_ps(ifft4235, ifft4252, ifft4245);
__m512 ifft4340 = _mm512_fmadd_ps(ifft4323, ifft4252, ifft4332);
__m512 ifft4255 = _mm512_fnmadd_ps(ifft4238, ifft4252, ifft4246);
__m512 ifft4341 = _mm512_fnmadd_ps(ifft4326, ifft4252, ifft4333);
__m512 ifft4256 = _mm512_fmadd_ps(ifft4237, ifft4252, ifft4247);
__m512 ifft4342 = _mm512_fmadd_ps(ifft4325, ifft4252, ifft4334);
__m512 ifft4257 = _mm512_fnmadd_ps(ifft4240, ifft4252, ifft4248);
__m512 ifft4343 = _mm512_fnmadd_ps(ifft4328, ifft4252, ifft4335);
__m512 ifft4258 = _mm512_fmadd_ps(ifft4239, ifft4252, ifft4249);
__m512 ifft4344 = _mm512_fmadd_ps(ifft4327, ifft4252, ifft4336);
__m512 ifft4259 = _mm512_fnmadd_ps(ifft4242, ifft4252, ifft4250);
__m512 ifft4345 = _mm512_fnmadd_ps(ifft4330, ifft4252, ifft4337);
__m512 ifft4260 = _mm512_fmadd_ps(ifft4241, ifft4252, ifft4251);
__m512 ifft4346 = _mm512_fmadd_ps(ifft4329, ifft4252, ifft4338);
__m512 ifft4261 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4262 = _mm512_fmadd_ps(ifft4253, ifft4261, _mm512_shuffle_ps(ifft4253, ifft4253, 78));
__m512 ifft4347 = _mm512_fmadd_ps(ifft4339, ifft4261, _mm512_shuffle_ps(ifft4339, ifft4339, 78));
__m512 ifft4263 = _mm512_fmadd_ps(ifft4254, ifft4261, _mm512_shuffle_ps(ifft4254, ifft4254, 78));
__m512 ifft4348 = _mm512_fmadd_ps(ifft4340, ifft4261, _mm512_shuffle_ps(ifft4340, ifft4340, 78));
__m512 ifft4264 = _mm512_fmadd_ps(ifft4255, ifft4261, _mm512_shuffle_ps(ifft4255, ifft4255, 78));
__m512 ifft4349 = _mm512_fmadd_ps(ifft4341, ifft4261, _mm512_shuffle_ps(ifft4341, ifft4341, 78));
__m512 ifft4265 = _mm512_fmadd_ps(ifft4256, ifft4261, _mm512_shuffle_ps(ifft4256, ifft4256, 78));
__m512 ifft4350 = _mm512_fmadd_ps(ifft4342, ifft4261, _mm512_shuffle_ps(ifft4342, ifft4342, 78));
__m512 ifft4266 = _mm512_fmadd_ps(ifft4257, ifft4261, _mm512_shuffle_ps(ifft4257, ifft4257, 78));
__m512 ifft4351 = _mm512_fmadd_ps(ifft4343, ifft4261, _mm512_shuffle_ps(ifft4343, ifft4343, 78));
__m512 ifft4267 = _mm512_fmadd_ps(ifft4258, ifft4261, _mm512_shuffle_ps(ifft4258, ifft4258, 78));
__m512 ifft4352 = _mm512_fmadd_ps(ifft4344, ifft4261, _mm512_shuffle_ps(ifft4344, ifft4344, 78));
__m512 ifft4268 = _mm512_fmadd_ps(ifft4259, ifft4261, _mm512_shuffle_ps(ifft4259, ifft4259, 78));
__m512 ifft4353 = _mm512_fmadd_ps(ifft4345, ifft4261, _mm512_shuffle_ps(ifft4345, ifft4345, 78));
__m512 ifft4269 = _mm512_fmadd_ps(ifft4260, ifft4261, _mm512_shuffle_ps(ifft4260, ifft4260, 78));
__m512 ifft4354 = _mm512_fmadd_ps(ifft4346, ifft4261, _mm512_shuffle_ps(ifft4346, ifft4346, 78));
__m512 ifft4270 = _mm512_mask_sub_ps(ifft4262, 49344, _mm512_setzero_ps(), ifft4263);
__m512 ifft4355 = _mm512_mask_sub_ps(ifft4347, 49344, _mm512_setzero_ps(), ifft4348);
__m512 ifft4271 = _mm512_mask_mov_ps(ifft4263, 49344, ifft4262);
__m512 ifft4356 = _mm512_mask_mov_ps(ifft4348, 49344, ifft4347);
__m512 ifft4272 = _mm512_mask_sub_ps(ifft4264, 49344, _mm512_setzero_ps(), ifft4265);
__m512 ifft4357 = _mm512_mask_sub_ps(ifft4349, 49344, _mm512_setzero_ps(), ifft4350);
__m512 ifft4273 = _mm512_mask_mov_ps(ifft4265, 49344, ifft4264);
__m512 ifft4358 = _mm512_mask_mov_ps(ifft4350, 49344, ifft4349);
__m512 ifft4274 = _mm512_mask_sub_ps(ifft4266, 49344, _mm512_setzero_ps(), ifft4267);
__m512 ifft4359 = _mm512_mask_sub_ps(ifft4351, 49344, _mm512_setzero_ps(), ifft4352);
__m512 ifft4275 = _mm512_mask_mov_ps(ifft4267, 49344, ifft4266);
__m512 ifft4360 = _mm512_mask_mov_ps(ifft4352, 49344, ifft4351);
__m512 ifft4276 = _mm512_mask_sub_ps(ifft4268, 49344, _mm512_setzero_ps(), ifft4269);
__m512 ifft4361 = _mm512_mask_sub_ps(ifft4353, 49344, _mm512_setzero_ps(), ifft4354);
__m512 ifft4277 = _mm512_mask_mov_ps(ifft4269, 49344, ifft4268);
__m512 ifft4362 = _mm512_mask_mov_ps(ifft4354, 49344, ifft4353);
__m512 ifft4278 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4279 = _mm512_fmadd_ps(ifft4270, ifft4278, _mm512_shuffle_f32x4(ifft4270, ifft4270, 177));
__m512 ifft4363 = _mm512_fmadd_ps(ifft4355, ifft4278, _mm512_shuffle_f32x4(ifft4355, ifft4355, 177));
__m512 ifft4280 = _mm512_fmadd_ps(ifft4271, ifft4278, _mm512_shuffle_f32x4(ifft4271, ifft4271, 177));
__m512 ifft4364 = _mm512_fmadd_ps(ifft4356, ifft4278, _mm512_shuffle_f32x4(ifft4356, ifft4356, 177));
__m512 ifft4281 = _mm512_fmadd_ps(ifft4272, ifft4278, _mm512_shuffle_f32x4(ifft4272, ifft4272, 177));
__m512 ifft4365 = _mm512_fmadd_ps(ifft4357, ifft4278, _mm512_shuffle_f32x4(ifft4357, ifft4357, 177));
__m512 ifft4282 = _mm512_fmadd_ps(ifft4273, ifft4278, _mm512_shuffle_f32x4(ifft4273, ifft4273, 177));
__m512 ifft4366 = _mm512_fmadd_ps(ifft4358, ifft4278, _mm512_shuffle_f32x4(ifft4358, ifft4358, 177));
__m512 ifft4283 = _mm512_fmadd_ps(ifft4274, ifft4278, _mm512_shuffle_f32x4(ifft4274, ifft4274, 177));
__m512 ifft4367 = _mm512_fmadd_ps(ifft4359, ifft4278, _mm512_shuffle_f32x4(ifft4359, ifft4359, 177));
__m512 ifft4284 = _mm512_fnmsub_ps(ifft4275, ifft4278, _mm512_shuffle_f32x4(ifft4275, ifft4275, 177));
__m512 ifft4368 = _mm512_fnmsub_ps(ifft4360, ifft4278, _mm512_shuffle_f32x4(ifft4360, ifft4360, 177));
__m512 ifft4285 = _mm512_fmadd_ps(ifft4276, ifft4278, _mm512_shuffle_f32x4(ifft4276, ifft4276, 177));
__m512 ifft4369 = _mm512_fmadd_ps(ifft4361, ifft4278, _mm512_shuffle_f32x4(ifft4361, ifft4361, 177));
__m512 ifft4286 = _mm512_fmadd_ps(ifft4277, ifft4278, _mm512_shuffle_f32x4(ifft4277, ifft4277, 177));
__m512 ifft4370 = _mm512_fmadd_ps(ifft4362, ifft4278, _mm512_shuffle_f32x4(ifft4362, ifft4362, 177));
__m512 ifft4287 = _mm512_add_ps(ifft4279, ifft4280);
__m512 ifft4371 = _mm512_add_ps(ifft4363, ifft4364);
__m512 ifft4288 = _mm512_sub_ps(ifft4279, ifft4280);
__m512 ifft4372 = _mm512_sub_ps(ifft4363, ifft4364);
__m512 ifft4289 = _mm512_sub_ps(ifft4281, ifft4285);
__m512 ifft4373 = _mm512_sub_ps(ifft4365, ifft4369);
__m512 ifft4290 = _mm512_add_ps(ifft4282, ifft4286);
__m512 ifft4374 = _mm512_add_ps(ifft4366, ifft4370);
__m512 ifft4291 = _mm512_add_ps(ifft4281, ifft4285);
__m512 ifft4375 = _mm512_add_ps(ifft4365, ifft4369);
__m512 ifft4292 = _mm512_sub_ps(ifft4282, ifft4286);
__m512 ifft4376 = _mm512_sub_ps(ifft4366, ifft4370);
__m512 ifft4293 = _mm512_mul_ps(ifft4283, _mm512_set1_ps(3.125e-02f));
__m512 ifft4377 = _mm512_mul_ps(ifft4367, _mm512_set1_ps(3.125e-02f));
__m512 ifft4294 = _mm512_mul_ps(ifft4284, _mm512_set1_ps(3.125e-02f));
__m512 ifft4378 = _mm512_mul_ps(ifft4368, _mm512_set1_ps(3.125e-02f));
__m512 ifft4295 = _mm512_fmadd_ps(ifft4287, _mm512_set1_ps(1.5625e-02f), ifft4293);
__m512 ifft4379 = _mm512_fmadd_ps(ifft4371, _mm512_set1_ps(1.5625e-02f), ifft4377);
__m512 ifft4296 = _mm512_fmsub_ps(ifft4287, _mm512_set1_ps(1.5625e-02f), ifft4293);
__m512 ifft4380 = _mm512_fmsub_ps(ifft4371, _mm512_set1_ps(1.5625e-02f), ifft4377);
__m512 ifft4297 = _mm512_fmadd_ps(ifft4288, _mm512_set1_ps(1.5625e-02f), ifft4294);
__m512 ifft4381 = _mm512_fmadd_ps(ifft4372, _mm512_set1_ps(1.5625e-02f), ifft4378);
__m512 ifft4298 = _mm512_fmsub_ps(ifft4288, _mm512_set1_ps(1.5625e-02f), ifft4294);
__m512 ifft4382 = _mm512_fmsub_ps(ifft4372, _mm512_set1_ps(1.5625e-02f), ifft4378);
__m512 ifft4299 = _mm512_add_ps(ifft4289, ifft4290);
__m512 ifft4383 = _mm512_add_ps(ifft4373, ifft4374);
__m512 ifft4300 = _mm512_sub_ps(ifft4289, ifft4290);
__m512 ifft4384 = _mm512_sub_ps(ifft4373, ifft4374);
__m512 ifft4301 = _mm512_fnmadd_ps(ifft4299, _mm512_set1_ps(7.0710677e-01f), ifft4291);
__m512 ifft4385 = _mm512_fnmadd_ps(ifft4383, _mm512_set1_ps(7.0710677e-01f), ifft4375);
__m512 ifft4302 = _mm512_fmadd_ps(ifft4299, _mm512_set1_ps(7.0710677e-01f), ifft4291);
__m512 ifft4386 = _mm512_fmadd_ps(ifft4383, _mm512_set1_ps(7.0710677e-01f), ifft4375);
__m512 ifft4303 = _mm512_fmadd_ps(ifft4300, _mm512_set1_ps(7.0710677e-01f), ifft4292);
__m512 ifft4387 = _mm512_fmadd_ps(ifft4384, _mm512_set1_ps(7.0710677e-01f), ifft4376);
__m512 ifft4304 = _mm512_fmsub_ps(ifft4300, _mm512_set1_ps(7.0710677e-01f), ifft4292);
__m512 ifft4388 = _mm512_fmsub_ps(ifft4384, _mm512_set1_ps(7.0710677e-01f), ifft4376);
__m512 ifft4305 = _mm512_add_ps(ifft4301, ifft4302);
__m512 ifft4389 = _mm512_add_ps(ifft4385, ifft4386);
__m512 ifft4306 = _mm512_sub_ps(ifft4301, ifft4302);
__m512 ifft4390 = _mm512_sub_ps(ifft4385, ifft4386);
__m512 ifft4307 = _mm512_add_ps(ifft4303, ifft4304);
__m512 ifft4391 = _mm512_add_ps(ifft4387, ifft4388);
__m512 ifft4308 = _mm512_sub_ps(ifft4303, ifft4304);
__m512 ifft4392 = _mm512_sub_ps(ifft4387, ifft4388);
__m512 ifft4309 = _mm512_fmadd_ps(ifft4305, _mm512_set1_ps(1.5625e-02f), ifft4295);
__m512 ifft4393 = _mm512_fmadd_ps(ifft4389, _mm512_set1_ps(1.5625e-02f), ifft4379);
__m512 ifft4310 = _mm512_fnmadd_ps(ifft4305, _mm512_set1_ps(1.5625e-02f), ifft4295);
__m512 ifft4394 = _mm512_fnmadd_ps(ifft4389, _mm512_set1_ps(1.5625e-02f), ifft4379);
__m512 ifft4311 = _mm512_fmadd_ps(ifft4307, _mm512_set1_ps(1.5625e-02f), ifft4297);
__m512 ifft4395 = _mm512_fmadd_ps(ifft4391, _mm512_set1_ps(1.5625e-02f), ifft4381);
__m512 ifft4312 = _mm512_fnmadd_ps(ifft4307, _mm512_set1_ps(1.5625e-02f), ifft4297);
__m512 ifft4396 = _mm512_fnmadd_ps(ifft4391, _mm512_set1_ps(1.5625e-02f), ifft4381);
__m512 ifft4313 = _mm512_fnmadd_ps(ifft4308, _mm512_set1_ps(1.5625e-02f), ifft4296);
__m512 ifft4397 = _mm512_fnmadd_ps(ifft4392, _mm512_set1_ps(1.5625e-02f), ifft4380);
__m512 ifft4314 = _mm512_fmadd_ps(ifft4308, _mm512_set1_ps(1.5625e-02f), ifft4296);
__m512 ifft4398 = _mm512_fmadd_ps(ifft4392, _mm512_set1_ps(1.5625e-02f), ifft4380);
__m512 ifft4315 = _mm512_fmadd_ps(ifft4306, _mm512_set1_ps(1.5625e-02f), ifft4298);
__m512 ifft4399 = _mm512_fmadd_ps(ifft4390, _mm512_set1_ps(1.5625e-02f), ifft4382);
__m512 ifft4316 = _mm512_fnmadd_ps(ifft4306, _mm512_set1_ps(1.5625e-02f), ifft4298);
__m512 ifft4400 = _mm512_fnmadd_ps(ifft4390, _mm512_set1_ps(1.5625e-02f), ifft4382);
__m512 dat830 = ifft4309;
__m512 dat835 = ifft4393;
__m512 dat831 = ifft4311;
__m512 dat836 = ifft4395;
__m512 dat832 = ifft4313;
__m512 dat837 = ifft4397;
__m512 dat833 = ifft4315;
__m512 dat838 = ifft4399;
__m512 dat834 = ifft4310;
__m512 dat839 = ifft4394;
(void)ifft4312;
(void)ifft4396;
(void)ifft4314;
(void)ifft4398;
(void)ifft4316;
(void)ifft4400;
__m512i pm43 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack211 = _mm512_permutex2var_ps(dat830, pm43, dat835);
__m512i pm44 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack212 = _mm512_permutex2var_ps(dat830, pm44, dat835);
__m512 pack213 = _mm512_permutex2var_ps(dat831, pm43, dat836);
__m512 pack214 = _mm512_permutex2var_ps(dat831, pm44, dat836);
__m512 pack215 = _mm512_permutex2var_ps(dat832, pm43, dat837);
__m512 pack216 = _mm512_permutex2var_ps(dat832, pm44, dat837);
__m512 pack217 = _mm512_permutex2var_ps(dat833, pm43, dat838);
__m512 pack218 = _mm512_permutex2var_ps(dat833, pm44, dat838);
__m512 pack219 = _mm512_permutex2var_ps(dat834, pm43, dat839);
__m512 pack220 = _mm512_permutex2var_ps(dat834, pm44, dat839);
pack211 = _mm512_max_ps(_mm512_setzero_ps(), pack211);
pack212 = _mm512_max_ps(_mm512_setzero_ps(), pack212);
pack213 = _mm512_max_ps(_mm512_setzero_ps(), pack213);
pack214 = _mm512_max_ps(_mm512_setzero_ps(), pack214);
pack215 = _mm512_max_ps(_mm512_setzero_ps(), pack215);
pack216 = _mm512_max_ps(_mm512_setzero_ps(), pack216);
pack217 = _mm512_max_ps(_mm512_setzero_ps(), pack217);
pack218 = _mm512_max_ps(_mm512_setzero_ps(), pack218);
pack219 = _mm512_max_ps(_mm512_setzero_ps(), pack219);
pack220 = _mm512_max_ps(_mm512_setzero_ps(), pack220);
_mm512_mask_storeu_ps(datPtr2+40+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack211);
_mm512_mask_storeu_ps(datPtr2+50280+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack212);
_mm512_mask_storeu_ps(datPtr2+488+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack213);
_mm512_mask_storeu_ps(datPtr2+50728+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack214);
_mm512_mask_storeu_ps(datPtr2+936+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack215);
_mm512_mask_storeu_ps(datPtr2+51176+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack216);
_mm512_mask_storeu_ps(datPtr2+1384+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack217);
_mm512_mask_storeu_ps(datPtr2+51624+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack218);
_mm512_mask_storeu_ps(datPtr2+1832+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack219);
_mm512_mask_storeu_ps(datPtr2+52072+3215360*i9+200960*k38+100480*r15+448*toH14+4*toW14+40*t26, 1023, pack220);
}
}
}
if (j5 >= last2) return;
++j5;
rel5 = 20;
}
if (rel5 < 22) {
ptrdiff_t toH15 = base5+25;
ptrdiff_t toW15 = -570+30*rel5;
ptrdiff_t jj17 = 21-rel5+j5;
for (; j5 <= jj17; toW15 += 30) {
ptrdiff_t k39 = 16*w21;
for (; k39 != 16; ++k39) {
ptrdiff_t r16 = 0;
for (; r16 != 2; ++r16) {
ptrdiff_t t27 = 0;
for (; t27 < 3; ++t27) {
__m512 sfRe257 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm257 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe261 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm261 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe258 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm258 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe262 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm262 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe259 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm259 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe263 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm263 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe260 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm260 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfRe264 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512 sfIm264 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k39+768*r16+256*t27);
__m512i ifft4401 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4402 = _mm512_permutexvar_ps(ifft4401, sfRe257);
__m512 ifft4493 = _mm512_permutexvar_ps(ifft4401, sfRe261);
__m512i ifft4403 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4404 = _mm512_permutexvar_ps(ifft4403, sfRe257);
__m512 ifft4494 = _mm512_permutexvar_ps(ifft4403, sfRe261);
__m512 ifft4405 = _mm512_permutexvar_ps(ifft4401, sfIm257);
__m512 ifft4495 = _mm512_permutexvar_ps(ifft4401, sfIm261);
__m512 ifft4406 = _mm512_permutexvar_ps(ifft4403, sfIm257);
__m512 ifft4496 = _mm512_permutexvar_ps(ifft4403, sfIm261);
__m512 ifft4407 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4408 = _mm512_mask_fmadd_ps(ifft4406, 65021, ifft4407, ifft4402);
__m512 ifft4497 = _mm512_mask_fmadd_ps(ifft4496, 65021, ifft4407, ifft4493);
__m512 ifft4409 = _mm512_mask_fnmadd_ps(ifft4405, 65021, ifft4407, ifft4404);
__m512 ifft4498 = _mm512_mask_fnmadd_ps(ifft4495, 65021, ifft4407, ifft4494);
__m512 ifft4410 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4411 = _mm512_fmadd_ps(ifft4408, ifft4410, _mm512_shuffle_ps(ifft4408, ifft4408, 177));
__m512 ifft4499 = _mm512_fmadd_ps(ifft4497, ifft4410, _mm512_shuffle_ps(ifft4497, ifft4497, 177));
__m512 ifft4412 = _mm512_fmadd_ps(ifft4409, ifft4410, _mm512_shuffle_ps(ifft4409, ifft4409, 177));
__m512 ifft4500 = _mm512_fmadd_ps(ifft4498, ifft4410, _mm512_shuffle_ps(ifft4498, ifft4498, 177));
__m512 ifft4413 = _mm512_fmadd_ps(sfRe258, ifft4410, _mm512_shuffle_ps(sfRe258, sfRe258, 177));
__m512 ifft4501 = _mm512_fmadd_ps(sfRe262, ifft4410, _mm512_shuffle_ps(sfRe262, sfRe262, 177));
__m512 ifft4414 = _mm512_fmadd_ps(sfIm258, ifft4410, _mm512_shuffle_ps(sfIm258, sfIm258, 177));
__m512 ifft4502 = _mm512_fmadd_ps(sfIm262, ifft4410, _mm512_shuffle_ps(sfIm262, sfIm262, 177));
__m512 ifft4415 = _mm512_fmadd_ps(sfRe259, ifft4410, _mm512_shuffle_ps(sfRe259, sfRe259, 177));
__m512 ifft4503 = _mm512_fmadd_ps(sfRe263, ifft4410, _mm512_shuffle_ps(sfRe263, sfRe263, 177));
__m512 ifft4416 = _mm512_fmadd_ps(sfIm259, ifft4410, _mm512_shuffle_ps(sfIm259, sfIm259, 177));
__m512 ifft4504 = _mm512_fmadd_ps(sfIm263, ifft4410, _mm512_shuffle_ps(sfIm263, sfIm263, 177));
__m512 ifft4417 = _mm512_fmadd_ps(sfRe260, ifft4410, _mm512_shuffle_ps(sfRe260, sfRe260, 177));
__m512 ifft4505 = _mm512_fmadd_ps(sfRe264, ifft4410, _mm512_shuffle_ps(sfRe264, sfRe264, 177));
__m512 ifft4418 = _mm512_fmadd_ps(sfIm260, ifft4410, _mm512_shuffle_ps(sfIm260, sfIm260, 177));
__m512 ifft4506 = _mm512_fmadd_ps(sfIm264, ifft4410, _mm512_shuffle_ps(sfIm264, sfIm264, 177));
__m512 ifft4419 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4420 = _mm512_mul_ps(ifft4411, ifft4419);
__m512 ifft4507 = _mm512_mul_ps(ifft4499, ifft4419);
__m512 ifft4421 = _mm512_mul_ps(ifft4412, ifft4419);
__m512 ifft4508 = _mm512_mul_ps(ifft4500, ifft4419);
__m512 ifft4422 = _mm512_mul_ps(ifft4413, ifft4419);
__m512 ifft4509 = _mm512_mul_ps(ifft4501, ifft4419);
__m512 ifft4423 = _mm512_mul_ps(ifft4414, ifft4419);
__m512 ifft4510 = _mm512_mul_ps(ifft4502, ifft4419);
__m512 ifft4424 = _mm512_mul_ps(ifft4415, ifft4419);
__m512 ifft4511 = _mm512_mul_ps(ifft4503, ifft4419);
__m512 ifft4425 = _mm512_mul_ps(ifft4416, ifft4419);
__m512 ifft4512 = _mm512_mul_ps(ifft4504, ifft4419);
__m512 ifft4426 = _mm512_mul_ps(ifft4417, ifft4419);
__m512 ifft4513 = _mm512_mul_ps(ifft4505, ifft4419);
__m512 ifft4427 = _mm512_mul_ps(ifft4418, ifft4419);
__m512 ifft4514 = _mm512_mul_ps(ifft4506, ifft4419);
__m512 ifft4428 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4429 = _mm512_fnmadd_ps(ifft4412, ifft4428, ifft4420);
__m512 ifft4515 = _mm512_fnmadd_ps(ifft4500, ifft4428, ifft4507);
__m512 ifft4430 = _mm512_fmadd_ps(ifft4411, ifft4428, ifft4421);
__m512 ifft4516 = _mm512_fmadd_ps(ifft4499, ifft4428, ifft4508);
__m512 ifft4431 = _mm512_fnmadd_ps(ifft4414, ifft4428, ifft4422);
__m512 ifft4517 = _mm512_fnmadd_ps(ifft4502, ifft4428, ifft4509);
__m512 ifft4432 = _mm512_fmadd_ps(ifft4413, ifft4428, ifft4423);
__m512 ifft4518 = _mm512_fmadd_ps(ifft4501, ifft4428, ifft4510);
__m512 ifft4433 = _mm512_fnmadd_ps(ifft4416, ifft4428, ifft4424);
__m512 ifft4519 = _mm512_fnmadd_ps(ifft4504, ifft4428, ifft4511);
__m512 ifft4434 = _mm512_fmadd_ps(ifft4415, ifft4428, ifft4425);
__m512 ifft4520 = _mm512_fmadd_ps(ifft4503, ifft4428, ifft4512);
__m512 ifft4435 = _mm512_fnmadd_ps(ifft4418, ifft4428, ifft4426);
__m512 ifft4521 = _mm512_fnmadd_ps(ifft4506, ifft4428, ifft4513);
__m512 ifft4436 = _mm512_fmadd_ps(ifft4417, ifft4428, ifft4427);
__m512 ifft4522 = _mm512_fmadd_ps(ifft4505, ifft4428, ifft4514);
__m512 ifft4437 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4438 = _mm512_fmadd_ps(ifft4429, ifft4437, _mm512_shuffle_ps(ifft4429, ifft4429, 78));
__m512 ifft4523 = _mm512_fmadd_ps(ifft4515, ifft4437, _mm512_shuffle_ps(ifft4515, ifft4515, 78));
__m512 ifft4439 = _mm512_fmadd_ps(ifft4430, ifft4437, _mm512_shuffle_ps(ifft4430, ifft4430, 78));
__m512 ifft4524 = _mm512_fmadd_ps(ifft4516, ifft4437, _mm512_shuffle_ps(ifft4516, ifft4516, 78));
__m512 ifft4440 = _mm512_fmadd_ps(ifft4431, ifft4437, _mm512_shuffle_ps(ifft4431, ifft4431, 78));
__m512 ifft4525 = _mm512_fmadd_ps(ifft4517, ifft4437, _mm512_shuffle_ps(ifft4517, ifft4517, 78));
__m512 ifft4441 = _mm512_fmadd_ps(ifft4432, ifft4437, _mm512_shuffle_ps(ifft4432, ifft4432, 78));
__m512 ifft4526 = _mm512_fmadd_ps(ifft4518, ifft4437, _mm512_shuffle_ps(ifft4518, ifft4518, 78));
__m512 ifft4442 = _mm512_fmadd_ps(ifft4433, ifft4437, _mm512_shuffle_ps(ifft4433, ifft4433, 78));
__m512 ifft4527 = _mm512_fmadd_ps(ifft4519, ifft4437, _mm512_shuffle_ps(ifft4519, ifft4519, 78));
__m512 ifft4443 = _mm512_fmadd_ps(ifft4434, ifft4437, _mm512_shuffle_ps(ifft4434, ifft4434, 78));
__m512 ifft4528 = _mm512_fmadd_ps(ifft4520, ifft4437, _mm512_shuffle_ps(ifft4520, ifft4520, 78));
__m512 ifft4444 = _mm512_fmadd_ps(ifft4435, ifft4437, _mm512_shuffle_ps(ifft4435, ifft4435, 78));
__m512 ifft4529 = _mm512_fmadd_ps(ifft4521, ifft4437, _mm512_shuffle_ps(ifft4521, ifft4521, 78));
__m512 ifft4445 = _mm512_fmadd_ps(ifft4436, ifft4437, _mm512_shuffle_ps(ifft4436, ifft4436, 78));
__m512 ifft4530 = _mm512_fmadd_ps(ifft4522, ifft4437, _mm512_shuffle_ps(ifft4522, ifft4522, 78));
__m512 ifft4446 = _mm512_mask_sub_ps(ifft4438, 49344, _mm512_setzero_ps(), ifft4439);
__m512 ifft4531 = _mm512_mask_sub_ps(ifft4523, 49344, _mm512_setzero_ps(), ifft4524);
__m512 ifft4447 = _mm512_mask_mov_ps(ifft4439, 49344, ifft4438);
__m512 ifft4532 = _mm512_mask_mov_ps(ifft4524, 49344, ifft4523);
__m512 ifft4448 = _mm512_mask_sub_ps(ifft4440, 49344, _mm512_setzero_ps(), ifft4441);
__m512 ifft4533 = _mm512_mask_sub_ps(ifft4525, 49344, _mm512_setzero_ps(), ifft4526);
__m512 ifft4449 = _mm512_mask_mov_ps(ifft4441, 49344, ifft4440);
__m512 ifft4534 = _mm512_mask_mov_ps(ifft4526, 49344, ifft4525);
__m512 ifft4450 = _mm512_mask_sub_ps(ifft4442, 49344, _mm512_setzero_ps(), ifft4443);
__m512 ifft4535 = _mm512_mask_sub_ps(ifft4527, 49344, _mm512_setzero_ps(), ifft4528);
__m512 ifft4451 = _mm512_mask_mov_ps(ifft4443, 49344, ifft4442);
__m512 ifft4536 = _mm512_mask_mov_ps(ifft4528, 49344, ifft4527);
__m512 ifft4452 = _mm512_mask_sub_ps(ifft4444, 49344, _mm512_setzero_ps(), ifft4445);
__m512 ifft4537 = _mm512_mask_sub_ps(ifft4529, 49344, _mm512_setzero_ps(), ifft4530);
__m512 ifft4453 = _mm512_mask_mov_ps(ifft4445, 49344, ifft4444);
__m512 ifft4538 = _mm512_mask_mov_ps(ifft4530, 49344, ifft4529);
__m512 ifft4454 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4455 = _mm512_fmadd_ps(ifft4446, ifft4454, _mm512_shuffle_f32x4(ifft4446, ifft4446, 177));
__m512 ifft4539 = _mm512_fmadd_ps(ifft4531, ifft4454, _mm512_shuffle_f32x4(ifft4531, ifft4531, 177));
__m512 ifft4456 = _mm512_fmadd_ps(ifft4447, ifft4454, _mm512_shuffle_f32x4(ifft4447, ifft4447, 177));
__m512 ifft4540 = _mm512_fmadd_ps(ifft4532, ifft4454, _mm512_shuffle_f32x4(ifft4532, ifft4532, 177));
__m512 ifft4457 = _mm512_fmadd_ps(ifft4448, ifft4454, _mm512_shuffle_f32x4(ifft4448, ifft4448, 177));
__m512 ifft4541 = _mm512_fmadd_ps(ifft4533, ifft4454, _mm512_shuffle_f32x4(ifft4533, ifft4533, 177));
__m512 ifft4458 = _mm512_fmadd_ps(ifft4449, ifft4454, _mm512_shuffle_f32x4(ifft4449, ifft4449, 177));
__m512 ifft4542 = _mm512_fmadd_ps(ifft4534, ifft4454, _mm512_shuffle_f32x4(ifft4534, ifft4534, 177));
__m512 ifft4459 = _mm512_fmadd_ps(ifft4450, ifft4454, _mm512_shuffle_f32x4(ifft4450, ifft4450, 177));
__m512 ifft4543 = _mm512_fmadd_ps(ifft4535, ifft4454, _mm512_shuffle_f32x4(ifft4535, ifft4535, 177));
__m512 ifft4460 = _mm512_fnmsub_ps(ifft4451, ifft4454, _mm512_shuffle_f32x4(ifft4451, ifft4451, 177));
__m512 ifft4544 = _mm512_fnmsub_ps(ifft4536, ifft4454, _mm512_shuffle_f32x4(ifft4536, ifft4536, 177));
__m512 ifft4461 = _mm512_fmadd_ps(ifft4452, ifft4454, _mm512_shuffle_f32x4(ifft4452, ifft4452, 177));
__m512 ifft4545 = _mm512_fmadd_ps(ifft4537, ifft4454, _mm512_shuffle_f32x4(ifft4537, ifft4537, 177));
__m512 ifft4462 = _mm512_fmadd_ps(ifft4453, ifft4454, _mm512_shuffle_f32x4(ifft4453, ifft4453, 177));
__m512 ifft4546 = _mm512_fmadd_ps(ifft4538, ifft4454, _mm512_shuffle_f32x4(ifft4538, ifft4538, 177));
__m512 ifft4463 = _mm512_add_ps(ifft4455, ifft4456);
__m512 ifft4547 = _mm512_add_ps(ifft4539, ifft4540);
__m512 ifft4464 = _mm512_sub_ps(ifft4455, ifft4456);
__m512 ifft4548 = _mm512_sub_ps(ifft4539, ifft4540);
__m512 ifft4465 = _mm512_sub_ps(ifft4457, ifft4461);
__m512 ifft4549 = _mm512_sub_ps(ifft4541, ifft4545);
__m512 ifft4466 = _mm512_add_ps(ifft4458, ifft4462);
__m512 ifft4550 = _mm512_add_ps(ifft4542, ifft4546);
__m512 ifft4467 = _mm512_add_ps(ifft4457, ifft4461);
__m512 ifft4551 = _mm512_add_ps(ifft4541, ifft4545);
__m512 ifft4468 = _mm512_sub_ps(ifft4458, ifft4462);
__m512 ifft4552 = _mm512_sub_ps(ifft4542, ifft4546);
__m512 ifft4469 = _mm512_mul_ps(ifft4459, _mm512_set1_ps(3.125e-02f));
__m512 ifft4553 = _mm512_mul_ps(ifft4543, _mm512_set1_ps(3.125e-02f));
__m512 ifft4470 = _mm512_mul_ps(ifft4460, _mm512_set1_ps(3.125e-02f));
__m512 ifft4554 = _mm512_mul_ps(ifft4544, _mm512_set1_ps(3.125e-02f));
__m512 ifft4471 = _mm512_fmadd_ps(ifft4463, _mm512_set1_ps(1.5625e-02f), ifft4469);
__m512 ifft4555 = _mm512_fmadd_ps(ifft4547, _mm512_set1_ps(1.5625e-02f), ifft4553);
__m512 ifft4472 = _mm512_fmsub_ps(ifft4463, _mm512_set1_ps(1.5625e-02f), ifft4469);
__m512 ifft4556 = _mm512_fmsub_ps(ifft4547, _mm512_set1_ps(1.5625e-02f), ifft4553);
__m512 ifft4473 = _mm512_fmadd_ps(ifft4464, _mm512_set1_ps(1.5625e-02f), ifft4470);
__m512 ifft4557 = _mm512_fmadd_ps(ifft4548, _mm512_set1_ps(1.5625e-02f), ifft4554);
__m512 ifft4474 = _mm512_fmsub_ps(ifft4464, _mm512_set1_ps(1.5625e-02f), ifft4470);
__m512 ifft4558 = _mm512_fmsub_ps(ifft4548, _mm512_set1_ps(1.5625e-02f), ifft4554);
__m512 ifft4475 = _mm512_add_ps(ifft4465, ifft4466);
__m512 ifft4559 = _mm512_add_ps(ifft4549, ifft4550);
__m512 ifft4476 = _mm512_sub_ps(ifft4465, ifft4466);
__m512 ifft4560 = _mm512_sub_ps(ifft4549, ifft4550);
__m512 ifft4477 = _mm512_fnmadd_ps(ifft4475, _mm512_set1_ps(7.0710677e-01f), ifft4467);
__m512 ifft4561 = _mm512_fnmadd_ps(ifft4559, _mm512_set1_ps(7.0710677e-01f), ifft4551);
__m512 ifft4478 = _mm512_fmadd_ps(ifft4475, _mm512_set1_ps(7.0710677e-01f), ifft4467);
__m512 ifft4562 = _mm512_fmadd_ps(ifft4559, _mm512_set1_ps(7.0710677e-01f), ifft4551);
__m512 ifft4479 = _mm512_fmadd_ps(ifft4476, _mm512_set1_ps(7.0710677e-01f), ifft4468);
__m512 ifft4563 = _mm512_fmadd_ps(ifft4560, _mm512_set1_ps(7.0710677e-01f), ifft4552);
__m512 ifft4480 = _mm512_fmsub_ps(ifft4476, _mm512_set1_ps(7.0710677e-01f), ifft4468);
__m512 ifft4564 = _mm512_fmsub_ps(ifft4560, _mm512_set1_ps(7.0710677e-01f), ifft4552);
__m512 ifft4481 = _mm512_add_ps(ifft4477, ifft4478);
__m512 ifft4565 = _mm512_add_ps(ifft4561, ifft4562);
__m512 ifft4482 = _mm512_sub_ps(ifft4477, ifft4478);
__m512 ifft4566 = _mm512_sub_ps(ifft4561, ifft4562);
__m512 ifft4483 = _mm512_add_ps(ifft4479, ifft4480);
__m512 ifft4567 = _mm512_add_ps(ifft4563, ifft4564);
__m512 ifft4484 = _mm512_sub_ps(ifft4479, ifft4480);
__m512 ifft4568 = _mm512_sub_ps(ifft4563, ifft4564);
__m512 ifft4485 = _mm512_fmadd_ps(ifft4481, _mm512_set1_ps(1.5625e-02f), ifft4471);
__m512 ifft4569 = _mm512_fmadd_ps(ifft4565, _mm512_set1_ps(1.5625e-02f), ifft4555);
__m512 ifft4486 = _mm512_fnmadd_ps(ifft4481, _mm512_set1_ps(1.5625e-02f), ifft4471);
__m512 ifft4570 = _mm512_fnmadd_ps(ifft4565, _mm512_set1_ps(1.5625e-02f), ifft4555);
__m512 ifft4487 = _mm512_fmadd_ps(ifft4483, _mm512_set1_ps(1.5625e-02f), ifft4473);
__m512 ifft4571 = _mm512_fmadd_ps(ifft4567, _mm512_set1_ps(1.5625e-02f), ifft4557);
__m512 ifft4488 = _mm512_fnmadd_ps(ifft4483, _mm512_set1_ps(1.5625e-02f), ifft4473);
__m512 ifft4572 = _mm512_fnmadd_ps(ifft4567, _mm512_set1_ps(1.5625e-02f), ifft4557);
__m512 ifft4489 = _mm512_fnmadd_ps(ifft4484, _mm512_set1_ps(1.5625e-02f), ifft4472);
__m512 ifft4573 = _mm512_fnmadd_ps(ifft4568, _mm512_set1_ps(1.5625e-02f), ifft4556);
__m512 ifft4490 = _mm512_fmadd_ps(ifft4484, _mm512_set1_ps(1.5625e-02f), ifft4472);
__m512 ifft4574 = _mm512_fmadd_ps(ifft4568, _mm512_set1_ps(1.5625e-02f), ifft4556);
__m512 ifft4491 = _mm512_fmadd_ps(ifft4482, _mm512_set1_ps(1.5625e-02f), ifft4474);
__m512 ifft4575 = _mm512_fmadd_ps(ifft4566, _mm512_set1_ps(1.5625e-02f), ifft4558);
__m512 ifft4492 = _mm512_fnmadd_ps(ifft4482, _mm512_set1_ps(1.5625e-02f), ifft4474);
__m512 ifft4576 = _mm512_fnmadd_ps(ifft4566, _mm512_set1_ps(1.5625e-02f), ifft4558);
__m512 dat840 = ifft4485;
__m512 dat845 = ifft4569;
__m512 dat841 = ifft4487;
__m512 dat846 = ifft4571;
__m512 dat842 = ifft4489;
__m512 dat847 = ifft4573;
__m512 dat843 = ifft4491;
__m512 dat848 = ifft4575;
__m512 dat844 = ifft4486;
__m512 dat849 = ifft4570;
(void)ifft4488;
(void)ifft4572;
(void)ifft4490;
(void)ifft4574;
(void)ifft4492;
(void)ifft4576;
__m512i pm45 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack221 = _mm512_permutex2var_ps(dat840, pm45, dat845);
__m512i pm46 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack222 = _mm512_permutex2var_ps(dat840, pm46, dat845);
__m512 pack223 = _mm512_permutex2var_ps(dat841, pm45, dat846);
__m512 pack224 = _mm512_permutex2var_ps(dat841, pm46, dat846);
__m512 pack225 = _mm512_permutex2var_ps(dat842, pm45, dat847);
__m512 pack226 = _mm512_permutex2var_ps(dat842, pm46, dat847);
__m512 pack227 = _mm512_permutex2var_ps(dat843, pm45, dat848);
__m512 pack228 = _mm512_permutex2var_ps(dat843, pm46, dat848);
__m512 pack229 = _mm512_permutex2var_ps(dat844, pm45, dat849);
__m512 pack230 = _mm512_permutex2var_ps(dat844, pm46, dat849);
pack221 = _mm512_max_ps(_mm512_setzero_ps(), pack221);
pack222 = _mm512_max_ps(_mm512_setzero_ps(), pack222);
pack223 = _mm512_max_ps(_mm512_setzero_ps(), pack223);
pack224 = _mm512_max_ps(_mm512_setzero_ps(), pack224);
pack225 = _mm512_max_ps(_mm512_setzero_ps(), pack225);
pack226 = _mm512_max_ps(_mm512_setzero_ps(), pack226);
pack227 = _mm512_max_ps(_mm512_setzero_ps(), pack227);
pack228 = _mm512_max_ps(_mm512_setzero_ps(), pack228);
pack229 = _mm512_max_ps(_mm512_setzero_ps(), pack229);
pack230 = _mm512_max_ps(_mm512_setzero_ps(), pack230);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack221);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack222);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack223);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack224);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack225);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack226);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack227);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack228);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack229);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k39+100480*r16+448*toH15+4*toW15+40*t27, 1023, pack230);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel5 = 22;
}
ptrdiff_t toH16 = base5+25;
ptrdiff_t toW16 = 90;
ptrdiff_t k40 = 16*w21;
for (; k40 != 16; ++k40) {
ptrdiff_t r17 = 0;
for (; r17 != 2; ++r17) {
ptrdiff_t t28 = 0;
for (; t28 < 2; ++t28) {
__m512 sfRe265 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm265 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe269 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm269 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe266 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm266 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe270 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm270 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe267 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm267 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe271 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm271 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe268 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm268 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfRe272 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512 sfIm272 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k40+768*r17+256*t28);
__m512i ifft4577 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4578 = _mm512_permutexvar_ps(ifft4577, sfRe265);
__m512 ifft4669 = _mm512_permutexvar_ps(ifft4577, sfRe269);
__m512i ifft4579 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4580 = _mm512_permutexvar_ps(ifft4579, sfRe265);
__m512 ifft4670 = _mm512_permutexvar_ps(ifft4579, sfRe269);
__m512 ifft4581 = _mm512_permutexvar_ps(ifft4577, sfIm265);
__m512 ifft4671 = _mm512_permutexvar_ps(ifft4577, sfIm269);
__m512 ifft4582 = _mm512_permutexvar_ps(ifft4579, sfIm265);
__m512 ifft4672 = _mm512_permutexvar_ps(ifft4579, sfIm269);
__m512 ifft4583 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4584 = _mm512_mask_fmadd_ps(ifft4582, 65021, ifft4583, ifft4578);
__m512 ifft4673 = _mm512_mask_fmadd_ps(ifft4672, 65021, ifft4583, ifft4669);
__m512 ifft4585 = _mm512_mask_fnmadd_ps(ifft4581, 65021, ifft4583, ifft4580);
__m512 ifft4674 = _mm512_mask_fnmadd_ps(ifft4671, 65021, ifft4583, ifft4670);
__m512 ifft4586 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4587 = _mm512_fmadd_ps(ifft4584, ifft4586, _mm512_shuffle_ps(ifft4584, ifft4584, 177));
__m512 ifft4675 = _mm512_fmadd_ps(ifft4673, ifft4586, _mm512_shuffle_ps(ifft4673, ifft4673, 177));
__m512 ifft4588 = _mm512_fmadd_ps(ifft4585, ifft4586, _mm512_shuffle_ps(ifft4585, ifft4585, 177));
__m512 ifft4676 = _mm512_fmadd_ps(ifft4674, ifft4586, _mm512_shuffle_ps(ifft4674, ifft4674, 177));
__m512 ifft4589 = _mm512_fmadd_ps(sfRe266, ifft4586, _mm512_shuffle_ps(sfRe266, sfRe266, 177));
__m512 ifft4677 = _mm512_fmadd_ps(sfRe270, ifft4586, _mm512_shuffle_ps(sfRe270, sfRe270, 177));
__m512 ifft4590 = _mm512_fmadd_ps(sfIm266, ifft4586, _mm512_shuffle_ps(sfIm266, sfIm266, 177));
__m512 ifft4678 = _mm512_fmadd_ps(sfIm270, ifft4586, _mm512_shuffle_ps(sfIm270, sfIm270, 177));
__m512 ifft4591 = _mm512_fmadd_ps(sfRe267, ifft4586, _mm512_shuffle_ps(sfRe267, sfRe267, 177));
__m512 ifft4679 = _mm512_fmadd_ps(sfRe271, ifft4586, _mm512_shuffle_ps(sfRe271, sfRe271, 177));
__m512 ifft4592 = _mm512_fmadd_ps(sfIm267, ifft4586, _mm512_shuffle_ps(sfIm267, sfIm267, 177));
__m512 ifft4680 = _mm512_fmadd_ps(sfIm271, ifft4586, _mm512_shuffle_ps(sfIm271, sfIm271, 177));
__m512 ifft4593 = _mm512_fmadd_ps(sfRe268, ifft4586, _mm512_shuffle_ps(sfRe268, sfRe268, 177));
__m512 ifft4681 = _mm512_fmadd_ps(sfRe272, ifft4586, _mm512_shuffle_ps(sfRe272, sfRe272, 177));
__m512 ifft4594 = _mm512_fmadd_ps(sfIm268, ifft4586, _mm512_shuffle_ps(sfIm268, sfIm268, 177));
__m512 ifft4682 = _mm512_fmadd_ps(sfIm272, ifft4586, _mm512_shuffle_ps(sfIm272, sfIm272, 177));
__m512 ifft4595 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4596 = _mm512_mul_ps(ifft4587, ifft4595);
__m512 ifft4683 = _mm512_mul_ps(ifft4675, ifft4595);
__m512 ifft4597 = _mm512_mul_ps(ifft4588, ifft4595);
__m512 ifft4684 = _mm512_mul_ps(ifft4676, ifft4595);
__m512 ifft4598 = _mm512_mul_ps(ifft4589, ifft4595);
__m512 ifft4685 = _mm512_mul_ps(ifft4677, ifft4595);
__m512 ifft4599 = _mm512_mul_ps(ifft4590, ifft4595);
__m512 ifft4686 = _mm512_mul_ps(ifft4678, ifft4595);
__m512 ifft4600 = _mm512_mul_ps(ifft4591, ifft4595);
__m512 ifft4687 = _mm512_mul_ps(ifft4679, ifft4595);
__m512 ifft4601 = _mm512_mul_ps(ifft4592, ifft4595);
__m512 ifft4688 = _mm512_mul_ps(ifft4680, ifft4595);
__m512 ifft4602 = _mm512_mul_ps(ifft4593, ifft4595);
__m512 ifft4689 = _mm512_mul_ps(ifft4681, ifft4595);
__m512 ifft4603 = _mm512_mul_ps(ifft4594, ifft4595);
__m512 ifft4690 = _mm512_mul_ps(ifft4682, ifft4595);
__m512 ifft4604 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4605 = _mm512_fnmadd_ps(ifft4588, ifft4604, ifft4596);
__m512 ifft4691 = _mm512_fnmadd_ps(ifft4676, ifft4604, ifft4683);
__m512 ifft4606 = _mm512_fmadd_ps(ifft4587, ifft4604, ifft4597);
__m512 ifft4692 = _mm512_fmadd_ps(ifft4675, ifft4604, ifft4684);
__m512 ifft4607 = _mm512_fnmadd_ps(ifft4590, ifft4604, ifft4598);
__m512 ifft4693 = _mm512_fnmadd_ps(ifft4678, ifft4604, ifft4685);
__m512 ifft4608 = _mm512_fmadd_ps(ifft4589, ifft4604, ifft4599);
__m512 ifft4694 = _mm512_fmadd_ps(ifft4677, ifft4604, ifft4686);
__m512 ifft4609 = _mm512_fnmadd_ps(ifft4592, ifft4604, ifft4600);
__m512 ifft4695 = _mm512_fnmadd_ps(ifft4680, ifft4604, ifft4687);
__m512 ifft4610 = _mm512_fmadd_ps(ifft4591, ifft4604, ifft4601);
__m512 ifft4696 = _mm512_fmadd_ps(ifft4679, ifft4604, ifft4688);
__m512 ifft4611 = _mm512_fnmadd_ps(ifft4594, ifft4604, ifft4602);
__m512 ifft4697 = _mm512_fnmadd_ps(ifft4682, ifft4604, ifft4689);
__m512 ifft4612 = _mm512_fmadd_ps(ifft4593, ifft4604, ifft4603);
__m512 ifft4698 = _mm512_fmadd_ps(ifft4681, ifft4604, ifft4690);
__m512 ifft4613 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4614 = _mm512_fmadd_ps(ifft4605, ifft4613, _mm512_shuffle_ps(ifft4605, ifft4605, 78));
__m512 ifft4699 = _mm512_fmadd_ps(ifft4691, ifft4613, _mm512_shuffle_ps(ifft4691, ifft4691, 78));
__m512 ifft4615 = _mm512_fmadd_ps(ifft4606, ifft4613, _mm512_shuffle_ps(ifft4606, ifft4606, 78));
__m512 ifft4700 = _mm512_fmadd_ps(ifft4692, ifft4613, _mm512_shuffle_ps(ifft4692, ifft4692, 78));
__m512 ifft4616 = _mm512_fmadd_ps(ifft4607, ifft4613, _mm512_shuffle_ps(ifft4607, ifft4607, 78));
__m512 ifft4701 = _mm512_fmadd_ps(ifft4693, ifft4613, _mm512_shuffle_ps(ifft4693, ifft4693, 78));
__m512 ifft4617 = _mm512_fmadd_ps(ifft4608, ifft4613, _mm512_shuffle_ps(ifft4608, ifft4608, 78));
__m512 ifft4702 = _mm512_fmadd_ps(ifft4694, ifft4613, _mm512_shuffle_ps(ifft4694, ifft4694, 78));
__m512 ifft4618 = _mm512_fmadd_ps(ifft4609, ifft4613, _mm512_shuffle_ps(ifft4609, ifft4609, 78));
__m512 ifft4703 = _mm512_fmadd_ps(ifft4695, ifft4613, _mm512_shuffle_ps(ifft4695, ifft4695, 78));
__m512 ifft4619 = _mm512_fmadd_ps(ifft4610, ifft4613, _mm512_shuffle_ps(ifft4610, ifft4610, 78));
__m512 ifft4704 = _mm512_fmadd_ps(ifft4696, ifft4613, _mm512_shuffle_ps(ifft4696, ifft4696, 78));
__m512 ifft4620 = _mm512_fmadd_ps(ifft4611, ifft4613, _mm512_shuffle_ps(ifft4611, ifft4611, 78));
__m512 ifft4705 = _mm512_fmadd_ps(ifft4697, ifft4613, _mm512_shuffle_ps(ifft4697, ifft4697, 78));
__m512 ifft4621 = _mm512_fmadd_ps(ifft4612, ifft4613, _mm512_shuffle_ps(ifft4612, ifft4612, 78));
__m512 ifft4706 = _mm512_fmadd_ps(ifft4698, ifft4613, _mm512_shuffle_ps(ifft4698, ifft4698, 78));
__m512 ifft4622 = _mm512_mask_sub_ps(ifft4614, 49344, _mm512_setzero_ps(), ifft4615);
__m512 ifft4707 = _mm512_mask_sub_ps(ifft4699, 49344, _mm512_setzero_ps(), ifft4700);
__m512 ifft4623 = _mm512_mask_mov_ps(ifft4615, 49344, ifft4614);
__m512 ifft4708 = _mm512_mask_mov_ps(ifft4700, 49344, ifft4699);
__m512 ifft4624 = _mm512_mask_sub_ps(ifft4616, 49344, _mm512_setzero_ps(), ifft4617);
__m512 ifft4709 = _mm512_mask_sub_ps(ifft4701, 49344, _mm512_setzero_ps(), ifft4702);
__m512 ifft4625 = _mm512_mask_mov_ps(ifft4617, 49344, ifft4616);
__m512 ifft4710 = _mm512_mask_mov_ps(ifft4702, 49344, ifft4701);
__m512 ifft4626 = _mm512_mask_sub_ps(ifft4618, 49344, _mm512_setzero_ps(), ifft4619);
__m512 ifft4711 = _mm512_mask_sub_ps(ifft4703, 49344, _mm512_setzero_ps(), ifft4704);
__m512 ifft4627 = _mm512_mask_mov_ps(ifft4619, 49344, ifft4618);
__m512 ifft4712 = _mm512_mask_mov_ps(ifft4704, 49344, ifft4703);
__m512 ifft4628 = _mm512_mask_sub_ps(ifft4620, 49344, _mm512_setzero_ps(), ifft4621);
__m512 ifft4713 = _mm512_mask_sub_ps(ifft4705, 49344, _mm512_setzero_ps(), ifft4706);
__m512 ifft4629 = _mm512_mask_mov_ps(ifft4621, 49344, ifft4620);
__m512 ifft4714 = _mm512_mask_mov_ps(ifft4706, 49344, ifft4705);
__m512 ifft4630 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4631 = _mm512_fmadd_ps(ifft4622, ifft4630, _mm512_shuffle_f32x4(ifft4622, ifft4622, 177));
__m512 ifft4715 = _mm512_fmadd_ps(ifft4707, ifft4630, _mm512_shuffle_f32x4(ifft4707, ifft4707, 177));
__m512 ifft4632 = _mm512_fmadd_ps(ifft4623, ifft4630, _mm512_shuffle_f32x4(ifft4623, ifft4623, 177));
__m512 ifft4716 = _mm512_fmadd_ps(ifft4708, ifft4630, _mm512_shuffle_f32x4(ifft4708, ifft4708, 177));
__m512 ifft4633 = _mm512_fmadd_ps(ifft4624, ifft4630, _mm512_shuffle_f32x4(ifft4624, ifft4624, 177));
__m512 ifft4717 = _mm512_fmadd_ps(ifft4709, ifft4630, _mm512_shuffle_f32x4(ifft4709, ifft4709, 177));
__m512 ifft4634 = _mm512_fmadd_ps(ifft4625, ifft4630, _mm512_shuffle_f32x4(ifft4625, ifft4625, 177));
__m512 ifft4718 = _mm512_fmadd_ps(ifft4710, ifft4630, _mm512_shuffle_f32x4(ifft4710, ifft4710, 177));
__m512 ifft4635 = _mm512_fmadd_ps(ifft4626, ifft4630, _mm512_shuffle_f32x4(ifft4626, ifft4626, 177));
__m512 ifft4719 = _mm512_fmadd_ps(ifft4711, ifft4630, _mm512_shuffle_f32x4(ifft4711, ifft4711, 177));
__m512 ifft4636 = _mm512_fnmsub_ps(ifft4627, ifft4630, _mm512_shuffle_f32x4(ifft4627, ifft4627, 177));
__m512 ifft4720 = _mm512_fnmsub_ps(ifft4712, ifft4630, _mm512_shuffle_f32x4(ifft4712, ifft4712, 177));
__m512 ifft4637 = _mm512_fmadd_ps(ifft4628, ifft4630, _mm512_shuffle_f32x4(ifft4628, ifft4628, 177));
__m512 ifft4721 = _mm512_fmadd_ps(ifft4713, ifft4630, _mm512_shuffle_f32x4(ifft4713, ifft4713, 177));
__m512 ifft4638 = _mm512_fmadd_ps(ifft4629, ifft4630, _mm512_shuffle_f32x4(ifft4629, ifft4629, 177));
__m512 ifft4722 = _mm512_fmadd_ps(ifft4714, ifft4630, _mm512_shuffle_f32x4(ifft4714, ifft4714, 177));
__m512 ifft4639 = _mm512_add_ps(ifft4631, ifft4632);
__m512 ifft4723 = _mm512_add_ps(ifft4715, ifft4716);
__m512 ifft4640 = _mm512_sub_ps(ifft4631, ifft4632);
__m512 ifft4724 = _mm512_sub_ps(ifft4715, ifft4716);
__m512 ifft4641 = _mm512_sub_ps(ifft4633, ifft4637);
__m512 ifft4725 = _mm512_sub_ps(ifft4717, ifft4721);
__m512 ifft4642 = _mm512_add_ps(ifft4634, ifft4638);
__m512 ifft4726 = _mm512_add_ps(ifft4718, ifft4722);
__m512 ifft4643 = _mm512_add_ps(ifft4633, ifft4637);
__m512 ifft4727 = _mm512_add_ps(ifft4717, ifft4721);
__m512 ifft4644 = _mm512_sub_ps(ifft4634, ifft4638);
__m512 ifft4728 = _mm512_sub_ps(ifft4718, ifft4722);
__m512 ifft4645 = _mm512_mul_ps(ifft4635, _mm512_set1_ps(3.125e-02f));
__m512 ifft4729 = _mm512_mul_ps(ifft4719, _mm512_set1_ps(3.125e-02f));
__m512 ifft4646 = _mm512_mul_ps(ifft4636, _mm512_set1_ps(3.125e-02f));
__m512 ifft4730 = _mm512_mul_ps(ifft4720, _mm512_set1_ps(3.125e-02f));
__m512 ifft4647 = _mm512_fmadd_ps(ifft4639, _mm512_set1_ps(1.5625e-02f), ifft4645);
__m512 ifft4731 = _mm512_fmadd_ps(ifft4723, _mm512_set1_ps(1.5625e-02f), ifft4729);
__m512 ifft4648 = _mm512_fmsub_ps(ifft4639, _mm512_set1_ps(1.5625e-02f), ifft4645);
__m512 ifft4732 = _mm512_fmsub_ps(ifft4723, _mm512_set1_ps(1.5625e-02f), ifft4729);
__m512 ifft4649 = _mm512_fmadd_ps(ifft4640, _mm512_set1_ps(1.5625e-02f), ifft4646);
__m512 ifft4733 = _mm512_fmadd_ps(ifft4724, _mm512_set1_ps(1.5625e-02f), ifft4730);
__m512 ifft4650 = _mm512_fmsub_ps(ifft4640, _mm512_set1_ps(1.5625e-02f), ifft4646);
__m512 ifft4734 = _mm512_fmsub_ps(ifft4724, _mm512_set1_ps(1.5625e-02f), ifft4730);
__m512 ifft4651 = _mm512_add_ps(ifft4641, ifft4642);
__m512 ifft4735 = _mm512_add_ps(ifft4725, ifft4726);
__m512 ifft4652 = _mm512_sub_ps(ifft4641, ifft4642);
__m512 ifft4736 = _mm512_sub_ps(ifft4725, ifft4726);
__m512 ifft4653 = _mm512_fnmadd_ps(ifft4651, _mm512_set1_ps(7.0710677e-01f), ifft4643);
__m512 ifft4737 = _mm512_fnmadd_ps(ifft4735, _mm512_set1_ps(7.0710677e-01f), ifft4727);
__m512 ifft4654 = _mm512_fmadd_ps(ifft4651, _mm512_set1_ps(7.0710677e-01f), ifft4643);
__m512 ifft4738 = _mm512_fmadd_ps(ifft4735, _mm512_set1_ps(7.0710677e-01f), ifft4727);
__m512 ifft4655 = _mm512_fmadd_ps(ifft4652, _mm512_set1_ps(7.0710677e-01f), ifft4644);
__m512 ifft4739 = _mm512_fmadd_ps(ifft4736, _mm512_set1_ps(7.0710677e-01f), ifft4728);
__m512 ifft4656 = _mm512_fmsub_ps(ifft4652, _mm512_set1_ps(7.0710677e-01f), ifft4644);
__m512 ifft4740 = _mm512_fmsub_ps(ifft4736, _mm512_set1_ps(7.0710677e-01f), ifft4728);
__m512 ifft4657 = _mm512_add_ps(ifft4653, ifft4654);
__m512 ifft4741 = _mm512_add_ps(ifft4737, ifft4738);
__m512 ifft4658 = _mm512_sub_ps(ifft4653, ifft4654);
__m512 ifft4742 = _mm512_sub_ps(ifft4737, ifft4738);
__m512 ifft4659 = _mm512_add_ps(ifft4655, ifft4656);
__m512 ifft4743 = _mm512_add_ps(ifft4739, ifft4740);
__m512 ifft4660 = _mm512_sub_ps(ifft4655, ifft4656);
__m512 ifft4744 = _mm512_sub_ps(ifft4739, ifft4740);
__m512 ifft4661 = _mm512_fmadd_ps(ifft4657, _mm512_set1_ps(1.5625e-02f), ifft4647);
__m512 ifft4745 = _mm512_fmadd_ps(ifft4741, _mm512_set1_ps(1.5625e-02f), ifft4731);
__m512 ifft4662 = _mm512_fnmadd_ps(ifft4657, _mm512_set1_ps(1.5625e-02f), ifft4647);
__m512 ifft4746 = _mm512_fnmadd_ps(ifft4741, _mm512_set1_ps(1.5625e-02f), ifft4731);
__m512 ifft4663 = _mm512_fmadd_ps(ifft4659, _mm512_set1_ps(1.5625e-02f), ifft4649);
__m512 ifft4747 = _mm512_fmadd_ps(ifft4743, _mm512_set1_ps(1.5625e-02f), ifft4733);
__m512 ifft4664 = _mm512_fnmadd_ps(ifft4659, _mm512_set1_ps(1.5625e-02f), ifft4649);
__m512 ifft4748 = _mm512_fnmadd_ps(ifft4743, _mm512_set1_ps(1.5625e-02f), ifft4733);
__m512 ifft4665 = _mm512_fnmadd_ps(ifft4660, _mm512_set1_ps(1.5625e-02f), ifft4648);
__m512 ifft4749 = _mm512_fnmadd_ps(ifft4744, _mm512_set1_ps(1.5625e-02f), ifft4732);
__m512 ifft4666 = _mm512_fmadd_ps(ifft4660, _mm512_set1_ps(1.5625e-02f), ifft4648);
__m512 ifft4750 = _mm512_fmadd_ps(ifft4744, _mm512_set1_ps(1.5625e-02f), ifft4732);
__m512 ifft4667 = _mm512_fmadd_ps(ifft4658, _mm512_set1_ps(1.5625e-02f), ifft4650);
__m512 ifft4751 = _mm512_fmadd_ps(ifft4742, _mm512_set1_ps(1.5625e-02f), ifft4734);
__m512 ifft4668 = _mm512_fnmadd_ps(ifft4658, _mm512_set1_ps(1.5625e-02f), ifft4650);
__m512 ifft4752 = _mm512_fnmadd_ps(ifft4742, _mm512_set1_ps(1.5625e-02f), ifft4734);
__m512 dat850 = ifft4661;
__m512 dat855 = ifft4745;
__m512 dat851 = ifft4663;
__m512 dat856 = ifft4747;
__m512 dat852 = ifft4665;
__m512 dat857 = ifft4749;
__m512 dat853 = ifft4667;
__m512 dat858 = ifft4751;
__m512 dat854 = ifft4662;
__m512 dat859 = ifft4746;
(void)ifft4664;
(void)ifft4748;
(void)ifft4666;
(void)ifft4750;
(void)ifft4668;
(void)ifft4752;
__m512i pm47 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack231 = _mm512_permutex2var_ps(dat850, pm47, dat855);
__m512i pm48 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack232 = _mm512_permutex2var_ps(dat850, pm48, dat855);
__m512 pack233 = _mm512_permutex2var_ps(dat851, pm47, dat856);
__m512 pack234 = _mm512_permutex2var_ps(dat851, pm48, dat856);
__m512 pack235 = _mm512_permutex2var_ps(dat852, pm47, dat857);
__m512 pack236 = _mm512_permutex2var_ps(dat852, pm48, dat857);
__m512 pack237 = _mm512_permutex2var_ps(dat853, pm47, dat858);
__m512 pack238 = _mm512_permutex2var_ps(dat853, pm48, dat858);
__m512 pack239 = _mm512_permutex2var_ps(dat854, pm47, dat859);
__m512 pack240 = _mm512_permutex2var_ps(dat854, pm48, dat859);
pack231 = _mm512_max_ps(_mm512_setzero_ps(), pack231);
pack232 = _mm512_max_ps(_mm512_setzero_ps(), pack232);
pack233 = _mm512_max_ps(_mm512_setzero_ps(), pack233);
pack234 = _mm512_max_ps(_mm512_setzero_ps(), pack234);
pack235 = _mm512_max_ps(_mm512_setzero_ps(), pack235);
pack236 = _mm512_max_ps(_mm512_setzero_ps(), pack236);
pack237 = _mm512_max_ps(_mm512_setzero_ps(), pack237);
pack238 = _mm512_max_ps(_mm512_setzero_ps(), pack238);
pack239 = _mm512_max_ps(_mm512_setzero_ps(), pack239);
pack240 = _mm512_max_ps(_mm512_setzero_ps(), pack240);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack231);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack232);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack233);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack234);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack235);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack236);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack237);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack238);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack239);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+40*t28, 1023, pack240);
}
ptrdiff_t t29 = 0;
__m512 sfRe273 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm273 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe277 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm277 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe274 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm274 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe278 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm278 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe275 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm275 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe279 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm279 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe276 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm276 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfRe280 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512 sfIm280 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k40+768*r17+256*t29);
__m512i ifft4753 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4754 = _mm512_permutexvar_ps(ifft4753, sfRe273);
__m512 ifft4845 = _mm512_permutexvar_ps(ifft4753, sfRe277);
__m512i ifft4755 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4756 = _mm512_permutexvar_ps(ifft4755, sfRe273);
__m512 ifft4846 = _mm512_permutexvar_ps(ifft4755, sfRe277);
__m512 ifft4757 = _mm512_permutexvar_ps(ifft4753, sfIm273);
__m512 ifft4847 = _mm512_permutexvar_ps(ifft4753, sfIm277);
__m512 ifft4758 = _mm512_permutexvar_ps(ifft4755, sfIm273);
__m512 ifft4848 = _mm512_permutexvar_ps(ifft4755, sfIm277);
__m512 ifft4759 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4760 = _mm512_mask_fmadd_ps(ifft4758, 65021, ifft4759, ifft4754);
__m512 ifft4849 = _mm512_mask_fmadd_ps(ifft4848, 65021, ifft4759, ifft4845);
__m512 ifft4761 = _mm512_mask_fnmadd_ps(ifft4757, 65021, ifft4759, ifft4756);
__m512 ifft4850 = _mm512_mask_fnmadd_ps(ifft4847, 65021, ifft4759, ifft4846);
__m512 ifft4762 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4763 = _mm512_fmadd_ps(ifft4760, ifft4762, _mm512_shuffle_ps(ifft4760, ifft4760, 177));
__m512 ifft4851 = _mm512_fmadd_ps(ifft4849, ifft4762, _mm512_shuffle_ps(ifft4849, ifft4849, 177));
__m512 ifft4764 = _mm512_fmadd_ps(ifft4761, ifft4762, _mm512_shuffle_ps(ifft4761, ifft4761, 177));
__m512 ifft4852 = _mm512_fmadd_ps(ifft4850, ifft4762, _mm512_shuffle_ps(ifft4850, ifft4850, 177));
__m512 ifft4765 = _mm512_fmadd_ps(sfRe274, ifft4762, _mm512_shuffle_ps(sfRe274, sfRe274, 177));
__m512 ifft4853 = _mm512_fmadd_ps(sfRe278, ifft4762, _mm512_shuffle_ps(sfRe278, sfRe278, 177));
__m512 ifft4766 = _mm512_fmadd_ps(sfIm274, ifft4762, _mm512_shuffle_ps(sfIm274, sfIm274, 177));
__m512 ifft4854 = _mm512_fmadd_ps(sfIm278, ifft4762, _mm512_shuffle_ps(sfIm278, sfIm278, 177));
__m512 ifft4767 = _mm512_fmadd_ps(sfRe275, ifft4762, _mm512_shuffle_ps(sfRe275, sfRe275, 177));
__m512 ifft4855 = _mm512_fmadd_ps(sfRe279, ifft4762, _mm512_shuffle_ps(sfRe279, sfRe279, 177));
__m512 ifft4768 = _mm512_fmadd_ps(sfIm275, ifft4762, _mm512_shuffle_ps(sfIm275, sfIm275, 177));
__m512 ifft4856 = _mm512_fmadd_ps(sfIm279, ifft4762, _mm512_shuffle_ps(sfIm279, sfIm279, 177));
__m512 ifft4769 = _mm512_fmadd_ps(sfRe276, ifft4762, _mm512_shuffle_ps(sfRe276, sfRe276, 177));
__m512 ifft4857 = _mm512_fmadd_ps(sfRe280, ifft4762, _mm512_shuffle_ps(sfRe280, sfRe280, 177));
__m512 ifft4770 = _mm512_fmadd_ps(sfIm276, ifft4762, _mm512_shuffle_ps(sfIm276, sfIm276, 177));
__m512 ifft4858 = _mm512_fmadd_ps(sfIm280, ifft4762, _mm512_shuffle_ps(sfIm280, sfIm280, 177));
__m512 ifft4771 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4772 = _mm512_mul_ps(ifft4763, ifft4771);
__m512 ifft4859 = _mm512_mul_ps(ifft4851, ifft4771);
__m512 ifft4773 = _mm512_mul_ps(ifft4764, ifft4771);
__m512 ifft4860 = _mm512_mul_ps(ifft4852, ifft4771);
__m512 ifft4774 = _mm512_mul_ps(ifft4765, ifft4771);
__m512 ifft4861 = _mm512_mul_ps(ifft4853, ifft4771);
__m512 ifft4775 = _mm512_mul_ps(ifft4766, ifft4771);
__m512 ifft4862 = _mm512_mul_ps(ifft4854, ifft4771);
__m512 ifft4776 = _mm512_mul_ps(ifft4767, ifft4771);
__m512 ifft4863 = _mm512_mul_ps(ifft4855, ifft4771);
__m512 ifft4777 = _mm512_mul_ps(ifft4768, ifft4771);
__m512 ifft4864 = _mm512_mul_ps(ifft4856, ifft4771);
__m512 ifft4778 = _mm512_mul_ps(ifft4769, ifft4771);
__m512 ifft4865 = _mm512_mul_ps(ifft4857, ifft4771);
__m512 ifft4779 = _mm512_mul_ps(ifft4770, ifft4771);
__m512 ifft4866 = _mm512_mul_ps(ifft4858, ifft4771);
__m512 ifft4780 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4781 = _mm512_fnmadd_ps(ifft4764, ifft4780, ifft4772);
__m512 ifft4867 = _mm512_fnmadd_ps(ifft4852, ifft4780, ifft4859);
__m512 ifft4782 = _mm512_fmadd_ps(ifft4763, ifft4780, ifft4773);
__m512 ifft4868 = _mm512_fmadd_ps(ifft4851, ifft4780, ifft4860);
__m512 ifft4783 = _mm512_fnmadd_ps(ifft4766, ifft4780, ifft4774);
__m512 ifft4869 = _mm512_fnmadd_ps(ifft4854, ifft4780, ifft4861);
__m512 ifft4784 = _mm512_fmadd_ps(ifft4765, ifft4780, ifft4775);
__m512 ifft4870 = _mm512_fmadd_ps(ifft4853, ifft4780, ifft4862);
__m512 ifft4785 = _mm512_fnmadd_ps(ifft4768, ifft4780, ifft4776);
__m512 ifft4871 = _mm512_fnmadd_ps(ifft4856, ifft4780, ifft4863);
__m512 ifft4786 = _mm512_fmadd_ps(ifft4767, ifft4780, ifft4777);
__m512 ifft4872 = _mm512_fmadd_ps(ifft4855, ifft4780, ifft4864);
__m512 ifft4787 = _mm512_fnmadd_ps(ifft4770, ifft4780, ifft4778);
__m512 ifft4873 = _mm512_fnmadd_ps(ifft4858, ifft4780, ifft4865);
__m512 ifft4788 = _mm512_fmadd_ps(ifft4769, ifft4780, ifft4779);
__m512 ifft4874 = _mm512_fmadd_ps(ifft4857, ifft4780, ifft4866);
__m512 ifft4789 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4790 = _mm512_fmadd_ps(ifft4781, ifft4789, _mm512_shuffle_ps(ifft4781, ifft4781, 78));
__m512 ifft4875 = _mm512_fmadd_ps(ifft4867, ifft4789, _mm512_shuffle_ps(ifft4867, ifft4867, 78));
__m512 ifft4791 = _mm512_fmadd_ps(ifft4782, ifft4789, _mm512_shuffle_ps(ifft4782, ifft4782, 78));
__m512 ifft4876 = _mm512_fmadd_ps(ifft4868, ifft4789, _mm512_shuffle_ps(ifft4868, ifft4868, 78));
__m512 ifft4792 = _mm512_fmadd_ps(ifft4783, ifft4789, _mm512_shuffle_ps(ifft4783, ifft4783, 78));
__m512 ifft4877 = _mm512_fmadd_ps(ifft4869, ifft4789, _mm512_shuffle_ps(ifft4869, ifft4869, 78));
__m512 ifft4793 = _mm512_fmadd_ps(ifft4784, ifft4789, _mm512_shuffle_ps(ifft4784, ifft4784, 78));
__m512 ifft4878 = _mm512_fmadd_ps(ifft4870, ifft4789, _mm512_shuffle_ps(ifft4870, ifft4870, 78));
__m512 ifft4794 = _mm512_fmadd_ps(ifft4785, ifft4789, _mm512_shuffle_ps(ifft4785, ifft4785, 78));
__m512 ifft4879 = _mm512_fmadd_ps(ifft4871, ifft4789, _mm512_shuffle_ps(ifft4871, ifft4871, 78));
__m512 ifft4795 = _mm512_fmadd_ps(ifft4786, ifft4789, _mm512_shuffle_ps(ifft4786, ifft4786, 78));
__m512 ifft4880 = _mm512_fmadd_ps(ifft4872, ifft4789, _mm512_shuffle_ps(ifft4872, ifft4872, 78));
__m512 ifft4796 = _mm512_fmadd_ps(ifft4787, ifft4789, _mm512_shuffle_ps(ifft4787, ifft4787, 78));
__m512 ifft4881 = _mm512_fmadd_ps(ifft4873, ifft4789, _mm512_shuffle_ps(ifft4873, ifft4873, 78));
__m512 ifft4797 = _mm512_fmadd_ps(ifft4788, ifft4789, _mm512_shuffle_ps(ifft4788, ifft4788, 78));
__m512 ifft4882 = _mm512_fmadd_ps(ifft4874, ifft4789, _mm512_shuffle_ps(ifft4874, ifft4874, 78));
__m512 ifft4798 = _mm512_mask_sub_ps(ifft4790, 49344, _mm512_setzero_ps(), ifft4791);
__m512 ifft4883 = _mm512_mask_sub_ps(ifft4875, 49344, _mm512_setzero_ps(), ifft4876);
__m512 ifft4799 = _mm512_mask_mov_ps(ifft4791, 49344, ifft4790);
__m512 ifft4884 = _mm512_mask_mov_ps(ifft4876, 49344, ifft4875);
__m512 ifft4800 = _mm512_mask_sub_ps(ifft4792, 49344, _mm512_setzero_ps(), ifft4793);
__m512 ifft4885 = _mm512_mask_sub_ps(ifft4877, 49344, _mm512_setzero_ps(), ifft4878);
__m512 ifft4801 = _mm512_mask_mov_ps(ifft4793, 49344, ifft4792);
__m512 ifft4886 = _mm512_mask_mov_ps(ifft4878, 49344, ifft4877);
__m512 ifft4802 = _mm512_mask_sub_ps(ifft4794, 49344, _mm512_setzero_ps(), ifft4795);
__m512 ifft4887 = _mm512_mask_sub_ps(ifft4879, 49344, _mm512_setzero_ps(), ifft4880);
__m512 ifft4803 = _mm512_mask_mov_ps(ifft4795, 49344, ifft4794);
__m512 ifft4888 = _mm512_mask_mov_ps(ifft4880, 49344, ifft4879);
__m512 ifft4804 = _mm512_mask_sub_ps(ifft4796, 49344, _mm512_setzero_ps(), ifft4797);
__m512 ifft4889 = _mm512_mask_sub_ps(ifft4881, 49344, _mm512_setzero_ps(), ifft4882);
__m512 ifft4805 = _mm512_mask_mov_ps(ifft4797, 49344, ifft4796);
__m512 ifft4890 = _mm512_mask_mov_ps(ifft4882, 49344, ifft4881);
__m512 ifft4806 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4807 = _mm512_fmadd_ps(ifft4798, ifft4806, _mm512_shuffle_f32x4(ifft4798, ifft4798, 177));
__m512 ifft4891 = _mm512_fmadd_ps(ifft4883, ifft4806, _mm512_shuffle_f32x4(ifft4883, ifft4883, 177));
__m512 ifft4808 = _mm512_fmadd_ps(ifft4799, ifft4806, _mm512_shuffle_f32x4(ifft4799, ifft4799, 177));
__m512 ifft4892 = _mm512_fmadd_ps(ifft4884, ifft4806, _mm512_shuffle_f32x4(ifft4884, ifft4884, 177));
__m512 ifft4809 = _mm512_fmadd_ps(ifft4800, ifft4806, _mm512_shuffle_f32x4(ifft4800, ifft4800, 177));
__m512 ifft4893 = _mm512_fmadd_ps(ifft4885, ifft4806, _mm512_shuffle_f32x4(ifft4885, ifft4885, 177));
__m512 ifft4810 = _mm512_fmadd_ps(ifft4801, ifft4806, _mm512_shuffle_f32x4(ifft4801, ifft4801, 177));
__m512 ifft4894 = _mm512_fmadd_ps(ifft4886, ifft4806, _mm512_shuffle_f32x4(ifft4886, ifft4886, 177));
__m512 ifft4811 = _mm512_fmadd_ps(ifft4802, ifft4806, _mm512_shuffle_f32x4(ifft4802, ifft4802, 177));
__m512 ifft4895 = _mm512_fmadd_ps(ifft4887, ifft4806, _mm512_shuffle_f32x4(ifft4887, ifft4887, 177));
__m512 ifft4812 = _mm512_fnmsub_ps(ifft4803, ifft4806, _mm512_shuffle_f32x4(ifft4803, ifft4803, 177));
__m512 ifft4896 = _mm512_fnmsub_ps(ifft4888, ifft4806, _mm512_shuffle_f32x4(ifft4888, ifft4888, 177));
__m512 ifft4813 = _mm512_fmadd_ps(ifft4804, ifft4806, _mm512_shuffle_f32x4(ifft4804, ifft4804, 177));
__m512 ifft4897 = _mm512_fmadd_ps(ifft4889, ifft4806, _mm512_shuffle_f32x4(ifft4889, ifft4889, 177));
__m512 ifft4814 = _mm512_fmadd_ps(ifft4805, ifft4806, _mm512_shuffle_f32x4(ifft4805, ifft4805, 177));
__m512 ifft4898 = _mm512_fmadd_ps(ifft4890, ifft4806, _mm512_shuffle_f32x4(ifft4890, ifft4890, 177));
__m512 ifft4815 = _mm512_add_ps(ifft4807, ifft4808);
__m512 ifft4899 = _mm512_add_ps(ifft4891, ifft4892);
__m512 ifft4816 = _mm512_sub_ps(ifft4807, ifft4808);
__m512 ifft4900 = _mm512_sub_ps(ifft4891, ifft4892);
__m512 ifft4817 = _mm512_sub_ps(ifft4809, ifft4813);
__m512 ifft4901 = _mm512_sub_ps(ifft4893, ifft4897);
__m512 ifft4818 = _mm512_add_ps(ifft4810, ifft4814);
__m512 ifft4902 = _mm512_add_ps(ifft4894, ifft4898);
__m512 ifft4819 = _mm512_add_ps(ifft4809, ifft4813);
__m512 ifft4903 = _mm512_add_ps(ifft4893, ifft4897);
__m512 ifft4820 = _mm512_sub_ps(ifft4810, ifft4814);
__m512 ifft4904 = _mm512_sub_ps(ifft4894, ifft4898);
__m512 ifft4821 = _mm512_mul_ps(ifft4811, _mm512_set1_ps(3.125e-02f));
__m512 ifft4905 = _mm512_mul_ps(ifft4895, _mm512_set1_ps(3.125e-02f));
__m512 ifft4822 = _mm512_mul_ps(ifft4812, _mm512_set1_ps(3.125e-02f));
__m512 ifft4906 = _mm512_mul_ps(ifft4896, _mm512_set1_ps(3.125e-02f));
__m512 ifft4823 = _mm512_fmadd_ps(ifft4815, _mm512_set1_ps(1.5625e-02f), ifft4821);
__m512 ifft4907 = _mm512_fmadd_ps(ifft4899, _mm512_set1_ps(1.5625e-02f), ifft4905);
__m512 ifft4824 = _mm512_fmsub_ps(ifft4815, _mm512_set1_ps(1.5625e-02f), ifft4821);
__m512 ifft4908 = _mm512_fmsub_ps(ifft4899, _mm512_set1_ps(1.5625e-02f), ifft4905);
__m512 ifft4825 = _mm512_fmadd_ps(ifft4816, _mm512_set1_ps(1.5625e-02f), ifft4822);
__m512 ifft4909 = _mm512_fmadd_ps(ifft4900, _mm512_set1_ps(1.5625e-02f), ifft4906);
__m512 ifft4826 = _mm512_fmsub_ps(ifft4816, _mm512_set1_ps(1.5625e-02f), ifft4822);
__m512 ifft4910 = _mm512_fmsub_ps(ifft4900, _mm512_set1_ps(1.5625e-02f), ifft4906);
__m512 ifft4827 = _mm512_add_ps(ifft4817, ifft4818);
__m512 ifft4911 = _mm512_add_ps(ifft4901, ifft4902);
__m512 ifft4828 = _mm512_sub_ps(ifft4817, ifft4818);
__m512 ifft4912 = _mm512_sub_ps(ifft4901, ifft4902);
__m512 ifft4829 = _mm512_fnmadd_ps(ifft4827, _mm512_set1_ps(7.0710677e-01f), ifft4819);
__m512 ifft4913 = _mm512_fnmadd_ps(ifft4911, _mm512_set1_ps(7.0710677e-01f), ifft4903);
__m512 ifft4830 = _mm512_fmadd_ps(ifft4827, _mm512_set1_ps(7.0710677e-01f), ifft4819);
__m512 ifft4914 = _mm512_fmadd_ps(ifft4911, _mm512_set1_ps(7.0710677e-01f), ifft4903);
__m512 ifft4831 = _mm512_fmadd_ps(ifft4828, _mm512_set1_ps(7.0710677e-01f), ifft4820);
__m512 ifft4915 = _mm512_fmadd_ps(ifft4912, _mm512_set1_ps(7.0710677e-01f), ifft4904);
__m512 ifft4832 = _mm512_fmsub_ps(ifft4828, _mm512_set1_ps(7.0710677e-01f), ifft4820);
__m512 ifft4916 = _mm512_fmsub_ps(ifft4912, _mm512_set1_ps(7.0710677e-01f), ifft4904);
__m512 ifft4833 = _mm512_add_ps(ifft4829, ifft4830);
__m512 ifft4917 = _mm512_add_ps(ifft4913, ifft4914);
__m512 ifft4834 = _mm512_sub_ps(ifft4829, ifft4830);
__m512 ifft4918 = _mm512_sub_ps(ifft4913, ifft4914);
__m512 ifft4835 = _mm512_add_ps(ifft4831, ifft4832);
__m512 ifft4919 = _mm512_add_ps(ifft4915, ifft4916);
__m512 ifft4836 = _mm512_sub_ps(ifft4831, ifft4832);
__m512 ifft4920 = _mm512_sub_ps(ifft4915, ifft4916);
__m512 ifft4837 = _mm512_fmadd_ps(ifft4833, _mm512_set1_ps(1.5625e-02f), ifft4823);
__m512 ifft4921 = _mm512_fmadd_ps(ifft4917, _mm512_set1_ps(1.5625e-02f), ifft4907);
__m512 ifft4838 = _mm512_fnmadd_ps(ifft4833, _mm512_set1_ps(1.5625e-02f), ifft4823);
__m512 ifft4922 = _mm512_fnmadd_ps(ifft4917, _mm512_set1_ps(1.5625e-02f), ifft4907);
__m512 ifft4839 = _mm512_fmadd_ps(ifft4835, _mm512_set1_ps(1.5625e-02f), ifft4825);
__m512 ifft4923 = _mm512_fmadd_ps(ifft4919, _mm512_set1_ps(1.5625e-02f), ifft4909);
__m512 ifft4840 = _mm512_fnmadd_ps(ifft4835, _mm512_set1_ps(1.5625e-02f), ifft4825);
__m512 ifft4924 = _mm512_fnmadd_ps(ifft4919, _mm512_set1_ps(1.5625e-02f), ifft4909);
__m512 ifft4841 = _mm512_fnmadd_ps(ifft4836, _mm512_set1_ps(1.5625e-02f), ifft4824);
__m512 ifft4925 = _mm512_fnmadd_ps(ifft4920, _mm512_set1_ps(1.5625e-02f), ifft4908);
__m512 ifft4842 = _mm512_fmadd_ps(ifft4836, _mm512_set1_ps(1.5625e-02f), ifft4824);
__m512 ifft4926 = _mm512_fmadd_ps(ifft4920, _mm512_set1_ps(1.5625e-02f), ifft4908);
__m512 ifft4843 = _mm512_fmadd_ps(ifft4834, _mm512_set1_ps(1.5625e-02f), ifft4826);
__m512 ifft4927 = _mm512_fmadd_ps(ifft4918, _mm512_set1_ps(1.5625e-02f), ifft4910);
__m512 ifft4844 = _mm512_fnmadd_ps(ifft4834, _mm512_set1_ps(1.5625e-02f), ifft4826);
__m512 ifft4928 = _mm512_fnmadd_ps(ifft4918, _mm512_set1_ps(1.5625e-02f), ifft4910);
__m512 dat860 = ifft4837;
__m512 dat865 = ifft4921;
__m512 dat861 = ifft4839;
__m512 dat866 = ifft4923;
__m512 dat862 = ifft4841;
__m512 dat867 = ifft4925;
__m512 dat863 = ifft4843;
__m512 dat868 = ifft4927;
__m512 dat864 = ifft4838;
__m512 dat869 = ifft4922;
(void)ifft4840;
(void)ifft4924;
(void)ifft4842;
(void)ifft4926;
(void)ifft4844;
(void)ifft4928;
dat860 = _mm512_max_ps(_mm512_setzero_ps(), dat860);
dat865 = _mm512_max_ps(_mm512_setzero_ps(), dat865);
dat861 = _mm512_max_ps(_mm512_setzero_ps(), dat861);
dat866 = _mm512_max_ps(_mm512_setzero_ps(), dat866);
dat862 = _mm512_max_ps(_mm512_setzero_ps(), dat862);
dat867 = _mm512_max_ps(_mm512_setzero_ps(), dat867);
dat863 = _mm512_max_ps(_mm512_setzero_ps(), dat863);
dat868 = _mm512_max_ps(_mm512_setzero_ps(), dat868);
dat864 = _mm512_max_ps(_mm512_setzero_ps(), dat864);
dat869 = _mm512_max_ps(_mm512_setzero_ps(), dat869);
_mm512_mask_storeu_ps(datPtr2+80+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat860);
_mm512_mask_storeu_ps(datPtr2+52088+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat860);
_mm512_mask_storeu_ps(datPtr2+1880+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat865);
_mm512_mask_storeu_ps(datPtr2+50288+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat865);
_mm512_mask_storeu_ps(datPtr2+528+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat861);
_mm512_mask_storeu_ps(datPtr2+52536+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat861);
_mm512_mask_storeu_ps(datPtr2+2328+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat866);
_mm512_mask_storeu_ps(datPtr2+50736+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat866);
_mm512_mask_storeu_ps(datPtr2+976+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat862);
_mm512_mask_storeu_ps(datPtr2+52984+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat862);
_mm512_mask_storeu_ps(datPtr2+2776+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat867);
_mm512_mask_storeu_ps(datPtr2+51184+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat867);
_mm512_mask_storeu_ps(datPtr2+1424+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat863);
_mm512_mask_storeu_ps(datPtr2+53432+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat863);
_mm512_mask_storeu_ps(datPtr2+3224+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat868);
_mm512_mask_storeu_ps(datPtr2+51632+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat868);
_mm512_mask_storeu_ps(datPtr2+1872+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 3, dat864);
_mm512_mask_storeu_ps(datPtr2+53880+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 7936, dat864);
_mm512_mask_storeu_ps(datPtr2+3672+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 31, dat869);
_mm512_mask_storeu_ps(datPtr2+52080+3215360*i9+200960*k40+100480*r17+448*toH16+4*toW16+0*t29, 768, dat869);
}
}
if (j5 >= last2) return;
++j5;
}
j5 = 84;
}
ptrdiff_t rel6 = j5-84;
ptrdiff_t base6 = 105;
if (rel6 < 1) {
ptrdiff_t toH17 = base6+0;
ptrdiff_t toW17 = 105;
ptrdiff_t k41 = 16*w21;
for (; k41 != 16; ++k41) {
ptrdiff_t r18 = 0;
for (; r18 != 2; ++r18) {
ptrdiff_t t30 = 0;
__m512 sfRe281 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm281 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe285 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm285 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe282 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm282 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe286 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm286 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe283 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm283 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe287 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm287 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe284 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm284 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfRe288 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512 sfIm288 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k41+768*r18+256*t30);
__m512i ifft4929 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft4930 = _mm512_permutexvar_ps(ifft4929, sfRe281);
__m512 ifft5021 = _mm512_permutexvar_ps(ifft4929, sfRe285);
__m512i ifft4931 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft4932 = _mm512_permutexvar_ps(ifft4931, sfRe281);
__m512 ifft5022 = _mm512_permutexvar_ps(ifft4931, sfRe285);
__m512 ifft4933 = _mm512_permutexvar_ps(ifft4929, sfIm281);
__m512 ifft5023 = _mm512_permutexvar_ps(ifft4929, sfIm285);
__m512 ifft4934 = _mm512_permutexvar_ps(ifft4931, sfIm281);
__m512 ifft5024 = _mm512_permutexvar_ps(ifft4931, sfIm285);
__m512 ifft4935 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft4936 = _mm512_mask_fmadd_ps(ifft4934, 65021, ifft4935, ifft4930);
__m512 ifft5025 = _mm512_mask_fmadd_ps(ifft5024, 65021, ifft4935, ifft5021);
__m512 ifft4937 = _mm512_mask_fnmadd_ps(ifft4933, 65021, ifft4935, ifft4932);
__m512 ifft5026 = _mm512_mask_fnmadd_ps(ifft5023, 65021, ifft4935, ifft5022);
__m512 ifft4938 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft4939 = _mm512_fmadd_ps(ifft4936, ifft4938, _mm512_shuffle_ps(ifft4936, ifft4936, 177));
__m512 ifft5027 = _mm512_fmadd_ps(ifft5025, ifft4938, _mm512_shuffle_ps(ifft5025, ifft5025, 177));
__m512 ifft4940 = _mm512_fmadd_ps(ifft4937, ifft4938, _mm512_shuffle_ps(ifft4937, ifft4937, 177));
__m512 ifft5028 = _mm512_fmadd_ps(ifft5026, ifft4938, _mm512_shuffle_ps(ifft5026, ifft5026, 177));
__m512 ifft4941 = _mm512_fmadd_ps(sfRe282, ifft4938, _mm512_shuffle_ps(sfRe282, sfRe282, 177));
__m512 ifft5029 = _mm512_fmadd_ps(sfRe286, ifft4938, _mm512_shuffle_ps(sfRe286, sfRe286, 177));
__m512 ifft4942 = _mm512_fmadd_ps(sfIm282, ifft4938, _mm512_shuffle_ps(sfIm282, sfIm282, 177));
__m512 ifft5030 = _mm512_fmadd_ps(sfIm286, ifft4938, _mm512_shuffle_ps(sfIm286, sfIm286, 177));
__m512 ifft4943 = _mm512_fmadd_ps(sfRe283, ifft4938, _mm512_shuffle_ps(sfRe283, sfRe283, 177));
__m512 ifft5031 = _mm512_fmadd_ps(sfRe287, ifft4938, _mm512_shuffle_ps(sfRe287, sfRe287, 177));
__m512 ifft4944 = _mm512_fmadd_ps(sfIm283, ifft4938, _mm512_shuffle_ps(sfIm283, sfIm283, 177));
__m512 ifft5032 = _mm512_fmadd_ps(sfIm287, ifft4938, _mm512_shuffle_ps(sfIm287, sfIm287, 177));
__m512 ifft4945 = _mm512_fmadd_ps(sfRe284, ifft4938, _mm512_shuffle_ps(sfRe284, sfRe284, 177));
__m512 ifft5033 = _mm512_fmadd_ps(sfRe288, ifft4938, _mm512_shuffle_ps(sfRe288, sfRe288, 177));
__m512 ifft4946 = _mm512_fmadd_ps(sfIm284, ifft4938, _mm512_shuffle_ps(sfIm284, sfIm284, 177));
__m512 ifft5034 = _mm512_fmadd_ps(sfIm288, ifft4938, _mm512_shuffle_ps(sfIm288, sfIm288, 177));
__m512 ifft4947 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft4948 = _mm512_mul_ps(ifft4939, ifft4947);
__m512 ifft5035 = _mm512_mul_ps(ifft5027, ifft4947);
__m512 ifft4949 = _mm512_mul_ps(ifft4940, ifft4947);
__m512 ifft5036 = _mm512_mul_ps(ifft5028, ifft4947);
__m512 ifft4950 = _mm512_mul_ps(ifft4941, ifft4947);
__m512 ifft5037 = _mm512_mul_ps(ifft5029, ifft4947);
__m512 ifft4951 = _mm512_mul_ps(ifft4942, ifft4947);
__m512 ifft5038 = _mm512_mul_ps(ifft5030, ifft4947);
__m512 ifft4952 = _mm512_mul_ps(ifft4943, ifft4947);
__m512 ifft5039 = _mm512_mul_ps(ifft5031, ifft4947);
__m512 ifft4953 = _mm512_mul_ps(ifft4944, ifft4947);
__m512 ifft5040 = _mm512_mul_ps(ifft5032, ifft4947);
__m512 ifft4954 = _mm512_mul_ps(ifft4945, ifft4947);
__m512 ifft5041 = _mm512_mul_ps(ifft5033, ifft4947);
__m512 ifft4955 = _mm512_mul_ps(ifft4946, ifft4947);
__m512 ifft5042 = _mm512_mul_ps(ifft5034, ifft4947);
__m512 ifft4956 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft4957 = _mm512_fnmadd_ps(ifft4940, ifft4956, ifft4948);
__m512 ifft5043 = _mm512_fnmadd_ps(ifft5028, ifft4956, ifft5035);
__m512 ifft4958 = _mm512_fmadd_ps(ifft4939, ifft4956, ifft4949);
__m512 ifft5044 = _mm512_fmadd_ps(ifft5027, ifft4956, ifft5036);
__m512 ifft4959 = _mm512_fnmadd_ps(ifft4942, ifft4956, ifft4950);
__m512 ifft5045 = _mm512_fnmadd_ps(ifft5030, ifft4956, ifft5037);
__m512 ifft4960 = _mm512_fmadd_ps(ifft4941, ifft4956, ifft4951);
__m512 ifft5046 = _mm512_fmadd_ps(ifft5029, ifft4956, ifft5038);
__m512 ifft4961 = _mm512_fnmadd_ps(ifft4944, ifft4956, ifft4952);
__m512 ifft5047 = _mm512_fnmadd_ps(ifft5032, ifft4956, ifft5039);
__m512 ifft4962 = _mm512_fmadd_ps(ifft4943, ifft4956, ifft4953);
__m512 ifft5048 = _mm512_fmadd_ps(ifft5031, ifft4956, ifft5040);
__m512 ifft4963 = _mm512_fnmadd_ps(ifft4946, ifft4956, ifft4954);
__m512 ifft5049 = _mm512_fnmadd_ps(ifft5034, ifft4956, ifft5041);
__m512 ifft4964 = _mm512_fmadd_ps(ifft4945, ifft4956, ifft4955);
__m512 ifft5050 = _mm512_fmadd_ps(ifft5033, ifft4956, ifft5042);
__m512 ifft4965 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft4966 = _mm512_fmadd_ps(ifft4957, ifft4965, _mm512_shuffle_ps(ifft4957, ifft4957, 78));
__m512 ifft5051 = _mm512_fmadd_ps(ifft5043, ifft4965, _mm512_shuffle_ps(ifft5043, ifft5043, 78));
__m512 ifft4967 = _mm512_fmadd_ps(ifft4958, ifft4965, _mm512_shuffle_ps(ifft4958, ifft4958, 78));
__m512 ifft5052 = _mm512_fmadd_ps(ifft5044, ifft4965, _mm512_shuffle_ps(ifft5044, ifft5044, 78));
__m512 ifft4968 = _mm512_fmadd_ps(ifft4959, ifft4965, _mm512_shuffle_ps(ifft4959, ifft4959, 78));
__m512 ifft5053 = _mm512_fmadd_ps(ifft5045, ifft4965, _mm512_shuffle_ps(ifft5045, ifft5045, 78));
__m512 ifft4969 = _mm512_fmadd_ps(ifft4960, ifft4965, _mm512_shuffle_ps(ifft4960, ifft4960, 78));
__m512 ifft5054 = _mm512_fmadd_ps(ifft5046, ifft4965, _mm512_shuffle_ps(ifft5046, ifft5046, 78));
__m512 ifft4970 = _mm512_fmadd_ps(ifft4961, ifft4965, _mm512_shuffle_ps(ifft4961, ifft4961, 78));
__m512 ifft5055 = _mm512_fmadd_ps(ifft5047, ifft4965, _mm512_shuffle_ps(ifft5047, ifft5047, 78));
__m512 ifft4971 = _mm512_fmadd_ps(ifft4962, ifft4965, _mm512_shuffle_ps(ifft4962, ifft4962, 78));
__m512 ifft5056 = _mm512_fmadd_ps(ifft5048, ifft4965, _mm512_shuffle_ps(ifft5048, ifft5048, 78));
__m512 ifft4972 = _mm512_fmadd_ps(ifft4963, ifft4965, _mm512_shuffle_ps(ifft4963, ifft4963, 78));
__m512 ifft5057 = _mm512_fmadd_ps(ifft5049, ifft4965, _mm512_shuffle_ps(ifft5049, ifft5049, 78));
__m512 ifft4973 = _mm512_fmadd_ps(ifft4964, ifft4965, _mm512_shuffle_ps(ifft4964, ifft4964, 78));
__m512 ifft5058 = _mm512_fmadd_ps(ifft5050, ifft4965, _mm512_shuffle_ps(ifft5050, ifft5050, 78));
__m512 ifft4974 = _mm512_mask_sub_ps(ifft4966, 49344, _mm512_setzero_ps(), ifft4967);
__m512 ifft5059 = _mm512_mask_sub_ps(ifft5051, 49344, _mm512_setzero_ps(), ifft5052);
__m512 ifft4975 = _mm512_mask_mov_ps(ifft4967, 49344, ifft4966);
__m512 ifft5060 = _mm512_mask_mov_ps(ifft5052, 49344, ifft5051);
__m512 ifft4976 = _mm512_mask_sub_ps(ifft4968, 49344, _mm512_setzero_ps(), ifft4969);
__m512 ifft5061 = _mm512_mask_sub_ps(ifft5053, 49344, _mm512_setzero_ps(), ifft5054);
__m512 ifft4977 = _mm512_mask_mov_ps(ifft4969, 49344, ifft4968);
__m512 ifft5062 = _mm512_mask_mov_ps(ifft5054, 49344, ifft5053);
__m512 ifft4978 = _mm512_mask_sub_ps(ifft4970, 49344, _mm512_setzero_ps(), ifft4971);
__m512 ifft5063 = _mm512_mask_sub_ps(ifft5055, 49344, _mm512_setzero_ps(), ifft5056);
__m512 ifft4979 = _mm512_mask_mov_ps(ifft4971, 49344, ifft4970);
__m512 ifft5064 = _mm512_mask_mov_ps(ifft5056, 49344, ifft5055);
__m512 ifft4980 = _mm512_mask_sub_ps(ifft4972, 49344, _mm512_setzero_ps(), ifft4973);
__m512 ifft5065 = _mm512_mask_sub_ps(ifft5057, 49344, _mm512_setzero_ps(), ifft5058);
__m512 ifft4981 = _mm512_mask_mov_ps(ifft4973, 49344, ifft4972);
__m512 ifft5066 = _mm512_mask_mov_ps(ifft5058, 49344, ifft5057);
__m512 ifft4982 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft4983 = _mm512_fmadd_ps(ifft4974, ifft4982, _mm512_shuffle_f32x4(ifft4974, ifft4974, 177));
__m512 ifft5067 = _mm512_fmadd_ps(ifft5059, ifft4982, _mm512_shuffle_f32x4(ifft5059, ifft5059, 177));
__m512 ifft4984 = _mm512_fmadd_ps(ifft4975, ifft4982, _mm512_shuffle_f32x4(ifft4975, ifft4975, 177));
__m512 ifft5068 = _mm512_fmadd_ps(ifft5060, ifft4982, _mm512_shuffle_f32x4(ifft5060, ifft5060, 177));
__m512 ifft4985 = _mm512_fmadd_ps(ifft4976, ifft4982, _mm512_shuffle_f32x4(ifft4976, ifft4976, 177));
__m512 ifft5069 = _mm512_fmadd_ps(ifft5061, ifft4982, _mm512_shuffle_f32x4(ifft5061, ifft5061, 177));
__m512 ifft4986 = _mm512_fmadd_ps(ifft4977, ifft4982, _mm512_shuffle_f32x4(ifft4977, ifft4977, 177));
__m512 ifft5070 = _mm512_fmadd_ps(ifft5062, ifft4982, _mm512_shuffle_f32x4(ifft5062, ifft5062, 177));
__m512 ifft4987 = _mm512_fmadd_ps(ifft4978, ifft4982, _mm512_shuffle_f32x4(ifft4978, ifft4978, 177));
__m512 ifft5071 = _mm512_fmadd_ps(ifft5063, ifft4982, _mm512_shuffle_f32x4(ifft5063, ifft5063, 177));
__m512 ifft4988 = _mm512_fnmsub_ps(ifft4979, ifft4982, _mm512_shuffle_f32x4(ifft4979, ifft4979, 177));
__m512 ifft5072 = _mm512_fnmsub_ps(ifft5064, ifft4982, _mm512_shuffle_f32x4(ifft5064, ifft5064, 177));
__m512 ifft4989 = _mm512_fmadd_ps(ifft4980, ifft4982, _mm512_shuffle_f32x4(ifft4980, ifft4980, 177));
__m512 ifft5073 = _mm512_fmadd_ps(ifft5065, ifft4982, _mm512_shuffle_f32x4(ifft5065, ifft5065, 177));
__m512 ifft4990 = _mm512_fmadd_ps(ifft4981, ifft4982, _mm512_shuffle_f32x4(ifft4981, ifft4981, 177));
__m512 ifft5074 = _mm512_fmadd_ps(ifft5066, ifft4982, _mm512_shuffle_f32x4(ifft5066, ifft5066, 177));
__m512 ifft4991 = _mm512_add_ps(ifft4983, ifft4984);
__m512 ifft5075 = _mm512_add_ps(ifft5067, ifft5068);
__m512 ifft4992 = _mm512_sub_ps(ifft4983, ifft4984);
__m512 ifft5076 = _mm512_sub_ps(ifft5067, ifft5068);
__m512 ifft4993 = _mm512_sub_ps(ifft4985, ifft4989);
__m512 ifft5077 = _mm512_sub_ps(ifft5069, ifft5073);
__m512 ifft4994 = _mm512_add_ps(ifft4986, ifft4990);
__m512 ifft5078 = _mm512_add_ps(ifft5070, ifft5074);
__m512 ifft4995 = _mm512_add_ps(ifft4985, ifft4989);
__m512 ifft5079 = _mm512_add_ps(ifft5069, ifft5073);
__m512 ifft4996 = _mm512_sub_ps(ifft4986, ifft4990);
__m512 ifft5080 = _mm512_sub_ps(ifft5070, ifft5074);
__m512 ifft4997 = _mm512_mul_ps(ifft4987, _mm512_set1_ps(3.125e-02f));
__m512 ifft5081 = _mm512_mul_ps(ifft5071, _mm512_set1_ps(3.125e-02f));
__m512 ifft4998 = _mm512_mul_ps(ifft4988, _mm512_set1_ps(3.125e-02f));
__m512 ifft5082 = _mm512_mul_ps(ifft5072, _mm512_set1_ps(3.125e-02f));
__m512 ifft4999 = _mm512_fmadd_ps(ifft4991, _mm512_set1_ps(1.5625e-02f), ifft4997);
__m512 ifft5083 = _mm512_fmadd_ps(ifft5075, _mm512_set1_ps(1.5625e-02f), ifft5081);
__m512 ifft5000 = _mm512_fmsub_ps(ifft4991, _mm512_set1_ps(1.5625e-02f), ifft4997);
__m512 ifft5084 = _mm512_fmsub_ps(ifft5075, _mm512_set1_ps(1.5625e-02f), ifft5081);
__m512 ifft5001 = _mm512_fmadd_ps(ifft4992, _mm512_set1_ps(1.5625e-02f), ifft4998);
__m512 ifft5085 = _mm512_fmadd_ps(ifft5076, _mm512_set1_ps(1.5625e-02f), ifft5082);
__m512 ifft5002 = _mm512_fmsub_ps(ifft4992, _mm512_set1_ps(1.5625e-02f), ifft4998);
__m512 ifft5086 = _mm512_fmsub_ps(ifft5076, _mm512_set1_ps(1.5625e-02f), ifft5082);
__m512 ifft5003 = _mm512_add_ps(ifft4993, ifft4994);
__m512 ifft5087 = _mm512_add_ps(ifft5077, ifft5078);
__m512 ifft5004 = _mm512_sub_ps(ifft4993, ifft4994);
__m512 ifft5088 = _mm512_sub_ps(ifft5077, ifft5078);
__m512 ifft5005 = _mm512_fnmadd_ps(ifft5003, _mm512_set1_ps(7.0710677e-01f), ifft4995);
__m512 ifft5089 = _mm512_fnmadd_ps(ifft5087, _mm512_set1_ps(7.0710677e-01f), ifft5079);
__m512 ifft5006 = _mm512_fmadd_ps(ifft5003, _mm512_set1_ps(7.0710677e-01f), ifft4995);
__m512 ifft5090 = _mm512_fmadd_ps(ifft5087, _mm512_set1_ps(7.0710677e-01f), ifft5079);
__m512 ifft5007 = _mm512_fmadd_ps(ifft5004, _mm512_set1_ps(7.0710677e-01f), ifft4996);
__m512 ifft5091 = _mm512_fmadd_ps(ifft5088, _mm512_set1_ps(7.0710677e-01f), ifft5080);
__m512 ifft5008 = _mm512_fmsub_ps(ifft5004, _mm512_set1_ps(7.0710677e-01f), ifft4996);
__m512 ifft5092 = _mm512_fmsub_ps(ifft5088, _mm512_set1_ps(7.0710677e-01f), ifft5080);
__m512 ifft5009 = _mm512_add_ps(ifft5005, ifft5006);
__m512 ifft5093 = _mm512_add_ps(ifft5089, ifft5090);
__m512 ifft5010 = _mm512_sub_ps(ifft5005, ifft5006);
__m512 ifft5094 = _mm512_sub_ps(ifft5089, ifft5090);
__m512 ifft5011 = _mm512_add_ps(ifft5007, ifft5008);
__m512 ifft5095 = _mm512_add_ps(ifft5091, ifft5092);
__m512 ifft5012 = _mm512_sub_ps(ifft5007, ifft5008);
__m512 ifft5096 = _mm512_sub_ps(ifft5091, ifft5092);
__m512 ifft5013 = _mm512_fmadd_ps(ifft5009, _mm512_set1_ps(1.5625e-02f), ifft4999);
__m512 ifft5097 = _mm512_fmadd_ps(ifft5093, _mm512_set1_ps(1.5625e-02f), ifft5083);
__m512 ifft5014 = _mm512_fnmadd_ps(ifft5009, _mm512_set1_ps(1.5625e-02f), ifft4999);
__m512 ifft5098 = _mm512_fnmadd_ps(ifft5093, _mm512_set1_ps(1.5625e-02f), ifft5083);
__m512 ifft5015 = _mm512_fmadd_ps(ifft5011, _mm512_set1_ps(1.5625e-02f), ifft5001);
__m512 ifft5099 = _mm512_fmadd_ps(ifft5095, _mm512_set1_ps(1.5625e-02f), ifft5085);
__m512 ifft5016 = _mm512_fnmadd_ps(ifft5011, _mm512_set1_ps(1.5625e-02f), ifft5001);
__m512 ifft5100 = _mm512_fnmadd_ps(ifft5095, _mm512_set1_ps(1.5625e-02f), ifft5085);
__m512 ifft5017 = _mm512_fnmadd_ps(ifft5012, _mm512_set1_ps(1.5625e-02f), ifft5000);
__m512 ifft5101 = _mm512_fnmadd_ps(ifft5096, _mm512_set1_ps(1.5625e-02f), ifft5084);
__m512 ifft5018 = _mm512_fmadd_ps(ifft5012, _mm512_set1_ps(1.5625e-02f), ifft5000);
__m512 ifft5102 = _mm512_fmadd_ps(ifft5096, _mm512_set1_ps(1.5625e-02f), ifft5084);
__m512 ifft5019 = _mm512_fmadd_ps(ifft5010, _mm512_set1_ps(1.5625e-02f), ifft5002);
__m512 ifft5103 = _mm512_fmadd_ps(ifft5094, _mm512_set1_ps(1.5625e-02f), ifft5086);
__m512 ifft5020 = _mm512_fnmadd_ps(ifft5010, _mm512_set1_ps(1.5625e-02f), ifft5002);
__m512 ifft5104 = _mm512_fnmadd_ps(ifft5094, _mm512_set1_ps(1.5625e-02f), ifft5086);
__m512 dat870 = ifft5013;
__m512 dat875 = ifft5097;
__m512 dat871 = ifft5015;
__m512 dat876 = ifft5099;
__m512 dat872 = ifft5017;
__m512 dat877 = ifft5101;
__m512 dat873 = ifft5019;
__m512 dat878 = ifft5103;
__m512 dat874 = ifft5014;
__m512 dat879 = ifft5098;
(void)ifft5016;
(void)ifft5100;
(void)ifft5018;
(void)ifft5102;
(void)ifft5020;
(void)ifft5104;
__m512i pm49 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack241 = _mm512_permutex2var_ps(dat870, pm49, dat875);
__m512i pm50 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack242 = _mm512_permutex2var_ps(dat870, pm50, dat875);
__m512 pack243 = _mm512_permutex2var_ps(dat871, pm49, dat876);
__m512 pack244 = _mm512_permutex2var_ps(dat871, pm50, dat876);
__m512 pack245 = _mm512_permutex2var_ps(dat872, pm49, dat877);
__m512 pack246 = _mm512_permutex2var_ps(dat872, pm50, dat877);
__m512 pack247 = _mm512_permutex2var_ps(dat873, pm49, dat878);
__m512 pack248 = _mm512_permutex2var_ps(dat873, pm50, dat878);
__m512 pack249 = _mm512_permutex2var_ps(dat874, pm49, dat879);
__m512 pack250 = _mm512_permutex2var_ps(dat874, pm50, dat879);
pack241 = _mm512_max_ps(_mm512_setzero_ps(), pack241);
pack242 = _mm512_max_ps(_mm512_setzero_ps(), pack242);
pack243 = _mm512_max_ps(_mm512_setzero_ps(), pack243);
pack244 = _mm512_max_ps(_mm512_setzero_ps(), pack244);
pack245 = _mm512_max_ps(_mm512_setzero_ps(), pack245);
pack246 = _mm512_max_ps(_mm512_setzero_ps(), pack246);
pack247 = _mm512_max_ps(_mm512_setzero_ps(), pack247);
pack248 = _mm512_max_ps(_mm512_setzero_ps(), pack248);
pack249 = _mm512_max_ps(_mm512_setzero_ps(), pack249);
pack250 = _mm512_max_ps(_mm512_setzero_ps(), pack250);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack241);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack242);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack243);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack244);
_mm512_mask_storeu_ps(datPtr2+896+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack245);
_mm512_mask_storeu_ps(datPtr2+51136+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack246);
_mm512_mask_storeu_ps(datPtr2+1344+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack247);
_mm512_mask_storeu_ps(datPtr2+51584+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack248);
_mm512_mask_storeu_ps(datPtr2+1792+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack249);
_mm512_mask_storeu_ps(datPtr2+52032+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t30, 127, pack250);
ptrdiff_t t31 = 0;
__m512 sfRe289 = _mm512_loadu_ps(sfPtr3+256+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm289 = _mm512_loadu_ps(sfPtr3+320+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe293 = _mm512_loadu_ps(sfPtr3+384+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm293 = _mm512_loadu_ps(sfPtr3+448+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe290 = _mm512_loadu_ps(sfPtr3+2167040+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm290 = _mm512_loadu_ps(sfPtr3+2167104+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe294 = _mm512_loadu_ps(sfPtr3+2167168+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm294 = _mm512_loadu_ps(sfPtr3+2167232+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe291 = _mm512_loadu_ps(sfPtr3+4333824+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm291 = _mm512_loadu_ps(sfPtr3+4333888+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe295 = _mm512_loadu_ps(sfPtr3+4333952+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm295 = _mm512_loadu_ps(sfPtr3+4334016+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe292 = _mm512_loadu_ps(sfPtr3+6500608+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm292 = _mm512_loadu_ps(sfPtr3+6500672+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfRe296 = _mm512_loadu_ps(sfPtr3+6500736+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512 sfIm296 = _mm512_loadu_ps(sfPtr3+6500800+8667136*i9+24576*j5+1536*k41+768*r18+256*t31);
__m512i ifft5105 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5106 = _mm512_permutexvar_ps(ifft5105, sfRe289);
__m512 ifft5197 = _mm512_permutexvar_ps(ifft5105, sfRe293);
__m512i ifft5107 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5108 = _mm512_permutexvar_ps(ifft5107, sfRe289);
__m512 ifft5198 = _mm512_permutexvar_ps(ifft5107, sfRe293);
__m512 ifft5109 = _mm512_permutexvar_ps(ifft5105, sfIm289);
__m512 ifft5199 = _mm512_permutexvar_ps(ifft5105, sfIm293);
__m512 ifft5110 = _mm512_permutexvar_ps(ifft5107, sfIm289);
__m512 ifft5200 = _mm512_permutexvar_ps(ifft5107, sfIm293);
__m512 ifft5111 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5112 = _mm512_mask_fmadd_ps(ifft5110, 65021, ifft5111, ifft5106);
__m512 ifft5201 = _mm512_mask_fmadd_ps(ifft5200, 65021, ifft5111, ifft5197);
__m512 ifft5113 = _mm512_mask_fnmadd_ps(ifft5109, 65021, ifft5111, ifft5108);
__m512 ifft5202 = _mm512_mask_fnmadd_ps(ifft5199, 65021, ifft5111, ifft5198);
__m512 ifft5114 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5115 = _mm512_fmadd_ps(ifft5112, ifft5114, _mm512_shuffle_ps(ifft5112, ifft5112, 177));
__m512 ifft5203 = _mm512_fmadd_ps(ifft5201, ifft5114, _mm512_shuffle_ps(ifft5201, ifft5201, 177));
__m512 ifft5116 = _mm512_fmadd_ps(ifft5113, ifft5114, _mm512_shuffle_ps(ifft5113, ifft5113, 177));
__m512 ifft5204 = _mm512_fmadd_ps(ifft5202, ifft5114, _mm512_shuffle_ps(ifft5202, ifft5202, 177));
__m512 ifft5117 = _mm512_fmadd_ps(sfRe290, ifft5114, _mm512_shuffle_ps(sfRe290, sfRe290, 177));
__m512 ifft5205 = _mm512_fmadd_ps(sfRe294, ifft5114, _mm512_shuffle_ps(sfRe294, sfRe294, 177));
__m512 ifft5118 = _mm512_fmadd_ps(sfIm290, ifft5114, _mm512_shuffle_ps(sfIm290, sfIm290, 177));
__m512 ifft5206 = _mm512_fmadd_ps(sfIm294, ifft5114, _mm512_shuffle_ps(sfIm294, sfIm294, 177));
__m512 ifft5119 = _mm512_fmadd_ps(sfRe291, ifft5114, _mm512_shuffle_ps(sfRe291, sfRe291, 177));
__m512 ifft5207 = _mm512_fmadd_ps(sfRe295, ifft5114, _mm512_shuffle_ps(sfRe295, sfRe295, 177));
__m512 ifft5120 = _mm512_fmadd_ps(sfIm291, ifft5114, _mm512_shuffle_ps(sfIm291, sfIm291, 177));
__m512 ifft5208 = _mm512_fmadd_ps(sfIm295, ifft5114, _mm512_shuffle_ps(sfIm295, sfIm295, 177));
__m512 ifft5121 = _mm512_fmadd_ps(sfRe292, ifft5114, _mm512_shuffle_ps(sfRe292, sfRe292, 177));
__m512 ifft5209 = _mm512_fmadd_ps(sfRe296, ifft5114, _mm512_shuffle_ps(sfRe296, sfRe296, 177));
__m512 ifft5122 = _mm512_fmadd_ps(sfIm292, ifft5114, _mm512_shuffle_ps(sfIm292, sfIm292, 177));
__m512 ifft5210 = _mm512_fmadd_ps(sfIm296, ifft5114, _mm512_shuffle_ps(sfIm296, sfIm296, 177));
__m512 ifft5123 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5124 = _mm512_mul_ps(ifft5115, ifft5123);
__m512 ifft5211 = _mm512_mul_ps(ifft5203, ifft5123);
__m512 ifft5125 = _mm512_mul_ps(ifft5116, ifft5123);
__m512 ifft5212 = _mm512_mul_ps(ifft5204, ifft5123);
__m512 ifft5126 = _mm512_mul_ps(ifft5117, ifft5123);
__m512 ifft5213 = _mm512_mul_ps(ifft5205, ifft5123);
__m512 ifft5127 = _mm512_mul_ps(ifft5118, ifft5123);
__m512 ifft5214 = _mm512_mul_ps(ifft5206, ifft5123);
__m512 ifft5128 = _mm512_mul_ps(ifft5119, ifft5123);
__m512 ifft5215 = _mm512_mul_ps(ifft5207, ifft5123);
__m512 ifft5129 = _mm512_mul_ps(ifft5120, ifft5123);
__m512 ifft5216 = _mm512_mul_ps(ifft5208, ifft5123);
__m512 ifft5130 = _mm512_mul_ps(ifft5121, ifft5123);
__m512 ifft5217 = _mm512_mul_ps(ifft5209, ifft5123);
__m512 ifft5131 = _mm512_mul_ps(ifft5122, ifft5123);
__m512 ifft5218 = _mm512_mul_ps(ifft5210, ifft5123);
__m512 ifft5132 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5133 = _mm512_fnmadd_ps(ifft5116, ifft5132, ifft5124);
__m512 ifft5219 = _mm512_fnmadd_ps(ifft5204, ifft5132, ifft5211);
__m512 ifft5134 = _mm512_fmadd_ps(ifft5115, ifft5132, ifft5125);
__m512 ifft5220 = _mm512_fmadd_ps(ifft5203, ifft5132, ifft5212);
__m512 ifft5135 = _mm512_fnmadd_ps(ifft5118, ifft5132, ifft5126);
__m512 ifft5221 = _mm512_fnmadd_ps(ifft5206, ifft5132, ifft5213);
__m512 ifft5136 = _mm512_fmadd_ps(ifft5117, ifft5132, ifft5127);
__m512 ifft5222 = _mm512_fmadd_ps(ifft5205, ifft5132, ifft5214);
__m512 ifft5137 = _mm512_fnmadd_ps(ifft5120, ifft5132, ifft5128);
__m512 ifft5223 = _mm512_fnmadd_ps(ifft5208, ifft5132, ifft5215);
__m512 ifft5138 = _mm512_fmadd_ps(ifft5119, ifft5132, ifft5129);
__m512 ifft5224 = _mm512_fmadd_ps(ifft5207, ifft5132, ifft5216);
__m512 ifft5139 = _mm512_fnmadd_ps(ifft5122, ifft5132, ifft5130);
__m512 ifft5225 = _mm512_fnmadd_ps(ifft5210, ifft5132, ifft5217);
__m512 ifft5140 = _mm512_fmadd_ps(ifft5121, ifft5132, ifft5131);
__m512 ifft5226 = _mm512_fmadd_ps(ifft5209, ifft5132, ifft5218);
__m512 ifft5141 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5142 = _mm512_fmadd_ps(ifft5133, ifft5141, _mm512_shuffle_ps(ifft5133, ifft5133, 78));
__m512 ifft5227 = _mm512_fmadd_ps(ifft5219, ifft5141, _mm512_shuffle_ps(ifft5219, ifft5219, 78));
__m512 ifft5143 = _mm512_fmadd_ps(ifft5134, ifft5141, _mm512_shuffle_ps(ifft5134, ifft5134, 78));
__m512 ifft5228 = _mm512_fmadd_ps(ifft5220, ifft5141, _mm512_shuffle_ps(ifft5220, ifft5220, 78));
__m512 ifft5144 = _mm512_fmadd_ps(ifft5135, ifft5141, _mm512_shuffle_ps(ifft5135, ifft5135, 78));
__m512 ifft5229 = _mm512_fmadd_ps(ifft5221, ifft5141, _mm512_shuffle_ps(ifft5221, ifft5221, 78));
__m512 ifft5145 = _mm512_fmadd_ps(ifft5136, ifft5141, _mm512_shuffle_ps(ifft5136, ifft5136, 78));
__m512 ifft5230 = _mm512_fmadd_ps(ifft5222, ifft5141, _mm512_shuffle_ps(ifft5222, ifft5222, 78));
__m512 ifft5146 = _mm512_fmadd_ps(ifft5137, ifft5141, _mm512_shuffle_ps(ifft5137, ifft5137, 78));
__m512 ifft5231 = _mm512_fmadd_ps(ifft5223, ifft5141, _mm512_shuffle_ps(ifft5223, ifft5223, 78));
__m512 ifft5147 = _mm512_fmadd_ps(ifft5138, ifft5141, _mm512_shuffle_ps(ifft5138, ifft5138, 78));
__m512 ifft5232 = _mm512_fmadd_ps(ifft5224, ifft5141, _mm512_shuffle_ps(ifft5224, ifft5224, 78));
__m512 ifft5148 = _mm512_fmadd_ps(ifft5139, ifft5141, _mm512_shuffle_ps(ifft5139, ifft5139, 78));
__m512 ifft5233 = _mm512_fmadd_ps(ifft5225, ifft5141, _mm512_shuffle_ps(ifft5225, ifft5225, 78));
__m512 ifft5149 = _mm512_fmadd_ps(ifft5140, ifft5141, _mm512_shuffle_ps(ifft5140, ifft5140, 78));
__m512 ifft5234 = _mm512_fmadd_ps(ifft5226, ifft5141, _mm512_shuffle_ps(ifft5226, ifft5226, 78));
__m512 ifft5150 = _mm512_mask_sub_ps(ifft5142, 49344, _mm512_setzero_ps(), ifft5143);
__m512 ifft5235 = _mm512_mask_sub_ps(ifft5227, 49344, _mm512_setzero_ps(), ifft5228);
__m512 ifft5151 = _mm512_mask_mov_ps(ifft5143, 49344, ifft5142);
__m512 ifft5236 = _mm512_mask_mov_ps(ifft5228, 49344, ifft5227);
__m512 ifft5152 = _mm512_mask_sub_ps(ifft5144, 49344, _mm512_setzero_ps(), ifft5145);
__m512 ifft5237 = _mm512_mask_sub_ps(ifft5229, 49344, _mm512_setzero_ps(), ifft5230);
__m512 ifft5153 = _mm512_mask_mov_ps(ifft5145, 49344, ifft5144);
__m512 ifft5238 = _mm512_mask_mov_ps(ifft5230, 49344, ifft5229);
__m512 ifft5154 = _mm512_mask_sub_ps(ifft5146, 49344, _mm512_setzero_ps(), ifft5147);
__m512 ifft5239 = _mm512_mask_sub_ps(ifft5231, 49344, _mm512_setzero_ps(), ifft5232);
__m512 ifft5155 = _mm512_mask_mov_ps(ifft5147, 49344, ifft5146);
__m512 ifft5240 = _mm512_mask_mov_ps(ifft5232, 49344, ifft5231);
__m512 ifft5156 = _mm512_mask_sub_ps(ifft5148, 49344, _mm512_setzero_ps(), ifft5149);
__m512 ifft5241 = _mm512_mask_sub_ps(ifft5233, 49344, _mm512_setzero_ps(), ifft5234);
__m512 ifft5157 = _mm512_mask_mov_ps(ifft5149, 49344, ifft5148);
__m512 ifft5242 = _mm512_mask_mov_ps(ifft5234, 49344, ifft5233);
__m512 ifft5158 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5159 = _mm512_fmadd_ps(ifft5150, ifft5158, _mm512_shuffle_f32x4(ifft5150, ifft5150, 177));
__m512 ifft5243 = _mm512_fmadd_ps(ifft5235, ifft5158, _mm512_shuffle_f32x4(ifft5235, ifft5235, 177));
__m512 ifft5160 = _mm512_fmadd_ps(ifft5151, ifft5158, _mm512_shuffle_f32x4(ifft5151, ifft5151, 177));
__m512 ifft5244 = _mm512_fmadd_ps(ifft5236, ifft5158, _mm512_shuffle_f32x4(ifft5236, ifft5236, 177));
__m512 ifft5161 = _mm512_fmadd_ps(ifft5152, ifft5158, _mm512_shuffle_f32x4(ifft5152, ifft5152, 177));
__m512 ifft5245 = _mm512_fmadd_ps(ifft5237, ifft5158, _mm512_shuffle_f32x4(ifft5237, ifft5237, 177));
__m512 ifft5162 = _mm512_fmadd_ps(ifft5153, ifft5158, _mm512_shuffle_f32x4(ifft5153, ifft5153, 177));
__m512 ifft5246 = _mm512_fmadd_ps(ifft5238, ifft5158, _mm512_shuffle_f32x4(ifft5238, ifft5238, 177));
__m512 ifft5163 = _mm512_fmadd_ps(ifft5154, ifft5158, _mm512_shuffle_f32x4(ifft5154, ifft5154, 177));
__m512 ifft5247 = _mm512_fmadd_ps(ifft5239, ifft5158, _mm512_shuffle_f32x4(ifft5239, ifft5239, 177));
__m512 ifft5164 = _mm512_fnmsub_ps(ifft5155, ifft5158, _mm512_shuffle_f32x4(ifft5155, ifft5155, 177));
__m512 ifft5248 = _mm512_fnmsub_ps(ifft5240, ifft5158, _mm512_shuffle_f32x4(ifft5240, ifft5240, 177));
__m512 ifft5165 = _mm512_fmadd_ps(ifft5156, ifft5158, _mm512_shuffle_f32x4(ifft5156, ifft5156, 177));
__m512 ifft5249 = _mm512_fmadd_ps(ifft5241, ifft5158, _mm512_shuffle_f32x4(ifft5241, ifft5241, 177));
__m512 ifft5166 = _mm512_fmadd_ps(ifft5157, ifft5158, _mm512_shuffle_f32x4(ifft5157, ifft5157, 177));
__m512 ifft5250 = _mm512_fmadd_ps(ifft5242, ifft5158, _mm512_shuffle_f32x4(ifft5242, ifft5242, 177));
__m512 ifft5167 = _mm512_add_ps(ifft5159, ifft5160);
__m512 ifft5251 = _mm512_add_ps(ifft5243, ifft5244);
__m512 ifft5168 = _mm512_sub_ps(ifft5159, ifft5160);
__m512 ifft5252 = _mm512_sub_ps(ifft5243, ifft5244);
__m512 ifft5169 = _mm512_sub_ps(ifft5161, ifft5165);
__m512 ifft5253 = _mm512_sub_ps(ifft5245, ifft5249);
__m512 ifft5170 = _mm512_add_ps(ifft5162, ifft5166);
__m512 ifft5254 = _mm512_add_ps(ifft5246, ifft5250);
__m512 ifft5171 = _mm512_add_ps(ifft5161, ifft5165);
__m512 ifft5255 = _mm512_add_ps(ifft5245, ifft5249);
__m512 ifft5172 = _mm512_sub_ps(ifft5162, ifft5166);
__m512 ifft5256 = _mm512_sub_ps(ifft5246, ifft5250);
__m512 ifft5173 = _mm512_mul_ps(ifft5163, _mm512_set1_ps(3.125e-02f));
__m512 ifft5257 = _mm512_mul_ps(ifft5247, _mm512_set1_ps(3.125e-02f));
__m512 ifft5174 = _mm512_mul_ps(ifft5164, _mm512_set1_ps(3.125e-02f));
__m512 ifft5258 = _mm512_mul_ps(ifft5248, _mm512_set1_ps(3.125e-02f));
__m512 ifft5175 = _mm512_fmadd_ps(ifft5167, _mm512_set1_ps(1.5625e-02f), ifft5173);
__m512 ifft5259 = _mm512_fmadd_ps(ifft5251, _mm512_set1_ps(1.5625e-02f), ifft5257);
__m512 ifft5176 = _mm512_fmsub_ps(ifft5167, _mm512_set1_ps(1.5625e-02f), ifft5173);
__m512 ifft5260 = _mm512_fmsub_ps(ifft5251, _mm512_set1_ps(1.5625e-02f), ifft5257);
__m512 ifft5177 = _mm512_fmadd_ps(ifft5168, _mm512_set1_ps(1.5625e-02f), ifft5174);
__m512 ifft5261 = _mm512_fmadd_ps(ifft5252, _mm512_set1_ps(1.5625e-02f), ifft5258);
__m512 ifft5178 = _mm512_fmsub_ps(ifft5168, _mm512_set1_ps(1.5625e-02f), ifft5174);
__m512 ifft5262 = _mm512_fmsub_ps(ifft5252, _mm512_set1_ps(1.5625e-02f), ifft5258);
__m512 ifft5179 = _mm512_add_ps(ifft5169, ifft5170);
__m512 ifft5263 = _mm512_add_ps(ifft5253, ifft5254);
__m512 ifft5180 = _mm512_sub_ps(ifft5169, ifft5170);
__m512 ifft5264 = _mm512_sub_ps(ifft5253, ifft5254);
__m512 ifft5181 = _mm512_fnmadd_ps(ifft5179, _mm512_set1_ps(7.0710677e-01f), ifft5171);
__m512 ifft5265 = _mm512_fnmadd_ps(ifft5263, _mm512_set1_ps(7.0710677e-01f), ifft5255);
__m512 ifft5182 = _mm512_fmadd_ps(ifft5179, _mm512_set1_ps(7.0710677e-01f), ifft5171);
__m512 ifft5266 = _mm512_fmadd_ps(ifft5263, _mm512_set1_ps(7.0710677e-01f), ifft5255);
__m512 ifft5183 = _mm512_fmadd_ps(ifft5180, _mm512_set1_ps(7.0710677e-01f), ifft5172);
__m512 ifft5267 = _mm512_fmadd_ps(ifft5264, _mm512_set1_ps(7.0710677e-01f), ifft5256);
__m512 ifft5184 = _mm512_fmsub_ps(ifft5180, _mm512_set1_ps(7.0710677e-01f), ifft5172);
__m512 ifft5268 = _mm512_fmsub_ps(ifft5264, _mm512_set1_ps(7.0710677e-01f), ifft5256);
__m512 ifft5185 = _mm512_add_ps(ifft5181, ifft5182);
__m512 ifft5269 = _mm512_add_ps(ifft5265, ifft5266);
__m512 ifft5186 = _mm512_sub_ps(ifft5181, ifft5182);
__m512 ifft5270 = _mm512_sub_ps(ifft5265, ifft5266);
__m512 ifft5187 = _mm512_add_ps(ifft5183, ifft5184);
__m512 ifft5271 = _mm512_add_ps(ifft5267, ifft5268);
__m512 ifft5188 = _mm512_sub_ps(ifft5183, ifft5184);
__m512 ifft5272 = _mm512_sub_ps(ifft5267, ifft5268);
__m512 ifft5189 = _mm512_fmadd_ps(ifft5185, _mm512_set1_ps(1.5625e-02f), ifft5175);
__m512 ifft5273 = _mm512_fmadd_ps(ifft5269, _mm512_set1_ps(1.5625e-02f), ifft5259);
__m512 ifft5190 = _mm512_fnmadd_ps(ifft5185, _mm512_set1_ps(1.5625e-02f), ifft5175);
__m512 ifft5274 = _mm512_fnmadd_ps(ifft5269, _mm512_set1_ps(1.5625e-02f), ifft5259);
__m512 ifft5191 = _mm512_fmadd_ps(ifft5187, _mm512_set1_ps(1.5625e-02f), ifft5177);
__m512 ifft5275 = _mm512_fmadd_ps(ifft5271, _mm512_set1_ps(1.5625e-02f), ifft5261);
__m512 ifft5192 = _mm512_fnmadd_ps(ifft5187, _mm512_set1_ps(1.5625e-02f), ifft5177);
__m512 ifft5276 = _mm512_fnmadd_ps(ifft5271, _mm512_set1_ps(1.5625e-02f), ifft5261);
__m512 ifft5193 = _mm512_fnmadd_ps(ifft5188, _mm512_set1_ps(1.5625e-02f), ifft5176);
__m512 ifft5277 = _mm512_fnmadd_ps(ifft5272, _mm512_set1_ps(1.5625e-02f), ifft5260);
__m512 ifft5194 = _mm512_fmadd_ps(ifft5188, _mm512_set1_ps(1.5625e-02f), ifft5176);
__m512 ifft5278 = _mm512_fmadd_ps(ifft5272, _mm512_set1_ps(1.5625e-02f), ifft5260);
__m512 ifft5195 = _mm512_fmadd_ps(ifft5186, _mm512_set1_ps(1.5625e-02f), ifft5178);
__m512 ifft5279 = _mm512_fmadd_ps(ifft5270, _mm512_set1_ps(1.5625e-02f), ifft5262);
__m512 ifft5196 = _mm512_fnmadd_ps(ifft5186, _mm512_set1_ps(1.5625e-02f), ifft5178);
__m512 ifft5280 = _mm512_fnmadd_ps(ifft5270, _mm512_set1_ps(1.5625e-02f), ifft5262);
__m512 dat880 = ifft5189;
__m512 dat882 = ifft5273;
__m512 dat881 = ifft5191;
__m512 dat883 = ifft5275;
(void)ifft5193;
(void)ifft5277;
(void)ifft5195;
(void)ifft5279;
(void)ifft5190;
(void)ifft5274;
(void)ifft5192;
(void)ifft5276;
(void)ifft5194;
(void)ifft5278;
(void)ifft5196;
(void)ifft5280;
__m512i pm51 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack251 = _mm512_permutex2var_ps(dat880, pm51, dat882);
__m512i pm52 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack252 = _mm512_permutex2var_ps(dat880, pm52, dat882);
__m512 pack253 = _mm512_permutex2var_ps(dat881, pm51, dat883);
__m512 pack254 = _mm512_permutex2var_ps(dat881, pm52, dat883);
pack251 = _mm512_max_ps(_mm512_setzero_ps(), pack251);
pack252 = _mm512_max_ps(_mm512_setzero_ps(), pack252);
pack253 = _mm512_max_ps(_mm512_setzero_ps(), pack253);
pack254 = _mm512_max_ps(_mm512_setzero_ps(), pack254);
_mm512_mask_storeu_ps(datPtr2+1820+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack251);
_mm512_mask_storeu_ps(datPtr2+52060+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack252);
_mm512_mask_storeu_ps(datPtr2+2268+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack253);
_mm512_mask_storeu_ps(datPtr2+52508+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+0*t31, 1023, pack254);
ptrdiff_t t32 = 0;
__m512 sfRe297 = _mm512_loadu_ps(sfPtr3+512+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm297 = _mm512_loadu_ps(sfPtr3+576+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe301 = _mm512_loadu_ps(sfPtr3+640+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm301 = _mm512_loadu_ps(sfPtr3+704+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe298 = _mm512_loadu_ps(sfPtr3+2167296+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm298 = _mm512_loadu_ps(sfPtr3+2167360+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe302 = _mm512_loadu_ps(sfPtr3+2167424+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm302 = _mm512_loadu_ps(sfPtr3+2167488+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe299 = _mm512_loadu_ps(sfPtr3+4334080+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm299 = _mm512_loadu_ps(sfPtr3+4334144+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe303 = _mm512_loadu_ps(sfPtr3+4334208+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm303 = _mm512_loadu_ps(sfPtr3+4334272+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe300 = _mm512_loadu_ps(sfPtr3+6500864+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm300 = _mm512_loadu_ps(sfPtr3+6500928+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfRe304 = _mm512_loadu_ps(sfPtr3+6500992+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512 sfIm304 = _mm512_loadu_ps(sfPtr3+6501056+8667136*i9+24576*j5+1536*k41+768*r18+256*t32);
__m512i ifft5281 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5282 = _mm512_permutexvar_ps(ifft5281, sfRe297);
__m512 ifft5373 = _mm512_permutexvar_ps(ifft5281, sfRe301);
__m512i ifft5283 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5284 = _mm512_permutexvar_ps(ifft5283, sfRe297);
__m512 ifft5374 = _mm512_permutexvar_ps(ifft5283, sfRe301);
__m512 ifft5285 = _mm512_permutexvar_ps(ifft5281, sfIm297);
__m512 ifft5375 = _mm512_permutexvar_ps(ifft5281, sfIm301);
__m512 ifft5286 = _mm512_permutexvar_ps(ifft5283, sfIm297);
__m512 ifft5376 = _mm512_permutexvar_ps(ifft5283, sfIm301);
__m512 ifft5287 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5288 = _mm512_mask_fmadd_ps(ifft5286, 65021, ifft5287, ifft5282);
__m512 ifft5377 = _mm512_mask_fmadd_ps(ifft5376, 65021, ifft5287, ifft5373);
__m512 ifft5289 = _mm512_mask_fnmadd_ps(ifft5285, 65021, ifft5287, ifft5284);
__m512 ifft5378 = _mm512_mask_fnmadd_ps(ifft5375, 65021, ifft5287, ifft5374);
__m512 ifft5290 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5291 = _mm512_fmadd_ps(ifft5288, ifft5290, _mm512_shuffle_ps(ifft5288, ifft5288, 177));
__m512 ifft5379 = _mm512_fmadd_ps(ifft5377, ifft5290, _mm512_shuffle_ps(ifft5377, ifft5377, 177));
__m512 ifft5292 = _mm512_fmadd_ps(ifft5289, ifft5290, _mm512_shuffle_ps(ifft5289, ifft5289, 177));
__m512 ifft5380 = _mm512_fmadd_ps(ifft5378, ifft5290, _mm512_shuffle_ps(ifft5378, ifft5378, 177));
__m512 ifft5293 = _mm512_fmadd_ps(sfRe298, ifft5290, _mm512_shuffle_ps(sfRe298, sfRe298, 177));
__m512 ifft5381 = _mm512_fmadd_ps(sfRe302, ifft5290, _mm512_shuffle_ps(sfRe302, sfRe302, 177));
__m512 ifft5294 = _mm512_fmadd_ps(sfIm298, ifft5290, _mm512_shuffle_ps(sfIm298, sfIm298, 177));
__m512 ifft5382 = _mm512_fmadd_ps(sfIm302, ifft5290, _mm512_shuffle_ps(sfIm302, sfIm302, 177));
__m512 ifft5295 = _mm512_fmadd_ps(sfRe299, ifft5290, _mm512_shuffle_ps(sfRe299, sfRe299, 177));
__m512 ifft5383 = _mm512_fmadd_ps(sfRe303, ifft5290, _mm512_shuffle_ps(sfRe303, sfRe303, 177));
__m512 ifft5296 = _mm512_fmadd_ps(sfIm299, ifft5290, _mm512_shuffle_ps(sfIm299, sfIm299, 177));
__m512 ifft5384 = _mm512_fmadd_ps(sfIm303, ifft5290, _mm512_shuffle_ps(sfIm303, sfIm303, 177));
__m512 ifft5297 = _mm512_fmadd_ps(sfRe300, ifft5290, _mm512_shuffle_ps(sfRe300, sfRe300, 177));
__m512 ifft5385 = _mm512_fmadd_ps(sfRe304, ifft5290, _mm512_shuffle_ps(sfRe304, sfRe304, 177));
__m512 ifft5298 = _mm512_fmadd_ps(sfIm300, ifft5290, _mm512_shuffle_ps(sfIm300, sfIm300, 177));
__m512 ifft5386 = _mm512_fmadd_ps(sfIm304, ifft5290, _mm512_shuffle_ps(sfIm304, sfIm304, 177));
__m512 ifft5299 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5300 = _mm512_mul_ps(ifft5291, ifft5299);
__m512 ifft5387 = _mm512_mul_ps(ifft5379, ifft5299);
__m512 ifft5301 = _mm512_mul_ps(ifft5292, ifft5299);
__m512 ifft5388 = _mm512_mul_ps(ifft5380, ifft5299);
__m512 ifft5302 = _mm512_mul_ps(ifft5293, ifft5299);
__m512 ifft5389 = _mm512_mul_ps(ifft5381, ifft5299);
__m512 ifft5303 = _mm512_mul_ps(ifft5294, ifft5299);
__m512 ifft5390 = _mm512_mul_ps(ifft5382, ifft5299);
__m512 ifft5304 = _mm512_mul_ps(ifft5295, ifft5299);
__m512 ifft5391 = _mm512_mul_ps(ifft5383, ifft5299);
__m512 ifft5305 = _mm512_mul_ps(ifft5296, ifft5299);
__m512 ifft5392 = _mm512_mul_ps(ifft5384, ifft5299);
__m512 ifft5306 = _mm512_mul_ps(ifft5297, ifft5299);
__m512 ifft5393 = _mm512_mul_ps(ifft5385, ifft5299);
__m512 ifft5307 = _mm512_mul_ps(ifft5298, ifft5299);
__m512 ifft5394 = _mm512_mul_ps(ifft5386, ifft5299);
__m512 ifft5308 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5309 = _mm512_fnmadd_ps(ifft5292, ifft5308, ifft5300);
__m512 ifft5395 = _mm512_fnmadd_ps(ifft5380, ifft5308, ifft5387);
__m512 ifft5310 = _mm512_fmadd_ps(ifft5291, ifft5308, ifft5301);
__m512 ifft5396 = _mm512_fmadd_ps(ifft5379, ifft5308, ifft5388);
__m512 ifft5311 = _mm512_fnmadd_ps(ifft5294, ifft5308, ifft5302);
__m512 ifft5397 = _mm512_fnmadd_ps(ifft5382, ifft5308, ifft5389);
__m512 ifft5312 = _mm512_fmadd_ps(ifft5293, ifft5308, ifft5303);
__m512 ifft5398 = _mm512_fmadd_ps(ifft5381, ifft5308, ifft5390);
__m512 ifft5313 = _mm512_fnmadd_ps(ifft5296, ifft5308, ifft5304);
__m512 ifft5399 = _mm512_fnmadd_ps(ifft5384, ifft5308, ifft5391);
__m512 ifft5314 = _mm512_fmadd_ps(ifft5295, ifft5308, ifft5305);
__m512 ifft5400 = _mm512_fmadd_ps(ifft5383, ifft5308, ifft5392);
__m512 ifft5315 = _mm512_fnmadd_ps(ifft5298, ifft5308, ifft5306);
__m512 ifft5401 = _mm512_fnmadd_ps(ifft5386, ifft5308, ifft5393);
__m512 ifft5316 = _mm512_fmadd_ps(ifft5297, ifft5308, ifft5307);
__m512 ifft5402 = _mm512_fmadd_ps(ifft5385, ifft5308, ifft5394);
__m512 ifft5317 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5318 = _mm512_fmadd_ps(ifft5309, ifft5317, _mm512_shuffle_ps(ifft5309, ifft5309, 78));
__m512 ifft5403 = _mm512_fmadd_ps(ifft5395, ifft5317, _mm512_shuffle_ps(ifft5395, ifft5395, 78));
__m512 ifft5319 = _mm512_fmadd_ps(ifft5310, ifft5317, _mm512_shuffle_ps(ifft5310, ifft5310, 78));
__m512 ifft5404 = _mm512_fmadd_ps(ifft5396, ifft5317, _mm512_shuffle_ps(ifft5396, ifft5396, 78));
__m512 ifft5320 = _mm512_fmadd_ps(ifft5311, ifft5317, _mm512_shuffle_ps(ifft5311, ifft5311, 78));
__m512 ifft5405 = _mm512_fmadd_ps(ifft5397, ifft5317, _mm512_shuffle_ps(ifft5397, ifft5397, 78));
__m512 ifft5321 = _mm512_fmadd_ps(ifft5312, ifft5317, _mm512_shuffle_ps(ifft5312, ifft5312, 78));
__m512 ifft5406 = _mm512_fmadd_ps(ifft5398, ifft5317, _mm512_shuffle_ps(ifft5398, ifft5398, 78));
__m512 ifft5322 = _mm512_fmadd_ps(ifft5313, ifft5317, _mm512_shuffle_ps(ifft5313, ifft5313, 78));
__m512 ifft5407 = _mm512_fmadd_ps(ifft5399, ifft5317, _mm512_shuffle_ps(ifft5399, ifft5399, 78));
__m512 ifft5323 = _mm512_fmadd_ps(ifft5314, ifft5317, _mm512_shuffle_ps(ifft5314, ifft5314, 78));
__m512 ifft5408 = _mm512_fmadd_ps(ifft5400, ifft5317, _mm512_shuffle_ps(ifft5400, ifft5400, 78));
__m512 ifft5324 = _mm512_fmadd_ps(ifft5315, ifft5317, _mm512_shuffle_ps(ifft5315, ifft5315, 78));
__m512 ifft5409 = _mm512_fmadd_ps(ifft5401, ifft5317, _mm512_shuffle_ps(ifft5401, ifft5401, 78));
__m512 ifft5325 = _mm512_fmadd_ps(ifft5316, ifft5317, _mm512_shuffle_ps(ifft5316, ifft5316, 78));
__m512 ifft5410 = _mm512_fmadd_ps(ifft5402, ifft5317, _mm512_shuffle_ps(ifft5402, ifft5402, 78));
__m512 ifft5326 = _mm512_mask_sub_ps(ifft5318, 49344, _mm512_setzero_ps(), ifft5319);
__m512 ifft5411 = _mm512_mask_sub_ps(ifft5403, 49344, _mm512_setzero_ps(), ifft5404);
__m512 ifft5327 = _mm512_mask_mov_ps(ifft5319, 49344, ifft5318);
__m512 ifft5412 = _mm512_mask_mov_ps(ifft5404, 49344, ifft5403);
__m512 ifft5328 = _mm512_mask_sub_ps(ifft5320, 49344, _mm512_setzero_ps(), ifft5321);
__m512 ifft5413 = _mm512_mask_sub_ps(ifft5405, 49344, _mm512_setzero_ps(), ifft5406);
__m512 ifft5329 = _mm512_mask_mov_ps(ifft5321, 49344, ifft5320);
__m512 ifft5414 = _mm512_mask_mov_ps(ifft5406, 49344, ifft5405);
__m512 ifft5330 = _mm512_mask_sub_ps(ifft5322, 49344, _mm512_setzero_ps(), ifft5323);
__m512 ifft5415 = _mm512_mask_sub_ps(ifft5407, 49344, _mm512_setzero_ps(), ifft5408);
__m512 ifft5331 = _mm512_mask_mov_ps(ifft5323, 49344, ifft5322);
__m512 ifft5416 = _mm512_mask_mov_ps(ifft5408, 49344, ifft5407);
__m512 ifft5332 = _mm512_mask_sub_ps(ifft5324, 49344, _mm512_setzero_ps(), ifft5325);
__m512 ifft5417 = _mm512_mask_sub_ps(ifft5409, 49344, _mm512_setzero_ps(), ifft5410);
__m512 ifft5333 = _mm512_mask_mov_ps(ifft5325, 49344, ifft5324);
__m512 ifft5418 = _mm512_mask_mov_ps(ifft5410, 49344, ifft5409);
__m512 ifft5334 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5335 = _mm512_fmadd_ps(ifft5326, ifft5334, _mm512_shuffle_f32x4(ifft5326, ifft5326, 177));
__m512 ifft5419 = _mm512_fmadd_ps(ifft5411, ifft5334, _mm512_shuffle_f32x4(ifft5411, ifft5411, 177));
__m512 ifft5336 = _mm512_fmadd_ps(ifft5327, ifft5334, _mm512_shuffle_f32x4(ifft5327, ifft5327, 177));
__m512 ifft5420 = _mm512_fmadd_ps(ifft5412, ifft5334, _mm512_shuffle_f32x4(ifft5412, ifft5412, 177));
__m512 ifft5337 = _mm512_fmadd_ps(ifft5328, ifft5334, _mm512_shuffle_f32x4(ifft5328, ifft5328, 177));
__m512 ifft5421 = _mm512_fmadd_ps(ifft5413, ifft5334, _mm512_shuffle_f32x4(ifft5413, ifft5413, 177));
__m512 ifft5338 = _mm512_fmadd_ps(ifft5329, ifft5334, _mm512_shuffle_f32x4(ifft5329, ifft5329, 177));
__m512 ifft5422 = _mm512_fmadd_ps(ifft5414, ifft5334, _mm512_shuffle_f32x4(ifft5414, ifft5414, 177));
__m512 ifft5339 = _mm512_fmadd_ps(ifft5330, ifft5334, _mm512_shuffle_f32x4(ifft5330, ifft5330, 177));
__m512 ifft5423 = _mm512_fmadd_ps(ifft5415, ifft5334, _mm512_shuffle_f32x4(ifft5415, ifft5415, 177));
__m512 ifft5340 = _mm512_fnmsub_ps(ifft5331, ifft5334, _mm512_shuffle_f32x4(ifft5331, ifft5331, 177));
__m512 ifft5424 = _mm512_fnmsub_ps(ifft5416, ifft5334, _mm512_shuffle_f32x4(ifft5416, ifft5416, 177));
__m512 ifft5341 = _mm512_fmadd_ps(ifft5332, ifft5334, _mm512_shuffle_f32x4(ifft5332, ifft5332, 177));
__m512 ifft5425 = _mm512_fmadd_ps(ifft5417, ifft5334, _mm512_shuffle_f32x4(ifft5417, ifft5417, 177));
__m512 ifft5342 = _mm512_fmadd_ps(ifft5333, ifft5334, _mm512_shuffle_f32x4(ifft5333, ifft5333, 177));
__m512 ifft5426 = _mm512_fmadd_ps(ifft5418, ifft5334, _mm512_shuffle_f32x4(ifft5418, ifft5418, 177));
__m512 ifft5343 = _mm512_add_ps(ifft5335, ifft5336);
__m512 ifft5427 = _mm512_add_ps(ifft5419, ifft5420);
__m512 ifft5344 = _mm512_sub_ps(ifft5335, ifft5336);
__m512 ifft5428 = _mm512_sub_ps(ifft5419, ifft5420);
__m512 ifft5345 = _mm512_sub_ps(ifft5337, ifft5341);
__m512 ifft5429 = _mm512_sub_ps(ifft5421, ifft5425);
__m512 ifft5346 = _mm512_add_ps(ifft5338, ifft5342);
__m512 ifft5430 = _mm512_add_ps(ifft5422, ifft5426);
__m512 ifft5347 = _mm512_add_ps(ifft5337, ifft5341);
__m512 ifft5431 = _mm512_add_ps(ifft5421, ifft5425);
__m512 ifft5348 = _mm512_sub_ps(ifft5338, ifft5342);
__m512 ifft5432 = _mm512_sub_ps(ifft5422, ifft5426);
__m512 ifft5349 = _mm512_mul_ps(ifft5339, _mm512_set1_ps(3.125e-02f));
__m512 ifft5433 = _mm512_mul_ps(ifft5423, _mm512_set1_ps(3.125e-02f));
__m512 ifft5350 = _mm512_mul_ps(ifft5340, _mm512_set1_ps(3.125e-02f));
__m512 ifft5434 = _mm512_mul_ps(ifft5424, _mm512_set1_ps(3.125e-02f));
__m512 ifft5351 = _mm512_fmadd_ps(ifft5343, _mm512_set1_ps(1.5625e-02f), ifft5349);
__m512 ifft5435 = _mm512_fmadd_ps(ifft5427, _mm512_set1_ps(1.5625e-02f), ifft5433);
__m512 ifft5352 = _mm512_fmsub_ps(ifft5343, _mm512_set1_ps(1.5625e-02f), ifft5349);
__m512 ifft5436 = _mm512_fmsub_ps(ifft5427, _mm512_set1_ps(1.5625e-02f), ifft5433);
__m512 ifft5353 = _mm512_fmadd_ps(ifft5344, _mm512_set1_ps(1.5625e-02f), ifft5350);
__m512 ifft5437 = _mm512_fmadd_ps(ifft5428, _mm512_set1_ps(1.5625e-02f), ifft5434);
__m512 ifft5354 = _mm512_fmsub_ps(ifft5344, _mm512_set1_ps(1.5625e-02f), ifft5350);
__m512 ifft5438 = _mm512_fmsub_ps(ifft5428, _mm512_set1_ps(1.5625e-02f), ifft5434);
__m512 ifft5355 = _mm512_add_ps(ifft5345, ifft5346);
__m512 ifft5439 = _mm512_add_ps(ifft5429, ifft5430);
__m512 ifft5356 = _mm512_sub_ps(ifft5345, ifft5346);
__m512 ifft5440 = _mm512_sub_ps(ifft5429, ifft5430);
__m512 ifft5357 = _mm512_fnmadd_ps(ifft5355, _mm512_set1_ps(7.0710677e-01f), ifft5347);
__m512 ifft5441 = _mm512_fnmadd_ps(ifft5439, _mm512_set1_ps(7.0710677e-01f), ifft5431);
__m512 ifft5358 = _mm512_fmadd_ps(ifft5355, _mm512_set1_ps(7.0710677e-01f), ifft5347);
__m512 ifft5442 = _mm512_fmadd_ps(ifft5439, _mm512_set1_ps(7.0710677e-01f), ifft5431);
__m512 ifft5359 = _mm512_fmadd_ps(ifft5356, _mm512_set1_ps(7.0710677e-01f), ifft5348);
__m512 ifft5443 = _mm512_fmadd_ps(ifft5440, _mm512_set1_ps(7.0710677e-01f), ifft5432);
__m512 ifft5360 = _mm512_fmsub_ps(ifft5356, _mm512_set1_ps(7.0710677e-01f), ifft5348);
__m512 ifft5444 = _mm512_fmsub_ps(ifft5440, _mm512_set1_ps(7.0710677e-01f), ifft5432);
__m512 ifft5361 = _mm512_add_ps(ifft5357, ifft5358);
__m512 ifft5445 = _mm512_add_ps(ifft5441, ifft5442);
__m512 ifft5362 = _mm512_sub_ps(ifft5357, ifft5358);
__m512 ifft5446 = _mm512_sub_ps(ifft5441, ifft5442);
__m512 ifft5363 = _mm512_add_ps(ifft5359, ifft5360);
__m512 ifft5447 = _mm512_add_ps(ifft5443, ifft5444);
__m512 ifft5364 = _mm512_sub_ps(ifft5359, ifft5360);
__m512 ifft5448 = _mm512_sub_ps(ifft5443, ifft5444);
__m512 ifft5365 = _mm512_fmadd_ps(ifft5361, _mm512_set1_ps(1.5625e-02f), ifft5351);
__m512 ifft5449 = _mm512_fmadd_ps(ifft5445, _mm512_set1_ps(1.5625e-02f), ifft5435);
__m512 ifft5366 = _mm512_fnmadd_ps(ifft5361, _mm512_set1_ps(1.5625e-02f), ifft5351);
__m512 ifft5450 = _mm512_fnmadd_ps(ifft5445, _mm512_set1_ps(1.5625e-02f), ifft5435);
__m512 ifft5367 = _mm512_fmadd_ps(ifft5363, _mm512_set1_ps(1.5625e-02f), ifft5353);
__m512 ifft5451 = _mm512_fmadd_ps(ifft5447, _mm512_set1_ps(1.5625e-02f), ifft5437);
__m512 ifft5368 = _mm512_fnmadd_ps(ifft5363, _mm512_set1_ps(1.5625e-02f), ifft5353);
__m512 ifft5452 = _mm512_fnmadd_ps(ifft5447, _mm512_set1_ps(1.5625e-02f), ifft5437);
__m512 ifft5369 = _mm512_fnmadd_ps(ifft5364, _mm512_set1_ps(1.5625e-02f), ifft5352);
__m512 ifft5453 = _mm512_fnmadd_ps(ifft5448, _mm512_set1_ps(1.5625e-02f), ifft5436);
__m512 ifft5370 = _mm512_fmadd_ps(ifft5364, _mm512_set1_ps(1.5625e-02f), ifft5352);
__m512 ifft5454 = _mm512_fmadd_ps(ifft5448, _mm512_set1_ps(1.5625e-02f), ifft5436);
__m512 ifft5371 = _mm512_fmadd_ps(ifft5362, _mm512_set1_ps(1.5625e-02f), ifft5354);
__m512 ifft5455 = _mm512_fmadd_ps(ifft5446, _mm512_set1_ps(1.5625e-02f), ifft5438);
__m512 ifft5372 = _mm512_fnmadd_ps(ifft5362, _mm512_set1_ps(1.5625e-02f), ifft5354);
__m512 ifft5456 = _mm512_fnmadd_ps(ifft5446, _mm512_set1_ps(1.5625e-02f), ifft5438);
__m512 dat884 = ifft5365;
__m512 dat886 = ifft5449;
__m512 dat885 = ifft5367;
__m512 dat887 = ifft5451;
(void)ifft5369;
(void)ifft5453;
(void)ifft5371;
(void)ifft5455;
(void)ifft5366;
(void)ifft5450;
(void)ifft5368;
(void)ifft5452;
(void)ifft5370;
(void)ifft5454;
(void)ifft5372;
(void)ifft5456;
__m512i pm53 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack255 = _mm512_permutex2var_ps(dat884, pm53, dat886);
__m512i pm54 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack256 = _mm512_permutex2var_ps(dat884, pm54, dat886);
__m512 pack257 = _mm512_permutex2var_ps(dat885, pm53, dat887);
__m512 pack258 = _mm512_permutex2var_ps(dat885, pm54, dat887);
pack255 = _mm512_max_ps(_mm512_setzero_ps(), pack255);
pack256 = _mm512_max_ps(_mm512_setzero_ps(), pack256);
pack257 = _mm512_max_ps(_mm512_setzero_ps(), pack257);
pack258 = _mm512_max_ps(_mm512_setzero_ps(), pack258);
_mm512_mask_storeu_ps(datPtr2+1860+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack255);
_mm512_mask_storeu_ps(datPtr2+52100+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack256);
_mm512_mask_storeu_ps(datPtr2+2308+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack257);
_mm512_mask_storeu_ps(datPtr2+52548+3215360*i9+200960*k41+100480*r18+448*toH17+4*toW17+40*t32, 1023, pack258);
}
}
if (j5 >= last2) return;
++j5;
rel6 = 1;
}
if (rel6 < 4) {
ptrdiff_t toH18 = base6+5;
ptrdiff_t toW18 = -10+30*rel6;
ptrdiff_t jj18 = 3-rel6+j5;
for (; j5 <= jj18; toW18 += 30) {
ptrdiff_t k42 = 16*w21;
for (; k42 != 16; ++k42) {
ptrdiff_t r19 = 0;
for (; r19 != 2; ++r19) {
ptrdiff_t t33 = 0;
for (; t33 < 3; ++t33) {
__m512 sfRe305 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm305 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe309 = _mm512_loadu_ps(sfPtr3+128+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm309 = _mm512_loadu_ps(sfPtr3+192+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe306 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm306 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe310 = _mm512_loadu_ps(sfPtr3+2166912+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm310 = _mm512_loadu_ps(sfPtr3+2166976+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe307 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm307 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe311 = _mm512_loadu_ps(sfPtr3+4333696+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm311 = _mm512_loadu_ps(sfPtr3+4333760+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe308 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm308 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfRe312 = _mm512_loadu_ps(sfPtr3+6500480+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512 sfIm312 = _mm512_loadu_ps(sfPtr3+6500544+8667136*i9+24576*j5+1536*k42+768*r19+256*t33);
__m512i ifft5457 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5458 = _mm512_permutexvar_ps(ifft5457, sfRe305);
__m512 ifft5549 = _mm512_permutexvar_ps(ifft5457, sfRe309);
__m512i ifft5459 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5460 = _mm512_permutexvar_ps(ifft5459, sfRe305);
__m512 ifft5550 = _mm512_permutexvar_ps(ifft5459, sfRe309);
__m512 ifft5461 = _mm512_permutexvar_ps(ifft5457, sfIm305);
__m512 ifft5551 = _mm512_permutexvar_ps(ifft5457, sfIm309);
__m512 ifft5462 = _mm512_permutexvar_ps(ifft5459, sfIm305);
__m512 ifft5552 = _mm512_permutexvar_ps(ifft5459, sfIm309);
__m512 ifft5463 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5464 = _mm512_mask_fmadd_ps(ifft5462, 65021, ifft5463, ifft5458);
__m512 ifft5553 = _mm512_mask_fmadd_ps(ifft5552, 65021, ifft5463, ifft5549);
__m512 ifft5465 = _mm512_mask_fnmadd_ps(ifft5461, 65021, ifft5463, ifft5460);
__m512 ifft5554 = _mm512_mask_fnmadd_ps(ifft5551, 65021, ifft5463, ifft5550);
__m512 ifft5466 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5467 = _mm512_fmadd_ps(ifft5464, ifft5466, _mm512_shuffle_ps(ifft5464, ifft5464, 177));
__m512 ifft5555 = _mm512_fmadd_ps(ifft5553, ifft5466, _mm512_shuffle_ps(ifft5553, ifft5553, 177));
__m512 ifft5468 = _mm512_fmadd_ps(ifft5465, ifft5466, _mm512_shuffle_ps(ifft5465, ifft5465, 177));
__m512 ifft5556 = _mm512_fmadd_ps(ifft5554, ifft5466, _mm512_shuffle_ps(ifft5554, ifft5554, 177));
__m512 ifft5469 = _mm512_fmadd_ps(sfRe306, ifft5466, _mm512_shuffle_ps(sfRe306, sfRe306, 177));
__m512 ifft5557 = _mm512_fmadd_ps(sfRe310, ifft5466, _mm512_shuffle_ps(sfRe310, sfRe310, 177));
__m512 ifft5470 = _mm512_fmadd_ps(sfIm306, ifft5466, _mm512_shuffle_ps(sfIm306, sfIm306, 177));
__m512 ifft5558 = _mm512_fmadd_ps(sfIm310, ifft5466, _mm512_shuffle_ps(sfIm310, sfIm310, 177));
__m512 ifft5471 = _mm512_fmadd_ps(sfRe307, ifft5466, _mm512_shuffle_ps(sfRe307, sfRe307, 177));
__m512 ifft5559 = _mm512_fmadd_ps(sfRe311, ifft5466, _mm512_shuffle_ps(sfRe311, sfRe311, 177));
__m512 ifft5472 = _mm512_fmadd_ps(sfIm307, ifft5466, _mm512_shuffle_ps(sfIm307, sfIm307, 177));
__m512 ifft5560 = _mm512_fmadd_ps(sfIm311, ifft5466, _mm512_shuffle_ps(sfIm311, sfIm311, 177));
__m512 ifft5473 = _mm512_fmadd_ps(sfRe308, ifft5466, _mm512_shuffle_ps(sfRe308, sfRe308, 177));
__m512 ifft5561 = _mm512_fmadd_ps(sfRe312, ifft5466, _mm512_shuffle_ps(sfRe312, sfRe312, 177));
__m512 ifft5474 = _mm512_fmadd_ps(sfIm308, ifft5466, _mm512_shuffle_ps(sfIm308, sfIm308, 177));
__m512 ifft5562 = _mm512_fmadd_ps(sfIm312, ifft5466, _mm512_shuffle_ps(sfIm312, sfIm312, 177));
__m512 ifft5475 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5476 = _mm512_mul_ps(ifft5467, ifft5475);
__m512 ifft5563 = _mm512_mul_ps(ifft5555, ifft5475);
__m512 ifft5477 = _mm512_mul_ps(ifft5468, ifft5475);
__m512 ifft5564 = _mm512_mul_ps(ifft5556, ifft5475);
__m512 ifft5478 = _mm512_mul_ps(ifft5469, ifft5475);
__m512 ifft5565 = _mm512_mul_ps(ifft5557, ifft5475);
__m512 ifft5479 = _mm512_mul_ps(ifft5470, ifft5475);
__m512 ifft5566 = _mm512_mul_ps(ifft5558, ifft5475);
__m512 ifft5480 = _mm512_mul_ps(ifft5471, ifft5475);
__m512 ifft5567 = _mm512_mul_ps(ifft5559, ifft5475);
__m512 ifft5481 = _mm512_mul_ps(ifft5472, ifft5475);
__m512 ifft5568 = _mm512_mul_ps(ifft5560, ifft5475);
__m512 ifft5482 = _mm512_mul_ps(ifft5473, ifft5475);
__m512 ifft5569 = _mm512_mul_ps(ifft5561, ifft5475);
__m512 ifft5483 = _mm512_mul_ps(ifft5474, ifft5475);
__m512 ifft5570 = _mm512_mul_ps(ifft5562, ifft5475);
__m512 ifft5484 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5485 = _mm512_fnmadd_ps(ifft5468, ifft5484, ifft5476);
__m512 ifft5571 = _mm512_fnmadd_ps(ifft5556, ifft5484, ifft5563);
__m512 ifft5486 = _mm512_fmadd_ps(ifft5467, ifft5484, ifft5477);
__m512 ifft5572 = _mm512_fmadd_ps(ifft5555, ifft5484, ifft5564);
__m512 ifft5487 = _mm512_fnmadd_ps(ifft5470, ifft5484, ifft5478);
__m512 ifft5573 = _mm512_fnmadd_ps(ifft5558, ifft5484, ifft5565);
__m512 ifft5488 = _mm512_fmadd_ps(ifft5469, ifft5484, ifft5479);
__m512 ifft5574 = _mm512_fmadd_ps(ifft5557, ifft5484, ifft5566);
__m512 ifft5489 = _mm512_fnmadd_ps(ifft5472, ifft5484, ifft5480);
__m512 ifft5575 = _mm512_fnmadd_ps(ifft5560, ifft5484, ifft5567);
__m512 ifft5490 = _mm512_fmadd_ps(ifft5471, ifft5484, ifft5481);
__m512 ifft5576 = _mm512_fmadd_ps(ifft5559, ifft5484, ifft5568);
__m512 ifft5491 = _mm512_fnmadd_ps(ifft5474, ifft5484, ifft5482);
__m512 ifft5577 = _mm512_fnmadd_ps(ifft5562, ifft5484, ifft5569);
__m512 ifft5492 = _mm512_fmadd_ps(ifft5473, ifft5484, ifft5483);
__m512 ifft5578 = _mm512_fmadd_ps(ifft5561, ifft5484, ifft5570);
__m512 ifft5493 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5494 = _mm512_fmadd_ps(ifft5485, ifft5493, _mm512_shuffle_ps(ifft5485, ifft5485, 78));
__m512 ifft5579 = _mm512_fmadd_ps(ifft5571, ifft5493, _mm512_shuffle_ps(ifft5571, ifft5571, 78));
__m512 ifft5495 = _mm512_fmadd_ps(ifft5486, ifft5493, _mm512_shuffle_ps(ifft5486, ifft5486, 78));
__m512 ifft5580 = _mm512_fmadd_ps(ifft5572, ifft5493, _mm512_shuffle_ps(ifft5572, ifft5572, 78));
__m512 ifft5496 = _mm512_fmadd_ps(ifft5487, ifft5493, _mm512_shuffle_ps(ifft5487, ifft5487, 78));
__m512 ifft5581 = _mm512_fmadd_ps(ifft5573, ifft5493, _mm512_shuffle_ps(ifft5573, ifft5573, 78));
__m512 ifft5497 = _mm512_fmadd_ps(ifft5488, ifft5493, _mm512_shuffle_ps(ifft5488, ifft5488, 78));
__m512 ifft5582 = _mm512_fmadd_ps(ifft5574, ifft5493, _mm512_shuffle_ps(ifft5574, ifft5574, 78));
__m512 ifft5498 = _mm512_fmadd_ps(ifft5489, ifft5493, _mm512_shuffle_ps(ifft5489, ifft5489, 78));
__m512 ifft5583 = _mm512_fmadd_ps(ifft5575, ifft5493, _mm512_shuffle_ps(ifft5575, ifft5575, 78));
__m512 ifft5499 = _mm512_fmadd_ps(ifft5490, ifft5493, _mm512_shuffle_ps(ifft5490, ifft5490, 78));
__m512 ifft5584 = _mm512_fmadd_ps(ifft5576, ifft5493, _mm512_shuffle_ps(ifft5576, ifft5576, 78));
__m512 ifft5500 = _mm512_fmadd_ps(ifft5491, ifft5493, _mm512_shuffle_ps(ifft5491, ifft5491, 78));
__m512 ifft5585 = _mm512_fmadd_ps(ifft5577, ifft5493, _mm512_shuffle_ps(ifft5577, ifft5577, 78));
__m512 ifft5501 = _mm512_fmadd_ps(ifft5492, ifft5493, _mm512_shuffle_ps(ifft5492, ifft5492, 78));
__m512 ifft5586 = _mm512_fmadd_ps(ifft5578, ifft5493, _mm512_shuffle_ps(ifft5578, ifft5578, 78));
__m512 ifft5502 = _mm512_mask_sub_ps(ifft5494, 49344, _mm512_setzero_ps(), ifft5495);
__m512 ifft5587 = _mm512_mask_sub_ps(ifft5579, 49344, _mm512_setzero_ps(), ifft5580);
__m512 ifft5503 = _mm512_mask_mov_ps(ifft5495, 49344, ifft5494);
__m512 ifft5588 = _mm512_mask_mov_ps(ifft5580, 49344, ifft5579);
__m512 ifft5504 = _mm512_mask_sub_ps(ifft5496, 49344, _mm512_setzero_ps(), ifft5497);
__m512 ifft5589 = _mm512_mask_sub_ps(ifft5581, 49344, _mm512_setzero_ps(), ifft5582);
__m512 ifft5505 = _mm512_mask_mov_ps(ifft5497, 49344, ifft5496);
__m512 ifft5590 = _mm512_mask_mov_ps(ifft5582, 49344, ifft5581);
__m512 ifft5506 = _mm512_mask_sub_ps(ifft5498, 49344, _mm512_setzero_ps(), ifft5499);
__m512 ifft5591 = _mm512_mask_sub_ps(ifft5583, 49344, _mm512_setzero_ps(), ifft5584);
__m512 ifft5507 = _mm512_mask_mov_ps(ifft5499, 49344, ifft5498);
__m512 ifft5592 = _mm512_mask_mov_ps(ifft5584, 49344, ifft5583);
__m512 ifft5508 = _mm512_mask_sub_ps(ifft5500, 49344, _mm512_setzero_ps(), ifft5501);
__m512 ifft5593 = _mm512_mask_sub_ps(ifft5585, 49344, _mm512_setzero_ps(), ifft5586);
__m512 ifft5509 = _mm512_mask_mov_ps(ifft5501, 49344, ifft5500);
__m512 ifft5594 = _mm512_mask_mov_ps(ifft5586, 49344, ifft5585);
__m512 ifft5510 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5511 = _mm512_fmadd_ps(ifft5502, ifft5510, _mm512_shuffle_f32x4(ifft5502, ifft5502, 177));
__m512 ifft5595 = _mm512_fmadd_ps(ifft5587, ifft5510, _mm512_shuffle_f32x4(ifft5587, ifft5587, 177));
__m512 ifft5512 = _mm512_fmadd_ps(ifft5503, ifft5510, _mm512_shuffle_f32x4(ifft5503, ifft5503, 177));
__m512 ifft5596 = _mm512_fmadd_ps(ifft5588, ifft5510, _mm512_shuffle_f32x4(ifft5588, ifft5588, 177));
__m512 ifft5513 = _mm512_fmadd_ps(ifft5504, ifft5510, _mm512_shuffle_f32x4(ifft5504, ifft5504, 177));
__m512 ifft5597 = _mm512_fmadd_ps(ifft5589, ifft5510, _mm512_shuffle_f32x4(ifft5589, ifft5589, 177));
__m512 ifft5514 = _mm512_fmadd_ps(ifft5505, ifft5510, _mm512_shuffle_f32x4(ifft5505, ifft5505, 177));
__m512 ifft5598 = _mm512_fmadd_ps(ifft5590, ifft5510, _mm512_shuffle_f32x4(ifft5590, ifft5590, 177));
__m512 ifft5515 = _mm512_fmadd_ps(ifft5506, ifft5510, _mm512_shuffle_f32x4(ifft5506, ifft5506, 177));
__m512 ifft5599 = _mm512_fmadd_ps(ifft5591, ifft5510, _mm512_shuffle_f32x4(ifft5591, ifft5591, 177));
__m512 ifft5516 = _mm512_fnmsub_ps(ifft5507, ifft5510, _mm512_shuffle_f32x4(ifft5507, ifft5507, 177));
__m512 ifft5600 = _mm512_fnmsub_ps(ifft5592, ifft5510, _mm512_shuffle_f32x4(ifft5592, ifft5592, 177));
__m512 ifft5517 = _mm512_fmadd_ps(ifft5508, ifft5510, _mm512_shuffle_f32x4(ifft5508, ifft5508, 177));
__m512 ifft5601 = _mm512_fmadd_ps(ifft5593, ifft5510, _mm512_shuffle_f32x4(ifft5593, ifft5593, 177));
__m512 ifft5518 = _mm512_fmadd_ps(ifft5509, ifft5510, _mm512_shuffle_f32x4(ifft5509, ifft5509, 177));
__m512 ifft5602 = _mm512_fmadd_ps(ifft5594, ifft5510, _mm512_shuffle_f32x4(ifft5594, ifft5594, 177));
__m512 ifft5519 = _mm512_add_ps(ifft5511, ifft5512);
__m512 ifft5603 = _mm512_add_ps(ifft5595, ifft5596);
__m512 ifft5520 = _mm512_sub_ps(ifft5511, ifft5512);
__m512 ifft5604 = _mm512_sub_ps(ifft5595, ifft5596);
__m512 ifft5521 = _mm512_sub_ps(ifft5513, ifft5517);
__m512 ifft5605 = _mm512_sub_ps(ifft5597, ifft5601);
__m512 ifft5522 = _mm512_add_ps(ifft5514, ifft5518);
__m512 ifft5606 = _mm512_add_ps(ifft5598, ifft5602);
__m512 ifft5523 = _mm512_add_ps(ifft5513, ifft5517);
__m512 ifft5607 = _mm512_add_ps(ifft5597, ifft5601);
__m512 ifft5524 = _mm512_sub_ps(ifft5514, ifft5518);
__m512 ifft5608 = _mm512_sub_ps(ifft5598, ifft5602);
__m512 ifft5525 = _mm512_mul_ps(ifft5515, _mm512_set1_ps(3.125e-02f));
__m512 ifft5609 = _mm512_mul_ps(ifft5599, _mm512_set1_ps(3.125e-02f));
__m512 ifft5526 = _mm512_mul_ps(ifft5516, _mm512_set1_ps(3.125e-02f));
__m512 ifft5610 = _mm512_mul_ps(ifft5600, _mm512_set1_ps(3.125e-02f));
__m512 ifft5527 = _mm512_fmadd_ps(ifft5519, _mm512_set1_ps(1.5625e-02f), ifft5525);
__m512 ifft5611 = _mm512_fmadd_ps(ifft5603, _mm512_set1_ps(1.5625e-02f), ifft5609);
__m512 ifft5528 = _mm512_fmsub_ps(ifft5519, _mm512_set1_ps(1.5625e-02f), ifft5525);
__m512 ifft5612 = _mm512_fmsub_ps(ifft5603, _mm512_set1_ps(1.5625e-02f), ifft5609);
__m512 ifft5529 = _mm512_fmadd_ps(ifft5520, _mm512_set1_ps(1.5625e-02f), ifft5526);
__m512 ifft5613 = _mm512_fmadd_ps(ifft5604, _mm512_set1_ps(1.5625e-02f), ifft5610);
__m512 ifft5530 = _mm512_fmsub_ps(ifft5520, _mm512_set1_ps(1.5625e-02f), ifft5526);
__m512 ifft5614 = _mm512_fmsub_ps(ifft5604, _mm512_set1_ps(1.5625e-02f), ifft5610);
__m512 ifft5531 = _mm512_add_ps(ifft5521, ifft5522);
__m512 ifft5615 = _mm512_add_ps(ifft5605, ifft5606);
__m512 ifft5532 = _mm512_sub_ps(ifft5521, ifft5522);
__m512 ifft5616 = _mm512_sub_ps(ifft5605, ifft5606);
__m512 ifft5533 = _mm512_fnmadd_ps(ifft5531, _mm512_set1_ps(7.0710677e-01f), ifft5523);
__m512 ifft5617 = _mm512_fnmadd_ps(ifft5615, _mm512_set1_ps(7.0710677e-01f), ifft5607);
__m512 ifft5534 = _mm512_fmadd_ps(ifft5531, _mm512_set1_ps(7.0710677e-01f), ifft5523);
__m512 ifft5618 = _mm512_fmadd_ps(ifft5615, _mm512_set1_ps(7.0710677e-01f), ifft5607);
__m512 ifft5535 = _mm512_fmadd_ps(ifft5532, _mm512_set1_ps(7.0710677e-01f), ifft5524);
__m512 ifft5619 = _mm512_fmadd_ps(ifft5616, _mm512_set1_ps(7.0710677e-01f), ifft5608);
__m512 ifft5536 = _mm512_fmsub_ps(ifft5532, _mm512_set1_ps(7.0710677e-01f), ifft5524);
__m512 ifft5620 = _mm512_fmsub_ps(ifft5616, _mm512_set1_ps(7.0710677e-01f), ifft5608);
__m512 ifft5537 = _mm512_add_ps(ifft5533, ifft5534);
__m512 ifft5621 = _mm512_add_ps(ifft5617, ifft5618);
__m512 ifft5538 = _mm512_sub_ps(ifft5533, ifft5534);
__m512 ifft5622 = _mm512_sub_ps(ifft5617, ifft5618);
__m512 ifft5539 = _mm512_add_ps(ifft5535, ifft5536);
__m512 ifft5623 = _mm512_add_ps(ifft5619, ifft5620);
__m512 ifft5540 = _mm512_sub_ps(ifft5535, ifft5536);
__m512 ifft5624 = _mm512_sub_ps(ifft5619, ifft5620);
__m512 ifft5541 = _mm512_fmadd_ps(ifft5537, _mm512_set1_ps(1.5625e-02f), ifft5527);
__m512 ifft5625 = _mm512_fmadd_ps(ifft5621, _mm512_set1_ps(1.5625e-02f), ifft5611);
__m512 ifft5542 = _mm512_fnmadd_ps(ifft5537, _mm512_set1_ps(1.5625e-02f), ifft5527);
__m512 ifft5626 = _mm512_fnmadd_ps(ifft5621, _mm512_set1_ps(1.5625e-02f), ifft5611);
__m512 ifft5543 = _mm512_fmadd_ps(ifft5539, _mm512_set1_ps(1.5625e-02f), ifft5529);
__m512 ifft5627 = _mm512_fmadd_ps(ifft5623, _mm512_set1_ps(1.5625e-02f), ifft5613);
__m512 ifft5544 = _mm512_fnmadd_ps(ifft5539, _mm512_set1_ps(1.5625e-02f), ifft5529);
__m512 ifft5628 = _mm512_fnmadd_ps(ifft5623, _mm512_set1_ps(1.5625e-02f), ifft5613);
__m512 ifft5545 = _mm512_fnmadd_ps(ifft5540, _mm512_set1_ps(1.5625e-02f), ifft5528);
__m512 ifft5629 = _mm512_fnmadd_ps(ifft5624, _mm512_set1_ps(1.5625e-02f), ifft5612);
__m512 ifft5546 = _mm512_fmadd_ps(ifft5540, _mm512_set1_ps(1.5625e-02f), ifft5528);
__m512 ifft5630 = _mm512_fmadd_ps(ifft5624, _mm512_set1_ps(1.5625e-02f), ifft5612);
__m512 ifft5547 = _mm512_fmadd_ps(ifft5538, _mm512_set1_ps(1.5625e-02f), ifft5530);
__m512 ifft5631 = _mm512_fmadd_ps(ifft5622, _mm512_set1_ps(1.5625e-02f), ifft5614);
__m512 ifft5548 = _mm512_fnmadd_ps(ifft5538, _mm512_set1_ps(1.5625e-02f), ifft5530);
__m512 ifft5632 = _mm512_fnmadd_ps(ifft5622, _mm512_set1_ps(1.5625e-02f), ifft5614);
__m512 dat888 = ifft5541;
__m512 dat890 = ifft5625;
__m512 dat889 = ifft5543;
__m512 dat891 = ifft5627;
(void)ifft5545;
(void)ifft5629;
(void)ifft5547;
(void)ifft5631;
(void)ifft5542;
(void)ifft5626;
(void)ifft5544;
(void)ifft5628;
(void)ifft5546;
(void)ifft5630;
(void)ifft5548;
(void)ifft5632;
__m512i pm55 = _mm512_set_epi32(16, 4, 3, 2, 1, 0, 20, 19, 18, 17, 16, 4, 3, 2, 1, 0);
__m512 pack259 = _mm512_permutex2var_ps(dat888, pm55, dat890);
__m512i pm56 = _mm512_set_epi32(8, 28, 27, 26, 25, 24, 12, 11, 10, 9, 8, 28, 27, 26, 25, 24);
__m512 pack260 = _mm512_permutex2var_ps(dat888, pm56, dat890);
__m512 pack261 = _mm512_permutex2var_ps(dat889, pm55, dat891);
__m512 pack262 = _mm512_permutex2var_ps(dat889, pm56, dat891);
pack259 = _mm512_max_ps(_mm512_setzero_ps(), pack259);
pack260 = _mm512_max_ps(_mm512_setzero_ps(), pack260);
pack261 = _mm512_max_ps(_mm512_setzero_ps(), pack261);
pack262 = _mm512_max_ps(_mm512_setzero_ps(), pack262);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack259);
_mm512_mask_storeu_ps(datPtr2+50240+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack260);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack261);
_mm512_mask_storeu_ps(datPtr2+50688+3215360*i9+200960*k42+100480*r19+448*toH18+4*toW18+40*t33, 1023, pack262);
}
}
}
if (j5 >= last2) return;
++j5;
}
rel6 = 4;
}
ptrdiff_t toH19 = base6+5;
ptrdiff_t toW19 = 110;
ptrdiff_t k43 = 16*w21;
for (; k43 != 16; ++k43) {
ptrdiff_t r20 = 0;
for (; r20 != 2; ++r20) {
ptrdiff_t t34 = 0;
__m512 sfRe313 = _mm512_loadu_ps(sfPtr3+0+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm313 = _mm512_loadu_ps(sfPtr3+64+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe314 = _mm512_loadu_ps(sfPtr3+2166784+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm314 = _mm512_loadu_ps(sfPtr3+2166848+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe315 = _mm512_loadu_ps(sfPtr3+4333568+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm315 = _mm512_loadu_ps(sfPtr3+4333632+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfRe316 = _mm512_loadu_ps(sfPtr3+6500352+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512 sfIm316 = _mm512_loadu_ps(sfPtr3+6500416+8667136*i9+24576*j5+256*k43+128*r20+0*t34);
__m512i ifft5633 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5634 = _mm512_permutexvar_ps(ifft5633, sfRe313);
__m512i ifft5635 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5636 = _mm512_permutexvar_ps(ifft5635, sfRe313);
__m512 ifft5637 = _mm512_permutexvar_ps(ifft5633, sfIm313);
__m512 ifft5638 = _mm512_permutexvar_ps(ifft5635, sfIm313);
__m512 ifft5639 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5640 = _mm512_mask_fmadd_ps(ifft5638, 65021, ifft5639, ifft5634);
__m512 ifft5641 = _mm512_mask_fnmadd_ps(ifft5637, 65021, ifft5639, ifft5636);
__m512 ifft5642 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5643 = _mm512_fmadd_ps(ifft5640, ifft5642, _mm512_shuffle_ps(ifft5640, ifft5640, 177));
__m512 ifft5644 = _mm512_fmadd_ps(ifft5641, ifft5642, _mm512_shuffle_ps(ifft5641, ifft5641, 177));
__m512 ifft5645 = _mm512_fmadd_ps(sfRe314, ifft5642, _mm512_shuffle_ps(sfRe314, sfRe314, 177));
__m512 ifft5646 = _mm512_fmadd_ps(sfIm314, ifft5642, _mm512_shuffle_ps(sfIm314, sfIm314, 177));
__m512 ifft5647 = _mm512_fmadd_ps(sfRe315, ifft5642, _mm512_shuffle_ps(sfRe315, sfRe315, 177));
__m512 ifft5648 = _mm512_fmadd_ps(sfIm315, ifft5642, _mm512_shuffle_ps(sfIm315, sfIm315, 177));
__m512 ifft5649 = _mm512_fmadd_ps(sfRe316, ifft5642, _mm512_shuffle_ps(sfRe316, sfRe316, 177));
__m512 ifft5650 = _mm512_fmadd_ps(sfIm316, ifft5642, _mm512_shuffle_ps(sfIm316, sfIm316, 177));
__m512 ifft5651 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5652 = _mm512_mul_ps(ifft5643, ifft5651);
__m512 ifft5653 = _mm512_mul_ps(ifft5644, ifft5651);
__m512 ifft5654 = _mm512_mul_ps(ifft5645, ifft5651);
__m512 ifft5655 = _mm512_mul_ps(ifft5646, ifft5651);
__m512 ifft5656 = _mm512_mul_ps(ifft5647, ifft5651);
__m512 ifft5657 = _mm512_mul_ps(ifft5648, ifft5651);
__m512 ifft5658 = _mm512_mul_ps(ifft5649, ifft5651);
__m512 ifft5659 = _mm512_mul_ps(ifft5650, ifft5651);
__m512 ifft5660 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5661 = _mm512_fnmadd_ps(ifft5644, ifft5660, ifft5652);
__m512 ifft5662 = _mm512_fmadd_ps(ifft5643, ifft5660, ifft5653);
__m512 ifft5663 = _mm512_fnmadd_ps(ifft5646, ifft5660, ifft5654);
__m512 ifft5664 = _mm512_fmadd_ps(ifft5645, ifft5660, ifft5655);
__m512 ifft5665 = _mm512_fnmadd_ps(ifft5648, ifft5660, ifft5656);
__m512 ifft5666 = _mm512_fmadd_ps(ifft5647, ifft5660, ifft5657);
__m512 ifft5667 = _mm512_fnmadd_ps(ifft5650, ifft5660, ifft5658);
__m512 ifft5668 = _mm512_fmadd_ps(ifft5649, ifft5660, ifft5659);
__m512 ifft5669 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5670 = _mm512_fmadd_ps(ifft5661, ifft5669, _mm512_shuffle_ps(ifft5661, ifft5661, 78));
__m512 ifft5671 = _mm512_fmadd_ps(ifft5662, ifft5669, _mm512_shuffle_ps(ifft5662, ifft5662, 78));
__m512 ifft5672 = _mm512_fmadd_ps(ifft5663, ifft5669, _mm512_shuffle_ps(ifft5663, ifft5663, 78));
__m512 ifft5673 = _mm512_fmadd_ps(ifft5664, ifft5669, _mm512_shuffle_ps(ifft5664, ifft5664, 78));
__m512 ifft5674 = _mm512_fmadd_ps(ifft5665, ifft5669, _mm512_shuffle_ps(ifft5665, ifft5665, 78));
__m512 ifft5675 = _mm512_fmadd_ps(ifft5666, ifft5669, _mm512_shuffle_ps(ifft5666, ifft5666, 78));
__m512 ifft5676 = _mm512_fmadd_ps(ifft5667, ifft5669, _mm512_shuffle_ps(ifft5667, ifft5667, 78));
__m512 ifft5677 = _mm512_fmadd_ps(ifft5668, ifft5669, _mm512_shuffle_ps(ifft5668, ifft5668, 78));
__m512 ifft5678 = _mm512_mask_sub_ps(ifft5670, 49344, _mm512_setzero_ps(), ifft5671);
__m512 ifft5679 = _mm512_mask_mov_ps(ifft5671, 49344, ifft5670);
__m512 ifft5680 = _mm512_mask_sub_ps(ifft5672, 49344, _mm512_setzero_ps(), ifft5673);
__m512 ifft5681 = _mm512_mask_mov_ps(ifft5673, 49344, ifft5672);
__m512 ifft5682 = _mm512_mask_sub_ps(ifft5674, 49344, _mm512_setzero_ps(), ifft5675);
__m512 ifft5683 = _mm512_mask_mov_ps(ifft5675, 49344, ifft5674);
__m512 ifft5684 = _mm512_mask_sub_ps(ifft5676, 49344, _mm512_setzero_ps(), ifft5677);
__m512 ifft5685 = _mm512_mask_mov_ps(ifft5677, 49344, ifft5676);
__m512 ifft5686 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5687 = _mm512_fmadd_ps(ifft5678, ifft5686, _mm512_shuffle_f32x4(ifft5678, ifft5678, 177));
__m512 ifft5688 = _mm512_fmadd_ps(ifft5679, ifft5686, _mm512_shuffle_f32x4(ifft5679, ifft5679, 177));
__m512 ifft5689 = _mm512_fmadd_ps(ifft5680, ifft5686, _mm512_shuffle_f32x4(ifft5680, ifft5680, 177));
__m512 ifft5690 = _mm512_fmadd_ps(ifft5681, ifft5686, _mm512_shuffle_f32x4(ifft5681, ifft5681, 177));
__m512 ifft5691 = _mm512_fmadd_ps(ifft5682, ifft5686, _mm512_shuffle_f32x4(ifft5682, ifft5682, 177));
__m512 ifft5692 = _mm512_fnmsub_ps(ifft5683, ifft5686, _mm512_shuffle_f32x4(ifft5683, ifft5683, 177));
__m512 ifft5693 = _mm512_fmadd_ps(ifft5684, ifft5686, _mm512_shuffle_f32x4(ifft5684, ifft5684, 177));
__m512 ifft5694 = _mm512_fmadd_ps(ifft5685, ifft5686, _mm512_shuffle_f32x4(ifft5685, ifft5685, 177));
__m512 ifft5695 = _mm512_add_ps(ifft5687, ifft5688);
__m512 ifft5696 = _mm512_sub_ps(ifft5687, ifft5688);
__m512 ifft5697 = _mm512_sub_ps(ifft5689, ifft5693);
__m512 ifft5698 = _mm512_add_ps(ifft5690, ifft5694);
__m512 ifft5699 = _mm512_add_ps(ifft5689, ifft5693);
__m512 ifft5700 = _mm512_sub_ps(ifft5690, ifft5694);
__m512 ifft5701 = _mm512_mul_ps(ifft5691, _mm512_set1_ps(3.125e-02f));
__m512 ifft5702 = _mm512_mul_ps(ifft5692, _mm512_set1_ps(3.125e-02f));
__m512 ifft5703 = _mm512_fmadd_ps(ifft5695, _mm512_set1_ps(1.5625e-02f), ifft5701);
__m512 ifft5704 = _mm512_fmsub_ps(ifft5695, _mm512_set1_ps(1.5625e-02f), ifft5701);
__m512 ifft5705 = _mm512_fmadd_ps(ifft5696, _mm512_set1_ps(1.5625e-02f), ifft5702);
__m512 ifft5706 = _mm512_fmsub_ps(ifft5696, _mm512_set1_ps(1.5625e-02f), ifft5702);
__m512 ifft5707 = _mm512_add_ps(ifft5697, ifft5698);
__m512 ifft5708 = _mm512_sub_ps(ifft5697, ifft5698);
__m512 ifft5709 = _mm512_fnmadd_ps(ifft5707, _mm512_set1_ps(7.0710677e-01f), ifft5699);
__m512 ifft5710 = _mm512_fmadd_ps(ifft5707, _mm512_set1_ps(7.0710677e-01f), ifft5699);
__m512 ifft5711 = _mm512_fmadd_ps(ifft5708, _mm512_set1_ps(7.0710677e-01f), ifft5700);
__m512 ifft5712 = _mm512_fmsub_ps(ifft5708, _mm512_set1_ps(7.0710677e-01f), ifft5700);
__m512 ifft5713 = _mm512_add_ps(ifft5709, ifft5710);
__m512 ifft5714 = _mm512_sub_ps(ifft5709, ifft5710);
__m512 ifft5715 = _mm512_add_ps(ifft5711, ifft5712);
__m512 ifft5716 = _mm512_sub_ps(ifft5711, ifft5712);
__m512 ifft5717 = _mm512_fmadd_ps(ifft5713, _mm512_set1_ps(1.5625e-02f), ifft5703);
__m512 ifft5718 = _mm512_fnmadd_ps(ifft5713, _mm512_set1_ps(1.5625e-02f), ifft5703);
__m512 ifft5719 = _mm512_fmadd_ps(ifft5715, _mm512_set1_ps(1.5625e-02f), ifft5705);
__m512 ifft5720 = _mm512_fnmadd_ps(ifft5715, _mm512_set1_ps(1.5625e-02f), ifft5705);
__m512 ifft5721 = _mm512_fnmadd_ps(ifft5716, _mm512_set1_ps(1.5625e-02f), ifft5704);
__m512 ifft5722 = _mm512_fmadd_ps(ifft5716, _mm512_set1_ps(1.5625e-02f), ifft5704);
__m512 ifft5723 = _mm512_fmadd_ps(ifft5714, _mm512_set1_ps(1.5625e-02f), ifft5706);
__m512 ifft5724 = _mm512_fnmadd_ps(ifft5714, _mm512_set1_ps(1.5625e-02f), ifft5706);
__m512 dat892 = ifft5717;
__m512 dat893 = ifft5719;
(void)ifft5721;
(void)ifft5723;
(void)ifft5718;
(void)ifft5720;
(void)ifft5722;
(void)ifft5724;
dat892 = _mm512_max_ps(_mm512_setzero_ps(), dat892);
dat893 = _mm512_max_ps(_mm512_setzero_ps(), dat893);
_mm512_mask_storeu_ps(datPtr2+0+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 3, dat892);
_mm512_mask_storeu_ps(datPtr2+50208+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 768, dat892);
_mm512_mask_storeu_ps(datPtr2+448+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 3, dat893);
_mm512_mask_storeu_ps(datPtr2+50656+3215360*i9+200960*k43+100480*r20+448*toH19+4*toW19+0*t34, 768, dat893);
}
}
if (j5 >= last2) return;
++j5;
}

static void ResNeXt50StriderConsumeSums1(ResNeXt50ThreaderTeam1* team17, char** tensors7) {
ResNeXt50ThreaderTask1 task11;
task11.callee1 = ResNeXt50StriderConsumeSums1Callee1;
task11.any1 = tensors7;
task11.nd1 = 3;
task11.hull1[0] = 1;
task11.hull1[1] = 44;
task11.hull1[2] = 1;
ResNeXt50ThreaderDo1(team17, &task11);
}

static void ResNeXt50StriderArrangeFilts2Callee1(ResNeXt50ThreaderTask1* task60, int64_t* pt35) {
char** tensors58 = task60->any1;
ptrdiff_t b55 = 0;
ptrdiff_t g19 = pt35[1];
ptrdiff_t e17 = 0;
char*restrict bfPtr8 = tensors58[3]+1024*e17;
char*restrict wfPtr8 = tensors58[3]+1024+51904512*e17;
char*restrict wtPtr10 = tensors58[0]+14256*e17;
char*restrict biasPtr10 = tensors58[1];
char*restrict bnPtr11 = tensors58[2];
ptrdiff_t i37 = 2*g19;
ptrdiff_t ii24 = i37+1;
for (; i37 <= ii24; ++i37) {
ptrdiff_t j30 = 4*b55;
if (j30 < 4) {
for (; j30 != 4; ++j30) {
__m512 postMul30 = _mm512_set1_ps(((float*)bnPtr11+(ptrdiff_t)2*(0+8*i37+2*j30))[0]);
__m512 postMul31 = _mm512_set1_ps(((float*)bnPtr11+(ptrdiff_t)2*(1+8*i37+2*j30))[0]);
for (ptrdiff_t k106 = 0; k106 < 8; ++k106) {
__m512 wt321 = _mm512_maskz_loadu_ps(7, wtPtr10+0+2304*i37+576*j30+36*k106);
__m512 wt322 = _mm512_maskz_loadu_ps(7, wtPtr10+12+2304*i37+576*j30+36*k106);
__m512 wt323 = _mm512_maskz_loadu_ps(7, wtPtr10+24+2304*i37+576*j30+36*k106);
wt321 = _mm512_mul_ps(postMul30, wt321);
wt322 = _mm512_mul_ps(postMul30, wt322);
wt323 = _mm512_mul_ps(postMul30, wt323);
__m512 fft7057 = _mm512_add_ps(wt321, _mm512_setzero_ps());
__m512 fft7145 = _mm512_add_ps(wt322, _mm512_setzero_ps());
__m512 fft7058 = _mm512_sub_ps(wt321, _mm512_setzero_ps());
__m512 fft7146 = _mm512_sub_ps(wt322, _mm512_setzero_ps());
__m512 fft7059 = _mm512_add_ps(wt323, _mm512_setzero_ps());
__m512 fft7147 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7060 = _mm512_sub_ps(wt323, _mm512_setzero_ps());
__m512 fft7148 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7061 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7149 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7062 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7150 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7063 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7151 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7064 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7152 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7065 = _mm512_add_ps(fft7057, fft7061);
__m512 fft7153 = _mm512_add_ps(fft7145, fft7149);
__m512 fft7066 = _mm512_sub_ps(fft7057, fft7061);
__m512 fft7154 = _mm512_sub_ps(fft7145, fft7149);
__m512 fft7067 = _mm512_add_ps(fft7059, fft7063);
__m512 fft7155 = _mm512_add_ps(fft7147, fft7151);
__m512 fft7068 = _mm512_sub_ps(fft7063, fft7059);
__m512 fft7156 = _mm512_sub_ps(fft7151, fft7147);
__m512 fft7069 = _mm512_sub_ps(fft7060, fft7064);
__m512 fft7157 = _mm512_sub_ps(fft7148, fft7152);
__m512 fft7070 = _mm512_add_ps(fft7060, fft7064);
__m512 fft7158 = _mm512_add_ps(fft7148, fft7152);
__m512 fft7071 = _mm512_add_ps(fft7065, fft7067);
__m512 fft7159 = _mm512_add_ps(fft7153, fft7155);
__m512 fft7072 = _mm512_sub_ps(fft7065, fft7067);
__m512 fft7160 = _mm512_sub_ps(fft7153, fft7155);
__m512 fft7073 = _mm512_fmadd_ps(fft7069, _mm512_set1_ps(7.0710677e-01f), fft7058);
__m512 fft7161 = _mm512_fmadd_ps(fft7157, _mm512_set1_ps(7.0710677e-01f), fft7146);
__m512 fft7074 = _mm512_fnmsub_ps(fft7070, _mm512_set1_ps(7.0710677e-01f), fft7062);
__m512 fft7162 = _mm512_fnmsub_ps(fft7158, _mm512_set1_ps(7.0710677e-01f), fft7150);
__m512 fft7075 = _mm512_fnmadd_ps(fft7069, _mm512_set1_ps(7.0710677e-01f), fft7058);
__m512 fft7163 = _mm512_fnmadd_ps(fft7157, _mm512_set1_ps(7.0710677e-01f), fft7146);
__m512 fft7076 = _mm512_fnmadd_ps(fft7070, _mm512_set1_ps(7.0710677e-01f), fft7062);
__m512 fft7164 = _mm512_fnmadd_ps(fft7158, _mm512_set1_ps(7.0710677e-01f), fft7150);
__m512 fft7077 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7078 = _mm512_fmadd_ps(fft7071, fft7077, _mm512_shuffle_f32x4(fft7071, fft7071, 78));
__m512 fft7165 = _mm512_fmadd_ps(fft7159, fft7077, _mm512_shuffle_f32x4(fft7159, fft7159, 78));
__m512 fft7079 = _mm512_fmadd_ps(fft7072, fft7077, _mm512_shuffle_f32x4(fft7072, fft7072, 78));
__m512 fft7166 = _mm512_fmadd_ps(fft7160, fft7077, _mm512_shuffle_f32x4(fft7160, fft7160, 78));
__m512 fft7080 = _mm512_fmadd_ps(fft7073, fft7077, _mm512_shuffle_f32x4(fft7073, fft7073, 78));
__m512 fft7167 = _mm512_fmadd_ps(fft7161, fft7077, _mm512_shuffle_f32x4(fft7161, fft7161, 78));
__m512 fft7081 = _mm512_fmadd_ps(fft7074, fft7077, _mm512_shuffle_f32x4(fft7074, fft7074, 78));
__m512 fft7168 = _mm512_fmadd_ps(fft7162, fft7077, _mm512_shuffle_f32x4(fft7162, fft7162, 78));
__m512 fft7082 = _mm512_fmadd_ps(fft7066, fft7077, _mm512_shuffle_f32x4(fft7066, fft7066, 78));
__m512 fft7169 = _mm512_fmadd_ps(fft7154, fft7077, _mm512_shuffle_f32x4(fft7154, fft7154, 78));
__m512 fft7083 = _mm512_fmadd_ps(fft7068, fft7077, _mm512_shuffle_f32x4(fft7068, fft7068, 78));
__m512 fft7170 = _mm512_fmadd_ps(fft7156, fft7077, _mm512_shuffle_f32x4(fft7156, fft7156, 78));
__m512 fft7084 = _mm512_fmadd_ps(fft7075, fft7077, _mm512_shuffle_f32x4(fft7075, fft7075, 78));
__m512 fft7171 = _mm512_fmadd_ps(fft7163, fft7077, _mm512_shuffle_f32x4(fft7163, fft7163, 78));
__m512 fft7085 = _mm512_fmadd_ps(fft7076, fft7077, _mm512_shuffle_f32x4(fft7076, fft7076, 78));
__m512 fft7172 = _mm512_fmadd_ps(fft7164, fft7077, _mm512_shuffle_f32x4(fft7164, fft7164, 78));
__m512 fft7086 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7087 = _mm512_mul_ps(fft7078, fft7086);
__m512 fft7173 = _mm512_mul_ps(fft7165, fft7086);
__m512 fft7088 = _mm512_mul_ps(fft7079, fft7086);
__m512 fft7174 = _mm512_mul_ps(fft7166, fft7086);
__m512 fft7089 = _mm512_mul_ps(fft7080, fft7086);
__m512 fft7175 = _mm512_mul_ps(fft7167, fft7086);
__m512 fft7090 = _mm512_mul_ps(fft7081, fft7086);
__m512 fft7176 = _mm512_mul_ps(fft7168, fft7086);
__m512 fft7091 = _mm512_mul_ps(fft7082, fft7086);
__m512 fft7177 = _mm512_mul_ps(fft7169, fft7086);
__m512 fft7092 = _mm512_mul_ps(fft7083, fft7086);
__m512 fft7178 = _mm512_mul_ps(fft7170, fft7086);
__m512 fft7093 = _mm512_mul_ps(fft7084, fft7086);
__m512 fft7179 = _mm512_mul_ps(fft7171, fft7086);
__m512 fft7094 = _mm512_mul_ps(fft7085, fft7086);
__m512 fft7180 = _mm512_mul_ps(fft7172, fft7086);
__m512 fft7095 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7096 = _mm512_fmadd_ps(fft7079, fft7095, fft7087);
__m512 fft7181 = _mm512_fmadd_ps(fft7166, fft7095, fft7173);
__m512 fft7097 = _mm512_fnmadd_ps(fft7078, fft7095, fft7088);
__m512 fft7182 = _mm512_fnmadd_ps(fft7165, fft7095, fft7174);
__m512 fft7098 = _mm512_fmadd_ps(fft7081, fft7095, fft7089);
__m512 fft7183 = _mm512_fmadd_ps(fft7168, fft7095, fft7175);
__m512 fft7099 = _mm512_fnmadd_ps(fft7080, fft7095, fft7090);
__m512 fft7184 = _mm512_fnmadd_ps(fft7167, fft7095, fft7176);
__m512 fft7100 = _mm512_fmadd_ps(fft7083, fft7095, fft7091);
__m512 fft7185 = _mm512_fmadd_ps(fft7170, fft7095, fft7177);
__m512 fft7101 = _mm512_fnmadd_ps(fft7082, fft7095, fft7092);
__m512 fft7186 = _mm512_fnmadd_ps(fft7169, fft7095, fft7178);
__m512 fft7102 = _mm512_fmadd_ps(fft7085, fft7095, fft7093);
__m512 fft7187 = _mm512_fmadd_ps(fft7172, fft7095, fft7179);
__m512 fft7103 = _mm512_fnmadd_ps(fft7084, fft7095, fft7094);
__m512 fft7188 = _mm512_fnmadd_ps(fft7171, fft7095, fft7180);
__m512 fft7104 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7105 = _mm512_fmadd_ps(fft7096, fft7104, _mm512_shuffle_f32x4(fft7096, fft7096, 177));
__m512 fft7189 = _mm512_fmadd_ps(fft7181, fft7104, _mm512_shuffle_f32x4(fft7181, fft7181, 177));
__m512 fft7106 = _mm512_fmadd_ps(fft7097, fft7104, _mm512_shuffle_f32x4(fft7097, fft7097, 177));
__m512 fft7190 = _mm512_fmadd_ps(fft7182, fft7104, _mm512_shuffle_f32x4(fft7182, fft7182, 177));
__m512 fft7107 = _mm512_fmadd_ps(fft7098, fft7104, _mm512_shuffle_f32x4(fft7098, fft7098, 177));
__m512 fft7191 = _mm512_fmadd_ps(fft7183, fft7104, _mm512_shuffle_f32x4(fft7183, fft7183, 177));
__m512 fft7108 = _mm512_fmadd_ps(fft7099, fft7104, _mm512_shuffle_f32x4(fft7099, fft7099, 177));
__m512 fft7192 = _mm512_fmadd_ps(fft7184, fft7104, _mm512_shuffle_f32x4(fft7184, fft7184, 177));
__m512 fft7109 = _mm512_fmadd_ps(fft7100, fft7104, _mm512_shuffle_f32x4(fft7100, fft7100, 177));
__m512 fft7193 = _mm512_fmadd_ps(fft7185, fft7104, _mm512_shuffle_f32x4(fft7185, fft7185, 177));
__m512 fft7110 = _mm512_fmadd_ps(fft7101, fft7104, _mm512_shuffle_f32x4(fft7101, fft7101, 177));
__m512 fft7194 = _mm512_fmadd_ps(fft7186, fft7104, _mm512_shuffle_f32x4(fft7186, fft7186, 177));
__m512 fft7111 = _mm512_fmadd_ps(fft7102, fft7104, _mm512_shuffle_f32x4(fft7102, fft7102, 177));
__m512 fft7195 = _mm512_fmadd_ps(fft7187, fft7104, _mm512_shuffle_f32x4(fft7187, fft7187, 177));
__m512 fft7112 = _mm512_fmadd_ps(fft7103, fft7104, _mm512_shuffle_f32x4(fft7103, fft7103, 177));
__m512 fft7196 = _mm512_fmadd_ps(fft7188, fft7104, _mm512_shuffle_f32x4(fft7188, fft7188, 177));
__m512 fft7113 = _mm512_mask_mov_ps(fft7105, 49344, fft7106);
__m512 fft7197 = _mm512_mask_mov_ps(fft7189, 49344, fft7190);
__m512 fft7114 = _mm512_mask_sub_ps(fft7106, 49344, _mm512_setzero_ps(), fft7105);
__m512 fft7198 = _mm512_mask_sub_ps(fft7190, 49344, _mm512_setzero_ps(), fft7189);
__m512 fft7115 = _mm512_mask_mov_ps(fft7107, 49344, fft7108);
__m512 fft7199 = _mm512_mask_mov_ps(fft7191, 49344, fft7192);
__m512 fft7116 = _mm512_mask_sub_ps(fft7108, 49344, _mm512_setzero_ps(), fft7107);
__m512 fft7200 = _mm512_mask_sub_ps(fft7192, 49344, _mm512_setzero_ps(), fft7191);
__m512 fft7117 = _mm512_mask_mov_ps(fft7109, 49344, fft7110);
__m512 fft7201 = _mm512_mask_mov_ps(fft7193, 49344, fft7194);
__m512 fft7118 = _mm512_mask_sub_ps(fft7110, 49344, _mm512_setzero_ps(), fft7109);
__m512 fft7202 = _mm512_mask_sub_ps(fft7194, 49344, _mm512_setzero_ps(), fft7193);
__m512 fft7119 = _mm512_mask_mov_ps(fft7111, 49344, fft7112);
__m512 fft7203 = _mm512_mask_mov_ps(fft7195, 49344, fft7196);
__m512 fft7120 = _mm512_mask_sub_ps(fft7112, 49344, _mm512_setzero_ps(), fft7111);
__m512 fft7204 = _mm512_mask_sub_ps(fft7196, 49344, _mm512_setzero_ps(), fft7195);
__m512 fft7121 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7122 = _mm512_fmadd_ps(fft7113, fft7121, _mm512_shuffle_ps(fft7113, fft7113, 78));
__m512 fft7205 = _mm512_fmadd_ps(fft7197, fft7121, _mm512_shuffle_ps(fft7197, fft7197, 78));
__m512 fft7123 = _mm512_fmadd_ps(fft7114, fft7121, _mm512_shuffle_ps(fft7114, fft7114, 78));
__m512 fft7206 = _mm512_fmadd_ps(fft7198, fft7121, _mm512_shuffle_ps(fft7198, fft7198, 78));
__m512 fft7124 = _mm512_fmadd_ps(fft7115, fft7121, _mm512_shuffle_ps(fft7115, fft7115, 78));
__m512 fft7207 = _mm512_fmadd_ps(fft7199, fft7121, _mm512_shuffle_ps(fft7199, fft7199, 78));
__m512 fft7125 = _mm512_fmadd_ps(fft7116, fft7121, _mm512_shuffle_ps(fft7116, fft7116, 78));
__m512 fft7208 = _mm512_fmadd_ps(fft7200, fft7121, _mm512_shuffle_ps(fft7200, fft7200, 78));
__m512 fft7126 = _mm512_fmadd_ps(fft7117, fft7121, _mm512_shuffle_ps(fft7117, fft7117, 78));
__m512 fft7209 = _mm512_fmadd_ps(fft7201, fft7121, _mm512_shuffle_ps(fft7201, fft7201, 78));
__m512 fft7127 = _mm512_fmadd_ps(fft7118, fft7121, _mm512_shuffle_ps(fft7118, fft7118, 78));
__m512 fft7210 = _mm512_fmadd_ps(fft7202, fft7121, _mm512_shuffle_ps(fft7202, fft7202, 78));
__m512 fft7128 = _mm512_fmadd_ps(fft7119, fft7121, _mm512_shuffle_ps(fft7119, fft7119, 78));
__m512 fft7211 = _mm512_fmadd_ps(fft7203, fft7121, _mm512_shuffle_ps(fft7203, fft7203, 78));
__m512 fft7129 = _mm512_fmadd_ps(fft7120, fft7121, _mm512_shuffle_ps(fft7120, fft7120, 78));
__m512 fft7212 = _mm512_fmadd_ps(fft7204, fft7121, _mm512_shuffle_ps(fft7204, fft7204, 78));
__m512i fft7130 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7131 = _mm512_permutexvar_ps(fft7130, fft7122);
__m512 fft7213 = _mm512_permutexvar_ps(fft7130, fft7205);
__m512i fft7132 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7133 = _mm512_permutexvar_ps(fft7132, fft7122);
__m512 fft7214 = _mm512_permutexvar_ps(fft7132, fft7205);
__m512 fft7134 = _mm512_permutexvar_ps(fft7130, fft7123);
__m512 fft7215 = _mm512_permutexvar_ps(fft7130, fft7206);
__m512 fft7135 = _mm512_permutexvar_ps(fft7132, fft7123);
__m512 fft7216 = _mm512_permutexvar_ps(fft7132, fft7206);
__m512 fft7136 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7137 = _mm512_fmadd_ps(fft7131, fft7136, fft7133);
__m512 fft7217 = _mm512_fmadd_ps(fft7213, fft7136, fft7214);
__m512 fft7138 = _mm512_fnmadd_ps(fft7135, fft7136, fft7134);
__m512 fft7218 = _mm512_fnmadd_ps(fft7216, fft7136, fft7215);
__m512 fft7139 = _mm512_mask_mov_ps(fft7135, 21845, fft7137);
__m512 fft7219 = _mm512_mask_mov_ps(fft7216, 21845, fft7217);
__m512 fft7140 = _mm512_mask_mov_ps(fft7131, 43176, fft7137);
__m512 fft7220 = _mm512_mask_mov_ps(fft7213, 43176, fft7217);
__m512 fft7141 = _mm512_mask_mov_ps(fft7139, 43176, fft7138);
__m512 fft7221 = _mm512_mask_mov_ps(fft7219, 43176, fft7218);
__m512 fft7142 = _mm512_mask_mov_ps(fft7140, 22102, fft7138);
__m512 fft7222 = _mm512_mask_mov_ps(fft7220, 22102, fft7218);
__m512 fft7143 = _mm512_mask_mul_ps(fft7141, 64764, fft7141, _mm512_set1_ps(5e-01f));
__m512 fft7223 = _mm512_mask_mul_ps(fft7221, 64764, fft7221, _mm512_set1_ps(5e-01f));
__m512 fft7144 = _mm512_mask_mul_ps(fft7142, 64764, fft7142, _mm512_set1_ps(5e-01f));
__m512 fft7224 = _mm512_mask_mul_ps(fft7222, 64764, fft7222, _mm512_set1_ps(5e-01f));
__m512 wf81 = fft7143;
__m512 wf89 = fft7223;
__m512 wf82 = fft7144;
__m512 wf90 = fft7224;
__m512 wf83 = fft7124;
__m512 wf91 = fft7207;
__m512 wf84 = fft7125;
__m512 wf92 = fft7208;
__m512 wf85 = fft7126;
__m512 wf93 = fft7209;
__m512 wf86 = fft7127;
__m512 wf94 = fft7210;
__m512 wf87 = fft7128;
__m512 wf95 = fft7211;
__m512 wf88 = fft7129;
__m512 wf96 = fft7212;
ptrdiff_t c28 = (size_t)(0+2*j30)/4;
ptrdiff_t m43 = (size_t)(0+2*j30)%4/2;
ptrdiff_t f46 = (size_t)(0+2*j30)%2;
__m512i eo43 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf83 = _mm512_permutexvar_ps(eo43, wf83);
wf84 = _mm512_permutexvar_ps(eo43, wf84);
__m512i wfs25 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf83, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs25 = _mm512_inserti64x4(wfs25, _mm512_cvtps_ph(wf84, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+2048+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs25);
_mm512_mask_storeu_epi32(wfPtr8+264176+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs25);
wf91 = _mm512_permutexvar_ps(eo43, wf91);
wf92 = _mm512_permutexvar_ps(eo43, wf92);
__m512i wfs26 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf91, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs26 = _mm512_inserti64x4(wfs26, _mm512_cvtps_ph(wf92, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+526336+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs26);
_mm512_mask_storeu_epi32(wfPtr8+788464+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs26);
wf85 = _mm512_permutexvar_ps(eo43, wf85);
wf86 = _mm512_permutexvar_ps(eo43, wf86);
__m512i wfs27 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf85, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs27 = _mm512_inserti64x4(wfs27, _mm512_cvtps_ph(wf86, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+4096+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs27);
_mm512_mask_storeu_epi32(wfPtr8+266224+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs27);
wf93 = _mm512_permutexvar_ps(eo43, wf93);
wf94 = _mm512_permutexvar_ps(eo43, wf94);
__m512i wfs28 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf93, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs28 = _mm512_inserti64x4(wfs28, _mm512_cvtps_ph(wf94, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+528384+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs28);
_mm512_mask_storeu_epi32(wfPtr8+790512+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs28);
wf87 = _mm512_permutexvar_ps(eo43, wf87);
wf88 = _mm512_permutexvar_ps(eo43, wf88);
__m512i wfs29 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf87, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs29 = _mm512_inserti64x4(wfs29, _mm512_cvtps_ph(wf88, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+6144+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs29);
_mm512_mask_storeu_epi32(wfPtr8+268272+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs29);
wf95 = _mm512_permutexvar_ps(eo43, wf95);
wf96 = _mm512_permutexvar_ps(eo43, wf96);
__m512i wfs30 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf95, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs30 = _mm512_inserti64x4(wfs30, _mm512_cvtps_ph(wf96, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+530432+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs30);
_mm512_mask_storeu_epi32(wfPtr8+792560+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs30);
__m512i wfs31 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf81, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs31 = _mm512_inserti64x4(wfs31, _mm512_cvtps_ph(wf82, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+0+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs31);
_mm512_mask_storeu_epi32(wfPtr8+262128+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs31);
__m512i wfs32 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf89, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs32 = _mm512_inserti64x4(wfs32, _mm512_cvtps_ph(wf90, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+524288+8192*i37+1024*c28+128*k106+64*m43+16*f46, 3855, wfs32);
_mm512_mask_storeu_epi32(wfPtr8+786416+8192*i37+1024*c28+128*k106+64*m43+16*f46, 61680, wfs32);
__m512 wt324 = _mm512_maskz_loadu_ps(7, wtPtr10+288+2304*i37+576*j30+36*k106);
__m512 wt325 = _mm512_maskz_loadu_ps(7, wtPtr10+300+2304*i37+576*j30+36*k106);
__m512 wt326 = _mm512_maskz_loadu_ps(7, wtPtr10+312+2304*i37+576*j30+36*k106);
wt324 = _mm512_mul_ps(postMul31, wt324);
wt325 = _mm512_mul_ps(postMul31, wt325);
wt326 = _mm512_mul_ps(postMul31, wt326);
__m512 fft7225 = _mm512_add_ps(wt324, _mm512_setzero_ps());
__m512 fft7313 = _mm512_add_ps(wt325, _mm512_setzero_ps());
__m512 fft7226 = _mm512_sub_ps(wt324, _mm512_setzero_ps());
__m512 fft7314 = _mm512_sub_ps(wt325, _mm512_setzero_ps());
__m512 fft7227 = _mm512_add_ps(wt326, _mm512_setzero_ps());
__m512 fft7315 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7228 = _mm512_sub_ps(wt326, _mm512_setzero_ps());
__m512 fft7316 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7229 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7317 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7230 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7318 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7231 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7319 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7232 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7320 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft7233 = _mm512_add_ps(fft7225, fft7229);
__m512 fft7321 = _mm512_add_ps(fft7313, fft7317);
__m512 fft7234 = _mm512_sub_ps(fft7225, fft7229);
__m512 fft7322 = _mm512_sub_ps(fft7313, fft7317);
__m512 fft7235 = _mm512_add_ps(fft7227, fft7231);
__m512 fft7323 = _mm512_add_ps(fft7315, fft7319);
__m512 fft7236 = _mm512_sub_ps(fft7231, fft7227);
__m512 fft7324 = _mm512_sub_ps(fft7319, fft7315);
__m512 fft7237 = _mm512_sub_ps(fft7228, fft7232);
__m512 fft7325 = _mm512_sub_ps(fft7316, fft7320);
__m512 fft7238 = _mm512_add_ps(fft7228, fft7232);
__m512 fft7326 = _mm512_add_ps(fft7316, fft7320);
__m512 fft7239 = _mm512_add_ps(fft7233, fft7235);
__m512 fft7327 = _mm512_add_ps(fft7321, fft7323);
__m512 fft7240 = _mm512_sub_ps(fft7233, fft7235);
__m512 fft7328 = _mm512_sub_ps(fft7321, fft7323);
__m512 fft7241 = _mm512_fmadd_ps(fft7237, _mm512_set1_ps(7.0710677e-01f), fft7226);
__m512 fft7329 = _mm512_fmadd_ps(fft7325, _mm512_set1_ps(7.0710677e-01f), fft7314);
__m512 fft7242 = _mm512_fnmsub_ps(fft7238, _mm512_set1_ps(7.0710677e-01f), fft7230);
__m512 fft7330 = _mm512_fnmsub_ps(fft7326, _mm512_set1_ps(7.0710677e-01f), fft7318);
__m512 fft7243 = _mm512_fnmadd_ps(fft7237, _mm512_set1_ps(7.0710677e-01f), fft7226);
__m512 fft7331 = _mm512_fnmadd_ps(fft7325, _mm512_set1_ps(7.0710677e-01f), fft7314);
__m512 fft7244 = _mm512_fnmadd_ps(fft7238, _mm512_set1_ps(7.0710677e-01f), fft7230);
__m512 fft7332 = _mm512_fnmadd_ps(fft7326, _mm512_set1_ps(7.0710677e-01f), fft7318);
__m512 fft7245 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7246 = _mm512_fmadd_ps(fft7239, fft7245, _mm512_shuffle_f32x4(fft7239, fft7239, 78));
__m512 fft7333 = _mm512_fmadd_ps(fft7327, fft7245, _mm512_shuffle_f32x4(fft7327, fft7327, 78));
__m512 fft7247 = _mm512_fmadd_ps(fft7240, fft7245, _mm512_shuffle_f32x4(fft7240, fft7240, 78));
__m512 fft7334 = _mm512_fmadd_ps(fft7328, fft7245, _mm512_shuffle_f32x4(fft7328, fft7328, 78));
__m512 fft7248 = _mm512_fmadd_ps(fft7241, fft7245, _mm512_shuffle_f32x4(fft7241, fft7241, 78));
__m512 fft7335 = _mm512_fmadd_ps(fft7329, fft7245, _mm512_shuffle_f32x4(fft7329, fft7329, 78));
__m512 fft7249 = _mm512_fmadd_ps(fft7242, fft7245, _mm512_shuffle_f32x4(fft7242, fft7242, 78));
__m512 fft7336 = _mm512_fmadd_ps(fft7330, fft7245, _mm512_shuffle_f32x4(fft7330, fft7330, 78));
__m512 fft7250 = _mm512_fmadd_ps(fft7234, fft7245, _mm512_shuffle_f32x4(fft7234, fft7234, 78));
__m512 fft7337 = _mm512_fmadd_ps(fft7322, fft7245, _mm512_shuffle_f32x4(fft7322, fft7322, 78));
__m512 fft7251 = _mm512_fmadd_ps(fft7236, fft7245, _mm512_shuffle_f32x4(fft7236, fft7236, 78));
__m512 fft7338 = _mm512_fmadd_ps(fft7324, fft7245, _mm512_shuffle_f32x4(fft7324, fft7324, 78));
__m512 fft7252 = _mm512_fmadd_ps(fft7243, fft7245, _mm512_shuffle_f32x4(fft7243, fft7243, 78));
__m512 fft7339 = _mm512_fmadd_ps(fft7331, fft7245, _mm512_shuffle_f32x4(fft7331, fft7331, 78));
__m512 fft7253 = _mm512_fmadd_ps(fft7244, fft7245, _mm512_shuffle_f32x4(fft7244, fft7244, 78));
__m512 fft7340 = _mm512_fmadd_ps(fft7332, fft7245, _mm512_shuffle_f32x4(fft7332, fft7332, 78));
__m512 fft7254 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7255 = _mm512_mul_ps(fft7246, fft7254);
__m512 fft7341 = _mm512_mul_ps(fft7333, fft7254);
__m512 fft7256 = _mm512_mul_ps(fft7247, fft7254);
__m512 fft7342 = _mm512_mul_ps(fft7334, fft7254);
__m512 fft7257 = _mm512_mul_ps(fft7248, fft7254);
__m512 fft7343 = _mm512_mul_ps(fft7335, fft7254);
__m512 fft7258 = _mm512_mul_ps(fft7249, fft7254);
__m512 fft7344 = _mm512_mul_ps(fft7336, fft7254);
__m512 fft7259 = _mm512_mul_ps(fft7250, fft7254);
__m512 fft7345 = _mm512_mul_ps(fft7337, fft7254);
__m512 fft7260 = _mm512_mul_ps(fft7251, fft7254);
__m512 fft7346 = _mm512_mul_ps(fft7338, fft7254);
__m512 fft7261 = _mm512_mul_ps(fft7252, fft7254);
__m512 fft7347 = _mm512_mul_ps(fft7339, fft7254);
__m512 fft7262 = _mm512_mul_ps(fft7253, fft7254);
__m512 fft7348 = _mm512_mul_ps(fft7340, fft7254);
__m512 fft7263 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7264 = _mm512_fmadd_ps(fft7247, fft7263, fft7255);
__m512 fft7349 = _mm512_fmadd_ps(fft7334, fft7263, fft7341);
__m512 fft7265 = _mm512_fnmadd_ps(fft7246, fft7263, fft7256);
__m512 fft7350 = _mm512_fnmadd_ps(fft7333, fft7263, fft7342);
__m512 fft7266 = _mm512_fmadd_ps(fft7249, fft7263, fft7257);
__m512 fft7351 = _mm512_fmadd_ps(fft7336, fft7263, fft7343);
__m512 fft7267 = _mm512_fnmadd_ps(fft7248, fft7263, fft7258);
__m512 fft7352 = _mm512_fnmadd_ps(fft7335, fft7263, fft7344);
__m512 fft7268 = _mm512_fmadd_ps(fft7251, fft7263, fft7259);
__m512 fft7353 = _mm512_fmadd_ps(fft7338, fft7263, fft7345);
__m512 fft7269 = _mm512_fnmadd_ps(fft7250, fft7263, fft7260);
__m512 fft7354 = _mm512_fnmadd_ps(fft7337, fft7263, fft7346);
__m512 fft7270 = _mm512_fmadd_ps(fft7253, fft7263, fft7261);
__m512 fft7355 = _mm512_fmadd_ps(fft7340, fft7263, fft7347);
__m512 fft7271 = _mm512_fnmadd_ps(fft7252, fft7263, fft7262);
__m512 fft7356 = _mm512_fnmadd_ps(fft7339, fft7263, fft7348);
__m512 fft7272 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7273 = _mm512_fmadd_ps(fft7264, fft7272, _mm512_shuffle_f32x4(fft7264, fft7264, 177));
__m512 fft7357 = _mm512_fmadd_ps(fft7349, fft7272, _mm512_shuffle_f32x4(fft7349, fft7349, 177));
__m512 fft7274 = _mm512_fmadd_ps(fft7265, fft7272, _mm512_shuffle_f32x4(fft7265, fft7265, 177));
__m512 fft7358 = _mm512_fmadd_ps(fft7350, fft7272, _mm512_shuffle_f32x4(fft7350, fft7350, 177));
__m512 fft7275 = _mm512_fmadd_ps(fft7266, fft7272, _mm512_shuffle_f32x4(fft7266, fft7266, 177));
__m512 fft7359 = _mm512_fmadd_ps(fft7351, fft7272, _mm512_shuffle_f32x4(fft7351, fft7351, 177));
__m512 fft7276 = _mm512_fmadd_ps(fft7267, fft7272, _mm512_shuffle_f32x4(fft7267, fft7267, 177));
__m512 fft7360 = _mm512_fmadd_ps(fft7352, fft7272, _mm512_shuffle_f32x4(fft7352, fft7352, 177));
__m512 fft7277 = _mm512_fmadd_ps(fft7268, fft7272, _mm512_shuffle_f32x4(fft7268, fft7268, 177));
__m512 fft7361 = _mm512_fmadd_ps(fft7353, fft7272, _mm512_shuffle_f32x4(fft7353, fft7353, 177));
__m512 fft7278 = _mm512_fmadd_ps(fft7269, fft7272, _mm512_shuffle_f32x4(fft7269, fft7269, 177));
__m512 fft7362 = _mm512_fmadd_ps(fft7354, fft7272, _mm512_shuffle_f32x4(fft7354, fft7354, 177));
__m512 fft7279 = _mm512_fmadd_ps(fft7270, fft7272, _mm512_shuffle_f32x4(fft7270, fft7270, 177));
__m512 fft7363 = _mm512_fmadd_ps(fft7355, fft7272, _mm512_shuffle_f32x4(fft7355, fft7355, 177));
__m512 fft7280 = _mm512_fmadd_ps(fft7271, fft7272, _mm512_shuffle_f32x4(fft7271, fft7271, 177));
__m512 fft7364 = _mm512_fmadd_ps(fft7356, fft7272, _mm512_shuffle_f32x4(fft7356, fft7356, 177));
__m512 fft7281 = _mm512_mask_mov_ps(fft7273, 49344, fft7274);
__m512 fft7365 = _mm512_mask_mov_ps(fft7357, 49344, fft7358);
__m512 fft7282 = _mm512_mask_sub_ps(fft7274, 49344, _mm512_setzero_ps(), fft7273);
__m512 fft7366 = _mm512_mask_sub_ps(fft7358, 49344, _mm512_setzero_ps(), fft7357);
__m512 fft7283 = _mm512_mask_mov_ps(fft7275, 49344, fft7276);
__m512 fft7367 = _mm512_mask_mov_ps(fft7359, 49344, fft7360);
__m512 fft7284 = _mm512_mask_sub_ps(fft7276, 49344, _mm512_setzero_ps(), fft7275);
__m512 fft7368 = _mm512_mask_sub_ps(fft7360, 49344, _mm512_setzero_ps(), fft7359);
__m512 fft7285 = _mm512_mask_mov_ps(fft7277, 49344, fft7278);
__m512 fft7369 = _mm512_mask_mov_ps(fft7361, 49344, fft7362);
__m512 fft7286 = _mm512_mask_sub_ps(fft7278, 49344, _mm512_setzero_ps(), fft7277);
__m512 fft7370 = _mm512_mask_sub_ps(fft7362, 49344, _mm512_setzero_ps(), fft7361);
__m512 fft7287 = _mm512_mask_mov_ps(fft7279, 49344, fft7280);
__m512 fft7371 = _mm512_mask_mov_ps(fft7363, 49344, fft7364);
__m512 fft7288 = _mm512_mask_sub_ps(fft7280, 49344, _mm512_setzero_ps(), fft7279);
__m512 fft7372 = _mm512_mask_sub_ps(fft7364, 49344, _mm512_setzero_ps(), fft7363);
__m512 fft7289 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7290 = _mm512_fmadd_ps(fft7281, fft7289, _mm512_shuffle_ps(fft7281, fft7281, 78));
__m512 fft7373 = _mm512_fmadd_ps(fft7365, fft7289, _mm512_shuffle_ps(fft7365, fft7365, 78));
__m512 fft7291 = _mm512_fmadd_ps(fft7282, fft7289, _mm512_shuffle_ps(fft7282, fft7282, 78));
__m512 fft7374 = _mm512_fmadd_ps(fft7366, fft7289, _mm512_shuffle_ps(fft7366, fft7366, 78));
__m512 fft7292 = _mm512_fmadd_ps(fft7283, fft7289, _mm512_shuffle_ps(fft7283, fft7283, 78));
__m512 fft7375 = _mm512_fmadd_ps(fft7367, fft7289, _mm512_shuffle_ps(fft7367, fft7367, 78));
__m512 fft7293 = _mm512_fmadd_ps(fft7284, fft7289, _mm512_shuffle_ps(fft7284, fft7284, 78));
__m512 fft7376 = _mm512_fmadd_ps(fft7368, fft7289, _mm512_shuffle_ps(fft7368, fft7368, 78));
__m512 fft7294 = _mm512_fmadd_ps(fft7285, fft7289, _mm512_shuffle_ps(fft7285, fft7285, 78));
__m512 fft7377 = _mm512_fmadd_ps(fft7369, fft7289, _mm512_shuffle_ps(fft7369, fft7369, 78));
__m512 fft7295 = _mm512_fmadd_ps(fft7286, fft7289, _mm512_shuffle_ps(fft7286, fft7286, 78));
__m512 fft7378 = _mm512_fmadd_ps(fft7370, fft7289, _mm512_shuffle_ps(fft7370, fft7370, 78));
__m512 fft7296 = _mm512_fmadd_ps(fft7287, fft7289, _mm512_shuffle_ps(fft7287, fft7287, 78));
__m512 fft7379 = _mm512_fmadd_ps(fft7371, fft7289, _mm512_shuffle_ps(fft7371, fft7371, 78));
__m512 fft7297 = _mm512_fmadd_ps(fft7288, fft7289, _mm512_shuffle_ps(fft7288, fft7288, 78));
__m512 fft7380 = _mm512_fmadd_ps(fft7372, fft7289, _mm512_shuffle_ps(fft7372, fft7372, 78));
__m512i fft7298 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7299 = _mm512_permutexvar_ps(fft7298, fft7290);
__m512 fft7381 = _mm512_permutexvar_ps(fft7298, fft7373);
__m512i fft7300 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7301 = _mm512_permutexvar_ps(fft7300, fft7290);
__m512 fft7382 = _mm512_permutexvar_ps(fft7300, fft7373);
__m512 fft7302 = _mm512_permutexvar_ps(fft7298, fft7291);
__m512 fft7383 = _mm512_permutexvar_ps(fft7298, fft7374);
__m512 fft7303 = _mm512_permutexvar_ps(fft7300, fft7291);
__m512 fft7384 = _mm512_permutexvar_ps(fft7300, fft7374);
__m512 fft7304 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7305 = _mm512_fmadd_ps(fft7299, fft7304, fft7301);
__m512 fft7385 = _mm512_fmadd_ps(fft7381, fft7304, fft7382);
__m512 fft7306 = _mm512_fnmadd_ps(fft7303, fft7304, fft7302);
__m512 fft7386 = _mm512_fnmadd_ps(fft7384, fft7304, fft7383);
__m512 fft7307 = _mm512_mask_mov_ps(fft7303, 21845, fft7305);
__m512 fft7387 = _mm512_mask_mov_ps(fft7384, 21845, fft7385);
__m512 fft7308 = _mm512_mask_mov_ps(fft7299, 43176, fft7305);
__m512 fft7388 = _mm512_mask_mov_ps(fft7381, 43176, fft7385);
__m512 fft7309 = _mm512_mask_mov_ps(fft7307, 43176, fft7306);
__m512 fft7389 = _mm512_mask_mov_ps(fft7387, 43176, fft7386);
__m512 fft7310 = _mm512_mask_mov_ps(fft7308, 22102, fft7306);
__m512 fft7390 = _mm512_mask_mov_ps(fft7388, 22102, fft7386);
__m512 fft7311 = _mm512_mask_mul_ps(fft7309, 64764, fft7309, _mm512_set1_ps(5e-01f));
__m512 fft7391 = _mm512_mask_mul_ps(fft7389, 64764, fft7389, _mm512_set1_ps(5e-01f));
__m512 fft7312 = _mm512_mask_mul_ps(fft7310, 64764, fft7310, _mm512_set1_ps(5e-01f));
__m512 fft7392 = _mm512_mask_mul_ps(fft7390, 64764, fft7390, _mm512_set1_ps(5e-01f));
__m512 wf97 = fft7311;
__m512 wf105 = fft7391;
__m512 wf98 = fft7312;
__m512 wf106 = fft7392;
__m512 wf99 = fft7292;
__m512 wf107 = fft7375;
__m512 wf100 = fft7293;
__m512 wf108 = fft7376;
__m512 wf101 = fft7294;
__m512 wf109 = fft7377;
__m512 wf102 = fft7295;
__m512 wf110 = fft7378;
__m512 wf103 = fft7296;
__m512 wf111 = fft7379;
__m512 wf104 = fft7297;
__m512 wf112 = fft7380;
ptrdiff_t c29 = (size_t)(1+2*j30)/4;
ptrdiff_t m44 = (size_t)(1+2*j30)%4/2;
ptrdiff_t f47 = (size_t)(1+2*j30)%2;
__m512i eo44 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf99 = _mm512_permutexvar_ps(eo44, wf99);
wf100 = _mm512_permutexvar_ps(eo44, wf100);
__m512i wfs33 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf99, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs33 = _mm512_inserti64x4(wfs33, _mm512_cvtps_ph(wf100, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+2048+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs33);
_mm512_mask_storeu_epi32(wfPtr8+264176+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs33);
wf107 = _mm512_permutexvar_ps(eo44, wf107);
wf108 = _mm512_permutexvar_ps(eo44, wf108);
__m512i wfs34 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf107, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs34 = _mm512_inserti64x4(wfs34, _mm512_cvtps_ph(wf108, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+526336+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs34);
_mm512_mask_storeu_epi32(wfPtr8+788464+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs34);
wf101 = _mm512_permutexvar_ps(eo44, wf101);
wf102 = _mm512_permutexvar_ps(eo44, wf102);
__m512i wfs35 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf101, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs35 = _mm512_inserti64x4(wfs35, _mm512_cvtps_ph(wf102, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+4096+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs35);
_mm512_mask_storeu_epi32(wfPtr8+266224+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs35);
wf109 = _mm512_permutexvar_ps(eo44, wf109);
wf110 = _mm512_permutexvar_ps(eo44, wf110);
__m512i wfs36 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf109, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs36 = _mm512_inserti64x4(wfs36, _mm512_cvtps_ph(wf110, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+528384+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs36);
_mm512_mask_storeu_epi32(wfPtr8+790512+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs36);
wf103 = _mm512_permutexvar_ps(eo44, wf103);
wf104 = _mm512_permutexvar_ps(eo44, wf104);
__m512i wfs37 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf103, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs37 = _mm512_inserti64x4(wfs37, _mm512_cvtps_ph(wf104, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+6144+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs37);
_mm512_mask_storeu_epi32(wfPtr8+268272+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs37);
wf111 = _mm512_permutexvar_ps(eo44, wf111);
wf112 = _mm512_permutexvar_ps(eo44, wf112);
__m512i wfs38 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf111, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs38 = _mm512_inserti64x4(wfs38, _mm512_cvtps_ph(wf112, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+530432+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs38);
_mm512_mask_storeu_epi32(wfPtr8+792560+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs38);
__m512i wfs39 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf97, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs39 = _mm512_inserti64x4(wfs39, _mm512_cvtps_ph(wf98, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+0+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs39);
_mm512_mask_storeu_epi32(wfPtr8+262128+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs39);
__m512i wfs40 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf105, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs40 = _mm512_inserti64x4(wfs40, _mm512_cvtps_ph(wf106, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr8+524288+8192*i37+1024*c29+128*k106+64*m44+16*f47, 3855, wfs40);
_mm512_mask_storeu_epi32(wfPtr8+786416+8192*i37+1024*c29+128*k106+64*m44+16*f47, 61680, wfs40);
}
__m512 bias4 = _mm512_setzero_ps();
if (!e17) {
bias4 = _mm512_maskz_loadu_ps(3, biasPtr10-0+32*i37+8*j30);
__m512i pmMul20 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd20 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas9 = _mm512_maskz_loadu_ps(15, bnPtr11+(ptrdiff_t)8*(0+8*i37+2*j30));
__m512 postMul32 = _mm512_permutexvar_ps(pmMul20, mas9);
__m512 postAdd20 = _mm512_permutexvar_ps(pmAdd20, mas9);
bias4 = _mm512_fmadd_ps(bias4, postMul32, postAdd20);
bias4 = _mm512_mul_ps(bias4, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr8-0+32*i37+8*j30, 3, bias4);
}
}
}
}

static void ResNeXt50StriderArrangeFilts2(ResNeXt50ThreaderTeam1* team42, char** tensors57) {
ResNeXt50ThreaderTask1 task61;
task61.callee1 = ResNeXt50StriderArrangeFilts2Callee1;
task61.any1 = tensors57;
task61.nd1 = 3;
task61.hull1[0] = 1;
task61.hull1[1] = 16;
task61.hull1[2] = 1;
ResNeXt50ThreaderDo1(team42, &task61);
}

static void ResNeXt50StriderArrangeDats2Callee1(ResNeXt50ThreaderTask1* task62, int64_t* pt36) {
char** tensors60 = task62->any1;
ptrdiff_t s26 = 0;
ptrdiff_t c30 = 0;
ptrdiff_t g20 = pt36[2];
ptrdiff_t e18 = 0;
char*restrict datPtr18 = tensors60[0]-228+4992768*e18;
char*restrict dfPtr8 = tensors60[1]+207618048*e18;
ptrdiff_t i38 = 1*g20;
ptrdiff_t j31 = 3*c30;
ptrdiff_t rel19 = j31-0;
ptrdiff_t base19 = 0;
if (rel19 < 1) {
ptrdiff_t h40 = base19+0;
ptrdiff_t w49 = 0;
ptrdiff_t k107 = 8*s26;
ptrdiff_t kk34 = k107+7;
for (; k107 <= kk34; ++k107) {
ptrdiff_t b56 = 0;
ptrdiff_t m45 = (size_t)b56/2;
ptrdiff_t f48 = (size_t)b56%2;
__m512 dat1689 = _mm512_maskz_loadu_ps(65534, datPtr18+224+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1690 = _mm512_maskz_loadu_ps(65534, datPtr18+448+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1691 = _mm512_maskz_loadu_ps(65534, datPtr18+672+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1692 = _mm512_maskz_loadu_ps(65534, datPtr18+896+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1693 = _mm512_maskz_loadu_ps(65534, datPtr18+1120+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1694 = _mm512_maskz_loadu_ps(65534, datPtr18+1344+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1695 = _mm512_maskz_loadu_ps(65534, datPtr18+1568+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1696 = _mm512_maskz_loadu_ps(65534, datPtr18+1792+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1697 = _mm512_maskz_loadu_ps(65534, datPtr18+2016+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1698 = _mm512_maskz_loadu_ps(65534, datPtr18+2240+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1699 = _mm512_maskz_loadu_ps(65534, datPtr18+2464+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1700 = _mm512_maskz_loadu_ps(65534, datPtr18+2688+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1701 = _mm512_maskz_loadu_ps(65534, datPtr18+2912+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1702 = _mm512_maskz_loadu_ps(65534, datPtr18+3136+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 dat1703 = _mm512_maskz_loadu_ps(65534, datPtr18+3360+100864*i38+12608*k107+224*h40+4*w49+0*b56);
__m512 fft7393 = _mm512_add_ps(_mm512_setzero_ps(), dat1696);
__m512 fft7481 = _mm512_add_ps(dat1689, dat1697);
__m512 fft7394 = _mm512_sub_ps(_mm512_setzero_ps(), dat1696);
__m512 fft7482 = _mm512_sub_ps(dat1689, dat1697);
__m512 fft7395 = _mm512_add_ps(dat1690, dat1698);
__m512 fft7483 = _mm512_add_ps(dat1691, dat1699);
__m512 fft7396 = _mm512_sub_ps(dat1690, dat1698);
__m512 fft7484 = _mm512_sub_ps(dat1691, dat1699);
__m512 fft7397 = _mm512_add_ps(dat1692, dat1700);
__m512 fft7485 = _mm512_add_ps(dat1693, dat1701);
__m512 fft7398 = _mm512_sub_ps(dat1692, dat1700);
__m512 fft7486 = _mm512_sub_ps(dat1693, dat1701);
__m512 fft7399 = _mm512_add_ps(dat1694, dat1702);
__m512 fft7487 = _mm512_add_ps(dat1695, dat1703);
__m512 fft7400 = _mm512_sub_ps(dat1694, dat1702);
__m512 fft7488 = _mm512_sub_ps(dat1695, dat1703);
__m512 fft7401 = _mm512_add_ps(fft7393, fft7397);
__m512 fft7489 = _mm512_add_ps(fft7481, fft7485);
__m512 fft7402 = _mm512_sub_ps(fft7393, fft7397);
__m512 fft7490 = _mm512_sub_ps(fft7481, fft7485);
__m512 fft7403 = _mm512_add_ps(fft7395, fft7399);
__m512 fft7491 = _mm512_add_ps(fft7483, fft7487);
__m512 fft7404 = _mm512_sub_ps(fft7399, fft7395);
__m512 fft7492 = _mm512_sub_ps(fft7487, fft7483);
__m512 fft7405 = _mm512_sub_ps(fft7396, fft7400);
__m512 fft7493 = _mm512_sub_ps(fft7484, fft7488);
__m512 fft7406 = _mm512_add_ps(fft7396, fft7400);
__m512 fft7494 = _mm512_add_ps(fft7484, fft7488);
__m512 fft7407 = _mm512_add_ps(fft7401, fft7403);
__m512 fft7495 = _mm512_add_ps(fft7489, fft7491);
__m512 fft7408 = _mm512_sub_ps(fft7401, fft7403);
__m512 fft7496 = _mm512_sub_ps(fft7489, fft7491);
__m512 fft7409 = _mm512_fmadd_ps(fft7405, _mm512_set1_ps(7.0710677e-01f), fft7394);
__m512 fft7497 = _mm512_fmadd_ps(fft7493, _mm512_set1_ps(7.0710677e-01f), fft7482);
__m512 fft7410 = _mm512_fnmsub_ps(fft7406, _mm512_set1_ps(7.0710677e-01f), fft7398);
__m512 fft7498 = _mm512_fnmsub_ps(fft7494, _mm512_set1_ps(7.0710677e-01f), fft7486);
__m512 fft7411 = _mm512_fnmadd_ps(fft7405, _mm512_set1_ps(7.0710677e-01f), fft7394);
__m512 fft7499 = _mm512_fnmadd_ps(fft7493, _mm512_set1_ps(7.0710677e-01f), fft7482);
__m512 fft7412 = _mm512_fnmadd_ps(fft7406, _mm512_set1_ps(7.0710677e-01f), fft7398);
__m512 fft7500 = _mm512_fnmadd_ps(fft7494, _mm512_set1_ps(7.0710677e-01f), fft7486);
__m512 fft7413 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7414 = _mm512_fmadd_ps(fft7407, fft7413, _mm512_shuffle_f32x4(fft7407, fft7407, 78));
__m512 fft7501 = _mm512_fmadd_ps(fft7495, fft7413, _mm512_shuffle_f32x4(fft7495, fft7495, 78));
__m512 fft7415 = _mm512_fmadd_ps(fft7408, fft7413, _mm512_shuffle_f32x4(fft7408, fft7408, 78));
__m512 fft7502 = _mm512_fmadd_ps(fft7496, fft7413, _mm512_shuffle_f32x4(fft7496, fft7496, 78));
__m512 fft7416 = _mm512_fmadd_ps(fft7409, fft7413, _mm512_shuffle_f32x4(fft7409, fft7409, 78));
__m512 fft7503 = _mm512_fmadd_ps(fft7497, fft7413, _mm512_shuffle_f32x4(fft7497, fft7497, 78));
__m512 fft7417 = _mm512_fmadd_ps(fft7410, fft7413, _mm512_shuffle_f32x4(fft7410, fft7410, 78));
__m512 fft7504 = _mm512_fmadd_ps(fft7498, fft7413, _mm512_shuffle_f32x4(fft7498, fft7498, 78));
__m512 fft7418 = _mm512_fmadd_ps(fft7402, fft7413, _mm512_shuffle_f32x4(fft7402, fft7402, 78));
__m512 fft7505 = _mm512_fmadd_ps(fft7490, fft7413, _mm512_shuffle_f32x4(fft7490, fft7490, 78));
__m512 fft7419 = _mm512_fmadd_ps(fft7404, fft7413, _mm512_shuffle_f32x4(fft7404, fft7404, 78));
__m512 fft7506 = _mm512_fmadd_ps(fft7492, fft7413, _mm512_shuffle_f32x4(fft7492, fft7492, 78));
__m512 fft7420 = _mm512_fmadd_ps(fft7411, fft7413, _mm512_shuffle_f32x4(fft7411, fft7411, 78));
__m512 fft7507 = _mm512_fmadd_ps(fft7499, fft7413, _mm512_shuffle_f32x4(fft7499, fft7499, 78));
__m512 fft7421 = _mm512_fmadd_ps(fft7412, fft7413, _mm512_shuffle_f32x4(fft7412, fft7412, 78));
__m512 fft7508 = _mm512_fmadd_ps(fft7500, fft7413, _mm512_shuffle_f32x4(fft7500, fft7500, 78));
__m512 fft7422 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7423 = _mm512_mul_ps(fft7414, fft7422);
__m512 fft7509 = _mm512_mul_ps(fft7501, fft7422);
__m512 fft7424 = _mm512_mul_ps(fft7415, fft7422);
__m512 fft7510 = _mm512_mul_ps(fft7502, fft7422);
__m512 fft7425 = _mm512_mul_ps(fft7416, fft7422);
__m512 fft7511 = _mm512_mul_ps(fft7503, fft7422);
__m512 fft7426 = _mm512_mul_ps(fft7417, fft7422);
__m512 fft7512 = _mm512_mul_ps(fft7504, fft7422);
__m512 fft7427 = _mm512_mul_ps(fft7418, fft7422);
__m512 fft7513 = _mm512_mul_ps(fft7505, fft7422);
__m512 fft7428 = _mm512_mul_ps(fft7419, fft7422);
__m512 fft7514 = _mm512_mul_ps(fft7506, fft7422);
__m512 fft7429 = _mm512_mul_ps(fft7420, fft7422);
__m512 fft7515 = _mm512_mul_ps(fft7507, fft7422);
__m512 fft7430 = _mm512_mul_ps(fft7421, fft7422);
__m512 fft7516 = _mm512_mul_ps(fft7508, fft7422);
__m512 fft7431 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7432 = _mm512_fmadd_ps(fft7415, fft7431, fft7423);
__m512 fft7517 = _mm512_fmadd_ps(fft7502, fft7431, fft7509);
__m512 fft7433 = _mm512_fnmadd_ps(fft7414, fft7431, fft7424);
__m512 fft7518 = _mm512_fnmadd_ps(fft7501, fft7431, fft7510);
__m512 fft7434 = _mm512_fmadd_ps(fft7417, fft7431, fft7425);
__m512 fft7519 = _mm512_fmadd_ps(fft7504, fft7431, fft7511);
__m512 fft7435 = _mm512_fnmadd_ps(fft7416, fft7431, fft7426);
__m512 fft7520 = _mm512_fnmadd_ps(fft7503, fft7431, fft7512);
__m512 fft7436 = _mm512_fmadd_ps(fft7419, fft7431, fft7427);
__m512 fft7521 = _mm512_fmadd_ps(fft7506, fft7431, fft7513);
__m512 fft7437 = _mm512_fnmadd_ps(fft7418, fft7431, fft7428);
__m512 fft7522 = _mm512_fnmadd_ps(fft7505, fft7431, fft7514);
__m512 fft7438 = _mm512_fmadd_ps(fft7421, fft7431, fft7429);
__m512 fft7523 = _mm512_fmadd_ps(fft7508, fft7431, fft7515);
__m512 fft7439 = _mm512_fnmadd_ps(fft7420, fft7431, fft7430);
__m512 fft7524 = _mm512_fnmadd_ps(fft7507, fft7431, fft7516);
__m512 fft7440 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7441 = _mm512_fmadd_ps(fft7432, fft7440, _mm512_shuffle_f32x4(fft7432, fft7432, 177));
__m512 fft7525 = _mm512_fmadd_ps(fft7517, fft7440, _mm512_shuffle_f32x4(fft7517, fft7517, 177));
__m512 fft7442 = _mm512_fmadd_ps(fft7433, fft7440, _mm512_shuffle_f32x4(fft7433, fft7433, 177));
__m512 fft7526 = _mm512_fmadd_ps(fft7518, fft7440, _mm512_shuffle_f32x4(fft7518, fft7518, 177));
__m512 fft7443 = _mm512_fmadd_ps(fft7434, fft7440, _mm512_shuffle_f32x4(fft7434, fft7434, 177));
__m512 fft7527 = _mm512_fmadd_ps(fft7519, fft7440, _mm512_shuffle_f32x4(fft7519, fft7519, 177));
__m512 fft7444 = _mm512_fmadd_ps(fft7435, fft7440, _mm512_shuffle_f32x4(fft7435, fft7435, 177));
__m512 fft7528 = _mm512_fmadd_ps(fft7520, fft7440, _mm512_shuffle_f32x4(fft7520, fft7520, 177));
__m512 fft7445 = _mm512_fmadd_ps(fft7436, fft7440, _mm512_shuffle_f32x4(fft7436, fft7436, 177));
__m512 fft7529 = _mm512_fmadd_ps(fft7521, fft7440, _mm512_shuffle_f32x4(fft7521, fft7521, 177));
__m512 fft7446 = _mm512_fmadd_ps(fft7437, fft7440, _mm512_shuffle_f32x4(fft7437, fft7437, 177));
__m512 fft7530 = _mm512_fmadd_ps(fft7522, fft7440, _mm512_shuffle_f32x4(fft7522, fft7522, 177));
__m512 fft7447 = _mm512_fmadd_ps(fft7438, fft7440, _mm512_shuffle_f32x4(fft7438, fft7438, 177));
__m512 fft7531 = _mm512_fmadd_ps(fft7523, fft7440, _mm512_shuffle_f32x4(fft7523, fft7523, 177));
__m512 fft7448 = _mm512_fmadd_ps(fft7439, fft7440, _mm512_shuffle_f32x4(fft7439, fft7439, 177));
__m512 fft7532 = _mm512_fmadd_ps(fft7524, fft7440, _mm512_shuffle_f32x4(fft7524, fft7524, 177));
__m512 fft7449 = _mm512_mask_mov_ps(fft7441, 49344, fft7442);
__m512 fft7533 = _mm512_mask_mov_ps(fft7525, 49344, fft7526);
__m512 fft7450 = _mm512_mask_sub_ps(fft7442, 49344, _mm512_setzero_ps(), fft7441);
__m512 fft7534 = _mm512_mask_sub_ps(fft7526, 49344, _mm512_setzero_ps(), fft7525);
__m512 fft7451 = _mm512_mask_mov_ps(fft7443, 49344, fft7444);
__m512 fft7535 = _mm512_mask_mov_ps(fft7527, 49344, fft7528);
__m512 fft7452 = _mm512_mask_sub_ps(fft7444, 49344, _mm512_setzero_ps(), fft7443);
__m512 fft7536 = _mm512_mask_sub_ps(fft7528, 49344, _mm512_setzero_ps(), fft7527);
__m512 fft7453 = _mm512_mask_mov_ps(fft7445, 49344, fft7446);
__m512 fft7537 = _mm512_mask_mov_ps(fft7529, 49344, fft7530);
__m512 fft7454 = _mm512_mask_sub_ps(fft7446, 49344, _mm512_setzero_ps(), fft7445);
__m512 fft7538 = _mm512_mask_sub_ps(fft7530, 49344, _mm512_setzero_ps(), fft7529);
__m512 fft7455 = _mm512_mask_mov_ps(fft7447, 49344, fft7448);
__m512 fft7539 = _mm512_mask_mov_ps(fft7531, 49344, fft7532);
__m512 fft7456 = _mm512_mask_sub_ps(fft7448, 49344, _mm512_setzero_ps(), fft7447);
__m512 fft7540 = _mm512_mask_sub_ps(fft7532, 49344, _mm512_setzero_ps(), fft7531);
__m512 fft7457 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7458 = _mm512_fmadd_ps(fft7449, fft7457, _mm512_shuffle_ps(fft7449, fft7449, 78));
__m512 fft7541 = _mm512_fmadd_ps(fft7533, fft7457, _mm512_shuffle_ps(fft7533, fft7533, 78));
__m512 fft7459 = _mm512_fmadd_ps(fft7450, fft7457, _mm512_shuffle_ps(fft7450, fft7450, 78));
__m512 fft7542 = _mm512_fmadd_ps(fft7534, fft7457, _mm512_shuffle_ps(fft7534, fft7534, 78));
__m512 fft7460 = _mm512_fmadd_ps(fft7451, fft7457, _mm512_shuffle_ps(fft7451, fft7451, 78));
__m512 fft7543 = _mm512_fmadd_ps(fft7535, fft7457, _mm512_shuffle_ps(fft7535, fft7535, 78));
__m512 fft7461 = _mm512_fmadd_ps(fft7452, fft7457, _mm512_shuffle_ps(fft7452, fft7452, 78));
__m512 fft7544 = _mm512_fmadd_ps(fft7536, fft7457, _mm512_shuffle_ps(fft7536, fft7536, 78));
__m512 fft7462 = _mm512_fmadd_ps(fft7453, fft7457, _mm512_shuffle_ps(fft7453, fft7453, 78));
__m512 fft7545 = _mm512_fmadd_ps(fft7537, fft7457, _mm512_shuffle_ps(fft7537, fft7537, 78));
__m512 fft7463 = _mm512_fmadd_ps(fft7454, fft7457, _mm512_shuffle_ps(fft7454, fft7454, 78));
__m512 fft7546 = _mm512_fmadd_ps(fft7538, fft7457, _mm512_shuffle_ps(fft7538, fft7538, 78));
__m512 fft7464 = _mm512_fmadd_ps(fft7455, fft7457, _mm512_shuffle_ps(fft7455, fft7455, 78));
__m512 fft7547 = _mm512_fmadd_ps(fft7539, fft7457, _mm512_shuffle_ps(fft7539, fft7539, 78));
__m512 fft7465 = _mm512_fmadd_ps(fft7456, fft7457, _mm512_shuffle_ps(fft7456, fft7456, 78));
__m512 fft7548 = _mm512_fmadd_ps(fft7540, fft7457, _mm512_shuffle_ps(fft7540, fft7540, 78));
__m512i fft7466 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7467 = _mm512_permutexvar_ps(fft7466, fft7458);
__m512 fft7549 = _mm512_permutexvar_ps(fft7466, fft7541);
__m512i fft7468 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7469 = _mm512_permutexvar_ps(fft7468, fft7458);
__m512 fft7550 = _mm512_permutexvar_ps(fft7468, fft7541);
__m512 fft7470 = _mm512_permutexvar_ps(fft7466, fft7459);
__m512 fft7551 = _mm512_permutexvar_ps(fft7466, fft7542);
__m512 fft7471 = _mm512_permutexvar_ps(fft7468, fft7459);
__m512 fft7552 = _mm512_permutexvar_ps(fft7468, fft7542);
__m512 fft7472 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7473 = _mm512_fmadd_ps(fft7467, fft7472, fft7469);
__m512 fft7553 = _mm512_fmadd_ps(fft7549, fft7472, fft7550);
__m512 fft7474 = _mm512_fnmadd_ps(fft7471, fft7472, fft7470);
__m512 fft7554 = _mm512_fnmadd_ps(fft7552, fft7472, fft7551);
__m512 fft7475 = _mm512_mask_mov_ps(fft7471, 21845, fft7473);
__m512 fft7555 = _mm512_mask_mov_ps(fft7552, 21845, fft7553);
__m512 fft7476 = _mm512_mask_mov_ps(fft7467, 43176, fft7473);
__m512 fft7556 = _mm512_mask_mov_ps(fft7549, 43176, fft7553);
__m512 fft7477 = _mm512_mask_mov_ps(fft7475, 43176, fft7474);
__m512 fft7557 = _mm512_mask_mov_ps(fft7555, 43176, fft7554);
__m512 fft7478 = _mm512_mask_mov_ps(fft7476, 22102, fft7474);
__m512 fft7558 = _mm512_mask_mov_ps(fft7556, 22102, fft7554);
__m512 fft7479 = _mm512_mask_mul_ps(fft7477, 64764, fft7477, _mm512_set1_ps(5e-01f));
__m512 fft7559 = _mm512_mask_mul_ps(fft7557, 64764, fft7557, _mm512_set1_ps(5e-01f));
__m512 fft7480 = _mm512_mask_mul_ps(fft7478, 64764, fft7478, _mm512_set1_ps(5e-01f));
__m512 fft7560 = _mm512_mask_mul_ps(fft7558, 64764, fft7558, _mm512_set1_ps(5e-01f));
__m512 df661 = fft7479;
__m512 df669 = fft7559;
__m512 df662 = fft7480;
__m512 df670 = fft7560;
__m512 df663 = fft7460;
__m512 df671 = fft7543;
__m512 df664 = fft7461;
__m512 df672 = fft7544;
__m512 df665 = fft7462;
__m512 df673 = fft7545;
__m512 df666 = fft7463;
__m512 df674 = fft7546;
__m512 df667 = fft7464;
__m512 df675 = fft7547;
__m512 df668 = fft7465;
__m512 df676 = fft7548;
__m512i eo45 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df663 = _mm512_permutexvar_ps(eo45, df663);
df664 = _mm512_permutexvar_ps(eo45, df664);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df663);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df664);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df663);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df664);
df671 = _mm512_permutexvar_ps(eo45, df671);
df672 = _mm512_permutexvar_ps(eo45, df672);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df671);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df672);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df671);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df672);
df665 = _mm512_permutexvar_ps(eo45, df665);
df666 = _mm512_permutexvar_ps(eo45, df666);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df665);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df666);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df665);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df666);
df673 = _mm512_permutexvar_ps(eo45, df673);
df674 = _mm512_permutexvar_ps(eo45, df674);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df673);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df674);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df673);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df674);
df667 = _mm512_permutexvar_ps(eo45, df667);
df668 = _mm512_permutexvar_ps(eo45, df668);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df667);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df668);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df667);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df668);
df675 = _mm512_permutexvar_ps(eo45, df675);
df676 = _mm512_permutexvar_ps(eo45, df676);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df675);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df676);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df675);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df676);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df661);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df662);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df661);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df662);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df669);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k107+128*m45+32*f48, 255, df670);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df669);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k107+128*m45+32*f48, 65280, df670);
for (ptrdiff_t b57 = 1; b57 < 3; ++b57) {
ptrdiff_t m46 = (size_t)b57/2;
ptrdiff_t f49 = (size_t)b57%2;
__m512 dat1704 = _mm512_maskz_loadu_ps(65535, datPtr18+224+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1705 = _mm512_maskz_loadu_ps(65535, datPtr18+448+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1706 = _mm512_maskz_loadu_ps(65535, datPtr18+672+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1707 = _mm512_maskz_loadu_ps(65535, datPtr18+896+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1708 = _mm512_maskz_loadu_ps(65535, datPtr18+1120+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1709 = _mm512_maskz_loadu_ps(65535, datPtr18+1344+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1710 = _mm512_maskz_loadu_ps(65535, datPtr18+1568+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1711 = _mm512_maskz_loadu_ps(65535, datPtr18+1792+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1712 = _mm512_maskz_loadu_ps(65535, datPtr18+2016+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1713 = _mm512_maskz_loadu_ps(65535, datPtr18+2240+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1714 = _mm512_maskz_loadu_ps(65535, datPtr18+2464+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1715 = _mm512_maskz_loadu_ps(65535, datPtr18+2688+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1716 = _mm512_maskz_loadu_ps(65535, datPtr18+2912+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1717 = _mm512_maskz_loadu_ps(65535, datPtr18+3136+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 dat1718 = _mm512_maskz_loadu_ps(65535, datPtr18+3360+100864*i38+12608*k107+224*h40+4*w49+56*b57);
__m512 fft7561 = _mm512_add_ps(_mm512_setzero_ps(), dat1711);
__m512 fft7649 = _mm512_add_ps(dat1704, dat1712);
__m512 fft7562 = _mm512_sub_ps(_mm512_setzero_ps(), dat1711);
__m512 fft7650 = _mm512_sub_ps(dat1704, dat1712);
__m512 fft7563 = _mm512_add_ps(dat1705, dat1713);
__m512 fft7651 = _mm512_add_ps(dat1706, dat1714);
__m512 fft7564 = _mm512_sub_ps(dat1705, dat1713);
__m512 fft7652 = _mm512_sub_ps(dat1706, dat1714);
__m512 fft7565 = _mm512_add_ps(dat1707, dat1715);
__m512 fft7653 = _mm512_add_ps(dat1708, dat1716);
__m512 fft7566 = _mm512_sub_ps(dat1707, dat1715);
__m512 fft7654 = _mm512_sub_ps(dat1708, dat1716);
__m512 fft7567 = _mm512_add_ps(dat1709, dat1717);
__m512 fft7655 = _mm512_add_ps(dat1710, dat1718);
__m512 fft7568 = _mm512_sub_ps(dat1709, dat1717);
__m512 fft7656 = _mm512_sub_ps(dat1710, dat1718);
__m512 fft7569 = _mm512_add_ps(fft7561, fft7565);
__m512 fft7657 = _mm512_add_ps(fft7649, fft7653);
__m512 fft7570 = _mm512_sub_ps(fft7561, fft7565);
__m512 fft7658 = _mm512_sub_ps(fft7649, fft7653);
__m512 fft7571 = _mm512_add_ps(fft7563, fft7567);
__m512 fft7659 = _mm512_add_ps(fft7651, fft7655);
__m512 fft7572 = _mm512_sub_ps(fft7567, fft7563);
__m512 fft7660 = _mm512_sub_ps(fft7655, fft7651);
__m512 fft7573 = _mm512_sub_ps(fft7564, fft7568);
__m512 fft7661 = _mm512_sub_ps(fft7652, fft7656);
__m512 fft7574 = _mm512_add_ps(fft7564, fft7568);
__m512 fft7662 = _mm512_add_ps(fft7652, fft7656);
__m512 fft7575 = _mm512_add_ps(fft7569, fft7571);
__m512 fft7663 = _mm512_add_ps(fft7657, fft7659);
__m512 fft7576 = _mm512_sub_ps(fft7569, fft7571);
__m512 fft7664 = _mm512_sub_ps(fft7657, fft7659);
__m512 fft7577 = _mm512_fmadd_ps(fft7573, _mm512_set1_ps(7.0710677e-01f), fft7562);
__m512 fft7665 = _mm512_fmadd_ps(fft7661, _mm512_set1_ps(7.0710677e-01f), fft7650);
__m512 fft7578 = _mm512_fnmsub_ps(fft7574, _mm512_set1_ps(7.0710677e-01f), fft7566);
__m512 fft7666 = _mm512_fnmsub_ps(fft7662, _mm512_set1_ps(7.0710677e-01f), fft7654);
__m512 fft7579 = _mm512_fnmadd_ps(fft7573, _mm512_set1_ps(7.0710677e-01f), fft7562);
__m512 fft7667 = _mm512_fnmadd_ps(fft7661, _mm512_set1_ps(7.0710677e-01f), fft7650);
__m512 fft7580 = _mm512_fnmadd_ps(fft7574, _mm512_set1_ps(7.0710677e-01f), fft7566);
__m512 fft7668 = _mm512_fnmadd_ps(fft7662, _mm512_set1_ps(7.0710677e-01f), fft7654);
__m512 fft7581 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7582 = _mm512_fmadd_ps(fft7575, fft7581, _mm512_shuffle_f32x4(fft7575, fft7575, 78));
__m512 fft7669 = _mm512_fmadd_ps(fft7663, fft7581, _mm512_shuffle_f32x4(fft7663, fft7663, 78));
__m512 fft7583 = _mm512_fmadd_ps(fft7576, fft7581, _mm512_shuffle_f32x4(fft7576, fft7576, 78));
__m512 fft7670 = _mm512_fmadd_ps(fft7664, fft7581, _mm512_shuffle_f32x4(fft7664, fft7664, 78));
__m512 fft7584 = _mm512_fmadd_ps(fft7577, fft7581, _mm512_shuffle_f32x4(fft7577, fft7577, 78));
__m512 fft7671 = _mm512_fmadd_ps(fft7665, fft7581, _mm512_shuffle_f32x4(fft7665, fft7665, 78));
__m512 fft7585 = _mm512_fmadd_ps(fft7578, fft7581, _mm512_shuffle_f32x4(fft7578, fft7578, 78));
__m512 fft7672 = _mm512_fmadd_ps(fft7666, fft7581, _mm512_shuffle_f32x4(fft7666, fft7666, 78));
__m512 fft7586 = _mm512_fmadd_ps(fft7570, fft7581, _mm512_shuffle_f32x4(fft7570, fft7570, 78));
__m512 fft7673 = _mm512_fmadd_ps(fft7658, fft7581, _mm512_shuffle_f32x4(fft7658, fft7658, 78));
__m512 fft7587 = _mm512_fmadd_ps(fft7572, fft7581, _mm512_shuffle_f32x4(fft7572, fft7572, 78));
__m512 fft7674 = _mm512_fmadd_ps(fft7660, fft7581, _mm512_shuffle_f32x4(fft7660, fft7660, 78));
__m512 fft7588 = _mm512_fmadd_ps(fft7579, fft7581, _mm512_shuffle_f32x4(fft7579, fft7579, 78));
__m512 fft7675 = _mm512_fmadd_ps(fft7667, fft7581, _mm512_shuffle_f32x4(fft7667, fft7667, 78));
__m512 fft7589 = _mm512_fmadd_ps(fft7580, fft7581, _mm512_shuffle_f32x4(fft7580, fft7580, 78));
__m512 fft7676 = _mm512_fmadd_ps(fft7668, fft7581, _mm512_shuffle_f32x4(fft7668, fft7668, 78));
__m512 fft7590 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7591 = _mm512_mul_ps(fft7582, fft7590);
__m512 fft7677 = _mm512_mul_ps(fft7669, fft7590);
__m512 fft7592 = _mm512_mul_ps(fft7583, fft7590);
__m512 fft7678 = _mm512_mul_ps(fft7670, fft7590);
__m512 fft7593 = _mm512_mul_ps(fft7584, fft7590);
__m512 fft7679 = _mm512_mul_ps(fft7671, fft7590);
__m512 fft7594 = _mm512_mul_ps(fft7585, fft7590);
__m512 fft7680 = _mm512_mul_ps(fft7672, fft7590);
__m512 fft7595 = _mm512_mul_ps(fft7586, fft7590);
__m512 fft7681 = _mm512_mul_ps(fft7673, fft7590);
__m512 fft7596 = _mm512_mul_ps(fft7587, fft7590);
__m512 fft7682 = _mm512_mul_ps(fft7674, fft7590);
__m512 fft7597 = _mm512_mul_ps(fft7588, fft7590);
__m512 fft7683 = _mm512_mul_ps(fft7675, fft7590);
__m512 fft7598 = _mm512_mul_ps(fft7589, fft7590);
__m512 fft7684 = _mm512_mul_ps(fft7676, fft7590);
__m512 fft7599 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7600 = _mm512_fmadd_ps(fft7583, fft7599, fft7591);
__m512 fft7685 = _mm512_fmadd_ps(fft7670, fft7599, fft7677);
__m512 fft7601 = _mm512_fnmadd_ps(fft7582, fft7599, fft7592);
__m512 fft7686 = _mm512_fnmadd_ps(fft7669, fft7599, fft7678);
__m512 fft7602 = _mm512_fmadd_ps(fft7585, fft7599, fft7593);
__m512 fft7687 = _mm512_fmadd_ps(fft7672, fft7599, fft7679);
__m512 fft7603 = _mm512_fnmadd_ps(fft7584, fft7599, fft7594);
__m512 fft7688 = _mm512_fnmadd_ps(fft7671, fft7599, fft7680);
__m512 fft7604 = _mm512_fmadd_ps(fft7587, fft7599, fft7595);
__m512 fft7689 = _mm512_fmadd_ps(fft7674, fft7599, fft7681);
__m512 fft7605 = _mm512_fnmadd_ps(fft7586, fft7599, fft7596);
__m512 fft7690 = _mm512_fnmadd_ps(fft7673, fft7599, fft7682);
__m512 fft7606 = _mm512_fmadd_ps(fft7589, fft7599, fft7597);
__m512 fft7691 = _mm512_fmadd_ps(fft7676, fft7599, fft7683);
__m512 fft7607 = _mm512_fnmadd_ps(fft7588, fft7599, fft7598);
__m512 fft7692 = _mm512_fnmadd_ps(fft7675, fft7599, fft7684);
__m512 fft7608 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7609 = _mm512_fmadd_ps(fft7600, fft7608, _mm512_shuffle_f32x4(fft7600, fft7600, 177));
__m512 fft7693 = _mm512_fmadd_ps(fft7685, fft7608, _mm512_shuffle_f32x4(fft7685, fft7685, 177));
__m512 fft7610 = _mm512_fmadd_ps(fft7601, fft7608, _mm512_shuffle_f32x4(fft7601, fft7601, 177));
__m512 fft7694 = _mm512_fmadd_ps(fft7686, fft7608, _mm512_shuffle_f32x4(fft7686, fft7686, 177));
__m512 fft7611 = _mm512_fmadd_ps(fft7602, fft7608, _mm512_shuffle_f32x4(fft7602, fft7602, 177));
__m512 fft7695 = _mm512_fmadd_ps(fft7687, fft7608, _mm512_shuffle_f32x4(fft7687, fft7687, 177));
__m512 fft7612 = _mm512_fmadd_ps(fft7603, fft7608, _mm512_shuffle_f32x4(fft7603, fft7603, 177));
__m512 fft7696 = _mm512_fmadd_ps(fft7688, fft7608, _mm512_shuffle_f32x4(fft7688, fft7688, 177));
__m512 fft7613 = _mm512_fmadd_ps(fft7604, fft7608, _mm512_shuffle_f32x4(fft7604, fft7604, 177));
__m512 fft7697 = _mm512_fmadd_ps(fft7689, fft7608, _mm512_shuffle_f32x4(fft7689, fft7689, 177));
__m512 fft7614 = _mm512_fmadd_ps(fft7605, fft7608, _mm512_shuffle_f32x4(fft7605, fft7605, 177));
__m512 fft7698 = _mm512_fmadd_ps(fft7690, fft7608, _mm512_shuffle_f32x4(fft7690, fft7690, 177));
__m512 fft7615 = _mm512_fmadd_ps(fft7606, fft7608, _mm512_shuffle_f32x4(fft7606, fft7606, 177));
__m512 fft7699 = _mm512_fmadd_ps(fft7691, fft7608, _mm512_shuffle_f32x4(fft7691, fft7691, 177));
__m512 fft7616 = _mm512_fmadd_ps(fft7607, fft7608, _mm512_shuffle_f32x4(fft7607, fft7607, 177));
__m512 fft7700 = _mm512_fmadd_ps(fft7692, fft7608, _mm512_shuffle_f32x4(fft7692, fft7692, 177));
__m512 fft7617 = _mm512_mask_mov_ps(fft7609, 49344, fft7610);
__m512 fft7701 = _mm512_mask_mov_ps(fft7693, 49344, fft7694);
__m512 fft7618 = _mm512_mask_sub_ps(fft7610, 49344, _mm512_setzero_ps(), fft7609);
__m512 fft7702 = _mm512_mask_sub_ps(fft7694, 49344, _mm512_setzero_ps(), fft7693);
__m512 fft7619 = _mm512_mask_mov_ps(fft7611, 49344, fft7612);
__m512 fft7703 = _mm512_mask_mov_ps(fft7695, 49344, fft7696);
__m512 fft7620 = _mm512_mask_sub_ps(fft7612, 49344, _mm512_setzero_ps(), fft7611);
__m512 fft7704 = _mm512_mask_sub_ps(fft7696, 49344, _mm512_setzero_ps(), fft7695);
__m512 fft7621 = _mm512_mask_mov_ps(fft7613, 49344, fft7614);
__m512 fft7705 = _mm512_mask_mov_ps(fft7697, 49344, fft7698);
__m512 fft7622 = _mm512_mask_sub_ps(fft7614, 49344, _mm512_setzero_ps(), fft7613);
__m512 fft7706 = _mm512_mask_sub_ps(fft7698, 49344, _mm512_setzero_ps(), fft7697);
__m512 fft7623 = _mm512_mask_mov_ps(fft7615, 49344, fft7616);
__m512 fft7707 = _mm512_mask_mov_ps(fft7699, 49344, fft7700);
__m512 fft7624 = _mm512_mask_sub_ps(fft7616, 49344, _mm512_setzero_ps(), fft7615);
__m512 fft7708 = _mm512_mask_sub_ps(fft7700, 49344, _mm512_setzero_ps(), fft7699);
__m512 fft7625 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7626 = _mm512_fmadd_ps(fft7617, fft7625, _mm512_shuffle_ps(fft7617, fft7617, 78));
__m512 fft7709 = _mm512_fmadd_ps(fft7701, fft7625, _mm512_shuffle_ps(fft7701, fft7701, 78));
__m512 fft7627 = _mm512_fmadd_ps(fft7618, fft7625, _mm512_shuffle_ps(fft7618, fft7618, 78));
__m512 fft7710 = _mm512_fmadd_ps(fft7702, fft7625, _mm512_shuffle_ps(fft7702, fft7702, 78));
__m512 fft7628 = _mm512_fmadd_ps(fft7619, fft7625, _mm512_shuffle_ps(fft7619, fft7619, 78));
__m512 fft7711 = _mm512_fmadd_ps(fft7703, fft7625, _mm512_shuffle_ps(fft7703, fft7703, 78));
__m512 fft7629 = _mm512_fmadd_ps(fft7620, fft7625, _mm512_shuffle_ps(fft7620, fft7620, 78));
__m512 fft7712 = _mm512_fmadd_ps(fft7704, fft7625, _mm512_shuffle_ps(fft7704, fft7704, 78));
__m512 fft7630 = _mm512_fmadd_ps(fft7621, fft7625, _mm512_shuffle_ps(fft7621, fft7621, 78));
__m512 fft7713 = _mm512_fmadd_ps(fft7705, fft7625, _mm512_shuffle_ps(fft7705, fft7705, 78));
__m512 fft7631 = _mm512_fmadd_ps(fft7622, fft7625, _mm512_shuffle_ps(fft7622, fft7622, 78));
__m512 fft7714 = _mm512_fmadd_ps(fft7706, fft7625, _mm512_shuffle_ps(fft7706, fft7706, 78));
__m512 fft7632 = _mm512_fmadd_ps(fft7623, fft7625, _mm512_shuffle_ps(fft7623, fft7623, 78));
__m512 fft7715 = _mm512_fmadd_ps(fft7707, fft7625, _mm512_shuffle_ps(fft7707, fft7707, 78));
__m512 fft7633 = _mm512_fmadd_ps(fft7624, fft7625, _mm512_shuffle_ps(fft7624, fft7624, 78));
__m512 fft7716 = _mm512_fmadd_ps(fft7708, fft7625, _mm512_shuffle_ps(fft7708, fft7708, 78));
__m512i fft7634 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7635 = _mm512_permutexvar_ps(fft7634, fft7626);
__m512 fft7717 = _mm512_permutexvar_ps(fft7634, fft7709);
__m512i fft7636 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7637 = _mm512_permutexvar_ps(fft7636, fft7626);
__m512 fft7718 = _mm512_permutexvar_ps(fft7636, fft7709);
__m512 fft7638 = _mm512_permutexvar_ps(fft7634, fft7627);
__m512 fft7719 = _mm512_permutexvar_ps(fft7634, fft7710);
__m512 fft7639 = _mm512_permutexvar_ps(fft7636, fft7627);
__m512 fft7720 = _mm512_permutexvar_ps(fft7636, fft7710);
__m512 fft7640 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7641 = _mm512_fmadd_ps(fft7635, fft7640, fft7637);
__m512 fft7721 = _mm512_fmadd_ps(fft7717, fft7640, fft7718);
__m512 fft7642 = _mm512_fnmadd_ps(fft7639, fft7640, fft7638);
__m512 fft7722 = _mm512_fnmadd_ps(fft7720, fft7640, fft7719);
__m512 fft7643 = _mm512_mask_mov_ps(fft7639, 21845, fft7641);
__m512 fft7723 = _mm512_mask_mov_ps(fft7720, 21845, fft7721);
__m512 fft7644 = _mm512_mask_mov_ps(fft7635, 43176, fft7641);
__m512 fft7724 = _mm512_mask_mov_ps(fft7717, 43176, fft7721);
__m512 fft7645 = _mm512_mask_mov_ps(fft7643, 43176, fft7642);
__m512 fft7725 = _mm512_mask_mov_ps(fft7723, 43176, fft7722);
__m512 fft7646 = _mm512_mask_mov_ps(fft7644, 22102, fft7642);
__m512 fft7726 = _mm512_mask_mov_ps(fft7724, 22102, fft7722);
__m512 fft7647 = _mm512_mask_mul_ps(fft7645, 64764, fft7645, _mm512_set1_ps(5e-01f));
__m512 fft7727 = _mm512_mask_mul_ps(fft7725, 64764, fft7725, _mm512_set1_ps(5e-01f));
__m512 fft7648 = _mm512_mask_mul_ps(fft7646, 64764, fft7646, _mm512_set1_ps(5e-01f));
__m512 fft7728 = _mm512_mask_mul_ps(fft7726, 64764, fft7726, _mm512_set1_ps(5e-01f));
__m512 df677 = fft7647;
__m512 df685 = fft7727;
__m512 df678 = fft7648;
__m512 df686 = fft7728;
__m512 df679 = fft7628;
__m512 df687 = fft7711;
__m512 df680 = fft7629;
__m512 df688 = fft7712;
__m512 df681 = fft7630;
__m512 df689 = fft7713;
__m512 df682 = fft7631;
__m512 df690 = fft7714;
__m512 df683 = fft7632;
__m512 df691 = fft7715;
__m512 df684 = fft7633;
__m512 df692 = fft7716;
__m512i eo46 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df679 = _mm512_permutexvar_ps(eo46, df679);
df680 = _mm512_permutexvar_ps(eo46, df680);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df679);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df680);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df679);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df680);
df687 = _mm512_permutexvar_ps(eo46, df687);
df688 = _mm512_permutexvar_ps(eo46, df688);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df687);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df688);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df687);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df688);
df681 = _mm512_permutexvar_ps(eo46, df681);
df682 = _mm512_permutexvar_ps(eo46, df682);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df681);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df682);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df681);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df682);
df689 = _mm512_permutexvar_ps(eo46, df689);
df690 = _mm512_permutexvar_ps(eo46, df690);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df689);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df690);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df689);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df690);
df683 = _mm512_permutexvar_ps(eo46, df683);
df684 = _mm512_permutexvar_ps(eo46, df684);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df683);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df684);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df683);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df684);
df691 = _mm512_permutexvar_ps(eo46, df691);
df692 = _mm512_permutexvar_ps(eo46, df692);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df691);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df692);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df691);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df692);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df677);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df678);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df677);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df678);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df685);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k107+128*m46+32*f49, 255, df686);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df685);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k107+128*m46+32*f49, 65280, df686);
}
ptrdiff_t b58 = 3;
ptrdiff_t m47 = (size_t)b58/2;
ptrdiff_t f50 = (size_t)b58%2;
__m512 dat1719 = _mm512_maskz_loadu_ps(32767, datPtr18+392+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1720 = _mm512_maskz_loadu_ps(32767, datPtr18+616+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1721 = _mm512_maskz_loadu_ps(32767, datPtr18+840+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1722 = _mm512_maskz_loadu_ps(32767, datPtr18+1064+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1723 = _mm512_maskz_loadu_ps(32767, datPtr18+1288+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1724 = _mm512_maskz_loadu_ps(32767, datPtr18+1512+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1725 = _mm512_maskz_loadu_ps(32767, datPtr18+1736+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1726 = _mm512_maskz_loadu_ps(32767, datPtr18+1960+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1727 = _mm512_maskz_loadu_ps(32767, datPtr18+2184+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1728 = _mm512_maskz_loadu_ps(32767, datPtr18+2408+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1729 = _mm512_maskz_loadu_ps(32767, datPtr18+2632+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1730 = _mm512_maskz_loadu_ps(32767, datPtr18+2856+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1731 = _mm512_maskz_loadu_ps(32767, datPtr18+3080+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1732 = _mm512_maskz_loadu_ps(32767, datPtr18+3304+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 dat1733 = _mm512_maskz_loadu_ps(32767, datPtr18+3528+100864*i38+12608*k107+224*h40+4*w49+0*b58);
__m512 fft7729 = _mm512_add_ps(_mm512_setzero_ps(), dat1726);
__m512 fft7817 = _mm512_add_ps(dat1719, dat1727);
__m512 fft7730 = _mm512_sub_ps(_mm512_setzero_ps(), dat1726);
__m512 fft7818 = _mm512_sub_ps(dat1719, dat1727);
__m512 fft7731 = _mm512_add_ps(dat1720, dat1728);
__m512 fft7819 = _mm512_add_ps(dat1721, dat1729);
__m512 fft7732 = _mm512_sub_ps(dat1720, dat1728);
__m512 fft7820 = _mm512_sub_ps(dat1721, dat1729);
__m512 fft7733 = _mm512_add_ps(dat1722, dat1730);
__m512 fft7821 = _mm512_add_ps(dat1723, dat1731);
__m512 fft7734 = _mm512_sub_ps(dat1722, dat1730);
__m512 fft7822 = _mm512_sub_ps(dat1723, dat1731);
__m512 fft7735 = _mm512_add_ps(dat1724, dat1732);
__m512 fft7823 = _mm512_add_ps(dat1725, dat1733);
__m512 fft7736 = _mm512_sub_ps(dat1724, dat1732);
__m512 fft7824 = _mm512_sub_ps(dat1725, dat1733);
__m512 fft7737 = _mm512_add_ps(fft7729, fft7733);
__m512 fft7825 = _mm512_add_ps(fft7817, fft7821);
__m512 fft7738 = _mm512_sub_ps(fft7729, fft7733);
__m512 fft7826 = _mm512_sub_ps(fft7817, fft7821);
__m512 fft7739 = _mm512_add_ps(fft7731, fft7735);
__m512 fft7827 = _mm512_add_ps(fft7819, fft7823);
__m512 fft7740 = _mm512_sub_ps(fft7735, fft7731);
__m512 fft7828 = _mm512_sub_ps(fft7823, fft7819);
__m512 fft7741 = _mm512_sub_ps(fft7732, fft7736);
__m512 fft7829 = _mm512_sub_ps(fft7820, fft7824);
__m512 fft7742 = _mm512_add_ps(fft7732, fft7736);
__m512 fft7830 = _mm512_add_ps(fft7820, fft7824);
__m512 fft7743 = _mm512_add_ps(fft7737, fft7739);
__m512 fft7831 = _mm512_add_ps(fft7825, fft7827);
__m512 fft7744 = _mm512_sub_ps(fft7737, fft7739);
__m512 fft7832 = _mm512_sub_ps(fft7825, fft7827);
__m512 fft7745 = _mm512_fmadd_ps(fft7741, _mm512_set1_ps(7.0710677e-01f), fft7730);
__m512 fft7833 = _mm512_fmadd_ps(fft7829, _mm512_set1_ps(7.0710677e-01f), fft7818);
__m512 fft7746 = _mm512_fnmsub_ps(fft7742, _mm512_set1_ps(7.0710677e-01f), fft7734);
__m512 fft7834 = _mm512_fnmsub_ps(fft7830, _mm512_set1_ps(7.0710677e-01f), fft7822);
__m512 fft7747 = _mm512_fnmadd_ps(fft7741, _mm512_set1_ps(7.0710677e-01f), fft7730);
__m512 fft7835 = _mm512_fnmadd_ps(fft7829, _mm512_set1_ps(7.0710677e-01f), fft7818);
__m512 fft7748 = _mm512_fnmadd_ps(fft7742, _mm512_set1_ps(7.0710677e-01f), fft7734);
__m512 fft7836 = _mm512_fnmadd_ps(fft7830, _mm512_set1_ps(7.0710677e-01f), fft7822);
__m512 fft7749 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7750 = _mm512_fmadd_ps(fft7743, fft7749, _mm512_shuffle_f32x4(fft7743, fft7743, 78));
__m512 fft7837 = _mm512_fmadd_ps(fft7831, fft7749, _mm512_shuffle_f32x4(fft7831, fft7831, 78));
__m512 fft7751 = _mm512_fmadd_ps(fft7744, fft7749, _mm512_shuffle_f32x4(fft7744, fft7744, 78));
__m512 fft7838 = _mm512_fmadd_ps(fft7832, fft7749, _mm512_shuffle_f32x4(fft7832, fft7832, 78));
__m512 fft7752 = _mm512_fmadd_ps(fft7745, fft7749, _mm512_shuffle_f32x4(fft7745, fft7745, 78));
__m512 fft7839 = _mm512_fmadd_ps(fft7833, fft7749, _mm512_shuffle_f32x4(fft7833, fft7833, 78));
__m512 fft7753 = _mm512_fmadd_ps(fft7746, fft7749, _mm512_shuffle_f32x4(fft7746, fft7746, 78));
__m512 fft7840 = _mm512_fmadd_ps(fft7834, fft7749, _mm512_shuffle_f32x4(fft7834, fft7834, 78));
__m512 fft7754 = _mm512_fmadd_ps(fft7738, fft7749, _mm512_shuffle_f32x4(fft7738, fft7738, 78));
__m512 fft7841 = _mm512_fmadd_ps(fft7826, fft7749, _mm512_shuffle_f32x4(fft7826, fft7826, 78));
__m512 fft7755 = _mm512_fmadd_ps(fft7740, fft7749, _mm512_shuffle_f32x4(fft7740, fft7740, 78));
__m512 fft7842 = _mm512_fmadd_ps(fft7828, fft7749, _mm512_shuffle_f32x4(fft7828, fft7828, 78));
__m512 fft7756 = _mm512_fmadd_ps(fft7747, fft7749, _mm512_shuffle_f32x4(fft7747, fft7747, 78));
__m512 fft7843 = _mm512_fmadd_ps(fft7835, fft7749, _mm512_shuffle_f32x4(fft7835, fft7835, 78));
__m512 fft7757 = _mm512_fmadd_ps(fft7748, fft7749, _mm512_shuffle_f32x4(fft7748, fft7748, 78));
__m512 fft7844 = _mm512_fmadd_ps(fft7836, fft7749, _mm512_shuffle_f32x4(fft7836, fft7836, 78));
__m512 fft7758 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7759 = _mm512_mul_ps(fft7750, fft7758);
__m512 fft7845 = _mm512_mul_ps(fft7837, fft7758);
__m512 fft7760 = _mm512_mul_ps(fft7751, fft7758);
__m512 fft7846 = _mm512_mul_ps(fft7838, fft7758);
__m512 fft7761 = _mm512_mul_ps(fft7752, fft7758);
__m512 fft7847 = _mm512_mul_ps(fft7839, fft7758);
__m512 fft7762 = _mm512_mul_ps(fft7753, fft7758);
__m512 fft7848 = _mm512_mul_ps(fft7840, fft7758);
__m512 fft7763 = _mm512_mul_ps(fft7754, fft7758);
__m512 fft7849 = _mm512_mul_ps(fft7841, fft7758);
__m512 fft7764 = _mm512_mul_ps(fft7755, fft7758);
__m512 fft7850 = _mm512_mul_ps(fft7842, fft7758);
__m512 fft7765 = _mm512_mul_ps(fft7756, fft7758);
__m512 fft7851 = _mm512_mul_ps(fft7843, fft7758);
__m512 fft7766 = _mm512_mul_ps(fft7757, fft7758);
__m512 fft7852 = _mm512_mul_ps(fft7844, fft7758);
__m512 fft7767 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7768 = _mm512_fmadd_ps(fft7751, fft7767, fft7759);
__m512 fft7853 = _mm512_fmadd_ps(fft7838, fft7767, fft7845);
__m512 fft7769 = _mm512_fnmadd_ps(fft7750, fft7767, fft7760);
__m512 fft7854 = _mm512_fnmadd_ps(fft7837, fft7767, fft7846);
__m512 fft7770 = _mm512_fmadd_ps(fft7753, fft7767, fft7761);
__m512 fft7855 = _mm512_fmadd_ps(fft7840, fft7767, fft7847);
__m512 fft7771 = _mm512_fnmadd_ps(fft7752, fft7767, fft7762);
__m512 fft7856 = _mm512_fnmadd_ps(fft7839, fft7767, fft7848);
__m512 fft7772 = _mm512_fmadd_ps(fft7755, fft7767, fft7763);
__m512 fft7857 = _mm512_fmadd_ps(fft7842, fft7767, fft7849);
__m512 fft7773 = _mm512_fnmadd_ps(fft7754, fft7767, fft7764);
__m512 fft7858 = _mm512_fnmadd_ps(fft7841, fft7767, fft7850);
__m512 fft7774 = _mm512_fmadd_ps(fft7757, fft7767, fft7765);
__m512 fft7859 = _mm512_fmadd_ps(fft7844, fft7767, fft7851);
__m512 fft7775 = _mm512_fnmadd_ps(fft7756, fft7767, fft7766);
__m512 fft7860 = _mm512_fnmadd_ps(fft7843, fft7767, fft7852);
__m512 fft7776 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7777 = _mm512_fmadd_ps(fft7768, fft7776, _mm512_shuffle_f32x4(fft7768, fft7768, 177));
__m512 fft7861 = _mm512_fmadd_ps(fft7853, fft7776, _mm512_shuffle_f32x4(fft7853, fft7853, 177));
__m512 fft7778 = _mm512_fmadd_ps(fft7769, fft7776, _mm512_shuffle_f32x4(fft7769, fft7769, 177));
__m512 fft7862 = _mm512_fmadd_ps(fft7854, fft7776, _mm512_shuffle_f32x4(fft7854, fft7854, 177));
__m512 fft7779 = _mm512_fmadd_ps(fft7770, fft7776, _mm512_shuffle_f32x4(fft7770, fft7770, 177));
__m512 fft7863 = _mm512_fmadd_ps(fft7855, fft7776, _mm512_shuffle_f32x4(fft7855, fft7855, 177));
__m512 fft7780 = _mm512_fmadd_ps(fft7771, fft7776, _mm512_shuffle_f32x4(fft7771, fft7771, 177));
__m512 fft7864 = _mm512_fmadd_ps(fft7856, fft7776, _mm512_shuffle_f32x4(fft7856, fft7856, 177));
__m512 fft7781 = _mm512_fmadd_ps(fft7772, fft7776, _mm512_shuffle_f32x4(fft7772, fft7772, 177));
__m512 fft7865 = _mm512_fmadd_ps(fft7857, fft7776, _mm512_shuffle_f32x4(fft7857, fft7857, 177));
__m512 fft7782 = _mm512_fmadd_ps(fft7773, fft7776, _mm512_shuffle_f32x4(fft7773, fft7773, 177));
__m512 fft7866 = _mm512_fmadd_ps(fft7858, fft7776, _mm512_shuffle_f32x4(fft7858, fft7858, 177));
__m512 fft7783 = _mm512_fmadd_ps(fft7774, fft7776, _mm512_shuffle_f32x4(fft7774, fft7774, 177));
__m512 fft7867 = _mm512_fmadd_ps(fft7859, fft7776, _mm512_shuffle_f32x4(fft7859, fft7859, 177));
__m512 fft7784 = _mm512_fmadd_ps(fft7775, fft7776, _mm512_shuffle_f32x4(fft7775, fft7775, 177));
__m512 fft7868 = _mm512_fmadd_ps(fft7860, fft7776, _mm512_shuffle_f32x4(fft7860, fft7860, 177));
__m512 fft7785 = _mm512_mask_mov_ps(fft7777, 49344, fft7778);
__m512 fft7869 = _mm512_mask_mov_ps(fft7861, 49344, fft7862);
__m512 fft7786 = _mm512_mask_sub_ps(fft7778, 49344, _mm512_setzero_ps(), fft7777);
__m512 fft7870 = _mm512_mask_sub_ps(fft7862, 49344, _mm512_setzero_ps(), fft7861);
__m512 fft7787 = _mm512_mask_mov_ps(fft7779, 49344, fft7780);
__m512 fft7871 = _mm512_mask_mov_ps(fft7863, 49344, fft7864);
__m512 fft7788 = _mm512_mask_sub_ps(fft7780, 49344, _mm512_setzero_ps(), fft7779);
__m512 fft7872 = _mm512_mask_sub_ps(fft7864, 49344, _mm512_setzero_ps(), fft7863);
__m512 fft7789 = _mm512_mask_mov_ps(fft7781, 49344, fft7782);
__m512 fft7873 = _mm512_mask_mov_ps(fft7865, 49344, fft7866);
__m512 fft7790 = _mm512_mask_sub_ps(fft7782, 49344, _mm512_setzero_ps(), fft7781);
__m512 fft7874 = _mm512_mask_sub_ps(fft7866, 49344, _mm512_setzero_ps(), fft7865);
__m512 fft7791 = _mm512_mask_mov_ps(fft7783, 49344, fft7784);
__m512 fft7875 = _mm512_mask_mov_ps(fft7867, 49344, fft7868);
__m512 fft7792 = _mm512_mask_sub_ps(fft7784, 49344, _mm512_setzero_ps(), fft7783);
__m512 fft7876 = _mm512_mask_sub_ps(fft7868, 49344, _mm512_setzero_ps(), fft7867);
__m512 fft7793 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7794 = _mm512_fmadd_ps(fft7785, fft7793, _mm512_shuffle_ps(fft7785, fft7785, 78));
__m512 fft7877 = _mm512_fmadd_ps(fft7869, fft7793, _mm512_shuffle_ps(fft7869, fft7869, 78));
__m512 fft7795 = _mm512_fmadd_ps(fft7786, fft7793, _mm512_shuffle_ps(fft7786, fft7786, 78));
__m512 fft7878 = _mm512_fmadd_ps(fft7870, fft7793, _mm512_shuffle_ps(fft7870, fft7870, 78));
__m512 fft7796 = _mm512_fmadd_ps(fft7787, fft7793, _mm512_shuffle_ps(fft7787, fft7787, 78));
__m512 fft7879 = _mm512_fmadd_ps(fft7871, fft7793, _mm512_shuffle_ps(fft7871, fft7871, 78));
__m512 fft7797 = _mm512_fmadd_ps(fft7788, fft7793, _mm512_shuffle_ps(fft7788, fft7788, 78));
__m512 fft7880 = _mm512_fmadd_ps(fft7872, fft7793, _mm512_shuffle_ps(fft7872, fft7872, 78));
__m512 fft7798 = _mm512_fmadd_ps(fft7789, fft7793, _mm512_shuffle_ps(fft7789, fft7789, 78));
__m512 fft7881 = _mm512_fmadd_ps(fft7873, fft7793, _mm512_shuffle_ps(fft7873, fft7873, 78));
__m512 fft7799 = _mm512_fmadd_ps(fft7790, fft7793, _mm512_shuffle_ps(fft7790, fft7790, 78));
__m512 fft7882 = _mm512_fmadd_ps(fft7874, fft7793, _mm512_shuffle_ps(fft7874, fft7874, 78));
__m512 fft7800 = _mm512_fmadd_ps(fft7791, fft7793, _mm512_shuffle_ps(fft7791, fft7791, 78));
__m512 fft7883 = _mm512_fmadd_ps(fft7875, fft7793, _mm512_shuffle_ps(fft7875, fft7875, 78));
__m512 fft7801 = _mm512_fmadd_ps(fft7792, fft7793, _mm512_shuffle_ps(fft7792, fft7792, 78));
__m512 fft7884 = _mm512_fmadd_ps(fft7876, fft7793, _mm512_shuffle_ps(fft7876, fft7876, 78));
__m512i fft7802 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7803 = _mm512_permutexvar_ps(fft7802, fft7794);
__m512 fft7885 = _mm512_permutexvar_ps(fft7802, fft7877);
__m512i fft7804 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7805 = _mm512_permutexvar_ps(fft7804, fft7794);
__m512 fft7886 = _mm512_permutexvar_ps(fft7804, fft7877);
__m512 fft7806 = _mm512_permutexvar_ps(fft7802, fft7795);
__m512 fft7887 = _mm512_permutexvar_ps(fft7802, fft7878);
__m512 fft7807 = _mm512_permutexvar_ps(fft7804, fft7795);
__m512 fft7888 = _mm512_permutexvar_ps(fft7804, fft7878);
__m512 fft7808 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7809 = _mm512_fmadd_ps(fft7803, fft7808, fft7805);
__m512 fft7889 = _mm512_fmadd_ps(fft7885, fft7808, fft7886);
__m512 fft7810 = _mm512_fnmadd_ps(fft7807, fft7808, fft7806);
__m512 fft7890 = _mm512_fnmadd_ps(fft7888, fft7808, fft7887);
__m512 fft7811 = _mm512_mask_mov_ps(fft7807, 21845, fft7809);
__m512 fft7891 = _mm512_mask_mov_ps(fft7888, 21845, fft7889);
__m512 fft7812 = _mm512_mask_mov_ps(fft7803, 43176, fft7809);
__m512 fft7892 = _mm512_mask_mov_ps(fft7885, 43176, fft7889);
__m512 fft7813 = _mm512_mask_mov_ps(fft7811, 43176, fft7810);
__m512 fft7893 = _mm512_mask_mov_ps(fft7891, 43176, fft7890);
__m512 fft7814 = _mm512_mask_mov_ps(fft7812, 22102, fft7810);
__m512 fft7894 = _mm512_mask_mov_ps(fft7892, 22102, fft7890);
__m512 fft7815 = _mm512_mask_mul_ps(fft7813, 64764, fft7813, _mm512_set1_ps(5e-01f));
__m512 fft7895 = _mm512_mask_mul_ps(fft7893, 64764, fft7893, _mm512_set1_ps(5e-01f));
__m512 fft7816 = _mm512_mask_mul_ps(fft7814, 64764, fft7814, _mm512_set1_ps(5e-01f));
__m512 fft7896 = _mm512_mask_mul_ps(fft7894, 64764, fft7894, _mm512_set1_ps(5e-01f));
__m512 df693 = fft7815;
__m512 df701 = fft7895;
__m512 df694 = fft7816;
__m512 df702 = fft7896;
__m512 df695 = fft7796;
__m512 df703 = fft7879;
__m512 df696 = fft7797;
__m512 df704 = fft7880;
__m512 df697 = fft7798;
__m512 df705 = fft7881;
__m512 df698 = fft7799;
__m512 df706 = fft7882;
__m512 df699 = fft7800;
__m512 df707 = fft7883;
__m512 df700 = fft7801;
__m512 df708 = fft7884;
__m512i eo47 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df695 = _mm512_permutexvar_ps(eo47, df695);
df696 = _mm512_permutexvar_ps(eo47, df696);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df695);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df696);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df695);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df696);
df703 = _mm512_permutexvar_ps(eo47, df703);
df704 = _mm512_permutexvar_ps(eo47, df704);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df703);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df704);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df703);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df704);
df697 = _mm512_permutexvar_ps(eo47, df697);
df698 = _mm512_permutexvar_ps(eo47, df698);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df697);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df698);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df697);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df698);
df705 = _mm512_permutexvar_ps(eo47, df705);
df706 = _mm512_permutexvar_ps(eo47, df706);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df705);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df706);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df705);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df706);
df699 = _mm512_permutexvar_ps(eo47, df699);
df700 = _mm512_permutexvar_ps(eo47, df700);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df699);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df700);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df699);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df700);
df707 = _mm512_permutexvar_ps(eo47, df707);
df708 = _mm512_permutexvar_ps(eo47, df708);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df707);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df708);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df707);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df708);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df693);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df694);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df693);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df694);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df701);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k107+128*m47+32*f50, 255, df702);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df701);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k107+128*m47+32*f50, 65280, df702);
ptrdiff_t b59 = 4;
ptrdiff_t m48 = (size_t)b59/2;
ptrdiff_t f51 = (size_t)b59%2;
__m512 dat1734 = _mm512_maskz_loadu_ps(65534, datPtr18+3136+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1735 = _mm512_maskz_loadu_ps(65534, datPtr18+3360+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1736 = _mm512_maskz_loadu_ps(65534, datPtr18+3584+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1737 = _mm512_maskz_loadu_ps(65534, datPtr18+3808+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1738 = _mm512_maskz_loadu_ps(65534, datPtr18+4032+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1739 = _mm512_maskz_loadu_ps(65534, datPtr18+4256+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1740 = _mm512_maskz_loadu_ps(65534, datPtr18+4480+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1741 = _mm512_maskz_loadu_ps(65534, datPtr18+4704+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1742 = _mm512_maskz_loadu_ps(65534, datPtr18+4928+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1743 = _mm512_maskz_loadu_ps(65534, datPtr18+5152+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1744 = _mm512_maskz_loadu_ps(65534, datPtr18+5376+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1745 = _mm512_maskz_loadu_ps(65534, datPtr18+5600+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1746 = _mm512_maskz_loadu_ps(65534, datPtr18+5824+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1747 = _mm512_maskz_loadu_ps(65534, datPtr18+6048+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1748 = _mm512_maskz_loadu_ps(65534, datPtr18+6272+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 dat1749 = _mm512_maskz_loadu_ps(65534, datPtr18+6496+100864*i38+12608*k107+224*h40+4*w49+0*b59);
__m512 fft7897 = _mm512_add_ps(dat1734, dat1742);
__m512 fft7985 = _mm512_add_ps(dat1735, dat1743);
__m512 fft7898 = _mm512_sub_ps(dat1734, dat1742);
__m512 fft7986 = _mm512_sub_ps(dat1735, dat1743);
__m512 fft7899 = _mm512_add_ps(dat1736, dat1744);
__m512 fft7987 = _mm512_add_ps(dat1737, dat1745);
__m512 fft7900 = _mm512_sub_ps(dat1736, dat1744);
__m512 fft7988 = _mm512_sub_ps(dat1737, dat1745);
__m512 fft7901 = _mm512_add_ps(dat1738, dat1746);
__m512 fft7989 = _mm512_add_ps(dat1739, dat1747);
__m512 fft7902 = _mm512_sub_ps(dat1738, dat1746);
__m512 fft7990 = _mm512_sub_ps(dat1739, dat1747);
__m512 fft7903 = _mm512_add_ps(dat1740, dat1748);
__m512 fft7991 = _mm512_add_ps(dat1741, dat1749);
__m512 fft7904 = _mm512_sub_ps(dat1740, dat1748);
__m512 fft7992 = _mm512_sub_ps(dat1741, dat1749);
__m512 fft7905 = _mm512_add_ps(fft7897, fft7901);
__m512 fft7993 = _mm512_add_ps(fft7985, fft7989);
__m512 fft7906 = _mm512_sub_ps(fft7897, fft7901);
__m512 fft7994 = _mm512_sub_ps(fft7985, fft7989);
__m512 fft7907 = _mm512_add_ps(fft7899, fft7903);
__m512 fft7995 = _mm512_add_ps(fft7987, fft7991);
__m512 fft7908 = _mm512_sub_ps(fft7903, fft7899);
__m512 fft7996 = _mm512_sub_ps(fft7991, fft7987);
__m512 fft7909 = _mm512_sub_ps(fft7900, fft7904);
__m512 fft7997 = _mm512_sub_ps(fft7988, fft7992);
__m512 fft7910 = _mm512_add_ps(fft7900, fft7904);
__m512 fft7998 = _mm512_add_ps(fft7988, fft7992);
__m512 fft7911 = _mm512_add_ps(fft7905, fft7907);
__m512 fft7999 = _mm512_add_ps(fft7993, fft7995);
__m512 fft7912 = _mm512_sub_ps(fft7905, fft7907);
__m512 fft8000 = _mm512_sub_ps(fft7993, fft7995);
__m512 fft7913 = _mm512_fmadd_ps(fft7909, _mm512_set1_ps(7.0710677e-01f), fft7898);
__m512 fft8001 = _mm512_fmadd_ps(fft7997, _mm512_set1_ps(7.0710677e-01f), fft7986);
__m512 fft7914 = _mm512_fnmsub_ps(fft7910, _mm512_set1_ps(7.0710677e-01f), fft7902);
__m512 fft8002 = _mm512_fnmsub_ps(fft7998, _mm512_set1_ps(7.0710677e-01f), fft7990);
__m512 fft7915 = _mm512_fnmadd_ps(fft7909, _mm512_set1_ps(7.0710677e-01f), fft7898);
__m512 fft8003 = _mm512_fnmadd_ps(fft7997, _mm512_set1_ps(7.0710677e-01f), fft7986);
__m512 fft7916 = _mm512_fnmadd_ps(fft7910, _mm512_set1_ps(7.0710677e-01f), fft7902);
__m512 fft8004 = _mm512_fnmadd_ps(fft7998, _mm512_set1_ps(7.0710677e-01f), fft7990);
__m512 fft7917 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7918 = _mm512_fmadd_ps(fft7911, fft7917, _mm512_shuffle_f32x4(fft7911, fft7911, 78));
__m512 fft8005 = _mm512_fmadd_ps(fft7999, fft7917, _mm512_shuffle_f32x4(fft7999, fft7999, 78));
__m512 fft7919 = _mm512_fmadd_ps(fft7912, fft7917, _mm512_shuffle_f32x4(fft7912, fft7912, 78));
__m512 fft8006 = _mm512_fmadd_ps(fft8000, fft7917, _mm512_shuffle_f32x4(fft8000, fft8000, 78));
__m512 fft7920 = _mm512_fmadd_ps(fft7913, fft7917, _mm512_shuffle_f32x4(fft7913, fft7913, 78));
__m512 fft8007 = _mm512_fmadd_ps(fft8001, fft7917, _mm512_shuffle_f32x4(fft8001, fft8001, 78));
__m512 fft7921 = _mm512_fmadd_ps(fft7914, fft7917, _mm512_shuffle_f32x4(fft7914, fft7914, 78));
__m512 fft8008 = _mm512_fmadd_ps(fft8002, fft7917, _mm512_shuffle_f32x4(fft8002, fft8002, 78));
__m512 fft7922 = _mm512_fmadd_ps(fft7906, fft7917, _mm512_shuffle_f32x4(fft7906, fft7906, 78));
__m512 fft8009 = _mm512_fmadd_ps(fft7994, fft7917, _mm512_shuffle_f32x4(fft7994, fft7994, 78));
__m512 fft7923 = _mm512_fmadd_ps(fft7908, fft7917, _mm512_shuffle_f32x4(fft7908, fft7908, 78));
__m512 fft8010 = _mm512_fmadd_ps(fft7996, fft7917, _mm512_shuffle_f32x4(fft7996, fft7996, 78));
__m512 fft7924 = _mm512_fmadd_ps(fft7915, fft7917, _mm512_shuffle_f32x4(fft7915, fft7915, 78));
__m512 fft8011 = _mm512_fmadd_ps(fft8003, fft7917, _mm512_shuffle_f32x4(fft8003, fft8003, 78));
__m512 fft7925 = _mm512_fmadd_ps(fft7916, fft7917, _mm512_shuffle_f32x4(fft7916, fft7916, 78));
__m512 fft8012 = _mm512_fmadd_ps(fft8004, fft7917, _mm512_shuffle_f32x4(fft8004, fft8004, 78));
__m512 fft7926 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft7927 = _mm512_mul_ps(fft7918, fft7926);
__m512 fft8013 = _mm512_mul_ps(fft8005, fft7926);
__m512 fft7928 = _mm512_mul_ps(fft7919, fft7926);
__m512 fft8014 = _mm512_mul_ps(fft8006, fft7926);
__m512 fft7929 = _mm512_mul_ps(fft7920, fft7926);
__m512 fft8015 = _mm512_mul_ps(fft8007, fft7926);
__m512 fft7930 = _mm512_mul_ps(fft7921, fft7926);
__m512 fft8016 = _mm512_mul_ps(fft8008, fft7926);
__m512 fft7931 = _mm512_mul_ps(fft7922, fft7926);
__m512 fft8017 = _mm512_mul_ps(fft8009, fft7926);
__m512 fft7932 = _mm512_mul_ps(fft7923, fft7926);
__m512 fft8018 = _mm512_mul_ps(fft8010, fft7926);
__m512 fft7933 = _mm512_mul_ps(fft7924, fft7926);
__m512 fft8019 = _mm512_mul_ps(fft8011, fft7926);
__m512 fft7934 = _mm512_mul_ps(fft7925, fft7926);
__m512 fft8020 = _mm512_mul_ps(fft8012, fft7926);
__m512 fft7935 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft7936 = _mm512_fmadd_ps(fft7919, fft7935, fft7927);
__m512 fft8021 = _mm512_fmadd_ps(fft8006, fft7935, fft8013);
__m512 fft7937 = _mm512_fnmadd_ps(fft7918, fft7935, fft7928);
__m512 fft8022 = _mm512_fnmadd_ps(fft8005, fft7935, fft8014);
__m512 fft7938 = _mm512_fmadd_ps(fft7921, fft7935, fft7929);
__m512 fft8023 = _mm512_fmadd_ps(fft8008, fft7935, fft8015);
__m512 fft7939 = _mm512_fnmadd_ps(fft7920, fft7935, fft7930);
__m512 fft8024 = _mm512_fnmadd_ps(fft8007, fft7935, fft8016);
__m512 fft7940 = _mm512_fmadd_ps(fft7923, fft7935, fft7931);
__m512 fft8025 = _mm512_fmadd_ps(fft8010, fft7935, fft8017);
__m512 fft7941 = _mm512_fnmadd_ps(fft7922, fft7935, fft7932);
__m512 fft8026 = _mm512_fnmadd_ps(fft8009, fft7935, fft8018);
__m512 fft7942 = _mm512_fmadd_ps(fft7925, fft7935, fft7933);
__m512 fft8027 = _mm512_fmadd_ps(fft8012, fft7935, fft8019);
__m512 fft7943 = _mm512_fnmadd_ps(fft7924, fft7935, fft7934);
__m512 fft8028 = _mm512_fnmadd_ps(fft8011, fft7935, fft8020);
__m512 fft7944 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft7945 = _mm512_fmadd_ps(fft7936, fft7944, _mm512_shuffle_f32x4(fft7936, fft7936, 177));
__m512 fft8029 = _mm512_fmadd_ps(fft8021, fft7944, _mm512_shuffle_f32x4(fft8021, fft8021, 177));
__m512 fft7946 = _mm512_fmadd_ps(fft7937, fft7944, _mm512_shuffle_f32x4(fft7937, fft7937, 177));
__m512 fft8030 = _mm512_fmadd_ps(fft8022, fft7944, _mm512_shuffle_f32x4(fft8022, fft8022, 177));
__m512 fft7947 = _mm512_fmadd_ps(fft7938, fft7944, _mm512_shuffle_f32x4(fft7938, fft7938, 177));
__m512 fft8031 = _mm512_fmadd_ps(fft8023, fft7944, _mm512_shuffle_f32x4(fft8023, fft8023, 177));
__m512 fft7948 = _mm512_fmadd_ps(fft7939, fft7944, _mm512_shuffle_f32x4(fft7939, fft7939, 177));
__m512 fft8032 = _mm512_fmadd_ps(fft8024, fft7944, _mm512_shuffle_f32x4(fft8024, fft8024, 177));
__m512 fft7949 = _mm512_fmadd_ps(fft7940, fft7944, _mm512_shuffle_f32x4(fft7940, fft7940, 177));
__m512 fft8033 = _mm512_fmadd_ps(fft8025, fft7944, _mm512_shuffle_f32x4(fft8025, fft8025, 177));
__m512 fft7950 = _mm512_fmadd_ps(fft7941, fft7944, _mm512_shuffle_f32x4(fft7941, fft7941, 177));
__m512 fft8034 = _mm512_fmadd_ps(fft8026, fft7944, _mm512_shuffle_f32x4(fft8026, fft8026, 177));
__m512 fft7951 = _mm512_fmadd_ps(fft7942, fft7944, _mm512_shuffle_f32x4(fft7942, fft7942, 177));
__m512 fft8035 = _mm512_fmadd_ps(fft8027, fft7944, _mm512_shuffle_f32x4(fft8027, fft8027, 177));
__m512 fft7952 = _mm512_fmadd_ps(fft7943, fft7944, _mm512_shuffle_f32x4(fft7943, fft7943, 177));
__m512 fft8036 = _mm512_fmadd_ps(fft8028, fft7944, _mm512_shuffle_f32x4(fft8028, fft8028, 177));
__m512 fft7953 = _mm512_mask_mov_ps(fft7945, 49344, fft7946);
__m512 fft8037 = _mm512_mask_mov_ps(fft8029, 49344, fft8030);
__m512 fft7954 = _mm512_mask_sub_ps(fft7946, 49344, _mm512_setzero_ps(), fft7945);
__m512 fft8038 = _mm512_mask_sub_ps(fft8030, 49344, _mm512_setzero_ps(), fft8029);
__m512 fft7955 = _mm512_mask_mov_ps(fft7947, 49344, fft7948);
__m512 fft8039 = _mm512_mask_mov_ps(fft8031, 49344, fft8032);
__m512 fft7956 = _mm512_mask_sub_ps(fft7948, 49344, _mm512_setzero_ps(), fft7947);
__m512 fft8040 = _mm512_mask_sub_ps(fft8032, 49344, _mm512_setzero_ps(), fft8031);
__m512 fft7957 = _mm512_mask_mov_ps(fft7949, 49344, fft7950);
__m512 fft8041 = _mm512_mask_mov_ps(fft8033, 49344, fft8034);
__m512 fft7958 = _mm512_mask_sub_ps(fft7950, 49344, _mm512_setzero_ps(), fft7949);
__m512 fft8042 = _mm512_mask_sub_ps(fft8034, 49344, _mm512_setzero_ps(), fft8033);
__m512 fft7959 = _mm512_mask_mov_ps(fft7951, 49344, fft7952);
__m512 fft8043 = _mm512_mask_mov_ps(fft8035, 49344, fft8036);
__m512 fft7960 = _mm512_mask_sub_ps(fft7952, 49344, _mm512_setzero_ps(), fft7951);
__m512 fft8044 = _mm512_mask_sub_ps(fft8036, 49344, _mm512_setzero_ps(), fft8035);
__m512 fft7961 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft7962 = _mm512_fmadd_ps(fft7953, fft7961, _mm512_shuffle_ps(fft7953, fft7953, 78));
__m512 fft8045 = _mm512_fmadd_ps(fft8037, fft7961, _mm512_shuffle_ps(fft8037, fft8037, 78));
__m512 fft7963 = _mm512_fmadd_ps(fft7954, fft7961, _mm512_shuffle_ps(fft7954, fft7954, 78));
__m512 fft8046 = _mm512_fmadd_ps(fft8038, fft7961, _mm512_shuffle_ps(fft8038, fft8038, 78));
__m512 fft7964 = _mm512_fmadd_ps(fft7955, fft7961, _mm512_shuffle_ps(fft7955, fft7955, 78));
__m512 fft8047 = _mm512_fmadd_ps(fft8039, fft7961, _mm512_shuffle_ps(fft8039, fft8039, 78));
__m512 fft7965 = _mm512_fmadd_ps(fft7956, fft7961, _mm512_shuffle_ps(fft7956, fft7956, 78));
__m512 fft8048 = _mm512_fmadd_ps(fft8040, fft7961, _mm512_shuffle_ps(fft8040, fft8040, 78));
__m512 fft7966 = _mm512_fmadd_ps(fft7957, fft7961, _mm512_shuffle_ps(fft7957, fft7957, 78));
__m512 fft8049 = _mm512_fmadd_ps(fft8041, fft7961, _mm512_shuffle_ps(fft8041, fft8041, 78));
__m512 fft7967 = _mm512_fmadd_ps(fft7958, fft7961, _mm512_shuffle_ps(fft7958, fft7958, 78));
__m512 fft8050 = _mm512_fmadd_ps(fft8042, fft7961, _mm512_shuffle_ps(fft8042, fft8042, 78));
__m512 fft7968 = _mm512_fmadd_ps(fft7959, fft7961, _mm512_shuffle_ps(fft7959, fft7959, 78));
__m512 fft8051 = _mm512_fmadd_ps(fft8043, fft7961, _mm512_shuffle_ps(fft8043, fft8043, 78));
__m512 fft7969 = _mm512_fmadd_ps(fft7960, fft7961, _mm512_shuffle_ps(fft7960, fft7960, 78));
__m512 fft8052 = _mm512_fmadd_ps(fft8044, fft7961, _mm512_shuffle_ps(fft8044, fft8044, 78));
__m512i fft7970 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft7971 = _mm512_permutexvar_ps(fft7970, fft7962);
__m512 fft8053 = _mm512_permutexvar_ps(fft7970, fft8045);
__m512i fft7972 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft7973 = _mm512_permutexvar_ps(fft7972, fft7962);
__m512 fft8054 = _mm512_permutexvar_ps(fft7972, fft8045);
__m512 fft7974 = _mm512_permutexvar_ps(fft7970, fft7963);
__m512 fft8055 = _mm512_permutexvar_ps(fft7970, fft8046);
__m512 fft7975 = _mm512_permutexvar_ps(fft7972, fft7963);
__m512 fft8056 = _mm512_permutexvar_ps(fft7972, fft8046);
__m512 fft7976 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft7977 = _mm512_fmadd_ps(fft7971, fft7976, fft7973);
__m512 fft8057 = _mm512_fmadd_ps(fft8053, fft7976, fft8054);
__m512 fft7978 = _mm512_fnmadd_ps(fft7975, fft7976, fft7974);
__m512 fft8058 = _mm512_fnmadd_ps(fft8056, fft7976, fft8055);
__m512 fft7979 = _mm512_mask_mov_ps(fft7975, 21845, fft7977);
__m512 fft8059 = _mm512_mask_mov_ps(fft8056, 21845, fft8057);
__m512 fft7980 = _mm512_mask_mov_ps(fft7971, 43176, fft7977);
__m512 fft8060 = _mm512_mask_mov_ps(fft8053, 43176, fft8057);
__m512 fft7981 = _mm512_mask_mov_ps(fft7979, 43176, fft7978);
__m512 fft8061 = _mm512_mask_mov_ps(fft8059, 43176, fft8058);
__m512 fft7982 = _mm512_mask_mov_ps(fft7980, 22102, fft7978);
__m512 fft8062 = _mm512_mask_mov_ps(fft8060, 22102, fft8058);
__m512 fft7983 = _mm512_mask_mul_ps(fft7981, 64764, fft7981, _mm512_set1_ps(5e-01f));
__m512 fft8063 = _mm512_mask_mul_ps(fft8061, 64764, fft8061, _mm512_set1_ps(5e-01f));
__m512 fft7984 = _mm512_mask_mul_ps(fft7982, 64764, fft7982, _mm512_set1_ps(5e-01f));
__m512 fft8064 = _mm512_mask_mul_ps(fft8062, 64764, fft8062, _mm512_set1_ps(5e-01f));
__m512 df709 = fft7983;
__m512 df717 = fft8063;
__m512 df710 = fft7984;
__m512 df718 = fft8064;
__m512 df711 = fft7964;
__m512 df719 = fft8047;
__m512 df712 = fft7965;
__m512 df720 = fft8048;
__m512 df713 = fft7966;
__m512 df721 = fft8049;
__m512 df714 = fft7967;
__m512 df722 = fft8050;
__m512 df715 = fft7968;
__m512 df723 = fft8051;
__m512 df716 = fft7969;
__m512 df724 = fft8052;
__m512i eo48 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df711 = _mm512_permutexvar_ps(eo48, df711);
df712 = _mm512_permutexvar_ps(eo48, df712);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df711);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df712);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df711);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df712);
df719 = _mm512_permutexvar_ps(eo48, df719);
df720 = _mm512_permutexvar_ps(eo48, df720);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df719);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df720);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df719);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df720);
df713 = _mm512_permutexvar_ps(eo48, df713);
df714 = _mm512_permutexvar_ps(eo48, df714);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df713);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df714);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df713);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df714);
df721 = _mm512_permutexvar_ps(eo48, df721);
df722 = _mm512_permutexvar_ps(eo48, df722);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df721);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df722);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df721);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df722);
df715 = _mm512_permutexvar_ps(eo48, df715);
df716 = _mm512_permutexvar_ps(eo48, df716);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df715);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df716);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df715);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df716);
df723 = _mm512_permutexvar_ps(eo48, df723);
df724 = _mm512_permutexvar_ps(eo48, df724);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df723);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df724);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df723);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df724);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df709);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df710);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df709);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df710);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df717);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k107+128*m48+32*f51, 255, df718);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df717);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k107+128*m48+32*f51, 65280, df718);
ptrdiff_t b60 = 5;
ptrdiff_t m49 = (size_t)b60/2;
ptrdiff_t f52 = (size_t)b60%2;
__m512 dat1750 = _mm512_maskz_loadu_ps(65535, datPtr18+3192+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1751 = _mm512_maskz_loadu_ps(65535, datPtr18+3416+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1752 = _mm512_maskz_loadu_ps(65535, datPtr18+3640+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1753 = _mm512_maskz_loadu_ps(65535, datPtr18+3864+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1754 = _mm512_maskz_loadu_ps(65535, datPtr18+4088+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1755 = _mm512_maskz_loadu_ps(65535, datPtr18+4312+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1756 = _mm512_maskz_loadu_ps(65535, datPtr18+4536+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1757 = _mm512_maskz_loadu_ps(65535, datPtr18+4760+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1758 = _mm512_maskz_loadu_ps(65535, datPtr18+4984+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1759 = _mm512_maskz_loadu_ps(65535, datPtr18+5208+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1760 = _mm512_maskz_loadu_ps(65535, datPtr18+5432+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1761 = _mm512_maskz_loadu_ps(65535, datPtr18+5656+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1762 = _mm512_maskz_loadu_ps(65535, datPtr18+5880+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1763 = _mm512_maskz_loadu_ps(65535, datPtr18+6104+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1764 = _mm512_maskz_loadu_ps(65535, datPtr18+6328+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 dat1765 = _mm512_maskz_loadu_ps(65535, datPtr18+6552+100864*i38+12608*k107+224*h40+4*w49+0*b60);
__m512 fft8065 = _mm512_add_ps(dat1750, dat1758);
__m512 fft8153 = _mm512_add_ps(dat1751, dat1759);
__m512 fft8066 = _mm512_sub_ps(dat1750, dat1758);
__m512 fft8154 = _mm512_sub_ps(dat1751, dat1759);
__m512 fft8067 = _mm512_add_ps(dat1752, dat1760);
__m512 fft8155 = _mm512_add_ps(dat1753, dat1761);
__m512 fft8068 = _mm512_sub_ps(dat1752, dat1760);
__m512 fft8156 = _mm512_sub_ps(dat1753, dat1761);
__m512 fft8069 = _mm512_add_ps(dat1754, dat1762);
__m512 fft8157 = _mm512_add_ps(dat1755, dat1763);
__m512 fft8070 = _mm512_sub_ps(dat1754, dat1762);
__m512 fft8158 = _mm512_sub_ps(dat1755, dat1763);
__m512 fft8071 = _mm512_add_ps(dat1756, dat1764);
__m512 fft8159 = _mm512_add_ps(dat1757, dat1765);
__m512 fft8072 = _mm512_sub_ps(dat1756, dat1764);
__m512 fft8160 = _mm512_sub_ps(dat1757, dat1765);
__m512 fft8073 = _mm512_add_ps(fft8065, fft8069);
__m512 fft8161 = _mm512_add_ps(fft8153, fft8157);
__m512 fft8074 = _mm512_sub_ps(fft8065, fft8069);
__m512 fft8162 = _mm512_sub_ps(fft8153, fft8157);
__m512 fft8075 = _mm512_add_ps(fft8067, fft8071);
__m512 fft8163 = _mm512_add_ps(fft8155, fft8159);
__m512 fft8076 = _mm512_sub_ps(fft8071, fft8067);
__m512 fft8164 = _mm512_sub_ps(fft8159, fft8155);
__m512 fft8077 = _mm512_sub_ps(fft8068, fft8072);
__m512 fft8165 = _mm512_sub_ps(fft8156, fft8160);
__m512 fft8078 = _mm512_add_ps(fft8068, fft8072);
__m512 fft8166 = _mm512_add_ps(fft8156, fft8160);
__m512 fft8079 = _mm512_add_ps(fft8073, fft8075);
__m512 fft8167 = _mm512_add_ps(fft8161, fft8163);
__m512 fft8080 = _mm512_sub_ps(fft8073, fft8075);
__m512 fft8168 = _mm512_sub_ps(fft8161, fft8163);
__m512 fft8081 = _mm512_fmadd_ps(fft8077, _mm512_set1_ps(7.0710677e-01f), fft8066);
__m512 fft8169 = _mm512_fmadd_ps(fft8165, _mm512_set1_ps(7.0710677e-01f), fft8154);
__m512 fft8082 = _mm512_fnmsub_ps(fft8078, _mm512_set1_ps(7.0710677e-01f), fft8070);
__m512 fft8170 = _mm512_fnmsub_ps(fft8166, _mm512_set1_ps(7.0710677e-01f), fft8158);
__m512 fft8083 = _mm512_fnmadd_ps(fft8077, _mm512_set1_ps(7.0710677e-01f), fft8066);
__m512 fft8171 = _mm512_fnmadd_ps(fft8165, _mm512_set1_ps(7.0710677e-01f), fft8154);
__m512 fft8084 = _mm512_fnmadd_ps(fft8078, _mm512_set1_ps(7.0710677e-01f), fft8070);
__m512 fft8172 = _mm512_fnmadd_ps(fft8166, _mm512_set1_ps(7.0710677e-01f), fft8158);
__m512 fft8085 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8086 = _mm512_fmadd_ps(fft8079, fft8085, _mm512_shuffle_f32x4(fft8079, fft8079, 78));
__m512 fft8173 = _mm512_fmadd_ps(fft8167, fft8085, _mm512_shuffle_f32x4(fft8167, fft8167, 78));
__m512 fft8087 = _mm512_fmadd_ps(fft8080, fft8085, _mm512_shuffle_f32x4(fft8080, fft8080, 78));
__m512 fft8174 = _mm512_fmadd_ps(fft8168, fft8085, _mm512_shuffle_f32x4(fft8168, fft8168, 78));
__m512 fft8088 = _mm512_fmadd_ps(fft8081, fft8085, _mm512_shuffle_f32x4(fft8081, fft8081, 78));
__m512 fft8175 = _mm512_fmadd_ps(fft8169, fft8085, _mm512_shuffle_f32x4(fft8169, fft8169, 78));
__m512 fft8089 = _mm512_fmadd_ps(fft8082, fft8085, _mm512_shuffle_f32x4(fft8082, fft8082, 78));
__m512 fft8176 = _mm512_fmadd_ps(fft8170, fft8085, _mm512_shuffle_f32x4(fft8170, fft8170, 78));
__m512 fft8090 = _mm512_fmadd_ps(fft8074, fft8085, _mm512_shuffle_f32x4(fft8074, fft8074, 78));
__m512 fft8177 = _mm512_fmadd_ps(fft8162, fft8085, _mm512_shuffle_f32x4(fft8162, fft8162, 78));
__m512 fft8091 = _mm512_fmadd_ps(fft8076, fft8085, _mm512_shuffle_f32x4(fft8076, fft8076, 78));
__m512 fft8178 = _mm512_fmadd_ps(fft8164, fft8085, _mm512_shuffle_f32x4(fft8164, fft8164, 78));
__m512 fft8092 = _mm512_fmadd_ps(fft8083, fft8085, _mm512_shuffle_f32x4(fft8083, fft8083, 78));
__m512 fft8179 = _mm512_fmadd_ps(fft8171, fft8085, _mm512_shuffle_f32x4(fft8171, fft8171, 78));
__m512 fft8093 = _mm512_fmadd_ps(fft8084, fft8085, _mm512_shuffle_f32x4(fft8084, fft8084, 78));
__m512 fft8180 = _mm512_fmadd_ps(fft8172, fft8085, _mm512_shuffle_f32x4(fft8172, fft8172, 78));
__m512 fft8094 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8095 = _mm512_mul_ps(fft8086, fft8094);
__m512 fft8181 = _mm512_mul_ps(fft8173, fft8094);
__m512 fft8096 = _mm512_mul_ps(fft8087, fft8094);
__m512 fft8182 = _mm512_mul_ps(fft8174, fft8094);
__m512 fft8097 = _mm512_mul_ps(fft8088, fft8094);
__m512 fft8183 = _mm512_mul_ps(fft8175, fft8094);
__m512 fft8098 = _mm512_mul_ps(fft8089, fft8094);
__m512 fft8184 = _mm512_mul_ps(fft8176, fft8094);
__m512 fft8099 = _mm512_mul_ps(fft8090, fft8094);
__m512 fft8185 = _mm512_mul_ps(fft8177, fft8094);
__m512 fft8100 = _mm512_mul_ps(fft8091, fft8094);
__m512 fft8186 = _mm512_mul_ps(fft8178, fft8094);
__m512 fft8101 = _mm512_mul_ps(fft8092, fft8094);
__m512 fft8187 = _mm512_mul_ps(fft8179, fft8094);
__m512 fft8102 = _mm512_mul_ps(fft8093, fft8094);
__m512 fft8188 = _mm512_mul_ps(fft8180, fft8094);
__m512 fft8103 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8104 = _mm512_fmadd_ps(fft8087, fft8103, fft8095);
__m512 fft8189 = _mm512_fmadd_ps(fft8174, fft8103, fft8181);
__m512 fft8105 = _mm512_fnmadd_ps(fft8086, fft8103, fft8096);
__m512 fft8190 = _mm512_fnmadd_ps(fft8173, fft8103, fft8182);
__m512 fft8106 = _mm512_fmadd_ps(fft8089, fft8103, fft8097);
__m512 fft8191 = _mm512_fmadd_ps(fft8176, fft8103, fft8183);
__m512 fft8107 = _mm512_fnmadd_ps(fft8088, fft8103, fft8098);
__m512 fft8192 = _mm512_fnmadd_ps(fft8175, fft8103, fft8184);
__m512 fft8108 = _mm512_fmadd_ps(fft8091, fft8103, fft8099);
__m512 fft8193 = _mm512_fmadd_ps(fft8178, fft8103, fft8185);
__m512 fft8109 = _mm512_fnmadd_ps(fft8090, fft8103, fft8100);
__m512 fft8194 = _mm512_fnmadd_ps(fft8177, fft8103, fft8186);
__m512 fft8110 = _mm512_fmadd_ps(fft8093, fft8103, fft8101);
__m512 fft8195 = _mm512_fmadd_ps(fft8180, fft8103, fft8187);
__m512 fft8111 = _mm512_fnmadd_ps(fft8092, fft8103, fft8102);
__m512 fft8196 = _mm512_fnmadd_ps(fft8179, fft8103, fft8188);
__m512 fft8112 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8113 = _mm512_fmadd_ps(fft8104, fft8112, _mm512_shuffle_f32x4(fft8104, fft8104, 177));
__m512 fft8197 = _mm512_fmadd_ps(fft8189, fft8112, _mm512_shuffle_f32x4(fft8189, fft8189, 177));
__m512 fft8114 = _mm512_fmadd_ps(fft8105, fft8112, _mm512_shuffle_f32x4(fft8105, fft8105, 177));
__m512 fft8198 = _mm512_fmadd_ps(fft8190, fft8112, _mm512_shuffle_f32x4(fft8190, fft8190, 177));
__m512 fft8115 = _mm512_fmadd_ps(fft8106, fft8112, _mm512_shuffle_f32x4(fft8106, fft8106, 177));
__m512 fft8199 = _mm512_fmadd_ps(fft8191, fft8112, _mm512_shuffle_f32x4(fft8191, fft8191, 177));
__m512 fft8116 = _mm512_fmadd_ps(fft8107, fft8112, _mm512_shuffle_f32x4(fft8107, fft8107, 177));
__m512 fft8200 = _mm512_fmadd_ps(fft8192, fft8112, _mm512_shuffle_f32x4(fft8192, fft8192, 177));
__m512 fft8117 = _mm512_fmadd_ps(fft8108, fft8112, _mm512_shuffle_f32x4(fft8108, fft8108, 177));
__m512 fft8201 = _mm512_fmadd_ps(fft8193, fft8112, _mm512_shuffle_f32x4(fft8193, fft8193, 177));
__m512 fft8118 = _mm512_fmadd_ps(fft8109, fft8112, _mm512_shuffle_f32x4(fft8109, fft8109, 177));
__m512 fft8202 = _mm512_fmadd_ps(fft8194, fft8112, _mm512_shuffle_f32x4(fft8194, fft8194, 177));
__m512 fft8119 = _mm512_fmadd_ps(fft8110, fft8112, _mm512_shuffle_f32x4(fft8110, fft8110, 177));
__m512 fft8203 = _mm512_fmadd_ps(fft8195, fft8112, _mm512_shuffle_f32x4(fft8195, fft8195, 177));
__m512 fft8120 = _mm512_fmadd_ps(fft8111, fft8112, _mm512_shuffle_f32x4(fft8111, fft8111, 177));
__m512 fft8204 = _mm512_fmadd_ps(fft8196, fft8112, _mm512_shuffle_f32x4(fft8196, fft8196, 177));
__m512 fft8121 = _mm512_mask_mov_ps(fft8113, 49344, fft8114);
__m512 fft8205 = _mm512_mask_mov_ps(fft8197, 49344, fft8198);
__m512 fft8122 = _mm512_mask_sub_ps(fft8114, 49344, _mm512_setzero_ps(), fft8113);
__m512 fft8206 = _mm512_mask_sub_ps(fft8198, 49344, _mm512_setzero_ps(), fft8197);
__m512 fft8123 = _mm512_mask_mov_ps(fft8115, 49344, fft8116);
__m512 fft8207 = _mm512_mask_mov_ps(fft8199, 49344, fft8200);
__m512 fft8124 = _mm512_mask_sub_ps(fft8116, 49344, _mm512_setzero_ps(), fft8115);
__m512 fft8208 = _mm512_mask_sub_ps(fft8200, 49344, _mm512_setzero_ps(), fft8199);
__m512 fft8125 = _mm512_mask_mov_ps(fft8117, 49344, fft8118);
__m512 fft8209 = _mm512_mask_mov_ps(fft8201, 49344, fft8202);
__m512 fft8126 = _mm512_mask_sub_ps(fft8118, 49344, _mm512_setzero_ps(), fft8117);
__m512 fft8210 = _mm512_mask_sub_ps(fft8202, 49344, _mm512_setzero_ps(), fft8201);
__m512 fft8127 = _mm512_mask_mov_ps(fft8119, 49344, fft8120);
__m512 fft8211 = _mm512_mask_mov_ps(fft8203, 49344, fft8204);
__m512 fft8128 = _mm512_mask_sub_ps(fft8120, 49344, _mm512_setzero_ps(), fft8119);
__m512 fft8212 = _mm512_mask_sub_ps(fft8204, 49344, _mm512_setzero_ps(), fft8203);
__m512 fft8129 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8130 = _mm512_fmadd_ps(fft8121, fft8129, _mm512_shuffle_ps(fft8121, fft8121, 78));
__m512 fft8213 = _mm512_fmadd_ps(fft8205, fft8129, _mm512_shuffle_ps(fft8205, fft8205, 78));
__m512 fft8131 = _mm512_fmadd_ps(fft8122, fft8129, _mm512_shuffle_ps(fft8122, fft8122, 78));
__m512 fft8214 = _mm512_fmadd_ps(fft8206, fft8129, _mm512_shuffle_ps(fft8206, fft8206, 78));
__m512 fft8132 = _mm512_fmadd_ps(fft8123, fft8129, _mm512_shuffle_ps(fft8123, fft8123, 78));
__m512 fft8215 = _mm512_fmadd_ps(fft8207, fft8129, _mm512_shuffle_ps(fft8207, fft8207, 78));
__m512 fft8133 = _mm512_fmadd_ps(fft8124, fft8129, _mm512_shuffle_ps(fft8124, fft8124, 78));
__m512 fft8216 = _mm512_fmadd_ps(fft8208, fft8129, _mm512_shuffle_ps(fft8208, fft8208, 78));
__m512 fft8134 = _mm512_fmadd_ps(fft8125, fft8129, _mm512_shuffle_ps(fft8125, fft8125, 78));
__m512 fft8217 = _mm512_fmadd_ps(fft8209, fft8129, _mm512_shuffle_ps(fft8209, fft8209, 78));
__m512 fft8135 = _mm512_fmadd_ps(fft8126, fft8129, _mm512_shuffle_ps(fft8126, fft8126, 78));
__m512 fft8218 = _mm512_fmadd_ps(fft8210, fft8129, _mm512_shuffle_ps(fft8210, fft8210, 78));
__m512 fft8136 = _mm512_fmadd_ps(fft8127, fft8129, _mm512_shuffle_ps(fft8127, fft8127, 78));
__m512 fft8219 = _mm512_fmadd_ps(fft8211, fft8129, _mm512_shuffle_ps(fft8211, fft8211, 78));
__m512 fft8137 = _mm512_fmadd_ps(fft8128, fft8129, _mm512_shuffle_ps(fft8128, fft8128, 78));
__m512 fft8220 = _mm512_fmadd_ps(fft8212, fft8129, _mm512_shuffle_ps(fft8212, fft8212, 78));
__m512i fft8138 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8139 = _mm512_permutexvar_ps(fft8138, fft8130);
__m512 fft8221 = _mm512_permutexvar_ps(fft8138, fft8213);
__m512i fft8140 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8141 = _mm512_permutexvar_ps(fft8140, fft8130);
__m512 fft8222 = _mm512_permutexvar_ps(fft8140, fft8213);
__m512 fft8142 = _mm512_permutexvar_ps(fft8138, fft8131);
__m512 fft8223 = _mm512_permutexvar_ps(fft8138, fft8214);
__m512 fft8143 = _mm512_permutexvar_ps(fft8140, fft8131);
__m512 fft8224 = _mm512_permutexvar_ps(fft8140, fft8214);
__m512 fft8144 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8145 = _mm512_fmadd_ps(fft8139, fft8144, fft8141);
__m512 fft8225 = _mm512_fmadd_ps(fft8221, fft8144, fft8222);
__m512 fft8146 = _mm512_fnmadd_ps(fft8143, fft8144, fft8142);
__m512 fft8226 = _mm512_fnmadd_ps(fft8224, fft8144, fft8223);
__m512 fft8147 = _mm512_mask_mov_ps(fft8143, 21845, fft8145);
__m512 fft8227 = _mm512_mask_mov_ps(fft8224, 21845, fft8225);
__m512 fft8148 = _mm512_mask_mov_ps(fft8139, 43176, fft8145);
__m512 fft8228 = _mm512_mask_mov_ps(fft8221, 43176, fft8225);
__m512 fft8149 = _mm512_mask_mov_ps(fft8147, 43176, fft8146);
__m512 fft8229 = _mm512_mask_mov_ps(fft8227, 43176, fft8226);
__m512 fft8150 = _mm512_mask_mov_ps(fft8148, 22102, fft8146);
__m512 fft8230 = _mm512_mask_mov_ps(fft8228, 22102, fft8226);
__m512 fft8151 = _mm512_mask_mul_ps(fft8149, 64764, fft8149, _mm512_set1_ps(5e-01f));
__m512 fft8231 = _mm512_mask_mul_ps(fft8229, 64764, fft8229, _mm512_set1_ps(5e-01f));
__m512 fft8152 = _mm512_mask_mul_ps(fft8150, 64764, fft8150, _mm512_set1_ps(5e-01f));
__m512 fft8232 = _mm512_mask_mul_ps(fft8230, 64764, fft8230, _mm512_set1_ps(5e-01f));
__m512 df725 = fft8151;
__m512 df733 = fft8231;
__m512 df726 = fft8152;
__m512 df734 = fft8232;
__m512 df727 = fft8132;
__m512 df735 = fft8215;
__m512 df728 = fft8133;
__m512 df736 = fft8216;
__m512 df729 = fft8134;
__m512 df737 = fft8217;
__m512 df730 = fft8135;
__m512 df738 = fft8218;
__m512 df731 = fft8136;
__m512 df739 = fft8219;
__m512 df732 = fft8137;
__m512 df740 = fft8220;
__m512i eo49 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df727 = _mm512_permutexvar_ps(eo49, df727);
df728 = _mm512_permutexvar_ps(eo49, df728);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df727);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df728);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df727);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df728);
df735 = _mm512_permutexvar_ps(eo49, df735);
df736 = _mm512_permutexvar_ps(eo49, df736);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df735);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df736);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df735);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df736);
df729 = _mm512_permutexvar_ps(eo49, df729);
df730 = _mm512_permutexvar_ps(eo49, df730);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df729);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df730);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df729);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df730);
df737 = _mm512_permutexvar_ps(eo49, df737);
df738 = _mm512_permutexvar_ps(eo49, df738);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df737);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df738);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df737);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df738);
df731 = _mm512_permutexvar_ps(eo49, df731);
df732 = _mm512_permutexvar_ps(eo49, df732);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df731);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df732);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df731);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df732);
df739 = _mm512_permutexvar_ps(eo49, df739);
df740 = _mm512_permutexvar_ps(eo49, df740);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df739);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df740);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df739);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df740);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df725);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df726);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df725);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df726);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df733);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k107+128*m49+32*f52, 255, df734);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df733);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k107+128*m49+32*f52, 65280, df734);
}
++j31;
rel19 = 1;
}
if (rel19 < 2) {
ptrdiff_t h41 = base19+14;
ptrdiff_t w50 = 28;
ptrdiff_t k108 = 8*s26;
ptrdiff_t kk35 = k108+7;
for (; k108 <= kk35; ++k108) {
ptrdiff_t b61 = 0;
ptrdiff_t m50 = (size_t)b61/2;
ptrdiff_t f53 = (size_t)b61%2;
__m512 dat1766 = _mm512_maskz_loadu_ps(65535, datPtr18+0+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1767 = _mm512_maskz_loadu_ps(65535, datPtr18+224+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1768 = _mm512_maskz_loadu_ps(65535, datPtr18+448+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1769 = _mm512_maskz_loadu_ps(65535, datPtr18+672+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1770 = _mm512_maskz_loadu_ps(65535, datPtr18+896+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1771 = _mm512_maskz_loadu_ps(65535, datPtr18+1120+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1772 = _mm512_maskz_loadu_ps(65535, datPtr18+1344+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1773 = _mm512_maskz_loadu_ps(65535, datPtr18+1568+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1774 = _mm512_maskz_loadu_ps(65535, datPtr18+1792+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1775 = _mm512_maskz_loadu_ps(65535, datPtr18+2016+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1776 = _mm512_maskz_loadu_ps(65535, datPtr18+2240+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1777 = _mm512_maskz_loadu_ps(65535, datPtr18+2464+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1778 = _mm512_maskz_loadu_ps(65535, datPtr18+2688+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1779 = _mm512_maskz_loadu_ps(65535, datPtr18+2912+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1780 = _mm512_maskz_loadu_ps(65535, datPtr18+3136+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 dat1781 = _mm512_maskz_loadu_ps(65535, datPtr18+3360+100864*i38+12608*k108+224*h41+4*w50+0*b61);
__m512 fft8233 = _mm512_add_ps(dat1766, dat1774);
__m512 fft8321 = _mm512_add_ps(dat1767, dat1775);
__m512 fft8234 = _mm512_sub_ps(dat1766, dat1774);
__m512 fft8322 = _mm512_sub_ps(dat1767, dat1775);
__m512 fft8235 = _mm512_add_ps(dat1768, dat1776);
__m512 fft8323 = _mm512_add_ps(dat1769, dat1777);
__m512 fft8236 = _mm512_sub_ps(dat1768, dat1776);
__m512 fft8324 = _mm512_sub_ps(dat1769, dat1777);
__m512 fft8237 = _mm512_add_ps(dat1770, dat1778);
__m512 fft8325 = _mm512_add_ps(dat1771, dat1779);
__m512 fft8238 = _mm512_sub_ps(dat1770, dat1778);
__m512 fft8326 = _mm512_sub_ps(dat1771, dat1779);
__m512 fft8239 = _mm512_add_ps(dat1772, dat1780);
__m512 fft8327 = _mm512_add_ps(dat1773, dat1781);
__m512 fft8240 = _mm512_sub_ps(dat1772, dat1780);
__m512 fft8328 = _mm512_sub_ps(dat1773, dat1781);
__m512 fft8241 = _mm512_add_ps(fft8233, fft8237);
__m512 fft8329 = _mm512_add_ps(fft8321, fft8325);
__m512 fft8242 = _mm512_sub_ps(fft8233, fft8237);
__m512 fft8330 = _mm512_sub_ps(fft8321, fft8325);
__m512 fft8243 = _mm512_add_ps(fft8235, fft8239);
__m512 fft8331 = _mm512_add_ps(fft8323, fft8327);
__m512 fft8244 = _mm512_sub_ps(fft8239, fft8235);
__m512 fft8332 = _mm512_sub_ps(fft8327, fft8323);
__m512 fft8245 = _mm512_sub_ps(fft8236, fft8240);
__m512 fft8333 = _mm512_sub_ps(fft8324, fft8328);
__m512 fft8246 = _mm512_add_ps(fft8236, fft8240);
__m512 fft8334 = _mm512_add_ps(fft8324, fft8328);
__m512 fft8247 = _mm512_add_ps(fft8241, fft8243);
__m512 fft8335 = _mm512_add_ps(fft8329, fft8331);
__m512 fft8248 = _mm512_sub_ps(fft8241, fft8243);
__m512 fft8336 = _mm512_sub_ps(fft8329, fft8331);
__m512 fft8249 = _mm512_fmadd_ps(fft8245, _mm512_set1_ps(7.0710677e-01f), fft8234);
__m512 fft8337 = _mm512_fmadd_ps(fft8333, _mm512_set1_ps(7.0710677e-01f), fft8322);
__m512 fft8250 = _mm512_fnmsub_ps(fft8246, _mm512_set1_ps(7.0710677e-01f), fft8238);
__m512 fft8338 = _mm512_fnmsub_ps(fft8334, _mm512_set1_ps(7.0710677e-01f), fft8326);
__m512 fft8251 = _mm512_fnmadd_ps(fft8245, _mm512_set1_ps(7.0710677e-01f), fft8234);
__m512 fft8339 = _mm512_fnmadd_ps(fft8333, _mm512_set1_ps(7.0710677e-01f), fft8322);
__m512 fft8252 = _mm512_fnmadd_ps(fft8246, _mm512_set1_ps(7.0710677e-01f), fft8238);
__m512 fft8340 = _mm512_fnmadd_ps(fft8334, _mm512_set1_ps(7.0710677e-01f), fft8326);
__m512 fft8253 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8254 = _mm512_fmadd_ps(fft8247, fft8253, _mm512_shuffle_f32x4(fft8247, fft8247, 78));
__m512 fft8341 = _mm512_fmadd_ps(fft8335, fft8253, _mm512_shuffle_f32x4(fft8335, fft8335, 78));
__m512 fft8255 = _mm512_fmadd_ps(fft8248, fft8253, _mm512_shuffle_f32x4(fft8248, fft8248, 78));
__m512 fft8342 = _mm512_fmadd_ps(fft8336, fft8253, _mm512_shuffle_f32x4(fft8336, fft8336, 78));
__m512 fft8256 = _mm512_fmadd_ps(fft8249, fft8253, _mm512_shuffle_f32x4(fft8249, fft8249, 78));
__m512 fft8343 = _mm512_fmadd_ps(fft8337, fft8253, _mm512_shuffle_f32x4(fft8337, fft8337, 78));
__m512 fft8257 = _mm512_fmadd_ps(fft8250, fft8253, _mm512_shuffle_f32x4(fft8250, fft8250, 78));
__m512 fft8344 = _mm512_fmadd_ps(fft8338, fft8253, _mm512_shuffle_f32x4(fft8338, fft8338, 78));
__m512 fft8258 = _mm512_fmadd_ps(fft8242, fft8253, _mm512_shuffle_f32x4(fft8242, fft8242, 78));
__m512 fft8345 = _mm512_fmadd_ps(fft8330, fft8253, _mm512_shuffle_f32x4(fft8330, fft8330, 78));
__m512 fft8259 = _mm512_fmadd_ps(fft8244, fft8253, _mm512_shuffle_f32x4(fft8244, fft8244, 78));
__m512 fft8346 = _mm512_fmadd_ps(fft8332, fft8253, _mm512_shuffle_f32x4(fft8332, fft8332, 78));
__m512 fft8260 = _mm512_fmadd_ps(fft8251, fft8253, _mm512_shuffle_f32x4(fft8251, fft8251, 78));
__m512 fft8347 = _mm512_fmadd_ps(fft8339, fft8253, _mm512_shuffle_f32x4(fft8339, fft8339, 78));
__m512 fft8261 = _mm512_fmadd_ps(fft8252, fft8253, _mm512_shuffle_f32x4(fft8252, fft8252, 78));
__m512 fft8348 = _mm512_fmadd_ps(fft8340, fft8253, _mm512_shuffle_f32x4(fft8340, fft8340, 78));
__m512 fft8262 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8263 = _mm512_mul_ps(fft8254, fft8262);
__m512 fft8349 = _mm512_mul_ps(fft8341, fft8262);
__m512 fft8264 = _mm512_mul_ps(fft8255, fft8262);
__m512 fft8350 = _mm512_mul_ps(fft8342, fft8262);
__m512 fft8265 = _mm512_mul_ps(fft8256, fft8262);
__m512 fft8351 = _mm512_mul_ps(fft8343, fft8262);
__m512 fft8266 = _mm512_mul_ps(fft8257, fft8262);
__m512 fft8352 = _mm512_mul_ps(fft8344, fft8262);
__m512 fft8267 = _mm512_mul_ps(fft8258, fft8262);
__m512 fft8353 = _mm512_mul_ps(fft8345, fft8262);
__m512 fft8268 = _mm512_mul_ps(fft8259, fft8262);
__m512 fft8354 = _mm512_mul_ps(fft8346, fft8262);
__m512 fft8269 = _mm512_mul_ps(fft8260, fft8262);
__m512 fft8355 = _mm512_mul_ps(fft8347, fft8262);
__m512 fft8270 = _mm512_mul_ps(fft8261, fft8262);
__m512 fft8356 = _mm512_mul_ps(fft8348, fft8262);
__m512 fft8271 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8272 = _mm512_fmadd_ps(fft8255, fft8271, fft8263);
__m512 fft8357 = _mm512_fmadd_ps(fft8342, fft8271, fft8349);
__m512 fft8273 = _mm512_fnmadd_ps(fft8254, fft8271, fft8264);
__m512 fft8358 = _mm512_fnmadd_ps(fft8341, fft8271, fft8350);
__m512 fft8274 = _mm512_fmadd_ps(fft8257, fft8271, fft8265);
__m512 fft8359 = _mm512_fmadd_ps(fft8344, fft8271, fft8351);
__m512 fft8275 = _mm512_fnmadd_ps(fft8256, fft8271, fft8266);
__m512 fft8360 = _mm512_fnmadd_ps(fft8343, fft8271, fft8352);
__m512 fft8276 = _mm512_fmadd_ps(fft8259, fft8271, fft8267);
__m512 fft8361 = _mm512_fmadd_ps(fft8346, fft8271, fft8353);
__m512 fft8277 = _mm512_fnmadd_ps(fft8258, fft8271, fft8268);
__m512 fft8362 = _mm512_fnmadd_ps(fft8345, fft8271, fft8354);
__m512 fft8278 = _mm512_fmadd_ps(fft8261, fft8271, fft8269);
__m512 fft8363 = _mm512_fmadd_ps(fft8348, fft8271, fft8355);
__m512 fft8279 = _mm512_fnmadd_ps(fft8260, fft8271, fft8270);
__m512 fft8364 = _mm512_fnmadd_ps(fft8347, fft8271, fft8356);
__m512 fft8280 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8281 = _mm512_fmadd_ps(fft8272, fft8280, _mm512_shuffle_f32x4(fft8272, fft8272, 177));
__m512 fft8365 = _mm512_fmadd_ps(fft8357, fft8280, _mm512_shuffle_f32x4(fft8357, fft8357, 177));
__m512 fft8282 = _mm512_fmadd_ps(fft8273, fft8280, _mm512_shuffle_f32x4(fft8273, fft8273, 177));
__m512 fft8366 = _mm512_fmadd_ps(fft8358, fft8280, _mm512_shuffle_f32x4(fft8358, fft8358, 177));
__m512 fft8283 = _mm512_fmadd_ps(fft8274, fft8280, _mm512_shuffle_f32x4(fft8274, fft8274, 177));
__m512 fft8367 = _mm512_fmadd_ps(fft8359, fft8280, _mm512_shuffle_f32x4(fft8359, fft8359, 177));
__m512 fft8284 = _mm512_fmadd_ps(fft8275, fft8280, _mm512_shuffle_f32x4(fft8275, fft8275, 177));
__m512 fft8368 = _mm512_fmadd_ps(fft8360, fft8280, _mm512_shuffle_f32x4(fft8360, fft8360, 177));
__m512 fft8285 = _mm512_fmadd_ps(fft8276, fft8280, _mm512_shuffle_f32x4(fft8276, fft8276, 177));
__m512 fft8369 = _mm512_fmadd_ps(fft8361, fft8280, _mm512_shuffle_f32x4(fft8361, fft8361, 177));
__m512 fft8286 = _mm512_fmadd_ps(fft8277, fft8280, _mm512_shuffle_f32x4(fft8277, fft8277, 177));
__m512 fft8370 = _mm512_fmadd_ps(fft8362, fft8280, _mm512_shuffle_f32x4(fft8362, fft8362, 177));
__m512 fft8287 = _mm512_fmadd_ps(fft8278, fft8280, _mm512_shuffle_f32x4(fft8278, fft8278, 177));
__m512 fft8371 = _mm512_fmadd_ps(fft8363, fft8280, _mm512_shuffle_f32x4(fft8363, fft8363, 177));
__m512 fft8288 = _mm512_fmadd_ps(fft8279, fft8280, _mm512_shuffle_f32x4(fft8279, fft8279, 177));
__m512 fft8372 = _mm512_fmadd_ps(fft8364, fft8280, _mm512_shuffle_f32x4(fft8364, fft8364, 177));
__m512 fft8289 = _mm512_mask_mov_ps(fft8281, 49344, fft8282);
__m512 fft8373 = _mm512_mask_mov_ps(fft8365, 49344, fft8366);
__m512 fft8290 = _mm512_mask_sub_ps(fft8282, 49344, _mm512_setzero_ps(), fft8281);
__m512 fft8374 = _mm512_mask_sub_ps(fft8366, 49344, _mm512_setzero_ps(), fft8365);
__m512 fft8291 = _mm512_mask_mov_ps(fft8283, 49344, fft8284);
__m512 fft8375 = _mm512_mask_mov_ps(fft8367, 49344, fft8368);
__m512 fft8292 = _mm512_mask_sub_ps(fft8284, 49344, _mm512_setzero_ps(), fft8283);
__m512 fft8376 = _mm512_mask_sub_ps(fft8368, 49344, _mm512_setzero_ps(), fft8367);
__m512 fft8293 = _mm512_mask_mov_ps(fft8285, 49344, fft8286);
__m512 fft8377 = _mm512_mask_mov_ps(fft8369, 49344, fft8370);
__m512 fft8294 = _mm512_mask_sub_ps(fft8286, 49344, _mm512_setzero_ps(), fft8285);
__m512 fft8378 = _mm512_mask_sub_ps(fft8370, 49344, _mm512_setzero_ps(), fft8369);
__m512 fft8295 = _mm512_mask_mov_ps(fft8287, 49344, fft8288);
__m512 fft8379 = _mm512_mask_mov_ps(fft8371, 49344, fft8372);
__m512 fft8296 = _mm512_mask_sub_ps(fft8288, 49344, _mm512_setzero_ps(), fft8287);
__m512 fft8380 = _mm512_mask_sub_ps(fft8372, 49344, _mm512_setzero_ps(), fft8371);
__m512 fft8297 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8298 = _mm512_fmadd_ps(fft8289, fft8297, _mm512_shuffle_ps(fft8289, fft8289, 78));
__m512 fft8381 = _mm512_fmadd_ps(fft8373, fft8297, _mm512_shuffle_ps(fft8373, fft8373, 78));
__m512 fft8299 = _mm512_fmadd_ps(fft8290, fft8297, _mm512_shuffle_ps(fft8290, fft8290, 78));
__m512 fft8382 = _mm512_fmadd_ps(fft8374, fft8297, _mm512_shuffle_ps(fft8374, fft8374, 78));
__m512 fft8300 = _mm512_fmadd_ps(fft8291, fft8297, _mm512_shuffle_ps(fft8291, fft8291, 78));
__m512 fft8383 = _mm512_fmadd_ps(fft8375, fft8297, _mm512_shuffle_ps(fft8375, fft8375, 78));
__m512 fft8301 = _mm512_fmadd_ps(fft8292, fft8297, _mm512_shuffle_ps(fft8292, fft8292, 78));
__m512 fft8384 = _mm512_fmadd_ps(fft8376, fft8297, _mm512_shuffle_ps(fft8376, fft8376, 78));
__m512 fft8302 = _mm512_fmadd_ps(fft8293, fft8297, _mm512_shuffle_ps(fft8293, fft8293, 78));
__m512 fft8385 = _mm512_fmadd_ps(fft8377, fft8297, _mm512_shuffle_ps(fft8377, fft8377, 78));
__m512 fft8303 = _mm512_fmadd_ps(fft8294, fft8297, _mm512_shuffle_ps(fft8294, fft8294, 78));
__m512 fft8386 = _mm512_fmadd_ps(fft8378, fft8297, _mm512_shuffle_ps(fft8378, fft8378, 78));
__m512 fft8304 = _mm512_fmadd_ps(fft8295, fft8297, _mm512_shuffle_ps(fft8295, fft8295, 78));
__m512 fft8387 = _mm512_fmadd_ps(fft8379, fft8297, _mm512_shuffle_ps(fft8379, fft8379, 78));
__m512 fft8305 = _mm512_fmadd_ps(fft8296, fft8297, _mm512_shuffle_ps(fft8296, fft8296, 78));
__m512 fft8388 = _mm512_fmadd_ps(fft8380, fft8297, _mm512_shuffle_ps(fft8380, fft8380, 78));
__m512i fft8306 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8307 = _mm512_permutexvar_ps(fft8306, fft8298);
__m512 fft8389 = _mm512_permutexvar_ps(fft8306, fft8381);
__m512i fft8308 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8309 = _mm512_permutexvar_ps(fft8308, fft8298);
__m512 fft8390 = _mm512_permutexvar_ps(fft8308, fft8381);
__m512 fft8310 = _mm512_permutexvar_ps(fft8306, fft8299);
__m512 fft8391 = _mm512_permutexvar_ps(fft8306, fft8382);
__m512 fft8311 = _mm512_permutexvar_ps(fft8308, fft8299);
__m512 fft8392 = _mm512_permutexvar_ps(fft8308, fft8382);
__m512 fft8312 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8313 = _mm512_fmadd_ps(fft8307, fft8312, fft8309);
__m512 fft8393 = _mm512_fmadd_ps(fft8389, fft8312, fft8390);
__m512 fft8314 = _mm512_fnmadd_ps(fft8311, fft8312, fft8310);
__m512 fft8394 = _mm512_fnmadd_ps(fft8392, fft8312, fft8391);
__m512 fft8315 = _mm512_mask_mov_ps(fft8311, 21845, fft8313);
__m512 fft8395 = _mm512_mask_mov_ps(fft8392, 21845, fft8393);
__m512 fft8316 = _mm512_mask_mov_ps(fft8307, 43176, fft8313);
__m512 fft8396 = _mm512_mask_mov_ps(fft8389, 43176, fft8393);
__m512 fft8317 = _mm512_mask_mov_ps(fft8315, 43176, fft8314);
__m512 fft8397 = _mm512_mask_mov_ps(fft8395, 43176, fft8394);
__m512 fft8318 = _mm512_mask_mov_ps(fft8316, 22102, fft8314);
__m512 fft8398 = _mm512_mask_mov_ps(fft8396, 22102, fft8394);
__m512 fft8319 = _mm512_mask_mul_ps(fft8317, 64764, fft8317, _mm512_set1_ps(5e-01f));
__m512 fft8399 = _mm512_mask_mul_ps(fft8397, 64764, fft8397, _mm512_set1_ps(5e-01f));
__m512 fft8320 = _mm512_mask_mul_ps(fft8318, 64764, fft8318, _mm512_set1_ps(5e-01f));
__m512 fft8400 = _mm512_mask_mul_ps(fft8398, 64764, fft8398, _mm512_set1_ps(5e-01f));
__m512 df741 = fft8319;
__m512 df749 = fft8399;
__m512 df742 = fft8320;
__m512 df750 = fft8400;
__m512 df743 = fft8300;
__m512 df751 = fft8383;
__m512 df744 = fft8301;
__m512 df752 = fft8384;
__m512 df745 = fft8302;
__m512 df753 = fft8385;
__m512 df746 = fft8303;
__m512 df754 = fft8386;
__m512 df747 = fft8304;
__m512 df755 = fft8387;
__m512 df748 = fft8305;
__m512 df756 = fft8388;
__m512i eo50 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df743 = _mm512_permutexvar_ps(eo50, df743);
df744 = _mm512_permutexvar_ps(eo50, df744);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df743);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df744);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df743);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df744);
df751 = _mm512_permutexvar_ps(eo50, df751);
df752 = _mm512_permutexvar_ps(eo50, df752);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df751);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df752);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df751);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df752);
df745 = _mm512_permutexvar_ps(eo50, df745);
df746 = _mm512_permutexvar_ps(eo50, df746);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df745);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df746);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df745);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df746);
df753 = _mm512_permutexvar_ps(eo50, df753);
df754 = _mm512_permutexvar_ps(eo50, df754);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df753);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df754);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df753);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df754);
df747 = _mm512_permutexvar_ps(eo50, df747);
df748 = _mm512_permutexvar_ps(eo50, df748);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df747);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df748);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df747);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df748);
df755 = _mm512_permutexvar_ps(eo50, df755);
df756 = _mm512_permutexvar_ps(eo50, df756);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df755);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df756);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df755);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df756);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df741);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df742);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df741);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df742);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df749);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k108+128*m50+32*f53, 255, df750);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df749);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k108+128*m50+32*f53, 65280, df750);
ptrdiff_t b62 = 1;
ptrdiff_t m51 = (size_t)b62/2;
ptrdiff_t f54 = (size_t)b62%2;
__m512 dat1782 = _mm512_maskz_loadu_ps(32767, datPtr18+56+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1783 = _mm512_maskz_loadu_ps(32767, datPtr18+280+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1784 = _mm512_maskz_loadu_ps(32767, datPtr18+504+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1785 = _mm512_maskz_loadu_ps(32767, datPtr18+728+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1786 = _mm512_maskz_loadu_ps(32767, datPtr18+952+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1787 = _mm512_maskz_loadu_ps(32767, datPtr18+1176+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1788 = _mm512_maskz_loadu_ps(32767, datPtr18+1400+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1789 = _mm512_maskz_loadu_ps(32767, datPtr18+1624+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1790 = _mm512_maskz_loadu_ps(32767, datPtr18+1848+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1791 = _mm512_maskz_loadu_ps(32767, datPtr18+2072+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1792 = _mm512_maskz_loadu_ps(32767, datPtr18+2296+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1793 = _mm512_maskz_loadu_ps(32767, datPtr18+2520+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1794 = _mm512_maskz_loadu_ps(32767, datPtr18+2744+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1795 = _mm512_maskz_loadu_ps(32767, datPtr18+2968+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1796 = _mm512_maskz_loadu_ps(32767, datPtr18+3192+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 dat1797 = _mm512_maskz_loadu_ps(32767, datPtr18+3416+100864*i38+12608*k108+224*h41+4*w50+0*b62);
__m512 fft8401 = _mm512_add_ps(dat1782, dat1790);
__m512 fft8489 = _mm512_add_ps(dat1783, dat1791);
__m512 fft8402 = _mm512_sub_ps(dat1782, dat1790);
__m512 fft8490 = _mm512_sub_ps(dat1783, dat1791);
__m512 fft8403 = _mm512_add_ps(dat1784, dat1792);
__m512 fft8491 = _mm512_add_ps(dat1785, dat1793);
__m512 fft8404 = _mm512_sub_ps(dat1784, dat1792);
__m512 fft8492 = _mm512_sub_ps(dat1785, dat1793);
__m512 fft8405 = _mm512_add_ps(dat1786, dat1794);
__m512 fft8493 = _mm512_add_ps(dat1787, dat1795);
__m512 fft8406 = _mm512_sub_ps(dat1786, dat1794);
__m512 fft8494 = _mm512_sub_ps(dat1787, dat1795);
__m512 fft8407 = _mm512_add_ps(dat1788, dat1796);
__m512 fft8495 = _mm512_add_ps(dat1789, dat1797);
__m512 fft8408 = _mm512_sub_ps(dat1788, dat1796);
__m512 fft8496 = _mm512_sub_ps(dat1789, dat1797);
__m512 fft8409 = _mm512_add_ps(fft8401, fft8405);
__m512 fft8497 = _mm512_add_ps(fft8489, fft8493);
__m512 fft8410 = _mm512_sub_ps(fft8401, fft8405);
__m512 fft8498 = _mm512_sub_ps(fft8489, fft8493);
__m512 fft8411 = _mm512_add_ps(fft8403, fft8407);
__m512 fft8499 = _mm512_add_ps(fft8491, fft8495);
__m512 fft8412 = _mm512_sub_ps(fft8407, fft8403);
__m512 fft8500 = _mm512_sub_ps(fft8495, fft8491);
__m512 fft8413 = _mm512_sub_ps(fft8404, fft8408);
__m512 fft8501 = _mm512_sub_ps(fft8492, fft8496);
__m512 fft8414 = _mm512_add_ps(fft8404, fft8408);
__m512 fft8502 = _mm512_add_ps(fft8492, fft8496);
__m512 fft8415 = _mm512_add_ps(fft8409, fft8411);
__m512 fft8503 = _mm512_add_ps(fft8497, fft8499);
__m512 fft8416 = _mm512_sub_ps(fft8409, fft8411);
__m512 fft8504 = _mm512_sub_ps(fft8497, fft8499);
__m512 fft8417 = _mm512_fmadd_ps(fft8413, _mm512_set1_ps(7.0710677e-01f), fft8402);
__m512 fft8505 = _mm512_fmadd_ps(fft8501, _mm512_set1_ps(7.0710677e-01f), fft8490);
__m512 fft8418 = _mm512_fnmsub_ps(fft8414, _mm512_set1_ps(7.0710677e-01f), fft8406);
__m512 fft8506 = _mm512_fnmsub_ps(fft8502, _mm512_set1_ps(7.0710677e-01f), fft8494);
__m512 fft8419 = _mm512_fnmadd_ps(fft8413, _mm512_set1_ps(7.0710677e-01f), fft8402);
__m512 fft8507 = _mm512_fnmadd_ps(fft8501, _mm512_set1_ps(7.0710677e-01f), fft8490);
__m512 fft8420 = _mm512_fnmadd_ps(fft8414, _mm512_set1_ps(7.0710677e-01f), fft8406);
__m512 fft8508 = _mm512_fnmadd_ps(fft8502, _mm512_set1_ps(7.0710677e-01f), fft8494);
__m512 fft8421 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8422 = _mm512_fmadd_ps(fft8415, fft8421, _mm512_shuffle_f32x4(fft8415, fft8415, 78));
__m512 fft8509 = _mm512_fmadd_ps(fft8503, fft8421, _mm512_shuffle_f32x4(fft8503, fft8503, 78));
__m512 fft8423 = _mm512_fmadd_ps(fft8416, fft8421, _mm512_shuffle_f32x4(fft8416, fft8416, 78));
__m512 fft8510 = _mm512_fmadd_ps(fft8504, fft8421, _mm512_shuffle_f32x4(fft8504, fft8504, 78));
__m512 fft8424 = _mm512_fmadd_ps(fft8417, fft8421, _mm512_shuffle_f32x4(fft8417, fft8417, 78));
__m512 fft8511 = _mm512_fmadd_ps(fft8505, fft8421, _mm512_shuffle_f32x4(fft8505, fft8505, 78));
__m512 fft8425 = _mm512_fmadd_ps(fft8418, fft8421, _mm512_shuffle_f32x4(fft8418, fft8418, 78));
__m512 fft8512 = _mm512_fmadd_ps(fft8506, fft8421, _mm512_shuffle_f32x4(fft8506, fft8506, 78));
__m512 fft8426 = _mm512_fmadd_ps(fft8410, fft8421, _mm512_shuffle_f32x4(fft8410, fft8410, 78));
__m512 fft8513 = _mm512_fmadd_ps(fft8498, fft8421, _mm512_shuffle_f32x4(fft8498, fft8498, 78));
__m512 fft8427 = _mm512_fmadd_ps(fft8412, fft8421, _mm512_shuffle_f32x4(fft8412, fft8412, 78));
__m512 fft8514 = _mm512_fmadd_ps(fft8500, fft8421, _mm512_shuffle_f32x4(fft8500, fft8500, 78));
__m512 fft8428 = _mm512_fmadd_ps(fft8419, fft8421, _mm512_shuffle_f32x4(fft8419, fft8419, 78));
__m512 fft8515 = _mm512_fmadd_ps(fft8507, fft8421, _mm512_shuffle_f32x4(fft8507, fft8507, 78));
__m512 fft8429 = _mm512_fmadd_ps(fft8420, fft8421, _mm512_shuffle_f32x4(fft8420, fft8420, 78));
__m512 fft8516 = _mm512_fmadd_ps(fft8508, fft8421, _mm512_shuffle_f32x4(fft8508, fft8508, 78));
__m512 fft8430 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8431 = _mm512_mul_ps(fft8422, fft8430);
__m512 fft8517 = _mm512_mul_ps(fft8509, fft8430);
__m512 fft8432 = _mm512_mul_ps(fft8423, fft8430);
__m512 fft8518 = _mm512_mul_ps(fft8510, fft8430);
__m512 fft8433 = _mm512_mul_ps(fft8424, fft8430);
__m512 fft8519 = _mm512_mul_ps(fft8511, fft8430);
__m512 fft8434 = _mm512_mul_ps(fft8425, fft8430);
__m512 fft8520 = _mm512_mul_ps(fft8512, fft8430);
__m512 fft8435 = _mm512_mul_ps(fft8426, fft8430);
__m512 fft8521 = _mm512_mul_ps(fft8513, fft8430);
__m512 fft8436 = _mm512_mul_ps(fft8427, fft8430);
__m512 fft8522 = _mm512_mul_ps(fft8514, fft8430);
__m512 fft8437 = _mm512_mul_ps(fft8428, fft8430);
__m512 fft8523 = _mm512_mul_ps(fft8515, fft8430);
__m512 fft8438 = _mm512_mul_ps(fft8429, fft8430);
__m512 fft8524 = _mm512_mul_ps(fft8516, fft8430);
__m512 fft8439 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8440 = _mm512_fmadd_ps(fft8423, fft8439, fft8431);
__m512 fft8525 = _mm512_fmadd_ps(fft8510, fft8439, fft8517);
__m512 fft8441 = _mm512_fnmadd_ps(fft8422, fft8439, fft8432);
__m512 fft8526 = _mm512_fnmadd_ps(fft8509, fft8439, fft8518);
__m512 fft8442 = _mm512_fmadd_ps(fft8425, fft8439, fft8433);
__m512 fft8527 = _mm512_fmadd_ps(fft8512, fft8439, fft8519);
__m512 fft8443 = _mm512_fnmadd_ps(fft8424, fft8439, fft8434);
__m512 fft8528 = _mm512_fnmadd_ps(fft8511, fft8439, fft8520);
__m512 fft8444 = _mm512_fmadd_ps(fft8427, fft8439, fft8435);
__m512 fft8529 = _mm512_fmadd_ps(fft8514, fft8439, fft8521);
__m512 fft8445 = _mm512_fnmadd_ps(fft8426, fft8439, fft8436);
__m512 fft8530 = _mm512_fnmadd_ps(fft8513, fft8439, fft8522);
__m512 fft8446 = _mm512_fmadd_ps(fft8429, fft8439, fft8437);
__m512 fft8531 = _mm512_fmadd_ps(fft8516, fft8439, fft8523);
__m512 fft8447 = _mm512_fnmadd_ps(fft8428, fft8439, fft8438);
__m512 fft8532 = _mm512_fnmadd_ps(fft8515, fft8439, fft8524);
__m512 fft8448 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8449 = _mm512_fmadd_ps(fft8440, fft8448, _mm512_shuffle_f32x4(fft8440, fft8440, 177));
__m512 fft8533 = _mm512_fmadd_ps(fft8525, fft8448, _mm512_shuffle_f32x4(fft8525, fft8525, 177));
__m512 fft8450 = _mm512_fmadd_ps(fft8441, fft8448, _mm512_shuffle_f32x4(fft8441, fft8441, 177));
__m512 fft8534 = _mm512_fmadd_ps(fft8526, fft8448, _mm512_shuffle_f32x4(fft8526, fft8526, 177));
__m512 fft8451 = _mm512_fmadd_ps(fft8442, fft8448, _mm512_shuffle_f32x4(fft8442, fft8442, 177));
__m512 fft8535 = _mm512_fmadd_ps(fft8527, fft8448, _mm512_shuffle_f32x4(fft8527, fft8527, 177));
__m512 fft8452 = _mm512_fmadd_ps(fft8443, fft8448, _mm512_shuffle_f32x4(fft8443, fft8443, 177));
__m512 fft8536 = _mm512_fmadd_ps(fft8528, fft8448, _mm512_shuffle_f32x4(fft8528, fft8528, 177));
__m512 fft8453 = _mm512_fmadd_ps(fft8444, fft8448, _mm512_shuffle_f32x4(fft8444, fft8444, 177));
__m512 fft8537 = _mm512_fmadd_ps(fft8529, fft8448, _mm512_shuffle_f32x4(fft8529, fft8529, 177));
__m512 fft8454 = _mm512_fmadd_ps(fft8445, fft8448, _mm512_shuffle_f32x4(fft8445, fft8445, 177));
__m512 fft8538 = _mm512_fmadd_ps(fft8530, fft8448, _mm512_shuffle_f32x4(fft8530, fft8530, 177));
__m512 fft8455 = _mm512_fmadd_ps(fft8446, fft8448, _mm512_shuffle_f32x4(fft8446, fft8446, 177));
__m512 fft8539 = _mm512_fmadd_ps(fft8531, fft8448, _mm512_shuffle_f32x4(fft8531, fft8531, 177));
__m512 fft8456 = _mm512_fmadd_ps(fft8447, fft8448, _mm512_shuffle_f32x4(fft8447, fft8447, 177));
__m512 fft8540 = _mm512_fmadd_ps(fft8532, fft8448, _mm512_shuffle_f32x4(fft8532, fft8532, 177));
__m512 fft8457 = _mm512_mask_mov_ps(fft8449, 49344, fft8450);
__m512 fft8541 = _mm512_mask_mov_ps(fft8533, 49344, fft8534);
__m512 fft8458 = _mm512_mask_sub_ps(fft8450, 49344, _mm512_setzero_ps(), fft8449);
__m512 fft8542 = _mm512_mask_sub_ps(fft8534, 49344, _mm512_setzero_ps(), fft8533);
__m512 fft8459 = _mm512_mask_mov_ps(fft8451, 49344, fft8452);
__m512 fft8543 = _mm512_mask_mov_ps(fft8535, 49344, fft8536);
__m512 fft8460 = _mm512_mask_sub_ps(fft8452, 49344, _mm512_setzero_ps(), fft8451);
__m512 fft8544 = _mm512_mask_sub_ps(fft8536, 49344, _mm512_setzero_ps(), fft8535);
__m512 fft8461 = _mm512_mask_mov_ps(fft8453, 49344, fft8454);
__m512 fft8545 = _mm512_mask_mov_ps(fft8537, 49344, fft8538);
__m512 fft8462 = _mm512_mask_sub_ps(fft8454, 49344, _mm512_setzero_ps(), fft8453);
__m512 fft8546 = _mm512_mask_sub_ps(fft8538, 49344, _mm512_setzero_ps(), fft8537);
__m512 fft8463 = _mm512_mask_mov_ps(fft8455, 49344, fft8456);
__m512 fft8547 = _mm512_mask_mov_ps(fft8539, 49344, fft8540);
__m512 fft8464 = _mm512_mask_sub_ps(fft8456, 49344, _mm512_setzero_ps(), fft8455);
__m512 fft8548 = _mm512_mask_sub_ps(fft8540, 49344, _mm512_setzero_ps(), fft8539);
__m512 fft8465 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8466 = _mm512_fmadd_ps(fft8457, fft8465, _mm512_shuffle_ps(fft8457, fft8457, 78));
__m512 fft8549 = _mm512_fmadd_ps(fft8541, fft8465, _mm512_shuffle_ps(fft8541, fft8541, 78));
__m512 fft8467 = _mm512_fmadd_ps(fft8458, fft8465, _mm512_shuffle_ps(fft8458, fft8458, 78));
__m512 fft8550 = _mm512_fmadd_ps(fft8542, fft8465, _mm512_shuffle_ps(fft8542, fft8542, 78));
__m512 fft8468 = _mm512_fmadd_ps(fft8459, fft8465, _mm512_shuffle_ps(fft8459, fft8459, 78));
__m512 fft8551 = _mm512_fmadd_ps(fft8543, fft8465, _mm512_shuffle_ps(fft8543, fft8543, 78));
__m512 fft8469 = _mm512_fmadd_ps(fft8460, fft8465, _mm512_shuffle_ps(fft8460, fft8460, 78));
__m512 fft8552 = _mm512_fmadd_ps(fft8544, fft8465, _mm512_shuffle_ps(fft8544, fft8544, 78));
__m512 fft8470 = _mm512_fmadd_ps(fft8461, fft8465, _mm512_shuffle_ps(fft8461, fft8461, 78));
__m512 fft8553 = _mm512_fmadd_ps(fft8545, fft8465, _mm512_shuffle_ps(fft8545, fft8545, 78));
__m512 fft8471 = _mm512_fmadd_ps(fft8462, fft8465, _mm512_shuffle_ps(fft8462, fft8462, 78));
__m512 fft8554 = _mm512_fmadd_ps(fft8546, fft8465, _mm512_shuffle_ps(fft8546, fft8546, 78));
__m512 fft8472 = _mm512_fmadd_ps(fft8463, fft8465, _mm512_shuffle_ps(fft8463, fft8463, 78));
__m512 fft8555 = _mm512_fmadd_ps(fft8547, fft8465, _mm512_shuffle_ps(fft8547, fft8547, 78));
__m512 fft8473 = _mm512_fmadd_ps(fft8464, fft8465, _mm512_shuffle_ps(fft8464, fft8464, 78));
__m512 fft8556 = _mm512_fmadd_ps(fft8548, fft8465, _mm512_shuffle_ps(fft8548, fft8548, 78));
__m512i fft8474 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8475 = _mm512_permutexvar_ps(fft8474, fft8466);
__m512 fft8557 = _mm512_permutexvar_ps(fft8474, fft8549);
__m512i fft8476 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8477 = _mm512_permutexvar_ps(fft8476, fft8466);
__m512 fft8558 = _mm512_permutexvar_ps(fft8476, fft8549);
__m512 fft8478 = _mm512_permutexvar_ps(fft8474, fft8467);
__m512 fft8559 = _mm512_permutexvar_ps(fft8474, fft8550);
__m512 fft8479 = _mm512_permutexvar_ps(fft8476, fft8467);
__m512 fft8560 = _mm512_permutexvar_ps(fft8476, fft8550);
__m512 fft8480 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8481 = _mm512_fmadd_ps(fft8475, fft8480, fft8477);
__m512 fft8561 = _mm512_fmadd_ps(fft8557, fft8480, fft8558);
__m512 fft8482 = _mm512_fnmadd_ps(fft8479, fft8480, fft8478);
__m512 fft8562 = _mm512_fnmadd_ps(fft8560, fft8480, fft8559);
__m512 fft8483 = _mm512_mask_mov_ps(fft8479, 21845, fft8481);
__m512 fft8563 = _mm512_mask_mov_ps(fft8560, 21845, fft8561);
__m512 fft8484 = _mm512_mask_mov_ps(fft8475, 43176, fft8481);
__m512 fft8564 = _mm512_mask_mov_ps(fft8557, 43176, fft8561);
__m512 fft8485 = _mm512_mask_mov_ps(fft8483, 43176, fft8482);
__m512 fft8565 = _mm512_mask_mov_ps(fft8563, 43176, fft8562);
__m512 fft8486 = _mm512_mask_mov_ps(fft8484, 22102, fft8482);
__m512 fft8566 = _mm512_mask_mov_ps(fft8564, 22102, fft8562);
__m512 fft8487 = _mm512_mask_mul_ps(fft8485, 64764, fft8485, _mm512_set1_ps(5e-01f));
__m512 fft8567 = _mm512_mask_mul_ps(fft8565, 64764, fft8565, _mm512_set1_ps(5e-01f));
__m512 fft8488 = _mm512_mask_mul_ps(fft8486, 64764, fft8486, _mm512_set1_ps(5e-01f));
__m512 fft8568 = _mm512_mask_mul_ps(fft8566, 64764, fft8566, _mm512_set1_ps(5e-01f));
__m512 df757 = fft8487;
__m512 df765 = fft8567;
__m512 df758 = fft8488;
__m512 df766 = fft8568;
__m512 df759 = fft8468;
__m512 df767 = fft8551;
__m512 df760 = fft8469;
__m512 df768 = fft8552;
__m512 df761 = fft8470;
__m512 df769 = fft8553;
__m512 df762 = fft8471;
__m512 df770 = fft8554;
__m512 df763 = fft8472;
__m512 df771 = fft8555;
__m512 df764 = fft8473;
__m512 df772 = fft8556;
__m512i eo51 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df759 = _mm512_permutexvar_ps(eo51, df759);
df760 = _mm512_permutexvar_ps(eo51, df760);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df759);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df760);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df759);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df760);
df767 = _mm512_permutexvar_ps(eo51, df767);
df768 = _mm512_permutexvar_ps(eo51, df768);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df767);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df768);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df767);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df768);
df761 = _mm512_permutexvar_ps(eo51, df761);
df762 = _mm512_permutexvar_ps(eo51, df762);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df761);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df762);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df761);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df762);
df769 = _mm512_permutexvar_ps(eo51, df769);
df770 = _mm512_permutexvar_ps(eo51, df770);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df769);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df770);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df769);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df770);
df763 = _mm512_permutexvar_ps(eo51, df763);
df764 = _mm512_permutexvar_ps(eo51, df764);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df763);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df764);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df763);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df764);
df771 = _mm512_permutexvar_ps(eo51, df771);
df772 = _mm512_permutexvar_ps(eo51, df772);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df771);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df772);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df771);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df772);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df757);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df758);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df757);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df758);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df765);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k108+128*m51+32*f54, 255, df766);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df765);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k108+128*m51+32*f54, 65280, df766);
ptrdiff_t b63 = 2;
ptrdiff_t m52 = (size_t)b63/2;
ptrdiff_t f55 = (size_t)b63%2;
__m512 dat1798 = _mm512_maskz_loadu_ps(65534, datPtr18+3024+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1799 = _mm512_maskz_loadu_ps(65534, datPtr18+3248+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1800 = _mm512_maskz_loadu_ps(65534, datPtr18+3472+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1801 = _mm512_maskz_loadu_ps(65534, datPtr18+3696+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1802 = _mm512_maskz_loadu_ps(65534, datPtr18+3920+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1803 = _mm512_maskz_loadu_ps(65534, datPtr18+4144+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1804 = _mm512_maskz_loadu_ps(65534, datPtr18+4368+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1805 = _mm512_maskz_loadu_ps(65534, datPtr18+4592+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1806 = _mm512_maskz_loadu_ps(65534, datPtr18+4816+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1807 = _mm512_maskz_loadu_ps(65534, datPtr18+5040+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1808 = _mm512_maskz_loadu_ps(65534, datPtr18+5264+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1809 = _mm512_maskz_loadu_ps(65534, datPtr18+5488+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1810 = _mm512_maskz_loadu_ps(65534, datPtr18+5712+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1811 = _mm512_maskz_loadu_ps(65534, datPtr18+5936+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1812 = _mm512_maskz_loadu_ps(65534, datPtr18+6160+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 dat1813 = _mm512_maskz_loadu_ps(65534, datPtr18+6384+100864*i38+12608*k108+224*h41+4*w50+0*b63);
__m512 fft8569 = _mm512_add_ps(dat1798, dat1806);
__m512 fft8657 = _mm512_add_ps(dat1799, dat1807);
__m512 fft8570 = _mm512_sub_ps(dat1798, dat1806);
__m512 fft8658 = _mm512_sub_ps(dat1799, dat1807);
__m512 fft8571 = _mm512_add_ps(dat1800, dat1808);
__m512 fft8659 = _mm512_add_ps(dat1801, dat1809);
__m512 fft8572 = _mm512_sub_ps(dat1800, dat1808);
__m512 fft8660 = _mm512_sub_ps(dat1801, dat1809);
__m512 fft8573 = _mm512_add_ps(dat1802, dat1810);
__m512 fft8661 = _mm512_add_ps(dat1803, dat1811);
__m512 fft8574 = _mm512_sub_ps(dat1802, dat1810);
__m512 fft8662 = _mm512_sub_ps(dat1803, dat1811);
__m512 fft8575 = _mm512_add_ps(dat1804, dat1812);
__m512 fft8663 = _mm512_add_ps(dat1805, dat1813);
__m512 fft8576 = _mm512_sub_ps(dat1804, dat1812);
__m512 fft8664 = _mm512_sub_ps(dat1805, dat1813);
__m512 fft8577 = _mm512_add_ps(fft8569, fft8573);
__m512 fft8665 = _mm512_add_ps(fft8657, fft8661);
__m512 fft8578 = _mm512_sub_ps(fft8569, fft8573);
__m512 fft8666 = _mm512_sub_ps(fft8657, fft8661);
__m512 fft8579 = _mm512_add_ps(fft8571, fft8575);
__m512 fft8667 = _mm512_add_ps(fft8659, fft8663);
__m512 fft8580 = _mm512_sub_ps(fft8575, fft8571);
__m512 fft8668 = _mm512_sub_ps(fft8663, fft8659);
__m512 fft8581 = _mm512_sub_ps(fft8572, fft8576);
__m512 fft8669 = _mm512_sub_ps(fft8660, fft8664);
__m512 fft8582 = _mm512_add_ps(fft8572, fft8576);
__m512 fft8670 = _mm512_add_ps(fft8660, fft8664);
__m512 fft8583 = _mm512_add_ps(fft8577, fft8579);
__m512 fft8671 = _mm512_add_ps(fft8665, fft8667);
__m512 fft8584 = _mm512_sub_ps(fft8577, fft8579);
__m512 fft8672 = _mm512_sub_ps(fft8665, fft8667);
__m512 fft8585 = _mm512_fmadd_ps(fft8581, _mm512_set1_ps(7.0710677e-01f), fft8570);
__m512 fft8673 = _mm512_fmadd_ps(fft8669, _mm512_set1_ps(7.0710677e-01f), fft8658);
__m512 fft8586 = _mm512_fnmsub_ps(fft8582, _mm512_set1_ps(7.0710677e-01f), fft8574);
__m512 fft8674 = _mm512_fnmsub_ps(fft8670, _mm512_set1_ps(7.0710677e-01f), fft8662);
__m512 fft8587 = _mm512_fnmadd_ps(fft8581, _mm512_set1_ps(7.0710677e-01f), fft8570);
__m512 fft8675 = _mm512_fnmadd_ps(fft8669, _mm512_set1_ps(7.0710677e-01f), fft8658);
__m512 fft8588 = _mm512_fnmadd_ps(fft8582, _mm512_set1_ps(7.0710677e-01f), fft8574);
__m512 fft8676 = _mm512_fnmadd_ps(fft8670, _mm512_set1_ps(7.0710677e-01f), fft8662);
__m512 fft8589 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8590 = _mm512_fmadd_ps(fft8583, fft8589, _mm512_shuffle_f32x4(fft8583, fft8583, 78));
__m512 fft8677 = _mm512_fmadd_ps(fft8671, fft8589, _mm512_shuffle_f32x4(fft8671, fft8671, 78));
__m512 fft8591 = _mm512_fmadd_ps(fft8584, fft8589, _mm512_shuffle_f32x4(fft8584, fft8584, 78));
__m512 fft8678 = _mm512_fmadd_ps(fft8672, fft8589, _mm512_shuffle_f32x4(fft8672, fft8672, 78));
__m512 fft8592 = _mm512_fmadd_ps(fft8585, fft8589, _mm512_shuffle_f32x4(fft8585, fft8585, 78));
__m512 fft8679 = _mm512_fmadd_ps(fft8673, fft8589, _mm512_shuffle_f32x4(fft8673, fft8673, 78));
__m512 fft8593 = _mm512_fmadd_ps(fft8586, fft8589, _mm512_shuffle_f32x4(fft8586, fft8586, 78));
__m512 fft8680 = _mm512_fmadd_ps(fft8674, fft8589, _mm512_shuffle_f32x4(fft8674, fft8674, 78));
__m512 fft8594 = _mm512_fmadd_ps(fft8578, fft8589, _mm512_shuffle_f32x4(fft8578, fft8578, 78));
__m512 fft8681 = _mm512_fmadd_ps(fft8666, fft8589, _mm512_shuffle_f32x4(fft8666, fft8666, 78));
__m512 fft8595 = _mm512_fmadd_ps(fft8580, fft8589, _mm512_shuffle_f32x4(fft8580, fft8580, 78));
__m512 fft8682 = _mm512_fmadd_ps(fft8668, fft8589, _mm512_shuffle_f32x4(fft8668, fft8668, 78));
__m512 fft8596 = _mm512_fmadd_ps(fft8587, fft8589, _mm512_shuffle_f32x4(fft8587, fft8587, 78));
__m512 fft8683 = _mm512_fmadd_ps(fft8675, fft8589, _mm512_shuffle_f32x4(fft8675, fft8675, 78));
__m512 fft8597 = _mm512_fmadd_ps(fft8588, fft8589, _mm512_shuffle_f32x4(fft8588, fft8588, 78));
__m512 fft8684 = _mm512_fmadd_ps(fft8676, fft8589, _mm512_shuffle_f32x4(fft8676, fft8676, 78));
__m512 fft8598 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8599 = _mm512_mul_ps(fft8590, fft8598);
__m512 fft8685 = _mm512_mul_ps(fft8677, fft8598);
__m512 fft8600 = _mm512_mul_ps(fft8591, fft8598);
__m512 fft8686 = _mm512_mul_ps(fft8678, fft8598);
__m512 fft8601 = _mm512_mul_ps(fft8592, fft8598);
__m512 fft8687 = _mm512_mul_ps(fft8679, fft8598);
__m512 fft8602 = _mm512_mul_ps(fft8593, fft8598);
__m512 fft8688 = _mm512_mul_ps(fft8680, fft8598);
__m512 fft8603 = _mm512_mul_ps(fft8594, fft8598);
__m512 fft8689 = _mm512_mul_ps(fft8681, fft8598);
__m512 fft8604 = _mm512_mul_ps(fft8595, fft8598);
__m512 fft8690 = _mm512_mul_ps(fft8682, fft8598);
__m512 fft8605 = _mm512_mul_ps(fft8596, fft8598);
__m512 fft8691 = _mm512_mul_ps(fft8683, fft8598);
__m512 fft8606 = _mm512_mul_ps(fft8597, fft8598);
__m512 fft8692 = _mm512_mul_ps(fft8684, fft8598);
__m512 fft8607 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8608 = _mm512_fmadd_ps(fft8591, fft8607, fft8599);
__m512 fft8693 = _mm512_fmadd_ps(fft8678, fft8607, fft8685);
__m512 fft8609 = _mm512_fnmadd_ps(fft8590, fft8607, fft8600);
__m512 fft8694 = _mm512_fnmadd_ps(fft8677, fft8607, fft8686);
__m512 fft8610 = _mm512_fmadd_ps(fft8593, fft8607, fft8601);
__m512 fft8695 = _mm512_fmadd_ps(fft8680, fft8607, fft8687);
__m512 fft8611 = _mm512_fnmadd_ps(fft8592, fft8607, fft8602);
__m512 fft8696 = _mm512_fnmadd_ps(fft8679, fft8607, fft8688);
__m512 fft8612 = _mm512_fmadd_ps(fft8595, fft8607, fft8603);
__m512 fft8697 = _mm512_fmadd_ps(fft8682, fft8607, fft8689);
__m512 fft8613 = _mm512_fnmadd_ps(fft8594, fft8607, fft8604);
__m512 fft8698 = _mm512_fnmadd_ps(fft8681, fft8607, fft8690);
__m512 fft8614 = _mm512_fmadd_ps(fft8597, fft8607, fft8605);
__m512 fft8699 = _mm512_fmadd_ps(fft8684, fft8607, fft8691);
__m512 fft8615 = _mm512_fnmadd_ps(fft8596, fft8607, fft8606);
__m512 fft8700 = _mm512_fnmadd_ps(fft8683, fft8607, fft8692);
__m512 fft8616 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8617 = _mm512_fmadd_ps(fft8608, fft8616, _mm512_shuffle_f32x4(fft8608, fft8608, 177));
__m512 fft8701 = _mm512_fmadd_ps(fft8693, fft8616, _mm512_shuffle_f32x4(fft8693, fft8693, 177));
__m512 fft8618 = _mm512_fmadd_ps(fft8609, fft8616, _mm512_shuffle_f32x4(fft8609, fft8609, 177));
__m512 fft8702 = _mm512_fmadd_ps(fft8694, fft8616, _mm512_shuffle_f32x4(fft8694, fft8694, 177));
__m512 fft8619 = _mm512_fmadd_ps(fft8610, fft8616, _mm512_shuffle_f32x4(fft8610, fft8610, 177));
__m512 fft8703 = _mm512_fmadd_ps(fft8695, fft8616, _mm512_shuffle_f32x4(fft8695, fft8695, 177));
__m512 fft8620 = _mm512_fmadd_ps(fft8611, fft8616, _mm512_shuffle_f32x4(fft8611, fft8611, 177));
__m512 fft8704 = _mm512_fmadd_ps(fft8696, fft8616, _mm512_shuffle_f32x4(fft8696, fft8696, 177));
__m512 fft8621 = _mm512_fmadd_ps(fft8612, fft8616, _mm512_shuffle_f32x4(fft8612, fft8612, 177));
__m512 fft8705 = _mm512_fmadd_ps(fft8697, fft8616, _mm512_shuffle_f32x4(fft8697, fft8697, 177));
__m512 fft8622 = _mm512_fmadd_ps(fft8613, fft8616, _mm512_shuffle_f32x4(fft8613, fft8613, 177));
__m512 fft8706 = _mm512_fmadd_ps(fft8698, fft8616, _mm512_shuffle_f32x4(fft8698, fft8698, 177));
__m512 fft8623 = _mm512_fmadd_ps(fft8614, fft8616, _mm512_shuffle_f32x4(fft8614, fft8614, 177));
__m512 fft8707 = _mm512_fmadd_ps(fft8699, fft8616, _mm512_shuffle_f32x4(fft8699, fft8699, 177));
__m512 fft8624 = _mm512_fmadd_ps(fft8615, fft8616, _mm512_shuffle_f32x4(fft8615, fft8615, 177));
__m512 fft8708 = _mm512_fmadd_ps(fft8700, fft8616, _mm512_shuffle_f32x4(fft8700, fft8700, 177));
__m512 fft8625 = _mm512_mask_mov_ps(fft8617, 49344, fft8618);
__m512 fft8709 = _mm512_mask_mov_ps(fft8701, 49344, fft8702);
__m512 fft8626 = _mm512_mask_sub_ps(fft8618, 49344, _mm512_setzero_ps(), fft8617);
__m512 fft8710 = _mm512_mask_sub_ps(fft8702, 49344, _mm512_setzero_ps(), fft8701);
__m512 fft8627 = _mm512_mask_mov_ps(fft8619, 49344, fft8620);
__m512 fft8711 = _mm512_mask_mov_ps(fft8703, 49344, fft8704);
__m512 fft8628 = _mm512_mask_sub_ps(fft8620, 49344, _mm512_setzero_ps(), fft8619);
__m512 fft8712 = _mm512_mask_sub_ps(fft8704, 49344, _mm512_setzero_ps(), fft8703);
__m512 fft8629 = _mm512_mask_mov_ps(fft8621, 49344, fft8622);
__m512 fft8713 = _mm512_mask_mov_ps(fft8705, 49344, fft8706);
__m512 fft8630 = _mm512_mask_sub_ps(fft8622, 49344, _mm512_setzero_ps(), fft8621);
__m512 fft8714 = _mm512_mask_sub_ps(fft8706, 49344, _mm512_setzero_ps(), fft8705);
__m512 fft8631 = _mm512_mask_mov_ps(fft8623, 49344, fft8624);
__m512 fft8715 = _mm512_mask_mov_ps(fft8707, 49344, fft8708);
__m512 fft8632 = _mm512_mask_sub_ps(fft8624, 49344, _mm512_setzero_ps(), fft8623);
__m512 fft8716 = _mm512_mask_sub_ps(fft8708, 49344, _mm512_setzero_ps(), fft8707);
__m512 fft8633 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8634 = _mm512_fmadd_ps(fft8625, fft8633, _mm512_shuffle_ps(fft8625, fft8625, 78));
__m512 fft8717 = _mm512_fmadd_ps(fft8709, fft8633, _mm512_shuffle_ps(fft8709, fft8709, 78));
__m512 fft8635 = _mm512_fmadd_ps(fft8626, fft8633, _mm512_shuffle_ps(fft8626, fft8626, 78));
__m512 fft8718 = _mm512_fmadd_ps(fft8710, fft8633, _mm512_shuffle_ps(fft8710, fft8710, 78));
__m512 fft8636 = _mm512_fmadd_ps(fft8627, fft8633, _mm512_shuffle_ps(fft8627, fft8627, 78));
__m512 fft8719 = _mm512_fmadd_ps(fft8711, fft8633, _mm512_shuffle_ps(fft8711, fft8711, 78));
__m512 fft8637 = _mm512_fmadd_ps(fft8628, fft8633, _mm512_shuffle_ps(fft8628, fft8628, 78));
__m512 fft8720 = _mm512_fmadd_ps(fft8712, fft8633, _mm512_shuffle_ps(fft8712, fft8712, 78));
__m512 fft8638 = _mm512_fmadd_ps(fft8629, fft8633, _mm512_shuffle_ps(fft8629, fft8629, 78));
__m512 fft8721 = _mm512_fmadd_ps(fft8713, fft8633, _mm512_shuffle_ps(fft8713, fft8713, 78));
__m512 fft8639 = _mm512_fmadd_ps(fft8630, fft8633, _mm512_shuffle_ps(fft8630, fft8630, 78));
__m512 fft8722 = _mm512_fmadd_ps(fft8714, fft8633, _mm512_shuffle_ps(fft8714, fft8714, 78));
__m512 fft8640 = _mm512_fmadd_ps(fft8631, fft8633, _mm512_shuffle_ps(fft8631, fft8631, 78));
__m512 fft8723 = _mm512_fmadd_ps(fft8715, fft8633, _mm512_shuffle_ps(fft8715, fft8715, 78));
__m512 fft8641 = _mm512_fmadd_ps(fft8632, fft8633, _mm512_shuffle_ps(fft8632, fft8632, 78));
__m512 fft8724 = _mm512_fmadd_ps(fft8716, fft8633, _mm512_shuffle_ps(fft8716, fft8716, 78));
__m512i fft8642 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8643 = _mm512_permutexvar_ps(fft8642, fft8634);
__m512 fft8725 = _mm512_permutexvar_ps(fft8642, fft8717);
__m512i fft8644 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8645 = _mm512_permutexvar_ps(fft8644, fft8634);
__m512 fft8726 = _mm512_permutexvar_ps(fft8644, fft8717);
__m512 fft8646 = _mm512_permutexvar_ps(fft8642, fft8635);
__m512 fft8727 = _mm512_permutexvar_ps(fft8642, fft8718);
__m512 fft8647 = _mm512_permutexvar_ps(fft8644, fft8635);
__m512 fft8728 = _mm512_permutexvar_ps(fft8644, fft8718);
__m512 fft8648 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8649 = _mm512_fmadd_ps(fft8643, fft8648, fft8645);
__m512 fft8729 = _mm512_fmadd_ps(fft8725, fft8648, fft8726);
__m512 fft8650 = _mm512_fnmadd_ps(fft8647, fft8648, fft8646);
__m512 fft8730 = _mm512_fnmadd_ps(fft8728, fft8648, fft8727);
__m512 fft8651 = _mm512_mask_mov_ps(fft8647, 21845, fft8649);
__m512 fft8731 = _mm512_mask_mov_ps(fft8728, 21845, fft8729);
__m512 fft8652 = _mm512_mask_mov_ps(fft8643, 43176, fft8649);
__m512 fft8732 = _mm512_mask_mov_ps(fft8725, 43176, fft8729);
__m512 fft8653 = _mm512_mask_mov_ps(fft8651, 43176, fft8650);
__m512 fft8733 = _mm512_mask_mov_ps(fft8731, 43176, fft8730);
__m512 fft8654 = _mm512_mask_mov_ps(fft8652, 22102, fft8650);
__m512 fft8734 = _mm512_mask_mov_ps(fft8732, 22102, fft8730);
__m512 fft8655 = _mm512_mask_mul_ps(fft8653, 64764, fft8653, _mm512_set1_ps(5e-01f));
__m512 fft8735 = _mm512_mask_mul_ps(fft8733, 64764, fft8733, _mm512_set1_ps(5e-01f));
__m512 fft8656 = _mm512_mask_mul_ps(fft8654, 64764, fft8654, _mm512_set1_ps(5e-01f));
__m512 fft8736 = _mm512_mask_mul_ps(fft8734, 64764, fft8734, _mm512_set1_ps(5e-01f));
__m512 df773 = fft8655;
__m512 df781 = fft8735;
__m512 df774 = fft8656;
__m512 df782 = fft8736;
__m512 df775 = fft8636;
__m512 df783 = fft8719;
__m512 df776 = fft8637;
__m512 df784 = fft8720;
__m512 df777 = fft8638;
__m512 df785 = fft8721;
__m512 df778 = fft8639;
__m512 df786 = fft8722;
__m512 df779 = fft8640;
__m512 df787 = fft8723;
__m512 df780 = fft8641;
__m512 df788 = fft8724;
__m512i eo52 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df775 = _mm512_permutexvar_ps(eo52, df775);
df776 = _mm512_permutexvar_ps(eo52, df776);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df775);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df776);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df775);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df776);
df783 = _mm512_permutexvar_ps(eo52, df783);
df784 = _mm512_permutexvar_ps(eo52, df784);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df783);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df784);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df783);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df784);
df777 = _mm512_permutexvar_ps(eo52, df777);
df778 = _mm512_permutexvar_ps(eo52, df778);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df777);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df778);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df777);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df778);
df785 = _mm512_permutexvar_ps(eo52, df785);
df786 = _mm512_permutexvar_ps(eo52, df786);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df785);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df786);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df785);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df786);
df779 = _mm512_permutexvar_ps(eo52, df779);
df780 = _mm512_permutexvar_ps(eo52, df780);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df779);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df780);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df779);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df780);
df787 = _mm512_permutexvar_ps(eo52, df787);
df788 = _mm512_permutexvar_ps(eo52, df788);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df787);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df788);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df787);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df788);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df773);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df774);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df773);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df774);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df781);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k108+128*m52+32*f55, 255, df782);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df781);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k108+128*m52+32*f55, 65280, df782);
for (ptrdiff_t b64 = 3; b64 < 5; ++b64) {
ptrdiff_t m53 = (size_t)b64/2;
ptrdiff_t f56 = (size_t)b64%2;
__m512 dat1814 = _mm512_maskz_loadu_ps(65535, datPtr18+2912+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1815 = _mm512_maskz_loadu_ps(65535, datPtr18+3136+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1816 = _mm512_maskz_loadu_ps(65535, datPtr18+3360+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1817 = _mm512_maskz_loadu_ps(65535, datPtr18+3584+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1818 = _mm512_maskz_loadu_ps(65535, datPtr18+3808+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1819 = _mm512_maskz_loadu_ps(65535, datPtr18+4032+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1820 = _mm512_maskz_loadu_ps(65535, datPtr18+4256+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1821 = _mm512_maskz_loadu_ps(65535, datPtr18+4480+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1822 = _mm512_maskz_loadu_ps(65535, datPtr18+4704+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1823 = _mm512_maskz_loadu_ps(65535, datPtr18+4928+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1824 = _mm512_maskz_loadu_ps(65535, datPtr18+5152+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1825 = _mm512_maskz_loadu_ps(65535, datPtr18+5376+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1826 = _mm512_maskz_loadu_ps(65535, datPtr18+5600+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1827 = _mm512_maskz_loadu_ps(65535, datPtr18+5824+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1828 = _mm512_maskz_loadu_ps(65535, datPtr18+6048+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 dat1829 = _mm512_maskz_loadu_ps(65535, datPtr18+6272+100864*i38+12608*k108+224*h41+4*w50+56*b64);
__m512 fft8737 = _mm512_add_ps(dat1814, dat1822);
__m512 fft8825 = _mm512_add_ps(dat1815, dat1823);
__m512 fft8738 = _mm512_sub_ps(dat1814, dat1822);
__m512 fft8826 = _mm512_sub_ps(dat1815, dat1823);
__m512 fft8739 = _mm512_add_ps(dat1816, dat1824);
__m512 fft8827 = _mm512_add_ps(dat1817, dat1825);
__m512 fft8740 = _mm512_sub_ps(dat1816, dat1824);
__m512 fft8828 = _mm512_sub_ps(dat1817, dat1825);
__m512 fft8741 = _mm512_add_ps(dat1818, dat1826);
__m512 fft8829 = _mm512_add_ps(dat1819, dat1827);
__m512 fft8742 = _mm512_sub_ps(dat1818, dat1826);
__m512 fft8830 = _mm512_sub_ps(dat1819, dat1827);
__m512 fft8743 = _mm512_add_ps(dat1820, dat1828);
__m512 fft8831 = _mm512_add_ps(dat1821, dat1829);
__m512 fft8744 = _mm512_sub_ps(dat1820, dat1828);
__m512 fft8832 = _mm512_sub_ps(dat1821, dat1829);
__m512 fft8745 = _mm512_add_ps(fft8737, fft8741);
__m512 fft8833 = _mm512_add_ps(fft8825, fft8829);
__m512 fft8746 = _mm512_sub_ps(fft8737, fft8741);
__m512 fft8834 = _mm512_sub_ps(fft8825, fft8829);
__m512 fft8747 = _mm512_add_ps(fft8739, fft8743);
__m512 fft8835 = _mm512_add_ps(fft8827, fft8831);
__m512 fft8748 = _mm512_sub_ps(fft8743, fft8739);
__m512 fft8836 = _mm512_sub_ps(fft8831, fft8827);
__m512 fft8749 = _mm512_sub_ps(fft8740, fft8744);
__m512 fft8837 = _mm512_sub_ps(fft8828, fft8832);
__m512 fft8750 = _mm512_add_ps(fft8740, fft8744);
__m512 fft8838 = _mm512_add_ps(fft8828, fft8832);
__m512 fft8751 = _mm512_add_ps(fft8745, fft8747);
__m512 fft8839 = _mm512_add_ps(fft8833, fft8835);
__m512 fft8752 = _mm512_sub_ps(fft8745, fft8747);
__m512 fft8840 = _mm512_sub_ps(fft8833, fft8835);
__m512 fft8753 = _mm512_fmadd_ps(fft8749, _mm512_set1_ps(7.0710677e-01f), fft8738);
__m512 fft8841 = _mm512_fmadd_ps(fft8837, _mm512_set1_ps(7.0710677e-01f), fft8826);
__m512 fft8754 = _mm512_fnmsub_ps(fft8750, _mm512_set1_ps(7.0710677e-01f), fft8742);
__m512 fft8842 = _mm512_fnmsub_ps(fft8838, _mm512_set1_ps(7.0710677e-01f), fft8830);
__m512 fft8755 = _mm512_fnmadd_ps(fft8749, _mm512_set1_ps(7.0710677e-01f), fft8738);
__m512 fft8843 = _mm512_fnmadd_ps(fft8837, _mm512_set1_ps(7.0710677e-01f), fft8826);
__m512 fft8756 = _mm512_fnmadd_ps(fft8750, _mm512_set1_ps(7.0710677e-01f), fft8742);
__m512 fft8844 = _mm512_fnmadd_ps(fft8838, _mm512_set1_ps(7.0710677e-01f), fft8830);
__m512 fft8757 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8758 = _mm512_fmadd_ps(fft8751, fft8757, _mm512_shuffle_f32x4(fft8751, fft8751, 78));
__m512 fft8845 = _mm512_fmadd_ps(fft8839, fft8757, _mm512_shuffle_f32x4(fft8839, fft8839, 78));
__m512 fft8759 = _mm512_fmadd_ps(fft8752, fft8757, _mm512_shuffle_f32x4(fft8752, fft8752, 78));
__m512 fft8846 = _mm512_fmadd_ps(fft8840, fft8757, _mm512_shuffle_f32x4(fft8840, fft8840, 78));
__m512 fft8760 = _mm512_fmadd_ps(fft8753, fft8757, _mm512_shuffle_f32x4(fft8753, fft8753, 78));
__m512 fft8847 = _mm512_fmadd_ps(fft8841, fft8757, _mm512_shuffle_f32x4(fft8841, fft8841, 78));
__m512 fft8761 = _mm512_fmadd_ps(fft8754, fft8757, _mm512_shuffle_f32x4(fft8754, fft8754, 78));
__m512 fft8848 = _mm512_fmadd_ps(fft8842, fft8757, _mm512_shuffle_f32x4(fft8842, fft8842, 78));
__m512 fft8762 = _mm512_fmadd_ps(fft8746, fft8757, _mm512_shuffle_f32x4(fft8746, fft8746, 78));
__m512 fft8849 = _mm512_fmadd_ps(fft8834, fft8757, _mm512_shuffle_f32x4(fft8834, fft8834, 78));
__m512 fft8763 = _mm512_fmadd_ps(fft8748, fft8757, _mm512_shuffle_f32x4(fft8748, fft8748, 78));
__m512 fft8850 = _mm512_fmadd_ps(fft8836, fft8757, _mm512_shuffle_f32x4(fft8836, fft8836, 78));
__m512 fft8764 = _mm512_fmadd_ps(fft8755, fft8757, _mm512_shuffle_f32x4(fft8755, fft8755, 78));
__m512 fft8851 = _mm512_fmadd_ps(fft8843, fft8757, _mm512_shuffle_f32x4(fft8843, fft8843, 78));
__m512 fft8765 = _mm512_fmadd_ps(fft8756, fft8757, _mm512_shuffle_f32x4(fft8756, fft8756, 78));
__m512 fft8852 = _mm512_fmadd_ps(fft8844, fft8757, _mm512_shuffle_f32x4(fft8844, fft8844, 78));
__m512 fft8766 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8767 = _mm512_mul_ps(fft8758, fft8766);
__m512 fft8853 = _mm512_mul_ps(fft8845, fft8766);
__m512 fft8768 = _mm512_mul_ps(fft8759, fft8766);
__m512 fft8854 = _mm512_mul_ps(fft8846, fft8766);
__m512 fft8769 = _mm512_mul_ps(fft8760, fft8766);
__m512 fft8855 = _mm512_mul_ps(fft8847, fft8766);
__m512 fft8770 = _mm512_mul_ps(fft8761, fft8766);
__m512 fft8856 = _mm512_mul_ps(fft8848, fft8766);
__m512 fft8771 = _mm512_mul_ps(fft8762, fft8766);
__m512 fft8857 = _mm512_mul_ps(fft8849, fft8766);
__m512 fft8772 = _mm512_mul_ps(fft8763, fft8766);
__m512 fft8858 = _mm512_mul_ps(fft8850, fft8766);
__m512 fft8773 = _mm512_mul_ps(fft8764, fft8766);
__m512 fft8859 = _mm512_mul_ps(fft8851, fft8766);
__m512 fft8774 = _mm512_mul_ps(fft8765, fft8766);
__m512 fft8860 = _mm512_mul_ps(fft8852, fft8766);
__m512 fft8775 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8776 = _mm512_fmadd_ps(fft8759, fft8775, fft8767);
__m512 fft8861 = _mm512_fmadd_ps(fft8846, fft8775, fft8853);
__m512 fft8777 = _mm512_fnmadd_ps(fft8758, fft8775, fft8768);
__m512 fft8862 = _mm512_fnmadd_ps(fft8845, fft8775, fft8854);
__m512 fft8778 = _mm512_fmadd_ps(fft8761, fft8775, fft8769);
__m512 fft8863 = _mm512_fmadd_ps(fft8848, fft8775, fft8855);
__m512 fft8779 = _mm512_fnmadd_ps(fft8760, fft8775, fft8770);
__m512 fft8864 = _mm512_fnmadd_ps(fft8847, fft8775, fft8856);
__m512 fft8780 = _mm512_fmadd_ps(fft8763, fft8775, fft8771);
__m512 fft8865 = _mm512_fmadd_ps(fft8850, fft8775, fft8857);
__m512 fft8781 = _mm512_fnmadd_ps(fft8762, fft8775, fft8772);
__m512 fft8866 = _mm512_fnmadd_ps(fft8849, fft8775, fft8858);
__m512 fft8782 = _mm512_fmadd_ps(fft8765, fft8775, fft8773);
__m512 fft8867 = _mm512_fmadd_ps(fft8852, fft8775, fft8859);
__m512 fft8783 = _mm512_fnmadd_ps(fft8764, fft8775, fft8774);
__m512 fft8868 = _mm512_fnmadd_ps(fft8851, fft8775, fft8860);
__m512 fft8784 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8785 = _mm512_fmadd_ps(fft8776, fft8784, _mm512_shuffle_f32x4(fft8776, fft8776, 177));
__m512 fft8869 = _mm512_fmadd_ps(fft8861, fft8784, _mm512_shuffle_f32x4(fft8861, fft8861, 177));
__m512 fft8786 = _mm512_fmadd_ps(fft8777, fft8784, _mm512_shuffle_f32x4(fft8777, fft8777, 177));
__m512 fft8870 = _mm512_fmadd_ps(fft8862, fft8784, _mm512_shuffle_f32x4(fft8862, fft8862, 177));
__m512 fft8787 = _mm512_fmadd_ps(fft8778, fft8784, _mm512_shuffle_f32x4(fft8778, fft8778, 177));
__m512 fft8871 = _mm512_fmadd_ps(fft8863, fft8784, _mm512_shuffle_f32x4(fft8863, fft8863, 177));
__m512 fft8788 = _mm512_fmadd_ps(fft8779, fft8784, _mm512_shuffle_f32x4(fft8779, fft8779, 177));
__m512 fft8872 = _mm512_fmadd_ps(fft8864, fft8784, _mm512_shuffle_f32x4(fft8864, fft8864, 177));
__m512 fft8789 = _mm512_fmadd_ps(fft8780, fft8784, _mm512_shuffle_f32x4(fft8780, fft8780, 177));
__m512 fft8873 = _mm512_fmadd_ps(fft8865, fft8784, _mm512_shuffle_f32x4(fft8865, fft8865, 177));
__m512 fft8790 = _mm512_fmadd_ps(fft8781, fft8784, _mm512_shuffle_f32x4(fft8781, fft8781, 177));
__m512 fft8874 = _mm512_fmadd_ps(fft8866, fft8784, _mm512_shuffle_f32x4(fft8866, fft8866, 177));
__m512 fft8791 = _mm512_fmadd_ps(fft8782, fft8784, _mm512_shuffle_f32x4(fft8782, fft8782, 177));
__m512 fft8875 = _mm512_fmadd_ps(fft8867, fft8784, _mm512_shuffle_f32x4(fft8867, fft8867, 177));
__m512 fft8792 = _mm512_fmadd_ps(fft8783, fft8784, _mm512_shuffle_f32x4(fft8783, fft8783, 177));
__m512 fft8876 = _mm512_fmadd_ps(fft8868, fft8784, _mm512_shuffle_f32x4(fft8868, fft8868, 177));
__m512 fft8793 = _mm512_mask_mov_ps(fft8785, 49344, fft8786);
__m512 fft8877 = _mm512_mask_mov_ps(fft8869, 49344, fft8870);
__m512 fft8794 = _mm512_mask_sub_ps(fft8786, 49344, _mm512_setzero_ps(), fft8785);
__m512 fft8878 = _mm512_mask_sub_ps(fft8870, 49344, _mm512_setzero_ps(), fft8869);
__m512 fft8795 = _mm512_mask_mov_ps(fft8787, 49344, fft8788);
__m512 fft8879 = _mm512_mask_mov_ps(fft8871, 49344, fft8872);
__m512 fft8796 = _mm512_mask_sub_ps(fft8788, 49344, _mm512_setzero_ps(), fft8787);
__m512 fft8880 = _mm512_mask_sub_ps(fft8872, 49344, _mm512_setzero_ps(), fft8871);
__m512 fft8797 = _mm512_mask_mov_ps(fft8789, 49344, fft8790);
__m512 fft8881 = _mm512_mask_mov_ps(fft8873, 49344, fft8874);
__m512 fft8798 = _mm512_mask_sub_ps(fft8790, 49344, _mm512_setzero_ps(), fft8789);
__m512 fft8882 = _mm512_mask_sub_ps(fft8874, 49344, _mm512_setzero_ps(), fft8873);
__m512 fft8799 = _mm512_mask_mov_ps(fft8791, 49344, fft8792);
__m512 fft8883 = _mm512_mask_mov_ps(fft8875, 49344, fft8876);
__m512 fft8800 = _mm512_mask_sub_ps(fft8792, 49344, _mm512_setzero_ps(), fft8791);
__m512 fft8884 = _mm512_mask_sub_ps(fft8876, 49344, _mm512_setzero_ps(), fft8875);
__m512 fft8801 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8802 = _mm512_fmadd_ps(fft8793, fft8801, _mm512_shuffle_ps(fft8793, fft8793, 78));
__m512 fft8885 = _mm512_fmadd_ps(fft8877, fft8801, _mm512_shuffle_ps(fft8877, fft8877, 78));
__m512 fft8803 = _mm512_fmadd_ps(fft8794, fft8801, _mm512_shuffle_ps(fft8794, fft8794, 78));
__m512 fft8886 = _mm512_fmadd_ps(fft8878, fft8801, _mm512_shuffle_ps(fft8878, fft8878, 78));
__m512 fft8804 = _mm512_fmadd_ps(fft8795, fft8801, _mm512_shuffle_ps(fft8795, fft8795, 78));
__m512 fft8887 = _mm512_fmadd_ps(fft8879, fft8801, _mm512_shuffle_ps(fft8879, fft8879, 78));
__m512 fft8805 = _mm512_fmadd_ps(fft8796, fft8801, _mm512_shuffle_ps(fft8796, fft8796, 78));
__m512 fft8888 = _mm512_fmadd_ps(fft8880, fft8801, _mm512_shuffle_ps(fft8880, fft8880, 78));
__m512 fft8806 = _mm512_fmadd_ps(fft8797, fft8801, _mm512_shuffle_ps(fft8797, fft8797, 78));
__m512 fft8889 = _mm512_fmadd_ps(fft8881, fft8801, _mm512_shuffle_ps(fft8881, fft8881, 78));
__m512 fft8807 = _mm512_fmadd_ps(fft8798, fft8801, _mm512_shuffle_ps(fft8798, fft8798, 78));
__m512 fft8890 = _mm512_fmadd_ps(fft8882, fft8801, _mm512_shuffle_ps(fft8882, fft8882, 78));
__m512 fft8808 = _mm512_fmadd_ps(fft8799, fft8801, _mm512_shuffle_ps(fft8799, fft8799, 78));
__m512 fft8891 = _mm512_fmadd_ps(fft8883, fft8801, _mm512_shuffle_ps(fft8883, fft8883, 78));
__m512 fft8809 = _mm512_fmadd_ps(fft8800, fft8801, _mm512_shuffle_ps(fft8800, fft8800, 78));
__m512 fft8892 = _mm512_fmadd_ps(fft8884, fft8801, _mm512_shuffle_ps(fft8884, fft8884, 78));
__m512i fft8810 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8811 = _mm512_permutexvar_ps(fft8810, fft8802);
__m512 fft8893 = _mm512_permutexvar_ps(fft8810, fft8885);
__m512i fft8812 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8813 = _mm512_permutexvar_ps(fft8812, fft8802);
__m512 fft8894 = _mm512_permutexvar_ps(fft8812, fft8885);
__m512 fft8814 = _mm512_permutexvar_ps(fft8810, fft8803);
__m512 fft8895 = _mm512_permutexvar_ps(fft8810, fft8886);
__m512 fft8815 = _mm512_permutexvar_ps(fft8812, fft8803);
__m512 fft8896 = _mm512_permutexvar_ps(fft8812, fft8886);
__m512 fft8816 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8817 = _mm512_fmadd_ps(fft8811, fft8816, fft8813);
__m512 fft8897 = _mm512_fmadd_ps(fft8893, fft8816, fft8894);
__m512 fft8818 = _mm512_fnmadd_ps(fft8815, fft8816, fft8814);
__m512 fft8898 = _mm512_fnmadd_ps(fft8896, fft8816, fft8895);
__m512 fft8819 = _mm512_mask_mov_ps(fft8815, 21845, fft8817);
__m512 fft8899 = _mm512_mask_mov_ps(fft8896, 21845, fft8897);
__m512 fft8820 = _mm512_mask_mov_ps(fft8811, 43176, fft8817);
__m512 fft8900 = _mm512_mask_mov_ps(fft8893, 43176, fft8897);
__m512 fft8821 = _mm512_mask_mov_ps(fft8819, 43176, fft8818);
__m512 fft8901 = _mm512_mask_mov_ps(fft8899, 43176, fft8898);
__m512 fft8822 = _mm512_mask_mov_ps(fft8820, 22102, fft8818);
__m512 fft8902 = _mm512_mask_mov_ps(fft8900, 22102, fft8898);
__m512 fft8823 = _mm512_mask_mul_ps(fft8821, 64764, fft8821, _mm512_set1_ps(5e-01f));
__m512 fft8903 = _mm512_mask_mul_ps(fft8901, 64764, fft8901, _mm512_set1_ps(5e-01f));
__m512 fft8824 = _mm512_mask_mul_ps(fft8822, 64764, fft8822, _mm512_set1_ps(5e-01f));
__m512 fft8904 = _mm512_mask_mul_ps(fft8902, 64764, fft8902, _mm512_set1_ps(5e-01f));
__m512 df789 = fft8823;
__m512 df797 = fft8903;
__m512 df790 = fft8824;
__m512 df798 = fft8904;
__m512 df791 = fft8804;
__m512 df799 = fft8887;
__m512 df792 = fft8805;
__m512 df800 = fft8888;
__m512 df793 = fft8806;
__m512 df801 = fft8889;
__m512 df794 = fft8807;
__m512 df802 = fft8890;
__m512 df795 = fft8808;
__m512 df803 = fft8891;
__m512 df796 = fft8809;
__m512 df804 = fft8892;
__m512i eo53 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df791 = _mm512_permutexvar_ps(eo53, df791);
df792 = _mm512_permutexvar_ps(eo53, df792);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df791);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df792);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df791);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df792);
df799 = _mm512_permutexvar_ps(eo53, df799);
df800 = _mm512_permutexvar_ps(eo53, df800);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df799);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df800);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df799);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df800);
df793 = _mm512_permutexvar_ps(eo53, df793);
df794 = _mm512_permutexvar_ps(eo53, df794);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df793);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df794);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df793);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df794);
df801 = _mm512_permutexvar_ps(eo53, df801);
df802 = _mm512_permutexvar_ps(eo53, df802);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df801);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df802);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df801);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df802);
df795 = _mm512_permutexvar_ps(eo53, df795);
df796 = _mm512_permutexvar_ps(eo53, df796);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df795);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df796);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df795);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df796);
df803 = _mm512_permutexvar_ps(eo53, df803);
df804 = _mm512_permutexvar_ps(eo53, df804);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df803);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df804);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df803);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df804);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df789);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df790);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df789);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df790);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df797);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k108+128*m53+32*f56, 255, df798);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df797);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k108+128*m53+32*f56, 65280, df798);
}
ptrdiff_t b65 = 5;
ptrdiff_t m54 = (size_t)b65/2;
ptrdiff_t f57 = (size_t)b65%2;
__m512 dat1830 = _mm512_maskz_loadu_ps(32767, datPtr18+3192+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1831 = _mm512_maskz_loadu_ps(32767, datPtr18+3416+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1832 = _mm512_maskz_loadu_ps(32767, datPtr18+3640+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1833 = _mm512_maskz_loadu_ps(32767, datPtr18+3864+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1834 = _mm512_maskz_loadu_ps(32767, datPtr18+4088+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1835 = _mm512_maskz_loadu_ps(32767, datPtr18+4312+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1836 = _mm512_maskz_loadu_ps(32767, datPtr18+4536+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1837 = _mm512_maskz_loadu_ps(32767, datPtr18+4760+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1838 = _mm512_maskz_loadu_ps(32767, datPtr18+4984+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1839 = _mm512_maskz_loadu_ps(32767, datPtr18+5208+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1840 = _mm512_maskz_loadu_ps(32767, datPtr18+5432+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1841 = _mm512_maskz_loadu_ps(32767, datPtr18+5656+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1842 = _mm512_maskz_loadu_ps(32767, datPtr18+5880+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1843 = _mm512_maskz_loadu_ps(32767, datPtr18+6104+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1844 = _mm512_maskz_loadu_ps(32767, datPtr18+6328+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 dat1845 = _mm512_maskz_loadu_ps(32767, datPtr18+6552+100864*i38+12608*k108+224*h41+4*w50+0*b65);
__m512 fft8905 = _mm512_add_ps(dat1830, dat1838);
__m512 fft8993 = _mm512_add_ps(dat1831, dat1839);
__m512 fft8906 = _mm512_sub_ps(dat1830, dat1838);
__m512 fft8994 = _mm512_sub_ps(dat1831, dat1839);
__m512 fft8907 = _mm512_add_ps(dat1832, dat1840);
__m512 fft8995 = _mm512_add_ps(dat1833, dat1841);
__m512 fft8908 = _mm512_sub_ps(dat1832, dat1840);
__m512 fft8996 = _mm512_sub_ps(dat1833, dat1841);
__m512 fft8909 = _mm512_add_ps(dat1834, dat1842);
__m512 fft8997 = _mm512_add_ps(dat1835, dat1843);
__m512 fft8910 = _mm512_sub_ps(dat1834, dat1842);
__m512 fft8998 = _mm512_sub_ps(dat1835, dat1843);
__m512 fft8911 = _mm512_add_ps(dat1836, dat1844);
__m512 fft8999 = _mm512_add_ps(dat1837, dat1845);
__m512 fft8912 = _mm512_sub_ps(dat1836, dat1844);
__m512 fft9000 = _mm512_sub_ps(dat1837, dat1845);
__m512 fft8913 = _mm512_add_ps(fft8905, fft8909);
__m512 fft9001 = _mm512_add_ps(fft8993, fft8997);
__m512 fft8914 = _mm512_sub_ps(fft8905, fft8909);
__m512 fft9002 = _mm512_sub_ps(fft8993, fft8997);
__m512 fft8915 = _mm512_add_ps(fft8907, fft8911);
__m512 fft9003 = _mm512_add_ps(fft8995, fft8999);
__m512 fft8916 = _mm512_sub_ps(fft8911, fft8907);
__m512 fft9004 = _mm512_sub_ps(fft8999, fft8995);
__m512 fft8917 = _mm512_sub_ps(fft8908, fft8912);
__m512 fft9005 = _mm512_sub_ps(fft8996, fft9000);
__m512 fft8918 = _mm512_add_ps(fft8908, fft8912);
__m512 fft9006 = _mm512_add_ps(fft8996, fft9000);
__m512 fft8919 = _mm512_add_ps(fft8913, fft8915);
__m512 fft9007 = _mm512_add_ps(fft9001, fft9003);
__m512 fft8920 = _mm512_sub_ps(fft8913, fft8915);
__m512 fft9008 = _mm512_sub_ps(fft9001, fft9003);
__m512 fft8921 = _mm512_fmadd_ps(fft8917, _mm512_set1_ps(7.0710677e-01f), fft8906);
__m512 fft9009 = _mm512_fmadd_ps(fft9005, _mm512_set1_ps(7.0710677e-01f), fft8994);
__m512 fft8922 = _mm512_fnmsub_ps(fft8918, _mm512_set1_ps(7.0710677e-01f), fft8910);
__m512 fft9010 = _mm512_fnmsub_ps(fft9006, _mm512_set1_ps(7.0710677e-01f), fft8998);
__m512 fft8923 = _mm512_fnmadd_ps(fft8917, _mm512_set1_ps(7.0710677e-01f), fft8906);
__m512 fft9011 = _mm512_fnmadd_ps(fft9005, _mm512_set1_ps(7.0710677e-01f), fft8994);
__m512 fft8924 = _mm512_fnmadd_ps(fft8918, _mm512_set1_ps(7.0710677e-01f), fft8910);
__m512 fft9012 = _mm512_fnmadd_ps(fft9006, _mm512_set1_ps(7.0710677e-01f), fft8998);
__m512 fft8925 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8926 = _mm512_fmadd_ps(fft8919, fft8925, _mm512_shuffle_f32x4(fft8919, fft8919, 78));
__m512 fft9013 = _mm512_fmadd_ps(fft9007, fft8925, _mm512_shuffle_f32x4(fft9007, fft9007, 78));
__m512 fft8927 = _mm512_fmadd_ps(fft8920, fft8925, _mm512_shuffle_f32x4(fft8920, fft8920, 78));
__m512 fft9014 = _mm512_fmadd_ps(fft9008, fft8925, _mm512_shuffle_f32x4(fft9008, fft9008, 78));
__m512 fft8928 = _mm512_fmadd_ps(fft8921, fft8925, _mm512_shuffle_f32x4(fft8921, fft8921, 78));
__m512 fft9015 = _mm512_fmadd_ps(fft9009, fft8925, _mm512_shuffle_f32x4(fft9009, fft9009, 78));
__m512 fft8929 = _mm512_fmadd_ps(fft8922, fft8925, _mm512_shuffle_f32x4(fft8922, fft8922, 78));
__m512 fft9016 = _mm512_fmadd_ps(fft9010, fft8925, _mm512_shuffle_f32x4(fft9010, fft9010, 78));
__m512 fft8930 = _mm512_fmadd_ps(fft8914, fft8925, _mm512_shuffle_f32x4(fft8914, fft8914, 78));
__m512 fft9017 = _mm512_fmadd_ps(fft9002, fft8925, _mm512_shuffle_f32x4(fft9002, fft9002, 78));
__m512 fft8931 = _mm512_fmadd_ps(fft8916, fft8925, _mm512_shuffle_f32x4(fft8916, fft8916, 78));
__m512 fft9018 = _mm512_fmadd_ps(fft9004, fft8925, _mm512_shuffle_f32x4(fft9004, fft9004, 78));
__m512 fft8932 = _mm512_fmadd_ps(fft8923, fft8925, _mm512_shuffle_f32x4(fft8923, fft8923, 78));
__m512 fft9019 = _mm512_fmadd_ps(fft9011, fft8925, _mm512_shuffle_f32x4(fft9011, fft9011, 78));
__m512 fft8933 = _mm512_fmadd_ps(fft8924, fft8925, _mm512_shuffle_f32x4(fft8924, fft8924, 78));
__m512 fft9020 = _mm512_fmadd_ps(fft9012, fft8925, _mm512_shuffle_f32x4(fft9012, fft9012, 78));
__m512 fft8934 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft8935 = _mm512_mul_ps(fft8926, fft8934);
__m512 fft9021 = _mm512_mul_ps(fft9013, fft8934);
__m512 fft8936 = _mm512_mul_ps(fft8927, fft8934);
__m512 fft9022 = _mm512_mul_ps(fft9014, fft8934);
__m512 fft8937 = _mm512_mul_ps(fft8928, fft8934);
__m512 fft9023 = _mm512_mul_ps(fft9015, fft8934);
__m512 fft8938 = _mm512_mul_ps(fft8929, fft8934);
__m512 fft9024 = _mm512_mul_ps(fft9016, fft8934);
__m512 fft8939 = _mm512_mul_ps(fft8930, fft8934);
__m512 fft9025 = _mm512_mul_ps(fft9017, fft8934);
__m512 fft8940 = _mm512_mul_ps(fft8931, fft8934);
__m512 fft9026 = _mm512_mul_ps(fft9018, fft8934);
__m512 fft8941 = _mm512_mul_ps(fft8932, fft8934);
__m512 fft9027 = _mm512_mul_ps(fft9019, fft8934);
__m512 fft8942 = _mm512_mul_ps(fft8933, fft8934);
__m512 fft9028 = _mm512_mul_ps(fft9020, fft8934);
__m512 fft8943 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft8944 = _mm512_fmadd_ps(fft8927, fft8943, fft8935);
__m512 fft9029 = _mm512_fmadd_ps(fft9014, fft8943, fft9021);
__m512 fft8945 = _mm512_fnmadd_ps(fft8926, fft8943, fft8936);
__m512 fft9030 = _mm512_fnmadd_ps(fft9013, fft8943, fft9022);
__m512 fft8946 = _mm512_fmadd_ps(fft8929, fft8943, fft8937);
__m512 fft9031 = _mm512_fmadd_ps(fft9016, fft8943, fft9023);
__m512 fft8947 = _mm512_fnmadd_ps(fft8928, fft8943, fft8938);
__m512 fft9032 = _mm512_fnmadd_ps(fft9015, fft8943, fft9024);
__m512 fft8948 = _mm512_fmadd_ps(fft8931, fft8943, fft8939);
__m512 fft9033 = _mm512_fmadd_ps(fft9018, fft8943, fft9025);
__m512 fft8949 = _mm512_fnmadd_ps(fft8930, fft8943, fft8940);
__m512 fft9034 = _mm512_fnmadd_ps(fft9017, fft8943, fft9026);
__m512 fft8950 = _mm512_fmadd_ps(fft8933, fft8943, fft8941);
__m512 fft9035 = _mm512_fmadd_ps(fft9020, fft8943, fft9027);
__m512 fft8951 = _mm512_fnmadd_ps(fft8932, fft8943, fft8942);
__m512 fft9036 = _mm512_fnmadd_ps(fft9019, fft8943, fft9028);
__m512 fft8952 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft8953 = _mm512_fmadd_ps(fft8944, fft8952, _mm512_shuffle_f32x4(fft8944, fft8944, 177));
__m512 fft9037 = _mm512_fmadd_ps(fft9029, fft8952, _mm512_shuffle_f32x4(fft9029, fft9029, 177));
__m512 fft8954 = _mm512_fmadd_ps(fft8945, fft8952, _mm512_shuffle_f32x4(fft8945, fft8945, 177));
__m512 fft9038 = _mm512_fmadd_ps(fft9030, fft8952, _mm512_shuffle_f32x4(fft9030, fft9030, 177));
__m512 fft8955 = _mm512_fmadd_ps(fft8946, fft8952, _mm512_shuffle_f32x4(fft8946, fft8946, 177));
__m512 fft9039 = _mm512_fmadd_ps(fft9031, fft8952, _mm512_shuffle_f32x4(fft9031, fft9031, 177));
__m512 fft8956 = _mm512_fmadd_ps(fft8947, fft8952, _mm512_shuffle_f32x4(fft8947, fft8947, 177));
__m512 fft9040 = _mm512_fmadd_ps(fft9032, fft8952, _mm512_shuffle_f32x4(fft9032, fft9032, 177));
__m512 fft8957 = _mm512_fmadd_ps(fft8948, fft8952, _mm512_shuffle_f32x4(fft8948, fft8948, 177));
__m512 fft9041 = _mm512_fmadd_ps(fft9033, fft8952, _mm512_shuffle_f32x4(fft9033, fft9033, 177));
__m512 fft8958 = _mm512_fmadd_ps(fft8949, fft8952, _mm512_shuffle_f32x4(fft8949, fft8949, 177));
__m512 fft9042 = _mm512_fmadd_ps(fft9034, fft8952, _mm512_shuffle_f32x4(fft9034, fft9034, 177));
__m512 fft8959 = _mm512_fmadd_ps(fft8950, fft8952, _mm512_shuffle_f32x4(fft8950, fft8950, 177));
__m512 fft9043 = _mm512_fmadd_ps(fft9035, fft8952, _mm512_shuffle_f32x4(fft9035, fft9035, 177));
__m512 fft8960 = _mm512_fmadd_ps(fft8951, fft8952, _mm512_shuffle_f32x4(fft8951, fft8951, 177));
__m512 fft9044 = _mm512_fmadd_ps(fft9036, fft8952, _mm512_shuffle_f32x4(fft9036, fft9036, 177));
__m512 fft8961 = _mm512_mask_mov_ps(fft8953, 49344, fft8954);
__m512 fft9045 = _mm512_mask_mov_ps(fft9037, 49344, fft9038);
__m512 fft8962 = _mm512_mask_sub_ps(fft8954, 49344, _mm512_setzero_ps(), fft8953);
__m512 fft9046 = _mm512_mask_sub_ps(fft9038, 49344, _mm512_setzero_ps(), fft9037);
__m512 fft8963 = _mm512_mask_mov_ps(fft8955, 49344, fft8956);
__m512 fft9047 = _mm512_mask_mov_ps(fft9039, 49344, fft9040);
__m512 fft8964 = _mm512_mask_sub_ps(fft8956, 49344, _mm512_setzero_ps(), fft8955);
__m512 fft9048 = _mm512_mask_sub_ps(fft9040, 49344, _mm512_setzero_ps(), fft9039);
__m512 fft8965 = _mm512_mask_mov_ps(fft8957, 49344, fft8958);
__m512 fft9049 = _mm512_mask_mov_ps(fft9041, 49344, fft9042);
__m512 fft8966 = _mm512_mask_sub_ps(fft8958, 49344, _mm512_setzero_ps(), fft8957);
__m512 fft9050 = _mm512_mask_sub_ps(fft9042, 49344, _mm512_setzero_ps(), fft9041);
__m512 fft8967 = _mm512_mask_mov_ps(fft8959, 49344, fft8960);
__m512 fft9051 = _mm512_mask_mov_ps(fft9043, 49344, fft9044);
__m512 fft8968 = _mm512_mask_sub_ps(fft8960, 49344, _mm512_setzero_ps(), fft8959);
__m512 fft9052 = _mm512_mask_sub_ps(fft9044, 49344, _mm512_setzero_ps(), fft9043);
__m512 fft8969 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft8970 = _mm512_fmadd_ps(fft8961, fft8969, _mm512_shuffle_ps(fft8961, fft8961, 78));
__m512 fft9053 = _mm512_fmadd_ps(fft9045, fft8969, _mm512_shuffle_ps(fft9045, fft9045, 78));
__m512 fft8971 = _mm512_fmadd_ps(fft8962, fft8969, _mm512_shuffle_ps(fft8962, fft8962, 78));
__m512 fft9054 = _mm512_fmadd_ps(fft9046, fft8969, _mm512_shuffle_ps(fft9046, fft9046, 78));
__m512 fft8972 = _mm512_fmadd_ps(fft8963, fft8969, _mm512_shuffle_ps(fft8963, fft8963, 78));
__m512 fft9055 = _mm512_fmadd_ps(fft9047, fft8969, _mm512_shuffle_ps(fft9047, fft9047, 78));
__m512 fft8973 = _mm512_fmadd_ps(fft8964, fft8969, _mm512_shuffle_ps(fft8964, fft8964, 78));
__m512 fft9056 = _mm512_fmadd_ps(fft9048, fft8969, _mm512_shuffle_ps(fft9048, fft9048, 78));
__m512 fft8974 = _mm512_fmadd_ps(fft8965, fft8969, _mm512_shuffle_ps(fft8965, fft8965, 78));
__m512 fft9057 = _mm512_fmadd_ps(fft9049, fft8969, _mm512_shuffle_ps(fft9049, fft9049, 78));
__m512 fft8975 = _mm512_fmadd_ps(fft8966, fft8969, _mm512_shuffle_ps(fft8966, fft8966, 78));
__m512 fft9058 = _mm512_fmadd_ps(fft9050, fft8969, _mm512_shuffle_ps(fft9050, fft9050, 78));
__m512 fft8976 = _mm512_fmadd_ps(fft8967, fft8969, _mm512_shuffle_ps(fft8967, fft8967, 78));
__m512 fft9059 = _mm512_fmadd_ps(fft9051, fft8969, _mm512_shuffle_ps(fft9051, fft9051, 78));
__m512 fft8977 = _mm512_fmadd_ps(fft8968, fft8969, _mm512_shuffle_ps(fft8968, fft8968, 78));
__m512 fft9060 = _mm512_fmadd_ps(fft9052, fft8969, _mm512_shuffle_ps(fft9052, fft9052, 78));
__m512i fft8978 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft8979 = _mm512_permutexvar_ps(fft8978, fft8970);
__m512 fft9061 = _mm512_permutexvar_ps(fft8978, fft9053);
__m512i fft8980 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft8981 = _mm512_permutexvar_ps(fft8980, fft8970);
__m512 fft9062 = _mm512_permutexvar_ps(fft8980, fft9053);
__m512 fft8982 = _mm512_permutexvar_ps(fft8978, fft8971);
__m512 fft9063 = _mm512_permutexvar_ps(fft8978, fft9054);
__m512 fft8983 = _mm512_permutexvar_ps(fft8980, fft8971);
__m512 fft9064 = _mm512_permutexvar_ps(fft8980, fft9054);
__m512 fft8984 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft8985 = _mm512_fmadd_ps(fft8979, fft8984, fft8981);
__m512 fft9065 = _mm512_fmadd_ps(fft9061, fft8984, fft9062);
__m512 fft8986 = _mm512_fnmadd_ps(fft8983, fft8984, fft8982);
__m512 fft9066 = _mm512_fnmadd_ps(fft9064, fft8984, fft9063);
__m512 fft8987 = _mm512_mask_mov_ps(fft8983, 21845, fft8985);
__m512 fft9067 = _mm512_mask_mov_ps(fft9064, 21845, fft9065);
__m512 fft8988 = _mm512_mask_mov_ps(fft8979, 43176, fft8985);
__m512 fft9068 = _mm512_mask_mov_ps(fft9061, 43176, fft9065);
__m512 fft8989 = _mm512_mask_mov_ps(fft8987, 43176, fft8986);
__m512 fft9069 = _mm512_mask_mov_ps(fft9067, 43176, fft9066);
__m512 fft8990 = _mm512_mask_mov_ps(fft8988, 22102, fft8986);
__m512 fft9070 = _mm512_mask_mov_ps(fft9068, 22102, fft9066);
__m512 fft8991 = _mm512_mask_mul_ps(fft8989, 64764, fft8989, _mm512_set1_ps(5e-01f));
__m512 fft9071 = _mm512_mask_mul_ps(fft9069, 64764, fft9069, _mm512_set1_ps(5e-01f));
__m512 fft8992 = _mm512_mask_mul_ps(fft8990, 64764, fft8990, _mm512_set1_ps(5e-01f));
__m512 fft9072 = _mm512_mask_mul_ps(fft9070, 64764, fft9070, _mm512_set1_ps(5e-01f));
__m512 df805 = fft8991;
__m512 df813 = fft9071;
__m512 df806 = fft8992;
__m512 df814 = fft9072;
__m512 df807 = fft8972;
__m512 df815 = fft9055;
__m512 df808 = fft8973;
__m512 df816 = fft9056;
__m512 df809 = fft8974;
__m512 df817 = fft9057;
__m512 df810 = fft8975;
__m512 df818 = fft9058;
__m512 df811 = fft8976;
__m512 df819 = fft9059;
__m512 df812 = fft8977;
__m512 df820 = fft9060;
__m512i eo54 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df807 = _mm512_permutexvar_ps(eo54, df807);
df808 = _mm512_permutexvar_ps(eo54, df808);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df807);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df808);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df807);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df808);
df815 = _mm512_permutexvar_ps(eo54, df815);
df816 = _mm512_permutexvar_ps(eo54, df816);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df815);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df816);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df815);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df816);
df809 = _mm512_permutexvar_ps(eo54, df809);
df810 = _mm512_permutexvar_ps(eo54, df810);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df809);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df810);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df809);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df810);
df817 = _mm512_permutexvar_ps(eo54, df817);
df818 = _mm512_permutexvar_ps(eo54, df818);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df817);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df818);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df817);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df818);
df811 = _mm512_permutexvar_ps(eo54, df811);
df812 = _mm512_permutexvar_ps(eo54, df812);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df811);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df812);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df811);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df812);
df819 = _mm512_permutexvar_ps(eo54, df819);
df820 = _mm512_permutexvar_ps(eo54, df820);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df819);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df820);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df819);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df820);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df805);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df806);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df805);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df806);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df813);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+384*k108+128*m54+32*f57, 255, df814);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df813);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+384*k108+128*m54+32*f57, 65280, df814);
}
++j31;
rel19 = 2;
}
ptrdiff_t h42 = base19+42;
ptrdiff_t w51 = 0;
ptrdiff_t k109 = 8*s26;
ptrdiff_t kk36 = k109+7;
for (; k109 <= kk36; ++k109) {
ptrdiff_t b66 = 0;
ptrdiff_t m55 = (size_t)b66/2;
ptrdiff_t f58 = (size_t)b66%2;
__m512 dat1846 = _mm512_maskz_loadu_ps(65534, datPtr18+0+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1847 = _mm512_maskz_loadu_ps(65534, datPtr18+224+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1848 = _mm512_maskz_loadu_ps(65534, datPtr18+448+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1849 = _mm512_maskz_loadu_ps(65534, datPtr18+672+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1850 = _mm512_maskz_loadu_ps(65534, datPtr18+896+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1851 = _mm512_maskz_loadu_ps(65534, datPtr18+1120+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1852 = _mm512_maskz_loadu_ps(65534, datPtr18+1344+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1853 = _mm512_maskz_loadu_ps(65534, datPtr18+1568+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1854 = _mm512_maskz_loadu_ps(65534, datPtr18+1792+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1855 = _mm512_maskz_loadu_ps(65534, datPtr18+2016+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1856 = _mm512_maskz_loadu_ps(65534, datPtr18+2240+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1857 = _mm512_maskz_loadu_ps(65534, datPtr18+2464+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1858 = _mm512_maskz_loadu_ps(65534, datPtr18+2688+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1859 = _mm512_maskz_loadu_ps(65534, datPtr18+2912+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 dat1860 = _mm512_maskz_loadu_ps(65534, datPtr18+3136+100864*i38+12608*k109+224*h42+4*w51+0*b66);
__m512 fft9073 = _mm512_add_ps(dat1846, dat1854);
__m512 fft9161 = _mm512_add_ps(dat1847, dat1855);
__m512 fft9074 = _mm512_sub_ps(dat1846, dat1854);
__m512 fft9162 = _mm512_sub_ps(dat1847, dat1855);
__m512 fft9075 = _mm512_add_ps(dat1848, dat1856);
__m512 fft9163 = _mm512_add_ps(dat1849, dat1857);
__m512 fft9076 = _mm512_sub_ps(dat1848, dat1856);
__m512 fft9164 = _mm512_sub_ps(dat1849, dat1857);
__m512 fft9077 = _mm512_add_ps(dat1850, dat1858);
__m512 fft9165 = _mm512_add_ps(dat1851, dat1859);
__m512 fft9078 = _mm512_sub_ps(dat1850, dat1858);
__m512 fft9166 = _mm512_sub_ps(dat1851, dat1859);
__m512 fft9079 = _mm512_add_ps(dat1852, dat1860);
__m512 fft9167 = _mm512_add_ps(dat1853, _mm512_setzero_ps());
__m512 fft9080 = _mm512_sub_ps(dat1852, dat1860);
__m512 fft9168 = _mm512_sub_ps(dat1853, _mm512_setzero_ps());
__m512 fft9081 = _mm512_add_ps(fft9073, fft9077);
__m512 fft9169 = _mm512_add_ps(fft9161, fft9165);
__m512 fft9082 = _mm512_sub_ps(fft9073, fft9077);
__m512 fft9170 = _mm512_sub_ps(fft9161, fft9165);
__m512 fft9083 = _mm512_add_ps(fft9075, fft9079);
__m512 fft9171 = _mm512_add_ps(fft9163, fft9167);
__m512 fft9084 = _mm512_sub_ps(fft9079, fft9075);
__m512 fft9172 = _mm512_sub_ps(fft9167, fft9163);
__m512 fft9085 = _mm512_sub_ps(fft9076, fft9080);
__m512 fft9173 = _mm512_sub_ps(fft9164, fft9168);
__m512 fft9086 = _mm512_add_ps(fft9076, fft9080);
__m512 fft9174 = _mm512_add_ps(fft9164, fft9168);
__m512 fft9087 = _mm512_add_ps(fft9081, fft9083);
__m512 fft9175 = _mm512_add_ps(fft9169, fft9171);
__m512 fft9088 = _mm512_sub_ps(fft9081, fft9083);
__m512 fft9176 = _mm512_sub_ps(fft9169, fft9171);
__m512 fft9089 = _mm512_fmadd_ps(fft9085, _mm512_set1_ps(7.0710677e-01f), fft9074);
__m512 fft9177 = _mm512_fmadd_ps(fft9173, _mm512_set1_ps(7.0710677e-01f), fft9162);
__m512 fft9090 = _mm512_fnmsub_ps(fft9086, _mm512_set1_ps(7.0710677e-01f), fft9078);
__m512 fft9178 = _mm512_fnmsub_ps(fft9174, _mm512_set1_ps(7.0710677e-01f), fft9166);
__m512 fft9091 = _mm512_fnmadd_ps(fft9085, _mm512_set1_ps(7.0710677e-01f), fft9074);
__m512 fft9179 = _mm512_fnmadd_ps(fft9173, _mm512_set1_ps(7.0710677e-01f), fft9162);
__m512 fft9092 = _mm512_fnmadd_ps(fft9086, _mm512_set1_ps(7.0710677e-01f), fft9078);
__m512 fft9180 = _mm512_fnmadd_ps(fft9174, _mm512_set1_ps(7.0710677e-01f), fft9166);
__m512 fft9093 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9094 = _mm512_fmadd_ps(fft9087, fft9093, _mm512_shuffle_f32x4(fft9087, fft9087, 78));
__m512 fft9181 = _mm512_fmadd_ps(fft9175, fft9093, _mm512_shuffle_f32x4(fft9175, fft9175, 78));
__m512 fft9095 = _mm512_fmadd_ps(fft9088, fft9093, _mm512_shuffle_f32x4(fft9088, fft9088, 78));
__m512 fft9182 = _mm512_fmadd_ps(fft9176, fft9093, _mm512_shuffle_f32x4(fft9176, fft9176, 78));
__m512 fft9096 = _mm512_fmadd_ps(fft9089, fft9093, _mm512_shuffle_f32x4(fft9089, fft9089, 78));
__m512 fft9183 = _mm512_fmadd_ps(fft9177, fft9093, _mm512_shuffle_f32x4(fft9177, fft9177, 78));
__m512 fft9097 = _mm512_fmadd_ps(fft9090, fft9093, _mm512_shuffle_f32x4(fft9090, fft9090, 78));
__m512 fft9184 = _mm512_fmadd_ps(fft9178, fft9093, _mm512_shuffle_f32x4(fft9178, fft9178, 78));
__m512 fft9098 = _mm512_fmadd_ps(fft9082, fft9093, _mm512_shuffle_f32x4(fft9082, fft9082, 78));
__m512 fft9185 = _mm512_fmadd_ps(fft9170, fft9093, _mm512_shuffle_f32x4(fft9170, fft9170, 78));
__m512 fft9099 = _mm512_fmadd_ps(fft9084, fft9093, _mm512_shuffle_f32x4(fft9084, fft9084, 78));
__m512 fft9186 = _mm512_fmadd_ps(fft9172, fft9093, _mm512_shuffle_f32x4(fft9172, fft9172, 78));
__m512 fft9100 = _mm512_fmadd_ps(fft9091, fft9093, _mm512_shuffle_f32x4(fft9091, fft9091, 78));
__m512 fft9187 = _mm512_fmadd_ps(fft9179, fft9093, _mm512_shuffle_f32x4(fft9179, fft9179, 78));
__m512 fft9101 = _mm512_fmadd_ps(fft9092, fft9093, _mm512_shuffle_f32x4(fft9092, fft9092, 78));
__m512 fft9188 = _mm512_fmadd_ps(fft9180, fft9093, _mm512_shuffle_f32x4(fft9180, fft9180, 78));
__m512 fft9102 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9103 = _mm512_mul_ps(fft9094, fft9102);
__m512 fft9189 = _mm512_mul_ps(fft9181, fft9102);
__m512 fft9104 = _mm512_mul_ps(fft9095, fft9102);
__m512 fft9190 = _mm512_mul_ps(fft9182, fft9102);
__m512 fft9105 = _mm512_mul_ps(fft9096, fft9102);
__m512 fft9191 = _mm512_mul_ps(fft9183, fft9102);
__m512 fft9106 = _mm512_mul_ps(fft9097, fft9102);
__m512 fft9192 = _mm512_mul_ps(fft9184, fft9102);
__m512 fft9107 = _mm512_mul_ps(fft9098, fft9102);
__m512 fft9193 = _mm512_mul_ps(fft9185, fft9102);
__m512 fft9108 = _mm512_mul_ps(fft9099, fft9102);
__m512 fft9194 = _mm512_mul_ps(fft9186, fft9102);
__m512 fft9109 = _mm512_mul_ps(fft9100, fft9102);
__m512 fft9195 = _mm512_mul_ps(fft9187, fft9102);
__m512 fft9110 = _mm512_mul_ps(fft9101, fft9102);
__m512 fft9196 = _mm512_mul_ps(fft9188, fft9102);
__m512 fft9111 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9112 = _mm512_fmadd_ps(fft9095, fft9111, fft9103);
__m512 fft9197 = _mm512_fmadd_ps(fft9182, fft9111, fft9189);
__m512 fft9113 = _mm512_fnmadd_ps(fft9094, fft9111, fft9104);
__m512 fft9198 = _mm512_fnmadd_ps(fft9181, fft9111, fft9190);
__m512 fft9114 = _mm512_fmadd_ps(fft9097, fft9111, fft9105);
__m512 fft9199 = _mm512_fmadd_ps(fft9184, fft9111, fft9191);
__m512 fft9115 = _mm512_fnmadd_ps(fft9096, fft9111, fft9106);
__m512 fft9200 = _mm512_fnmadd_ps(fft9183, fft9111, fft9192);
__m512 fft9116 = _mm512_fmadd_ps(fft9099, fft9111, fft9107);
__m512 fft9201 = _mm512_fmadd_ps(fft9186, fft9111, fft9193);
__m512 fft9117 = _mm512_fnmadd_ps(fft9098, fft9111, fft9108);
__m512 fft9202 = _mm512_fnmadd_ps(fft9185, fft9111, fft9194);
__m512 fft9118 = _mm512_fmadd_ps(fft9101, fft9111, fft9109);
__m512 fft9203 = _mm512_fmadd_ps(fft9188, fft9111, fft9195);
__m512 fft9119 = _mm512_fnmadd_ps(fft9100, fft9111, fft9110);
__m512 fft9204 = _mm512_fnmadd_ps(fft9187, fft9111, fft9196);
__m512 fft9120 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9121 = _mm512_fmadd_ps(fft9112, fft9120, _mm512_shuffle_f32x4(fft9112, fft9112, 177));
__m512 fft9205 = _mm512_fmadd_ps(fft9197, fft9120, _mm512_shuffle_f32x4(fft9197, fft9197, 177));
__m512 fft9122 = _mm512_fmadd_ps(fft9113, fft9120, _mm512_shuffle_f32x4(fft9113, fft9113, 177));
__m512 fft9206 = _mm512_fmadd_ps(fft9198, fft9120, _mm512_shuffle_f32x4(fft9198, fft9198, 177));
__m512 fft9123 = _mm512_fmadd_ps(fft9114, fft9120, _mm512_shuffle_f32x4(fft9114, fft9114, 177));
__m512 fft9207 = _mm512_fmadd_ps(fft9199, fft9120, _mm512_shuffle_f32x4(fft9199, fft9199, 177));
__m512 fft9124 = _mm512_fmadd_ps(fft9115, fft9120, _mm512_shuffle_f32x4(fft9115, fft9115, 177));
__m512 fft9208 = _mm512_fmadd_ps(fft9200, fft9120, _mm512_shuffle_f32x4(fft9200, fft9200, 177));
__m512 fft9125 = _mm512_fmadd_ps(fft9116, fft9120, _mm512_shuffle_f32x4(fft9116, fft9116, 177));
__m512 fft9209 = _mm512_fmadd_ps(fft9201, fft9120, _mm512_shuffle_f32x4(fft9201, fft9201, 177));
__m512 fft9126 = _mm512_fmadd_ps(fft9117, fft9120, _mm512_shuffle_f32x4(fft9117, fft9117, 177));
__m512 fft9210 = _mm512_fmadd_ps(fft9202, fft9120, _mm512_shuffle_f32x4(fft9202, fft9202, 177));
__m512 fft9127 = _mm512_fmadd_ps(fft9118, fft9120, _mm512_shuffle_f32x4(fft9118, fft9118, 177));
__m512 fft9211 = _mm512_fmadd_ps(fft9203, fft9120, _mm512_shuffle_f32x4(fft9203, fft9203, 177));
__m512 fft9128 = _mm512_fmadd_ps(fft9119, fft9120, _mm512_shuffle_f32x4(fft9119, fft9119, 177));
__m512 fft9212 = _mm512_fmadd_ps(fft9204, fft9120, _mm512_shuffle_f32x4(fft9204, fft9204, 177));
__m512 fft9129 = _mm512_mask_mov_ps(fft9121, 49344, fft9122);
__m512 fft9213 = _mm512_mask_mov_ps(fft9205, 49344, fft9206);
__m512 fft9130 = _mm512_mask_sub_ps(fft9122, 49344, _mm512_setzero_ps(), fft9121);
__m512 fft9214 = _mm512_mask_sub_ps(fft9206, 49344, _mm512_setzero_ps(), fft9205);
__m512 fft9131 = _mm512_mask_mov_ps(fft9123, 49344, fft9124);
__m512 fft9215 = _mm512_mask_mov_ps(fft9207, 49344, fft9208);
__m512 fft9132 = _mm512_mask_sub_ps(fft9124, 49344, _mm512_setzero_ps(), fft9123);
__m512 fft9216 = _mm512_mask_sub_ps(fft9208, 49344, _mm512_setzero_ps(), fft9207);
__m512 fft9133 = _mm512_mask_mov_ps(fft9125, 49344, fft9126);
__m512 fft9217 = _mm512_mask_mov_ps(fft9209, 49344, fft9210);
__m512 fft9134 = _mm512_mask_sub_ps(fft9126, 49344, _mm512_setzero_ps(), fft9125);
__m512 fft9218 = _mm512_mask_sub_ps(fft9210, 49344, _mm512_setzero_ps(), fft9209);
__m512 fft9135 = _mm512_mask_mov_ps(fft9127, 49344, fft9128);
__m512 fft9219 = _mm512_mask_mov_ps(fft9211, 49344, fft9212);
__m512 fft9136 = _mm512_mask_sub_ps(fft9128, 49344, _mm512_setzero_ps(), fft9127);
__m512 fft9220 = _mm512_mask_sub_ps(fft9212, 49344, _mm512_setzero_ps(), fft9211);
__m512 fft9137 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9138 = _mm512_fmadd_ps(fft9129, fft9137, _mm512_shuffle_ps(fft9129, fft9129, 78));
__m512 fft9221 = _mm512_fmadd_ps(fft9213, fft9137, _mm512_shuffle_ps(fft9213, fft9213, 78));
__m512 fft9139 = _mm512_fmadd_ps(fft9130, fft9137, _mm512_shuffle_ps(fft9130, fft9130, 78));
__m512 fft9222 = _mm512_fmadd_ps(fft9214, fft9137, _mm512_shuffle_ps(fft9214, fft9214, 78));
__m512 fft9140 = _mm512_fmadd_ps(fft9131, fft9137, _mm512_shuffle_ps(fft9131, fft9131, 78));
__m512 fft9223 = _mm512_fmadd_ps(fft9215, fft9137, _mm512_shuffle_ps(fft9215, fft9215, 78));
__m512 fft9141 = _mm512_fmadd_ps(fft9132, fft9137, _mm512_shuffle_ps(fft9132, fft9132, 78));
__m512 fft9224 = _mm512_fmadd_ps(fft9216, fft9137, _mm512_shuffle_ps(fft9216, fft9216, 78));
__m512 fft9142 = _mm512_fmadd_ps(fft9133, fft9137, _mm512_shuffle_ps(fft9133, fft9133, 78));
__m512 fft9225 = _mm512_fmadd_ps(fft9217, fft9137, _mm512_shuffle_ps(fft9217, fft9217, 78));
__m512 fft9143 = _mm512_fmadd_ps(fft9134, fft9137, _mm512_shuffle_ps(fft9134, fft9134, 78));
__m512 fft9226 = _mm512_fmadd_ps(fft9218, fft9137, _mm512_shuffle_ps(fft9218, fft9218, 78));
__m512 fft9144 = _mm512_fmadd_ps(fft9135, fft9137, _mm512_shuffle_ps(fft9135, fft9135, 78));
__m512 fft9227 = _mm512_fmadd_ps(fft9219, fft9137, _mm512_shuffle_ps(fft9219, fft9219, 78));
__m512 fft9145 = _mm512_fmadd_ps(fft9136, fft9137, _mm512_shuffle_ps(fft9136, fft9136, 78));
__m512 fft9228 = _mm512_fmadd_ps(fft9220, fft9137, _mm512_shuffle_ps(fft9220, fft9220, 78));
__m512i fft9146 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9147 = _mm512_permutexvar_ps(fft9146, fft9138);
__m512 fft9229 = _mm512_permutexvar_ps(fft9146, fft9221);
__m512i fft9148 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9149 = _mm512_permutexvar_ps(fft9148, fft9138);
__m512 fft9230 = _mm512_permutexvar_ps(fft9148, fft9221);
__m512 fft9150 = _mm512_permutexvar_ps(fft9146, fft9139);
__m512 fft9231 = _mm512_permutexvar_ps(fft9146, fft9222);
__m512 fft9151 = _mm512_permutexvar_ps(fft9148, fft9139);
__m512 fft9232 = _mm512_permutexvar_ps(fft9148, fft9222);
__m512 fft9152 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9153 = _mm512_fmadd_ps(fft9147, fft9152, fft9149);
__m512 fft9233 = _mm512_fmadd_ps(fft9229, fft9152, fft9230);
__m512 fft9154 = _mm512_fnmadd_ps(fft9151, fft9152, fft9150);
__m512 fft9234 = _mm512_fnmadd_ps(fft9232, fft9152, fft9231);
__m512 fft9155 = _mm512_mask_mov_ps(fft9151, 21845, fft9153);
__m512 fft9235 = _mm512_mask_mov_ps(fft9232, 21845, fft9233);
__m512 fft9156 = _mm512_mask_mov_ps(fft9147, 43176, fft9153);
__m512 fft9236 = _mm512_mask_mov_ps(fft9229, 43176, fft9233);
__m512 fft9157 = _mm512_mask_mov_ps(fft9155, 43176, fft9154);
__m512 fft9237 = _mm512_mask_mov_ps(fft9235, 43176, fft9234);
__m512 fft9158 = _mm512_mask_mov_ps(fft9156, 22102, fft9154);
__m512 fft9238 = _mm512_mask_mov_ps(fft9236, 22102, fft9234);
__m512 fft9159 = _mm512_mask_mul_ps(fft9157, 64764, fft9157, _mm512_set1_ps(5e-01f));
__m512 fft9239 = _mm512_mask_mul_ps(fft9237, 64764, fft9237, _mm512_set1_ps(5e-01f));
__m512 fft9160 = _mm512_mask_mul_ps(fft9158, 64764, fft9158, _mm512_set1_ps(5e-01f));
__m512 fft9240 = _mm512_mask_mul_ps(fft9238, 64764, fft9238, _mm512_set1_ps(5e-01f));
__m512 df821 = fft9159;
__m512 df829 = fft9239;
__m512 df822 = fft9160;
__m512 df830 = fft9240;
__m512 df823 = fft9140;
__m512 df831 = fft9223;
__m512 df824 = fft9141;
__m512 df832 = fft9224;
__m512 df825 = fft9142;
__m512 df833 = fft9225;
__m512 df826 = fft9143;
__m512 df834 = fft9226;
__m512 df827 = fft9144;
__m512 df835 = fft9227;
__m512 df828 = fft9145;
__m512 df836 = fft9228;
__m512i eo55 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df823 = _mm512_permutexvar_ps(eo55, df823);
df824 = _mm512_permutexvar_ps(eo55, df824);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df823);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df824);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df823);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df824);
df831 = _mm512_permutexvar_ps(eo55, df831);
df832 = _mm512_permutexvar_ps(eo55, df832);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df831);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df832);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df831);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df832);
df825 = _mm512_permutexvar_ps(eo55, df825);
df826 = _mm512_permutexvar_ps(eo55, df826);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df825);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df826);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df825);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df826);
df833 = _mm512_permutexvar_ps(eo55, df833);
df834 = _mm512_permutexvar_ps(eo55, df834);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df833);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df834);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df833);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df834);
df827 = _mm512_permutexvar_ps(eo55, df827);
df828 = _mm512_permutexvar_ps(eo55, df828);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df827);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df828);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df827);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df828);
df835 = _mm512_permutexvar_ps(eo55, df835);
df836 = _mm512_permutexvar_ps(eo55, df836);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df835);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df836);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df835);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df836);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df821);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df822);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df821);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df822);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df829);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+256*k109+128*m55+32*f58, 255, df830);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df829);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+256*k109+128*m55+32*f58, 65280, df830);
for (ptrdiff_t b67 = 1; b67 < 3; ++b67) {
ptrdiff_t m56 = (size_t)b67/2;
ptrdiff_t f59 = (size_t)b67%2;
__m512 dat1861 = _mm512_maskz_loadu_ps(65535, datPtr18+0+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1862 = _mm512_maskz_loadu_ps(65535, datPtr18+224+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1863 = _mm512_maskz_loadu_ps(65535, datPtr18+448+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1864 = _mm512_maskz_loadu_ps(65535, datPtr18+672+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1865 = _mm512_maskz_loadu_ps(65535, datPtr18+896+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1866 = _mm512_maskz_loadu_ps(65535, datPtr18+1120+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1867 = _mm512_maskz_loadu_ps(65535, datPtr18+1344+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1868 = _mm512_maskz_loadu_ps(65535, datPtr18+1568+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1869 = _mm512_maskz_loadu_ps(65535, datPtr18+1792+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1870 = _mm512_maskz_loadu_ps(65535, datPtr18+2016+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1871 = _mm512_maskz_loadu_ps(65535, datPtr18+2240+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1872 = _mm512_maskz_loadu_ps(65535, datPtr18+2464+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1873 = _mm512_maskz_loadu_ps(65535, datPtr18+2688+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1874 = _mm512_maskz_loadu_ps(65535, datPtr18+2912+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 dat1875 = _mm512_maskz_loadu_ps(65535, datPtr18+3136+100864*i38+12608*k109+224*h42+4*w51+56*b67);
__m512 fft9241 = _mm512_add_ps(dat1861, dat1869);
__m512 fft9329 = _mm512_add_ps(dat1862, dat1870);
__m512 fft9242 = _mm512_sub_ps(dat1861, dat1869);
__m512 fft9330 = _mm512_sub_ps(dat1862, dat1870);
__m512 fft9243 = _mm512_add_ps(dat1863, dat1871);
__m512 fft9331 = _mm512_add_ps(dat1864, dat1872);
__m512 fft9244 = _mm512_sub_ps(dat1863, dat1871);
__m512 fft9332 = _mm512_sub_ps(dat1864, dat1872);
__m512 fft9245 = _mm512_add_ps(dat1865, dat1873);
__m512 fft9333 = _mm512_add_ps(dat1866, dat1874);
__m512 fft9246 = _mm512_sub_ps(dat1865, dat1873);
__m512 fft9334 = _mm512_sub_ps(dat1866, dat1874);
__m512 fft9247 = _mm512_add_ps(dat1867, dat1875);
__m512 fft9335 = _mm512_add_ps(dat1868, _mm512_setzero_ps());
__m512 fft9248 = _mm512_sub_ps(dat1867, dat1875);
__m512 fft9336 = _mm512_sub_ps(dat1868, _mm512_setzero_ps());
__m512 fft9249 = _mm512_add_ps(fft9241, fft9245);
__m512 fft9337 = _mm512_add_ps(fft9329, fft9333);
__m512 fft9250 = _mm512_sub_ps(fft9241, fft9245);
__m512 fft9338 = _mm512_sub_ps(fft9329, fft9333);
__m512 fft9251 = _mm512_add_ps(fft9243, fft9247);
__m512 fft9339 = _mm512_add_ps(fft9331, fft9335);
__m512 fft9252 = _mm512_sub_ps(fft9247, fft9243);
__m512 fft9340 = _mm512_sub_ps(fft9335, fft9331);
__m512 fft9253 = _mm512_sub_ps(fft9244, fft9248);
__m512 fft9341 = _mm512_sub_ps(fft9332, fft9336);
__m512 fft9254 = _mm512_add_ps(fft9244, fft9248);
__m512 fft9342 = _mm512_add_ps(fft9332, fft9336);
__m512 fft9255 = _mm512_add_ps(fft9249, fft9251);
__m512 fft9343 = _mm512_add_ps(fft9337, fft9339);
__m512 fft9256 = _mm512_sub_ps(fft9249, fft9251);
__m512 fft9344 = _mm512_sub_ps(fft9337, fft9339);
__m512 fft9257 = _mm512_fmadd_ps(fft9253, _mm512_set1_ps(7.0710677e-01f), fft9242);
__m512 fft9345 = _mm512_fmadd_ps(fft9341, _mm512_set1_ps(7.0710677e-01f), fft9330);
__m512 fft9258 = _mm512_fnmsub_ps(fft9254, _mm512_set1_ps(7.0710677e-01f), fft9246);
__m512 fft9346 = _mm512_fnmsub_ps(fft9342, _mm512_set1_ps(7.0710677e-01f), fft9334);
__m512 fft9259 = _mm512_fnmadd_ps(fft9253, _mm512_set1_ps(7.0710677e-01f), fft9242);
__m512 fft9347 = _mm512_fnmadd_ps(fft9341, _mm512_set1_ps(7.0710677e-01f), fft9330);
__m512 fft9260 = _mm512_fnmadd_ps(fft9254, _mm512_set1_ps(7.0710677e-01f), fft9246);
__m512 fft9348 = _mm512_fnmadd_ps(fft9342, _mm512_set1_ps(7.0710677e-01f), fft9334);
__m512 fft9261 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9262 = _mm512_fmadd_ps(fft9255, fft9261, _mm512_shuffle_f32x4(fft9255, fft9255, 78));
__m512 fft9349 = _mm512_fmadd_ps(fft9343, fft9261, _mm512_shuffle_f32x4(fft9343, fft9343, 78));
__m512 fft9263 = _mm512_fmadd_ps(fft9256, fft9261, _mm512_shuffle_f32x4(fft9256, fft9256, 78));
__m512 fft9350 = _mm512_fmadd_ps(fft9344, fft9261, _mm512_shuffle_f32x4(fft9344, fft9344, 78));
__m512 fft9264 = _mm512_fmadd_ps(fft9257, fft9261, _mm512_shuffle_f32x4(fft9257, fft9257, 78));
__m512 fft9351 = _mm512_fmadd_ps(fft9345, fft9261, _mm512_shuffle_f32x4(fft9345, fft9345, 78));
__m512 fft9265 = _mm512_fmadd_ps(fft9258, fft9261, _mm512_shuffle_f32x4(fft9258, fft9258, 78));
__m512 fft9352 = _mm512_fmadd_ps(fft9346, fft9261, _mm512_shuffle_f32x4(fft9346, fft9346, 78));
__m512 fft9266 = _mm512_fmadd_ps(fft9250, fft9261, _mm512_shuffle_f32x4(fft9250, fft9250, 78));
__m512 fft9353 = _mm512_fmadd_ps(fft9338, fft9261, _mm512_shuffle_f32x4(fft9338, fft9338, 78));
__m512 fft9267 = _mm512_fmadd_ps(fft9252, fft9261, _mm512_shuffle_f32x4(fft9252, fft9252, 78));
__m512 fft9354 = _mm512_fmadd_ps(fft9340, fft9261, _mm512_shuffle_f32x4(fft9340, fft9340, 78));
__m512 fft9268 = _mm512_fmadd_ps(fft9259, fft9261, _mm512_shuffle_f32x4(fft9259, fft9259, 78));
__m512 fft9355 = _mm512_fmadd_ps(fft9347, fft9261, _mm512_shuffle_f32x4(fft9347, fft9347, 78));
__m512 fft9269 = _mm512_fmadd_ps(fft9260, fft9261, _mm512_shuffle_f32x4(fft9260, fft9260, 78));
__m512 fft9356 = _mm512_fmadd_ps(fft9348, fft9261, _mm512_shuffle_f32x4(fft9348, fft9348, 78));
__m512 fft9270 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9271 = _mm512_mul_ps(fft9262, fft9270);
__m512 fft9357 = _mm512_mul_ps(fft9349, fft9270);
__m512 fft9272 = _mm512_mul_ps(fft9263, fft9270);
__m512 fft9358 = _mm512_mul_ps(fft9350, fft9270);
__m512 fft9273 = _mm512_mul_ps(fft9264, fft9270);
__m512 fft9359 = _mm512_mul_ps(fft9351, fft9270);
__m512 fft9274 = _mm512_mul_ps(fft9265, fft9270);
__m512 fft9360 = _mm512_mul_ps(fft9352, fft9270);
__m512 fft9275 = _mm512_mul_ps(fft9266, fft9270);
__m512 fft9361 = _mm512_mul_ps(fft9353, fft9270);
__m512 fft9276 = _mm512_mul_ps(fft9267, fft9270);
__m512 fft9362 = _mm512_mul_ps(fft9354, fft9270);
__m512 fft9277 = _mm512_mul_ps(fft9268, fft9270);
__m512 fft9363 = _mm512_mul_ps(fft9355, fft9270);
__m512 fft9278 = _mm512_mul_ps(fft9269, fft9270);
__m512 fft9364 = _mm512_mul_ps(fft9356, fft9270);
__m512 fft9279 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9280 = _mm512_fmadd_ps(fft9263, fft9279, fft9271);
__m512 fft9365 = _mm512_fmadd_ps(fft9350, fft9279, fft9357);
__m512 fft9281 = _mm512_fnmadd_ps(fft9262, fft9279, fft9272);
__m512 fft9366 = _mm512_fnmadd_ps(fft9349, fft9279, fft9358);
__m512 fft9282 = _mm512_fmadd_ps(fft9265, fft9279, fft9273);
__m512 fft9367 = _mm512_fmadd_ps(fft9352, fft9279, fft9359);
__m512 fft9283 = _mm512_fnmadd_ps(fft9264, fft9279, fft9274);
__m512 fft9368 = _mm512_fnmadd_ps(fft9351, fft9279, fft9360);
__m512 fft9284 = _mm512_fmadd_ps(fft9267, fft9279, fft9275);
__m512 fft9369 = _mm512_fmadd_ps(fft9354, fft9279, fft9361);
__m512 fft9285 = _mm512_fnmadd_ps(fft9266, fft9279, fft9276);
__m512 fft9370 = _mm512_fnmadd_ps(fft9353, fft9279, fft9362);
__m512 fft9286 = _mm512_fmadd_ps(fft9269, fft9279, fft9277);
__m512 fft9371 = _mm512_fmadd_ps(fft9356, fft9279, fft9363);
__m512 fft9287 = _mm512_fnmadd_ps(fft9268, fft9279, fft9278);
__m512 fft9372 = _mm512_fnmadd_ps(fft9355, fft9279, fft9364);
__m512 fft9288 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9289 = _mm512_fmadd_ps(fft9280, fft9288, _mm512_shuffle_f32x4(fft9280, fft9280, 177));
__m512 fft9373 = _mm512_fmadd_ps(fft9365, fft9288, _mm512_shuffle_f32x4(fft9365, fft9365, 177));
__m512 fft9290 = _mm512_fmadd_ps(fft9281, fft9288, _mm512_shuffle_f32x4(fft9281, fft9281, 177));
__m512 fft9374 = _mm512_fmadd_ps(fft9366, fft9288, _mm512_shuffle_f32x4(fft9366, fft9366, 177));
__m512 fft9291 = _mm512_fmadd_ps(fft9282, fft9288, _mm512_shuffle_f32x4(fft9282, fft9282, 177));
__m512 fft9375 = _mm512_fmadd_ps(fft9367, fft9288, _mm512_shuffle_f32x4(fft9367, fft9367, 177));
__m512 fft9292 = _mm512_fmadd_ps(fft9283, fft9288, _mm512_shuffle_f32x4(fft9283, fft9283, 177));
__m512 fft9376 = _mm512_fmadd_ps(fft9368, fft9288, _mm512_shuffle_f32x4(fft9368, fft9368, 177));
__m512 fft9293 = _mm512_fmadd_ps(fft9284, fft9288, _mm512_shuffle_f32x4(fft9284, fft9284, 177));
__m512 fft9377 = _mm512_fmadd_ps(fft9369, fft9288, _mm512_shuffle_f32x4(fft9369, fft9369, 177));
__m512 fft9294 = _mm512_fmadd_ps(fft9285, fft9288, _mm512_shuffle_f32x4(fft9285, fft9285, 177));
__m512 fft9378 = _mm512_fmadd_ps(fft9370, fft9288, _mm512_shuffle_f32x4(fft9370, fft9370, 177));
__m512 fft9295 = _mm512_fmadd_ps(fft9286, fft9288, _mm512_shuffle_f32x4(fft9286, fft9286, 177));
__m512 fft9379 = _mm512_fmadd_ps(fft9371, fft9288, _mm512_shuffle_f32x4(fft9371, fft9371, 177));
__m512 fft9296 = _mm512_fmadd_ps(fft9287, fft9288, _mm512_shuffle_f32x4(fft9287, fft9287, 177));
__m512 fft9380 = _mm512_fmadd_ps(fft9372, fft9288, _mm512_shuffle_f32x4(fft9372, fft9372, 177));
__m512 fft9297 = _mm512_mask_mov_ps(fft9289, 49344, fft9290);
__m512 fft9381 = _mm512_mask_mov_ps(fft9373, 49344, fft9374);
__m512 fft9298 = _mm512_mask_sub_ps(fft9290, 49344, _mm512_setzero_ps(), fft9289);
__m512 fft9382 = _mm512_mask_sub_ps(fft9374, 49344, _mm512_setzero_ps(), fft9373);
__m512 fft9299 = _mm512_mask_mov_ps(fft9291, 49344, fft9292);
__m512 fft9383 = _mm512_mask_mov_ps(fft9375, 49344, fft9376);
__m512 fft9300 = _mm512_mask_sub_ps(fft9292, 49344, _mm512_setzero_ps(), fft9291);
__m512 fft9384 = _mm512_mask_sub_ps(fft9376, 49344, _mm512_setzero_ps(), fft9375);
__m512 fft9301 = _mm512_mask_mov_ps(fft9293, 49344, fft9294);
__m512 fft9385 = _mm512_mask_mov_ps(fft9377, 49344, fft9378);
__m512 fft9302 = _mm512_mask_sub_ps(fft9294, 49344, _mm512_setzero_ps(), fft9293);
__m512 fft9386 = _mm512_mask_sub_ps(fft9378, 49344, _mm512_setzero_ps(), fft9377);
__m512 fft9303 = _mm512_mask_mov_ps(fft9295, 49344, fft9296);
__m512 fft9387 = _mm512_mask_mov_ps(fft9379, 49344, fft9380);
__m512 fft9304 = _mm512_mask_sub_ps(fft9296, 49344, _mm512_setzero_ps(), fft9295);
__m512 fft9388 = _mm512_mask_sub_ps(fft9380, 49344, _mm512_setzero_ps(), fft9379);
__m512 fft9305 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9306 = _mm512_fmadd_ps(fft9297, fft9305, _mm512_shuffle_ps(fft9297, fft9297, 78));
__m512 fft9389 = _mm512_fmadd_ps(fft9381, fft9305, _mm512_shuffle_ps(fft9381, fft9381, 78));
__m512 fft9307 = _mm512_fmadd_ps(fft9298, fft9305, _mm512_shuffle_ps(fft9298, fft9298, 78));
__m512 fft9390 = _mm512_fmadd_ps(fft9382, fft9305, _mm512_shuffle_ps(fft9382, fft9382, 78));
__m512 fft9308 = _mm512_fmadd_ps(fft9299, fft9305, _mm512_shuffle_ps(fft9299, fft9299, 78));
__m512 fft9391 = _mm512_fmadd_ps(fft9383, fft9305, _mm512_shuffle_ps(fft9383, fft9383, 78));
__m512 fft9309 = _mm512_fmadd_ps(fft9300, fft9305, _mm512_shuffle_ps(fft9300, fft9300, 78));
__m512 fft9392 = _mm512_fmadd_ps(fft9384, fft9305, _mm512_shuffle_ps(fft9384, fft9384, 78));
__m512 fft9310 = _mm512_fmadd_ps(fft9301, fft9305, _mm512_shuffle_ps(fft9301, fft9301, 78));
__m512 fft9393 = _mm512_fmadd_ps(fft9385, fft9305, _mm512_shuffle_ps(fft9385, fft9385, 78));
__m512 fft9311 = _mm512_fmadd_ps(fft9302, fft9305, _mm512_shuffle_ps(fft9302, fft9302, 78));
__m512 fft9394 = _mm512_fmadd_ps(fft9386, fft9305, _mm512_shuffle_ps(fft9386, fft9386, 78));
__m512 fft9312 = _mm512_fmadd_ps(fft9303, fft9305, _mm512_shuffle_ps(fft9303, fft9303, 78));
__m512 fft9395 = _mm512_fmadd_ps(fft9387, fft9305, _mm512_shuffle_ps(fft9387, fft9387, 78));
__m512 fft9313 = _mm512_fmadd_ps(fft9304, fft9305, _mm512_shuffle_ps(fft9304, fft9304, 78));
__m512 fft9396 = _mm512_fmadd_ps(fft9388, fft9305, _mm512_shuffle_ps(fft9388, fft9388, 78));
__m512i fft9314 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9315 = _mm512_permutexvar_ps(fft9314, fft9306);
__m512 fft9397 = _mm512_permutexvar_ps(fft9314, fft9389);
__m512i fft9316 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9317 = _mm512_permutexvar_ps(fft9316, fft9306);
__m512 fft9398 = _mm512_permutexvar_ps(fft9316, fft9389);
__m512 fft9318 = _mm512_permutexvar_ps(fft9314, fft9307);
__m512 fft9399 = _mm512_permutexvar_ps(fft9314, fft9390);
__m512 fft9319 = _mm512_permutexvar_ps(fft9316, fft9307);
__m512 fft9400 = _mm512_permutexvar_ps(fft9316, fft9390);
__m512 fft9320 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9321 = _mm512_fmadd_ps(fft9315, fft9320, fft9317);
__m512 fft9401 = _mm512_fmadd_ps(fft9397, fft9320, fft9398);
__m512 fft9322 = _mm512_fnmadd_ps(fft9319, fft9320, fft9318);
__m512 fft9402 = _mm512_fnmadd_ps(fft9400, fft9320, fft9399);
__m512 fft9323 = _mm512_mask_mov_ps(fft9319, 21845, fft9321);
__m512 fft9403 = _mm512_mask_mov_ps(fft9400, 21845, fft9401);
__m512 fft9324 = _mm512_mask_mov_ps(fft9315, 43176, fft9321);
__m512 fft9404 = _mm512_mask_mov_ps(fft9397, 43176, fft9401);
__m512 fft9325 = _mm512_mask_mov_ps(fft9323, 43176, fft9322);
__m512 fft9405 = _mm512_mask_mov_ps(fft9403, 43176, fft9402);
__m512 fft9326 = _mm512_mask_mov_ps(fft9324, 22102, fft9322);
__m512 fft9406 = _mm512_mask_mov_ps(fft9404, 22102, fft9402);
__m512 fft9327 = _mm512_mask_mul_ps(fft9325, 64764, fft9325, _mm512_set1_ps(5e-01f));
__m512 fft9407 = _mm512_mask_mul_ps(fft9405, 64764, fft9405, _mm512_set1_ps(5e-01f));
__m512 fft9328 = _mm512_mask_mul_ps(fft9326, 64764, fft9326, _mm512_set1_ps(5e-01f));
__m512 fft9408 = _mm512_mask_mul_ps(fft9406, 64764, fft9406, _mm512_set1_ps(5e-01f));
__m512 df837 = fft9327;
__m512 df845 = fft9407;
__m512 df838 = fft9328;
__m512 df846 = fft9408;
__m512 df839 = fft9308;
__m512 df847 = fft9391;
__m512 df840 = fft9309;
__m512 df848 = fft9392;
__m512 df841 = fft9310;
__m512 df849 = fft9393;
__m512 df842 = fft9311;
__m512 df850 = fft9394;
__m512 df843 = fft9312;
__m512 df851 = fft9395;
__m512 df844 = fft9313;
__m512 df852 = fft9396;
__m512i eo56 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df839 = _mm512_permutexvar_ps(eo56, df839);
df840 = _mm512_permutexvar_ps(eo56, df840);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df839);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df840);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df839);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df840);
df847 = _mm512_permutexvar_ps(eo56, df847);
df848 = _mm512_permutexvar_ps(eo56, df848);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df847);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df848);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df847);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df848);
df841 = _mm512_permutexvar_ps(eo56, df841);
df842 = _mm512_permutexvar_ps(eo56, df842);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df841);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df842);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df841);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df842);
df849 = _mm512_permutexvar_ps(eo56, df849);
df850 = _mm512_permutexvar_ps(eo56, df850);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df849);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df850);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df849);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df850);
df843 = _mm512_permutexvar_ps(eo56, df843);
df844 = _mm512_permutexvar_ps(eo56, df844);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df843);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df844);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df843);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df844);
df851 = _mm512_permutexvar_ps(eo56, df851);
df852 = _mm512_permutexvar_ps(eo56, df852);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df851);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df852);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df851);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df852);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df837);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df838);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df837);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df838);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df845);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+256*k109+128*m56+32*f59, 255, df846);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df845);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+256*k109+128*m56+32*f59, 65280, df846);
}
ptrdiff_t b68 = 3;
ptrdiff_t m57 = (size_t)b68/2;
ptrdiff_t f60 = (size_t)b68%2;
__m512 dat1876 = _mm512_maskz_loadu_ps(32767, datPtr18+168+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1877 = _mm512_maskz_loadu_ps(32767, datPtr18+392+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1878 = _mm512_maskz_loadu_ps(32767, datPtr18+616+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1879 = _mm512_maskz_loadu_ps(32767, datPtr18+840+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1880 = _mm512_maskz_loadu_ps(32767, datPtr18+1064+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1881 = _mm512_maskz_loadu_ps(32767, datPtr18+1288+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1882 = _mm512_maskz_loadu_ps(32767, datPtr18+1512+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1883 = _mm512_maskz_loadu_ps(32767, datPtr18+1736+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1884 = _mm512_maskz_loadu_ps(32767, datPtr18+1960+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1885 = _mm512_maskz_loadu_ps(32767, datPtr18+2184+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1886 = _mm512_maskz_loadu_ps(32767, datPtr18+2408+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1887 = _mm512_maskz_loadu_ps(32767, datPtr18+2632+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1888 = _mm512_maskz_loadu_ps(32767, datPtr18+2856+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1889 = _mm512_maskz_loadu_ps(32767, datPtr18+3080+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 dat1890 = _mm512_maskz_loadu_ps(32767, datPtr18+3304+100864*i38+12608*k109+224*h42+4*w51+0*b68);
__m512 fft9409 = _mm512_add_ps(dat1876, dat1884);
__m512 fft9497 = _mm512_add_ps(dat1877, dat1885);
__m512 fft9410 = _mm512_sub_ps(dat1876, dat1884);
__m512 fft9498 = _mm512_sub_ps(dat1877, dat1885);
__m512 fft9411 = _mm512_add_ps(dat1878, dat1886);
__m512 fft9499 = _mm512_add_ps(dat1879, dat1887);
__m512 fft9412 = _mm512_sub_ps(dat1878, dat1886);
__m512 fft9500 = _mm512_sub_ps(dat1879, dat1887);
__m512 fft9413 = _mm512_add_ps(dat1880, dat1888);
__m512 fft9501 = _mm512_add_ps(dat1881, dat1889);
__m512 fft9414 = _mm512_sub_ps(dat1880, dat1888);
__m512 fft9502 = _mm512_sub_ps(dat1881, dat1889);
__m512 fft9415 = _mm512_add_ps(dat1882, dat1890);
__m512 fft9503 = _mm512_add_ps(dat1883, _mm512_setzero_ps());
__m512 fft9416 = _mm512_sub_ps(dat1882, dat1890);
__m512 fft9504 = _mm512_sub_ps(dat1883, _mm512_setzero_ps());
__m512 fft9417 = _mm512_add_ps(fft9409, fft9413);
__m512 fft9505 = _mm512_add_ps(fft9497, fft9501);
__m512 fft9418 = _mm512_sub_ps(fft9409, fft9413);
__m512 fft9506 = _mm512_sub_ps(fft9497, fft9501);
__m512 fft9419 = _mm512_add_ps(fft9411, fft9415);
__m512 fft9507 = _mm512_add_ps(fft9499, fft9503);
__m512 fft9420 = _mm512_sub_ps(fft9415, fft9411);
__m512 fft9508 = _mm512_sub_ps(fft9503, fft9499);
__m512 fft9421 = _mm512_sub_ps(fft9412, fft9416);
__m512 fft9509 = _mm512_sub_ps(fft9500, fft9504);
__m512 fft9422 = _mm512_add_ps(fft9412, fft9416);
__m512 fft9510 = _mm512_add_ps(fft9500, fft9504);
__m512 fft9423 = _mm512_add_ps(fft9417, fft9419);
__m512 fft9511 = _mm512_add_ps(fft9505, fft9507);
__m512 fft9424 = _mm512_sub_ps(fft9417, fft9419);
__m512 fft9512 = _mm512_sub_ps(fft9505, fft9507);
__m512 fft9425 = _mm512_fmadd_ps(fft9421, _mm512_set1_ps(7.0710677e-01f), fft9410);
__m512 fft9513 = _mm512_fmadd_ps(fft9509, _mm512_set1_ps(7.0710677e-01f), fft9498);
__m512 fft9426 = _mm512_fnmsub_ps(fft9422, _mm512_set1_ps(7.0710677e-01f), fft9414);
__m512 fft9514 = _mm512_fnmsub_ps(fft9510, _mm512_set1_ps(7.0710677e-01f), fft9502);
__m512 fft9427 = _mm512_fnmadd_ps(fft9421, _mm512_set1_ps(7.0710677e-01f), fft9410);
__m512 fft9515 = _mm512_fnmadd_ps(fft9509, _mm512_set1_ps(7.0710677e-01f), fft9498);
__m512 fft9428 = _mm512_fnmadd_ps(fft9422, _mm512_set1_ps(7.0710677e-01f), fft9414);
__m512 fft9516 = _mm512_fnmadd_ps(fft9510, _mm512_set1_ps(7.0710677e-01f), fft9502);
__m512 fft9429 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9430 = _mm512_fmadd_ps(fft9423, fft9429, _mm512_shuffle_f32x4(fft9423, fft9423, 78));
__m512 fft9517 = _mm512_fmadd_ps(fft9511, fft9429, _mm512_shuffle_f32x4(fft9511, fft9511, 78));
__m512 fft9431 = _mm512_fmadd_ps(fft9424, fft9429, _mm512_shuffle_f32x4(fft9424, fft9424, 78));
__m512 fft9518 = _mm512_fmadd_ps(fft9512, fft9429, _mm512_shuffle_f32x4(fft9512, fft9512, 78));
__m512 fft9432 = _mm512_fmadd_ps(fft9425, fft9429, _mm512_shuffle_f32x4(fft9425, fft9425, 78));
__m512 fft9519 = _mm512_fmadd_ps(fft9513, fft9429, _mm512_shuffle_f32x4(fft9513, fft9513, 78));
__m512 fft9433 = _mm512_fmadd_ps(fft9426, fft9429, _mm512_shuffle_f32x4(fft9426, fft9426, 78));
__m512 fft9520 = _mm512_fmadd_ps(fft9514, fft9429, _mm512_shuffle_f32x4(fft9514, fft9514, 78));
__m512 fft9434 = _mm512_fmadd_ps(fft9418, fft9429, _mm512_shuffle_f32x4(fft9418, fft9418, 78));
__m512 fft9521 = _mm512_fmadd_ps(fft9506, fft9429, _mm512_shuffle_f32x4(fft9506, fft9506, 78));
__m512 fft9435 = _mm512_fmadd_ps(fft9420, fft9429, _mm512_shuffle_f32x4(fft9420, fft9420, 78));
__m512 fft9522 = _mm512_fmadd_ps(fft9508, fft9429, _mm512_shuffle_f32x4(fft9508, fft9508, 78));
__m512 fft9436 = _mm512_fmadd_ps(fft9427, fft9429, _mm512_shuffle_f32x4(fft9427, fft9427, 78));
__m512 fft9523 = _mm512_fmadd_ps(fft9515, fft9429, _mm512_shuffle_f32x4(fft9515, fft9515, 78));
__m512 fft9437 = _mm512_fmadd_ps(fft9428, fft9429, _mm512_shuffle_f32x4(fft9428, fft9428, 78));
__m512 fft9524 = _mm512_fmadd_ps(fft9516, fft9429, _mm512_shuffle_f32x4(fft9516, fft9516, 78));
__m512 fft9438 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9439 = _mm512_mul_ps(fft9430, fft9438);
__m512 fft9525 = _mm512_mul_ps(fft9517, fft9438);
__m512 fft9440 = _mm512_mul_ps(fft9431, fft9438);
__m512 fft9526 = _mm512_mul_ps(fft9518, fft9438);
__m512 fft9441 = _mm512_mul_ps(fft9432, fft9438);
__m512 fft9527 = _mm512_mul_ps(fft9519, fft9438);
__m512 fft9442 = _mm512_mul_ps(fft9433, fft9438);
__m512 fft9528 = _mm512_mul_ps(fft9520, fft9438);
__m512 fft9443 = _mm512_mul_ps(fft9434, fft9438);
__m512 fft9529 = _mm512_mul_ps(fft9521, fft9438);
__m512 fft9444 = _mm512_mul_ps(fft9435, fft9438);
__m512 fft9530 = _mm512_mul_ps(fft9522, fft9438);
__m512 fft9445 = _mm512_mul_ps(fft9436, fft9438);
__m512 fft9531 = _mm512_mul_ps(fft9523, fft9438);
__m512 fft9446 = _mm512_mul_ps(fft9437, fft9438);
__m512 fft9532 = _mm512_mul_ps(fft9524, fft9438);
__m512 fft9447 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9448 = _mm512_fmadd_ps(fft9431, fft9447, fft9439);
__m512 fft9533 = _mm512_fmadd_ps(fft9518, fft9447, fft9525);
__m512 fft9449 = _mm512_fnmadd_ps(fft9430, fft9447, fft9440);
__m512 fft9534 = _mm512_fnmadd_ps(fft9517, fft9447, fft9526);
__m512 fft9450 = _mm512_fmadd_ps(fft9433, fft9447, fft9441);
__m512 fft9535 = _mm512_fmadd_ps(fft9520, fft9447, fft9527);
__m512 fft9451 = _mm512_fnmadd_ps(fft9432, fft9447, fft9442);
__m512 fft9536 = _mm512_fnmadd_ps(fft9519, fft9447, fft9528);
__m512 fft9452 = _mm512_fmadd_ps(fft9435, fft9447, fft9443);
__m512 fft9537 = _mm512_fmadd_ps(fft9522, fft9447, fft9529);
__m512 fft9453 = _mm512_fnmadd_ps(fft9434, fft9447, fft9444);
__m512 fft9538 = _mm512_fnmadd_ps(fft9521, fft9447, fft9530);
__m512 fft9454 = _mm512_fmadd_ps(fft9437, fft9447, fft9445);
__m512 fft9539 = _mm512_fmadd_ps(fft9524, fft9447, fft9531);
__m512 fft9455 = _mm512_fnmadd_ps(fft9436, fft9447, fft9446);
__m512 fft9540 = _mm512_fnmadd_ps(fft9523, fft9447, fft9532);
__m512 fft9456 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9457 = _mm512_fmadd_ps(fft9448, fft9456, _mm512_shuffle_f32x4(fft9448, fft9448, 177));
__m512 fft9541 = _mm512_fmadd_ps(fft9533, fft9456, _mm512_shuffle_f32x4(fft9533, fft9533, 177));
__m512 fft9458 = _mm512_fmadd_ps(fft9449, fft9456, _mm512_shuffle_f32x4(fft9449, fft9449, 177));
__m512 fft9542 = _mm512_fmadd_ps(fft9534, fft9456, _mm512_shuffle_f32x4(fft9534, fft9534, 177));
__m512 fft9459 = _mm512_fmadd_ps(fft9450, fft9456, _mm512_shuffle_f32x4(fft9450, fft9450, 177));
__m512 fft9543 = _mm512_fmadd_ps(fft9535, fft9456, _mm512_shuffle_f32x4(fft9535, fft9535, 177));
__m512 fft9460 = _mm512_fmadd_ps(fft9451, fft9456, _mm512_shuffle_f32x4(fft9451, fft9451, 177));
__m512 fft9544 = _mm512_fmadd_ps(fft9536, fft9456, _mm512_shuffle_f32x4(fft9536, fft9536, 177));
__m512 fft9461 = _mm512_fmadd_ps(fft9452, fft9456, _mm512_shuffle_f32x4(fft9452, fft9452, 177));
__m512 fft9545 = _mm512_fmadd_ps(fft9537, fft9456, _mm512_shuffle_f32x4(fft9537, fft9537, 177));
__m512 fft9462 = _mm512_fmadd_ps(fft9453, fft9456, _mm512_shuffle_f32x4(fft9453, fft9453, 177));
__m512 fft9546 = _mm512_fmadd_ps(fft9538, fft9456, _mm512_shuffle_f32x4(fft9538, fft9538, 177));
__m512 fft9463 = _mm512_fmadd_ps(fft9454, fft9456, _mm512_shuffle_f32x4(fft9454, fft9454, 177));
__m512 fft9547 = _mm512_fmadd_ps(fft9539, fft9456, _mm512_shuffle_f32x4(fft9539, fft9539, 177));
__m512 fft9464 = _mm512_fmadd_ps(fft9455, fft9456, _mm512_shuffle_f32x4(fft9455, fft9455, 177));
__m512 fft9548 = _mm512_fmadd_ps(fft9540, fft9456, _mm512_shuffle_f32x4(fft9540, fft9540, 177));
__m512 fft9465 = _mm512_mask_mov_ps(fft9457, 49344, fft9458);
__m512 fft9549 = _mm512_mask_mov_ps(fft9541, 49344, fft9542);
__m512 fft9466 = _mm512_mask_sub_ps(fft9458, 49344, _mm512_setzero_ps(), fft9457);
__m512 fft9550 = _mm512_mask_sub_ps(fft9542, 49344, _mm512_setzero_ps(), fft9541);
__m512 fft9467 = _mm512_mask_mov_ps(fft9459, 49344, fft9460);
__m512 fft9551 = _mm512_mask_mov_ps(fft9543, 49344, fft9544);
__m512 fft9468 = _mm512_mask_sub_ps(fft9460, 49344, _mm512_setzero_ps(), fft9459);
__m512 fft9552 = _mm512_mask_sub_ps(fft9544, 49344, _mm512_setzero_ps(), fft9543);
__m512 fft9469 = _mm512_mask_mov_ps(fft9461, 49344, fft9462);
__m512 fft9553 = _mm512_mask_mov_ps(fft9545, 49344, fft9546);
__m512 fft9470 = _mm512_mask_sub_ps(fft9462, 49344, _mm512_setzero_ps(), fft9461);
__m512 fft9554 = _mm512_mask_sub_ps(fft9546, 49344, _mm512_setzero_ps(), fft9545);
__m512 fft9471 = _mm512_mask_mov_ps(fft9463, 49344, fft9464);
__m512 fft9555 = _mm512_mask_mov_ps(fft9547, 49344, fft9548);
__m512 fft9472 = _mm512_mask_sub_ps(fft9464, 49344, _mm512_setzero_ps(), fft9463);
__m512 fft9556 = _mm512_mask_sub_ps(fft9548, 49344, _mm512_setzero_ps(), fft9547);
__m512 fft9473 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9474 = _mm512_fmadd_ps(fft9465, fft9473, _mm512_shuffle_ps(fft9465, fft9465, 78));
__m512 fft9557 = _mm512_fmadd_ps(fft9549, fft9473, _mm512_shuffle_ps(fft9549, fft9549, 78));
__m512 fft9475 = _mm512_fmadd_ps(fft9466, fft9473, _mm512_shuffle_ps(fft9466, fft9466, 78));
__m512 fft9558 = _mm512_fmadd_ps(fft9550, fft9473, _mm512_shuffle_ps(fft9550, fft9550, 78));
__m512 fft9476 = _mm512_fmadd_ps(fft9467, fft9473, _mm512_shuffle_ps(fft9467, fft9467, 78));
__m512 fft9559 = _mm512_fmadd_ps(fft9551, fft9473, _mm512_shuffle_ps(fft9551, fft9551, 78));
__m512 fft9477 = _mm512_fmadd_ps(fft9468, fft9473, _mm512_shuffle_ps(fft9468, fft9468, 78));
__m512 fft9560 = _mm512_fmadd_ps(fft9552, fft9473, _mm512_shuffle_ps(fft9552, fft9552, 78));
__m512 fft9478 = _mm512_fmadd_ps(fft9469, fft9473, _mm512_shuffle_ps(fft9469, fft9469, 78));
__m512 fft9561 = _mm512_fmadd_ps(fft9553, fft9473, _mm512_shuffle_ps(fft9553, fft9553, 78));
__m512 fft9479 = _mm512_fmadd_ps(fft9470, fft9473, _mm512_shuffle_ps(fft9470, fft9470, 78));
__m512 fft9562 = _mm512_fmadd_ps(fft9554, fft9473, _mm512_shuffle_ps(fft9554, fft9554, 78));
__m512 fft9480 = _mm512_fmadd_ps(fft9471, fft9473, _mm512_shuffle_ps(fft9471, fft9471, 78));
__m512 fft9563 = _mm512_fmadd_ps(fft9555, fft9473, _mm512_shuffle_ps(fft9555, fft9555, 78));
__m512 fft9481 = _mm512_fmadd_ps(fft9472, fft9473, _mm512_shuffle_ps(fft9472, fft9472, 78));
__m512 fft9564 = _mm512_fmadd_ps(fft9556, fft9473, _mm512_shuffle_ps(fft9556, fft9556, 78));
__m512i fft9482 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9483 = _mm512_permutexvar_ps(fft9482, fft9474);
__m512 fft9565 = _mm512_permutexvar_ps(fft9482, fft9557);
__m512i fft9484 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9485 = _mm512_permutexvar_ps(fft9484, fft9474);
__m512 fft9566 = _mm512_permutexvar_ps(fft9484, fft9557);
__m512 fft9486 = _mm512_permutexvar_ps(fft9482, fft9475);
__m512 fft9567 = _mm512_permutexvar_ps(fft9482, fft9558);
__m512 fft9487 = _mm512_permutexvar_ps(fft9484, fft9475);
__m512 fft9568 = _mm512_permutexvar_ps(fft9484, fft9558);
__m512 fft9488 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9489 = _mm512_fmadd_ps(fft9483, fft9488, fft9485);
__m512 fft9569 = _mm512_fmadd_ps(fft9565, fft9488, fft9566);
__m512 fft9490 = _mm512_fnmadd_ps(fft9487, fft9488, fft9486);
__m512 fft9570 = _mm512_fnmadd_ps(fft9568, fft9488, fft9567);
__m512 fft9491 = _mm512_mask_mov_ps(fft9487, 21845, fft9489);
__m512 fft9571 = _mm512_mask_mov_ps(fft9568, 21845, fft9569);
__m512 fft9492 = _mm512_mask_mov_ps(fft9483, 43176, fft9489);
__m512 fft9572 = _mm512_mask_mov_ps(fft9565, 43176, fft9569);
__m512 fft9493 = _mm512_mask_mov_ps(fft9491, 43176, fft9490);
__m512 fft9573 = _mm512_mask_mov_ps(fft9571, 43176, fft9570);
__m512 fft9494 = _mm512_mask_mov_ps(fft9492, 22102, fft9490);
__m512 fft9574 = _mm512_mask_mov_ps(fft9572, 22102, fft9570);
__m512 fft9495 = _mm512_mask_mul_ps(fft9493, 64764, fft9493, _mm512_set1_ps(5e-01f));
__m512 fft9575 = _mm512_mask_mul_ps(fft9573, 64764, fft9573, _mm512_set1_ps(5e-01f));
__m512 fft9496 = _mm512_mask_mul_ps(fft9494, 64764, fft9494, _mm512_set1_ps(5e-01f));
__m512 fft9576 = _mm512_mask_mul_ps(fft9574, 64764, fft9574, _mm512_set1_ps(5e-01f));
__m512 df853 = fft9495;
__m512 df861 = fft9575;
__m512 df854 = fft9496;
__m512 df862 = fft9576;
__m512 df855 = fft9476;
__m512 df863 = fft9559;
__m512 df856 = fft9477;
__m512 df864 = fft9560;
__m512 df857 = fft9478;
__m512 df865 = fft9561;
__m512 df858 = fft9479;
__m512 df866 = fft9562;
__m512 df859 = fft9480;
__m512 df867 = fft9563;
__m512 df860 = fft9481;
__m512 df868 = fft9564;
__m512i eo57 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df855 = _mm512_permutexvar_ps(eo57, df855);
df856 = _mm512_permutexvar_ps(eo57, df856);
_mm512_mask_storeu_ps(dfPtr8+8192+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df855);
_mm512_mask_storeu_ps(dfPtr8+8256+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df856);
_mm512_mask_storeu_ps(dfPtr8+1056736+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df855);
_mm512_mask_storeu_ps(dfPtr8+1056800+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df856);
df863 = _mm512_permutexvar_ps(eo57, df863);
df864 = _mm512_permutexvar_ps(eo57, df864);
_mm512_mask_storeu_ps(dfPtr8+2105344+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df863);
_mm512_mask_storeu_ps(dfPtr8+2105408+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df864);
_mm512_mask_storeu_ps(dfPtr8+3153888+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df863);
_mm512_mask_storeu_ps(dfPtr8+3153952+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df864);
df857 = _mm512_permutexvar_ps(eo57, df857);
df858 = _mm512_permutexvar_ps(eo57, df858);
_mm512_mask_storeu_ps(dfPtr8+16384+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df857);
_mm512_mask_storeu_ps(dfPtr8+16448+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df858);
_mm512_mask_storeu_ps(dfPtr8+1064928+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df857);
_mm512_mask_storeu_ps(dfPtr8+1064992+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df858);
df865 = _mm512_permutexvar_ps(eo57, df865);
df866 = _mm512_permutexvar_ps(eo57, df866);
_mm512_mask_storeu_ps(dfPtr8+2113536+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df865);
_mm512_mask_storeu_ps(dfPtr8+2113600+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df866);
_mm512_mask_storeu_ps(dfPtr8+3162080+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df865);
_mm512_mask_storeu_ps(dfPtr8+3162144+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df866);
df859 = _mm512_permutexvar_ps(eo57, df859);
df860 = _mm512_permutexvar_ps(eo57, df860);
_mm512_mask_storeu_ps(dfPtr8+24576+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df859);
_mm512_mask_storeu_ps(dfPtr8+24640+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df860);
_mm512_mask_storeu_ps(dfPtr8+1073120+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df859);
_mm512_mask_storeu_ps(dfPtr8+1073184+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df860);
df867 = _mm512_permutexvar_ps(eo57, df867);
df868 = _mm512_permutexvar_ps(eo57, df868);
_mm512_mask_storeu_ps(dfPtr8+2121728+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df867);
_mm512_mask_storeu_ps(dfPtr8+2121792+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df868);
_mm512_mask_storeu_ps(dfPtr8+3170272+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df867);
_mm512_mask_storeu_ps(dfPtr8+3170336+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df868);
_mm512_mask_storeu_ps(dfPtr8+0+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df853);
_mm512_mask_storeu_ps(dfPtr8+64+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df854);
_mm512_mask_storeu_ps(dfPtr8+1048544+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df853);
_mm512_mask_storeu_ps(dfPtr8+1048608+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df854);
_mm512_mask_storeu_ps(dfPtr8+2097152+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df861);
_mm512_mask_storeu_ps(dfPtr8+2097216+32768*i38+3072*j31+256*k109+128*m57+32*f60, 255, df862);
_mm512_mask_storeu_ps(dfPtr8+3145696+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df861);
_mm512_mask_storeu_ps(dfPtr8+3145760+32768*i38+3072*j31+256*k109+128*m57+32*f60, 65280, df862);
}
++j31;
}

static void ResNeXt50StriderArrangeDats2(ResNeXt50ThreaderTeam1* team43, char** tensors59) {
ResNeXt50ThreaderTask1 task63;
task63.callee1 = ResNeXt50StriderArrangeDats2Callee1;
task63.any1 = tensors59;
task63.nd1 = 4;
task63.hull1[0] = 1;
task63.hull1[1] = 1;
task63.hull1[2] = 32;
task63.hull1[3] = 1;
ResNeXt50ThreaderDo1(team43, &task63);
}

static void ResNeXt50StriderProduceSums2Callee1(ResNeXt50ThreaderTask1* task64, int64_t* pt37) {
void** tuple4 = task64->any1;
char** tensors62 = tuple4[0];
ptrdiff_t e19 = 0;
ptrdiff_t z4 = (ptrdiff_t)tuple4[2];
ptrdiff_t g21 = pt37[3];
ptrdiff_t p2 = 0;
ptrdiff_t d12 = 0;
ptrdiff_t w52 = 0;
if (__builtin_expect(!(e19|z4), 0)) {
z4 = 0;
char*restrict bfPtr9 = tensors62[0]+1024*e19;
char*restrict wfPtr9 = tensors62[0]+1024+51904512*e19+262144*z4;
char*restrict dfPtr9 = tensors62[1]+207618048*e19+1048576*z4;
char*restrict sfPtr8 = tensors62[2];
ptrdiff_t i39 = 2*g21;
ptrdiff_t ii25 = i39+1;
for (; i39 <= ii25; ++i39) {
ptrdiff_t j32 = 4*p2;
ptrdiff_t jj36 = j32+3;
if (__builtin_expect(!j32, 0)) {
ptrdiff_t k110 = 3*d12;
for (; k110 != 2; ++k110) {
ptrdiff_t l41 = 2*w52;
for (; l41 != 2; ++l41) {
__m512 sfRe317 = _mm512_setzero_ps();
__m512 sfIm317 = _mm512_setzero_ps();
__m512 sfRe323 = _mm512_setzero_ps();
__m512 sfIm323 = _mm512_setzero_ps();
sfRe317 = _mm512_mask_mov_ps(sfRe317, 1, _mm512_set1_ps(*(float*)(bfPtr9+0+32*i39+16*l41)));
sfRe317 = _mm512_mask_mov_ps(sfRe317, 256, _mm512_set1_ps(*(float*)(bfPtr9+4+32*i39+16*l41)));
sfRe323 = _mm512_mask_mov_ps(sfRe323, 1, _mm512_set1_ps(*(float*)(bfPtr9+8+32*i39+16*l41)));
sfRe323 = _mm512_mask_mov_ps(sfRe323, 256, _mm512_set1_ps(*(float*)(bfPtr9+12+32*i39+16*l41)));
__m512 sfRe318 = sfRe317;
__m512 sfIm318 = sfIm317;
__m512 sfRe319 = sfRe317;
__m512 sfIm319 = sfIm317;
__m512 sfRe320 = sfRe317;
__m512 sfIm320 = sfIm317;
__m512 sfRe321 = sfRe317;
__m512 sfIm321 = sfIm317;
__m512 sfRe322 = sfRe317;
__m512 sfIm322 = sfIm317;
__m512 sfRe324 = sfRe323;
__m512 sfIm324 = sfIm323;
__m512 sfRe325 = sfRe323;
__m512 sfIm325 = sfIm323;
__m512 sfRe326 = sfRe323;
__m512 sfIm326 = sfIm323;
__m512 sfRe327 = sfRe323;
__m512 sfIm327 = sfIm323;
__m512 sfRe328 = sfRe323;
__m512 sfIm328 = sfIm323;
for (ptrdiff_t s27 = 0; s27 < 8; ++s27) {
__m512i wfLd17 = _mm512_loadu_si512(wfPtr9+0+8192*i39+2048*j32+1024*l41+128*s27);
__m512 wfRe17 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd17));
__m512 wfIm17 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd17, 1));
__m512 wfMx9 = _mm512_mask_mov_ps(wfIm17, 64764, wfRe17);
__m512i wfLd18 = _mm512_loadu_si512(wfPtr9+64+8192*i39+2048*j32+1024*l41+128*s27);
__m512 wfRe18 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd18));
__m512 wfIm18 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd18, 1));
__m512 wfMx10 = _mm512_mask_mov_ps(wfIm18, 64764, wfRe18);
__m512 dfRe17 = _mm512_loadu_ps(dfPtr9+0+32768*i39+8192*j32+3072*k110+384*s27);
__m512 dfIm17 = _mm512_loadu_ps(dfPtr9+64+32768*i39+8192*j32+3072*k110+384*s27);
sfRe317 = _mm512_fmadd_ps(wfRe17, dfRe17, sfRe317);
sfRe317 = _mm512_mask3_fmadd_ps(wfIm17, dfIm17, sfRe317, 64764);
sfIm317 = _mm512_fmadd_ps(wfMx9, dfIm17, sfIm317);
sfIm317 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe17, sfIm317, 64764);
sfRe323 = _mm512_fmadd_ps(wfRe18, dfRe17, sfRe323);
sfRe323 = _mm512_mask3_fmadd_ps(wfIm18, dfIm17, sfRe323, 64764);
sfIm323 = _mm512_fmadd_ps(wfMx10, dfIm17, sfIm323);
sfIm323 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe17, sfIm323, 64764);
dfRe17 = _mm512_shuffle_f32x4(dfRe17, dfRe17, 78);
dfIm17 = _mm512_shuffle_f32x4(dfIm17, dfIm17, 78);
sfRe318 = _mm512_fmadd_ps(wfRe17, dfRe17, sfRe318);
sfRe318 = _mm512_mask3_fmadd_ps(wfIm17, dfIm17, sfRe318, 64764);
sfIm318 = _mm512_fmadd_ps(wfMx9, dfIm17, sfIm318);
sfIm318 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe17, sfIm318, 64764);
sfRe324 = _mm512_fmadd_ps(wfRe18, dfRe17, sfRe324);
sfRe324 = _mm512_mask3_fmadd_ps(wfIm18, dfIm17, sfRe324, 64764);
sfIm324 = _mm512_fmadd_ps(wfMx10, dfIm17, sfIm324);
sfIm324 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe17, sfIm324, 64764);
__m512 dfRe18 = _mm512_loadu_ps(dfPtr9+128+32768*i39+8192*j32+3072*k110+384*s27);
__m512 dfIm18 = _mm512_loadu_ps(dfPtr9+192+32768*i39+8192*j32+3072*k110+384*s27);
sfRe319 = _mm512_fmadd_ps(wfRe17, dfRe18, sfRe319);
sfRe319 = _mm512_mask3_fmadd_ps(wfIm17, dfIm18, sfRe319, 64764);
sfIm319 = _mm512_fmadd_ps(wfMx9, dfIm18, sfIm319);
sfIm319 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe18, sfIm319, 64764);
sfRe325 = _mm512_fmadd_ps(wfRe18, dfRe18, sfRe325);
sfRe325 = _mm512_mask3_fmadd_ps(wfIm18, dfIm18, sfRe325, 64764);
sfIm325 = _mm512_fmadd_ps(wfMx10, dfIm18, sfIm325);
sfIm325 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe18, sfIm325, 64764);
dfRe18 = _mm512_shuffle_f32x4(dfRe18, dfRe18, 78);
dfIm18 = _mm512_shuffle_f32x4(dfIm18, dfIm18, 78);
sfRe320 = _mm512_fmadd_ps(wfRe17, dfRe18, sfRe320);
sfRe320 = _mm512_mask3_fmadd_ps(wfIm17, dfIm18, sfRe320, 64764);
sfIm320 = _mm512_fmadd_ps(wfMx9, dfIm18, sfIm320);
sfIm320 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe18, sfIm320, 64764);
sfRe326 = _mm512_fmadd_ps(wfRe18, dfRe18, sfRe326);
sfRe326 = _mm512_mask3_fmadd_ps(wfIm18, dfIm18, sfRe326, 64764);
sfIm326 = _mm512_fmadd_ps(wfMx10, dfIm18, sfIm326);
sfIm326 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe18, sfIm326, 64764);
__m512 dfRe19 = _mm512_loadu_ps(dfPtr9+256+32768*i39+8192*j32+3072*k110+384*s27);
__m512 dfIm19 = _mm512_loadu_ps(dfPtr9+320+32768*i39+8192*j32+3072*k110+384*s27);
sfRe321 = _mm512_fmadd_ps(wfRe17, dfRe19, sfRe321);
sfRe321 = _mm512_mask3_fmadd_ps(wfIm17, dfIm19, sfRe321, 64764);
sfIm321 = _mm512_fmadd_ps(wfMx9, dfIm19, sfIm321);
sfIm321 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe19, sfIm321, 64764);
sfRe327 = _mm512_fmadd_ps(wfRe18, dfRe19, sfRe327);
sfRe327 = _mm512_mask3_fmadd_ps(wfIm18, dfIm19, sfRe327, 64764);
sfIm327 = _mm512_fmadd_ps(wfMx10, dfIm19, sfIm327);
sfIm327 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe19, sfIm327, 64764);
dfRe19 = _mm512_shuffle_f32x4(dfRe19, dfRe19, 78);
dfIm19 = _mm512_shuffle_f32x4(dfIm19, dfIm19, 78);
sfRe322 = _mm512_fmadd_ps(wfRe17, dfRe19, sfRe322);
sfRe322 = _mm512_mask3_fmadd_ps(wfIm17, dfIm19, sfRe322, 64764);
sfIm322 = _mm512_fmadd_ps(wfMx9, dfIm19, sfIm322);
sfIm322 = _mm512_mask3_fnmadd_ps(wfIm17, dfRe19, sfIm322, 64764);
sfRe328 = _mm512_fmadd_ps(wfRe18, dfRe19, sfRe328);
sfRe328 = _mm512_mask3_fmadd_ps(wfIm18, dfIm19, sfRe328, 64764);
sfIm328 = _mm512_fmadd_ps(wfMx10, dfIm19, sfIm328);
sfIm328 = _mm512_mask3_fnmadd_ps(wfIm18, dfRe19, sfIm328, 64764);
}
_mm512_storeu_ps(sfPtr8+0+32768*i39+8192*j32+3072*k110+1536*l41, sfRe317);
_mm512_storeu_ps(sfPtr8+64+32768*i39+8192*j32+3072*k110+1536*l41, sfIm317);
_mm512_storeu_ps(sfPtr8+128+32768*i39+8192*j32+3072*k110+1536*l41, sfRe318);
_mm512_storeu_ps(sfPtr8+192+32768*i39+8192*j32+3072*k110+1536*l41, sfIm318);
_mm512_storeu_ps(sfPtr8+256+32768*i39+8192*j32+3072*k110+1536*l41, sfRe319);
_mm512_storeu_ps(sfPtr8+320+32768*i39+8192*j32+3072*k110+1536*l41, sfIm319);
_mm512_storeu_ps(sfPtr8+384+32768*i39+8192*j32+3072*k110+1536*l41, sfRe320);
_mm512_storeu_ps(sfPtr8+448+32768*i39+8192*j32+3072*k110+1536*l41, sfIm320);
_mm512_storeu_ps(sfPtr8+512+32768*i39+8192*j32+3072*k110+1536*l41, sfRe321);
_mm512_storeu_ps(sfPtr8+576+32768*i39+8192*j32+3072*k110+1536*l41, sfIm321);
_mm512_storeu_ps(sfPtr8+640+32768*i39+8192*j32+3072*k110+1536*l41, sfRe322);
_mm512_storeu_ps(sfPtr8+704+32768*i39+8192*j32+3072*k110+1536*l41, sfIm322);
_mm512_storeu_ps(sfPtr8+768+32768*i39+8192*j32+3072*k110+1536*l41, sfRe323);
_mm512_storeu_ps(sfPtr8+832+32768*i39+8192*j32+3072*k110+1536*l41, sfIm323);
_mm512_storeu_ps(sfPtr8+896+32768*i39+8192*j32+3072*k110+1536*l41, sfRe324);
_mm512_storeu_ps(sfPtr8+960+32768*i39+8192*j32+3072*k110+1536*l41, sfIm324);
_mm512_storeu_ps(sfPtr8+1024+32768*i39+8192*j32+3072*k110+1536*l41, sfRe325);
_mm512_storeu_ps(sfPtr8+1088+32768*i39+8192*j32+3072*k110+1536*l41, sfIm325);
_mm512_storeu_ps(sfPtr8+1152+32768*i39+8192*j32+3072*k110+1536*l41, sfRe326);
_mm512_storeu_ps(sfPtr8+1216+32768*i39+8192*j32+3072*k110+1536*l41, sfIm326);
_mm512_storeu_ps(sfPtr8+1280+32768*i39+8192*j32+3072*k110+1536*l41, sfRe327);
_mm512_storeu_ps(sfPtr8+1344+32768*i39+8192*j32+3072*k110+1536*l41, sfIm327);
_mm512_storeu_ps(sfPtr8+1408+32768*i39+8192*j32+3072*k110+1536*l41, sfRe328);
_mm512_storeu_ps(sfPtr8+1472+32768*i39+8192*j32+3072*k110+1536*l41, sfIm328);
}
}
ptrdiff_t l42 = 2*w52;
for (; l42 != 2; ++l42) {
__m512 sfRe329 = _mm512_setzero_ps();
__m512 sfIm329 = _mm512_setzero_ps();
__m512 sfRe333 = _mm512_setzero_ps();
__m512 sfIm333 = _mm512_setzero_ps();
sfRe329 = _mm512_mask_mov_ps(sfRe329, 1, _mm512_set1_ps(*(float*)(bfPtr9+0+32*i39+16*l42)));
sfRe329 = _mm512_mask_mov_ps(sfRe329, 256, _mm512_set1_ps(*(float*)(bfPtr9+4+32*i39+16*l42)));
sfRe333 = _mm512_mask_mov_ps(sfRe333, 1, _mm512_set1_ps(*(float*)(bfPtr9+8+32*i39+16*l42)));
sfRe333 = _mm512_mask_mov_ps(sfRe333, 256, _mm512_set1_ps(*(float*)(bfPtr9+12+32*i39+16*l42)));
__m512 sfRe330 = sfRe329;
__m512 sfIm330 = sfIm329;
__m512 sfRe331 = sfRe329;
__m512 sfIm331 = sfIm329;
__m512 sfRe332 = sfRe329;
__m512 sfIm332 = sfIm329;
__m512 sfRe334 = sfRe333;
__m512 sfIm334 = sfIm333;
__m512 sfRe335 = sfRe333;
__m512 sfIm335 = sfIm333;
__m512 sfRe336 = sfRe333;
__m512 sfIm336 = sfIm333;
for (ptrdiff_t s28 = 0; s28 < 8; ++s28) {
__m512i wfLd19 = _mm512_loadu_si512(wfPtr9+0+8192*i39+2048*j32+1024*l42+128*s28);
__m512 wfRe19 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd19));
__m512 wfIm19 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd19, 1));
__m512 wfMx11 = _mm512_mask_mov_ps(wfIm19, 64764, wfRe19);
__m512i wfLd20 = _mm512_loadu_si512(wfPtr9+64+8192*i39+2048*j32+1024*l42+128*s28);
__m512 wfRe20 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd20));
__m512 wfIm20 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd20, 1));
__m512 wfMx12 = _mm512_mask_mov_ps(wfIm20, 64764, wfRe20);
__m512 dfRe20 = _mm512_loadu_ps(dfPtr9+0+32768*i39+8192*j32+3072*k110+256*s28);
__m512 dfIm20 = _mm512_loadu_ps(dfPtr9+64+32768*i39+8192*j32+3072*k110+256*s28);
sfRe329 = _mm512_fmadd_ps(wfRe19, dfRe20, sfRe329);
sfRe329 = _mm512_mask3_fmadd_ps(wfIm19, dfIm20, sfRe329, 64764);
sfIm329 = _mm512_fmadd_ps(wfMx11, dfIm20, sfIm329);
sfIm329 = _mm512_mask3_fnmadd_ps(wfIm19, dfRe20, sfIm329, 64764);
sfRe333 = _mm512_fmadd_ps(wfRe20, dfRe20, sfRe333);
sfRe333 = _mm512_mask3_fmadd_ps(wfIm20, dfIm20, sfRe333, 64764);
sfIm333 = _mm512_fmadd_ps(wfMx12, dfIm20, sfIm333);
sfIm333 = _mm512_mask3_fnmadd_ps(wfIm20, dfRe20, sfIm333, 64764);
dfRe20 = _mm512_shuffle_f32x4(dfRe20, dfRe20, 78);
dfIm20 = _mm512_shuffle_f32x4(dfIm20, dfIm20, 78);
sfRe330 = _mm512_fmadd_ps(wfRe19, dfRe20, sfRe330);
sfRe330 = _mm512_mask3_fmadd_ps(wfIm19, dfIm20, sfRe330, 64764);
sfIm330 = _mm512_fmadd_ps(wfMx11, dfIm20, sfIm330);
sfIm330 = _mm512_mask3_fnmadd_ps(wfIm19, dfRe20, sfIm330, 64764);
sfRe334 = _mm512_fmadd_ps(wfRe20, dfRe20, sfRe334);
sfRe334 = _mm512_mask3_fmadd_ps(wfIm20, dfIm20, sfRe334, 64764);
sfIm334 = _mm512_fmadd_ps(wfMx12, dfIm20, sfIm334);
sfIm334 = _mm512_mask3_fnmadd_ps(wfIm20, dfRe20, sfIm334, 64764);
__m512 dfRe21 = _mm512_loadu_ps(dfPtr9+128+32768*i39+8192*j32+3072*k110+256*s28);
__m512 dfIm21 = _mm512_loadu_ps(dfPtr9+192+32768*i39+8192*j32+3072*k110+256*s28);
sfRe331 = _mm512_fmadd_ps(wfRe19, dfRe21, sfRe331);
sfRe331 = _mm512_mask3_fmadd_ps(wfIm19, dfIm21, sfRe331, 64764);
sfIm331 = _mm512_fmadd_ps(wfMx11, dfIm21, sfIm331);
sfIm331 = _mm512_mask3_fnmadd_ps(wfIm19, dfRe21, sfIm331, 64764);
sfRe335 = _mm512_fmadd_ps(wfRe20, dfRe21, sfRe335);
sfRe335 = _mm512_mask3_fmadd_ps(wfIm20, dfIm21, sfRe335, 64764);
sfIm335 = _mm512_fmadd_ps(wfMx12, dfIm21, sfIm335);
sfIm335 = _mm512_mask3_fnmadd_ps(wfIm20, dfRe21, sfIm335, 64764);
dfRe21 = _mm512_shuffle_f32x4(dfRe21, dfRe21, 78);
dfIm21 = _mm512_shuffle_f32x4(dfIm21, dfIm21, 78);
sfRe332 = _mm512_fmadd_ps(wfRe19, dfRe21, sfRe332);
sfRe332 = _mm512_mask3_fmadd_ps(wfIm19, dfIm21, sfRe332, 64764);
sfIm332 = _mm512_fmadd_ps(wfMx11, dfIm21, sfIm332);
sfIm332 = _mm512_mask3_fnmadd_ps(wfIm19, dfRe21, sfIm332, 64764);
sfRe336 = _mm512_fmadd_ps(wfRe20, dfRe21, sfRe336);
sfRe336 = _mm512_mask3_fmadd_ps(wfIm20, dfIm21, sfRe336, 64764);
sfIm336 = _mm512_fmadd_ps(wfMx12, dfIm21, sfIm336);
sfIm336 = _mm512_mask3_fnmadd_ps(wfIm20, dfRe21, sfIm336, 64764);
}
_mm512_storeu_ps(sfPtr8+0+32768*i39+8192*j32+3072*k110+1024*l42, sfRe329);
_mm512_storeu_ps(sfPtr8+64+32768*i39+8192*j32+3072*k110+1024*l42, sfIm329);
_mm512_storeu_ps(sfPtr8+128+32768*i39+8192*j32+3072*k110+1024*l42, sfRe330);
_mm512_storeu_ps(sfPtr8+192+32768*i39+8192*j32+3072*k110+1024*l42, sfIm330);
_mm512_storeu_ps(sfPtr8+256+32768*i39+8192*j32+3072*k110+1024*l42, sfRe331);
_mm512_storeu_ps(sfPtr8+320+32768*i39+8192*j32+3072*k110+1024*l42, sfIm331);
_mm512_storeu_ps(sfPtr8+384+32768*i39+8192*j32+3072*k110+1024*l42, sfRe332);
_mm512_storeu_ps(sfPtr8+448+32768*i39+8192*j32+3072*k110+1024*l42, sfIm332);
_mm512_storeu_ps(sfPtr8+512+32768*i39+8192*j32+3072*k110+1024*l42, sfRe333);
_mm512_storeu_ps(sfPtr8+576+32768*i39+8192*j32+3072*k110+1024*l42, sfIm333);
_mm512_storeu_ps(sfPtr8+640+32768*i39+8192*j32+3072*k110+1024*l42, sfRe334);
_mm512_storeu_ps(sfPtr8+704+32768*i39+8192*j32+3072*k110+1024*l42, sfIm334);
_mm512_storeu_ps(sfPtr8+768+32768*i39+8192*j32+3072*k110+1024*l42, sfRe335);
_mm512_storeu_ps(sfPtr8+832+32768*i39+8192*j32+3072*k110+1024*l42, sfIm335);
_mm512_storeu_ps(sfPtr8+896+32768*i39+8192*j32+3072*k110+1024*l42, sfRe336);
_mm512_storeu_ps(sfPtr8+960+32768*i39+8192*j32+3072*k110+1024*l42, sfIm336);
}
j32 = 1;
}
for (; j32 <= jj36; ++j32) {
ptrdiff_t k111 = 3*d12;
for (; k111 != 2; ++k111) {
ptrdiff_t l43 = 2*w52;
for (; l43 != 2; ++l43) {
__m512 sfRe337 = _mm512_setzero_ps();
__m512 sfIm337 = _mm512_setzero_ps();
__m512 sfRe343 = _mm512_setzero_ps();
__m512 sfIm343 = _mm512_setzero_ps();
(void)bfPtr9;
__m512 sfRe338 = sfRe337;
__m512 sfIm338 = sfIm337;
__m512 sfRe339 = sfRe337;
__m512 sfIm339 = sfIm337;
__m512 sfRe340 = sfRe337;
__m512 sfIm340 = sfIm337;
__m512 sfRe341 = sfRe337;
__m512 sfIm341 = sfIm337;
__m512 sfRe342 = sfRe337;
__m512 sfIm342 = sfIm337;
__m512 sfRe344 = sfRe343;
__m512 sfIm344 = sfIm343;
__m512 sfRe345 = sfRe343;
__m512 sfIm345 = sfIm343;
__m512 sfRe346 = sfRe343;
__m512 sfIm346 = sfIm343;
__m512 sfRe347 = sfRe343;
__m512 sfIm347 = sfIm343;
__m512 sfRe348 = sfRe343;
__m512 sfIm348 = sfIm343;
for (ptrdiff_t s29 = 0; s29 < 8; ++s29) {
__m512i wfLd21 = _mm512_loadu_si512(wfPtr9+0+8192*i39+2048*j32+1024*l43+128*s29);
__m512 wfRe21 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd21));
__m512 wfIm21 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd21, 1));
__m512i wfLd22 = _mm512_loadu_si512(wfPtr9+64+8192*i39+2048*j32+1024*l43+128*s29);
__m512 wfRe22 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd22));
__m512 wfIm22 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd22, 1));
__m512 dfRe22 = _mm512_loadu_ps(dfPtr9+0+32768*i39+8192*j32+3072*k111+384*s29);
__m512 dfIm22 = _mm512_loadu_ps(dfPtr9+64+32768*i39+8192*j32+3072*k111+384*s29);
sfRe337 = _mm512_fmadd_ps(wfRe21, dfRe22, sfRe337);
sfRe337 = _mm512_fmadd_ps(wfIm21, dfIm22, sfRe337);
sfIm337 = _mm512_fmadd_ps(wfRe21, dfIm22, sfIm337);
sfIm337 = _mm512_fnmadd_ps(wfIm21, dfRe22, sfIm337);
sfRe343 = _mm512_fmadd_ps(wfRe22, dfRe22, sfRe343);
sfRe343 = _mm512_fmadd_ps(wfIm22, dfIm22, sfRe343);
sfIm343 = _mm512_fmadd_ps(wfRe22, dfIm22, sfIm343);
sfIm343 = _mm512_fnmadd_ps(wfIm22, dfRe22, sfIm343);
dfRe22 = _mm512_shuffle_f32x4(dfRe22, dfRe22, 78);
dfIm22 = _mm512_shuffle_f32x4(dfIm22, dfIm22, 78);
sfRe338 = _mm512_fmadd_ps(wfRe21, dfRe22, sfRe338);
sfRe338 = _mm512_fmadd_ps(wfIm21, dfIm22, sfRe338);
sfIm338 = _mm512_fmadd_ps(wfRe21, dfIm22, sfIm338);
sfIm338 = _mm512_fnmadd_ps(wfIm21, dfRe22, sfIm338);
sfRe344 = _mm512_fmadd_ps(wfRe22, dfRe22, sfRe344);
sfRe344 = _mm512_fmadd_ps(wfIm22, dfIm22, sfRe344);
sfIm344 = _mm512_fmadd_ps(wfRe22, dfIm22, sfIm344);
sfIm344 = _mm512_fnmadd_ps(wfIm22, dfRe22, sfIm344);
__m512 dfRe23 = _mm512_loadu_ps(dfPtr9+128+32768*i39+8192*j32+3072*k111+384*s29);
__m512 dfIm23 = _mm512_loadu_ps(dfPtr9+192+32768*i39+8192*j32+3072*k111+384*s29);
sfRe339 = _mm512_fmadd_ps(wfRe21, dfRe23, sfRe339);
sfRe339 = _mm512_fmadd_ps(wfIm21, dfIm23, sfRe339);
sfIm339 = _mm512_fmadd_ps(wfRe21, dfIm23, sfIm339);
sfIm339 = _mm512_fnmadd_ps(wfIm21, dfRe23, sfIm339);
sfRe345 = _mm512_fmadd_ps(wfRe22, dfRe23, sfRe345);
sfRe345 = _mm512_fmadd_ps(wfIm22, dfIm23, sfRe345);
sfIm345 = _mm512_fmadd_ps(wfRe22, dfIm23, sfIm345);
sfIm345 = _mm512_fnmadd_ps(wfIm22, dfRe23, sfIm345);
dfRe23 = _mm512_shuffle_f32x4(dfRe23, dfRe23, 78);
dfIm23 = _mm512_shuffle_f32x4(dfIm23, dfIm23, 78);
sfRe340 = _mm512_fmadd_ps(wfRe21, dfRe23, sfRe340);
sfRe340 = _mm512_fmadd_ps(wfIm21, dfIm23, sfRe340);
sfIm340 = _mm512_fmadd_ps(wfRe21, dfIm23, sfIm340);
sfIm340 = _mm512_fnmadd_ps(wfIm21, dfRe23, sfIm340);
sfRe346 = _mm512_fmadd_ps(wfRe22, dfRe23, sfRe346);
sfRe346 = _mm512_fmadd_ps(wfIm22, dfIm23, sfRe346);
sfIm346 = _mm512_fmadd_ps(wfRe22, dfIm23, sfIm346);
sfIm346 = _mm512_fnmadd_ps(wfIm22, dfRe23, sfIm346);
__m512 dfRe24 = _mm512_loadu_ps(dfPtr9+256+32768*i39+8192*j32+3072*k111+384*s29);
__m512 dfIm24 = _mm512_loadu_ps(dfPtr9+320+32768*i39+8192*j32+3072*k111+384*s29);
sfRe341 = _mm512_fmadd_ps(wfRe21, dfRe24, sfRe341);
sfRe341 = _mm512_fmadd_ps(wfIm21, dfIm24, sfRe341);
sfIm341 = _mm512_fmadd_ps(wfRe21, dfIm24, sfIm341);
sfIm341 = _mm512_fnmadd_ps(wfIm21, dfRe24, sfIm341);
sfRe347 = _mm512_fmadd_ps(wfRe22, dfRe24, sfRe347);
sfRe347 = _mm512_fmadd_ps(wfIm22, dfIm24, sfRe347);
sfIm347 = _mm512_fmadd_ps(wfRe22, dfIm24, sfIm347);
sfIm347 = _mm512_fnmadd_ps(wfIm22, dfRe24, sfIm347);
dfRe24 = _mm512_shuffle_f32x4(dfRe24, dfRe24, 78);
dfIm24 = _mm512_shuffle_f32x4(dfIm24, dfIm24, 78);
sfRe342 = _mm512_fmadd_ps(wfRe21, dfRe24, sfRe342);
sfRe342 = _mm512_fmadd_ps(wfIm21, dfIm24, sfRe342);
sfIm342 = _mm512_fmadd_ps(wfRe21, dfIm24, sfIm342);
sfIm342 = _mm512_fnmadd_ps(wfIm21, dfRe24, sfIm342);
sfRe348 = _mm512_fmadd_ps(wfRe22, dfRe24, sfRe348);
sfRe348 = _mm512_fmadd_ps(wfIm22, dfIm24, sfRe348);
sfIm348 = _mm512_fmadd_ps(wfRe22, dfIm24, sfIm348);
sfIm348 = _mm512_fnmadd_ps(wfIm22, dfRe24, sfIm348);
}
_mm512_storeu_ps(sfPtr8+0+32768*i39+8192*j32+3072*k111+1536*l43, sfRe337);
_mm512_storeu_ps(sfPtr8+64+32768*i39+8192*j32+3072*k111+1536*l43, sfIm337);
_mm512_storeu_ps(sfPtr8+128+32768*i39+8192*j32+3072*k111+1536*l43, sfRe338);
_mm512_storeu_ps(sfPtr8+192+32768*i39+8192*j32+3072*k111+1536*l43, sfIm338);
_mm512_storeu_ps(sfPtr8+256+32768*i39+8192*j32+3072*k111+1536*l43, sfRe339);
_mm512_storeu_ps(sfPtr8+320+32768*i39+8192*j32+3072*k111+1536*l43, sfIm339);
_mm512_storeu_ps(sfPtr8+384+32768*i39+8192*j32+3072*k111+1536*l43, sfRe340);
_mm512_storeu_ps(sfPtr8+448+32768*i39+8192*j32+3072*k111+1536*l43, sfIm340);
_mm512_storeu_ps(sfPtr8+512+32768*i39+8192*j32+3072*k111+1536*l43, sfRe341);
_mm512_storeu_ps(sfPtr8+576+32768*i39+8192*j32+3072*k111+1536*l43, sfIm341);
_mm512_storeu_ps(sfPtr8+640+32768*i39+8192*j32+3072*k111+1536*l43, sfRe342);
_mm512_storeu_ps(sfPtr8+704+32768*i39+8192*j32+3072*k111+1536*l43, sfIm342);
_mm512_storeu_ps(sfPtr8+768+32768*i39+8192*j32+3072*k111+1536*l43, sfRe343);
_mm512_storeu_ps(sfPtr8+832+32768*i39+8192*j32+3072*k111+1536*l43, sfIm343);
_mm512_storeu_ps(sfPtr8+896+32768*i39+8192*j32+3072*k111+1536*l43, sfRe344);
_mm512_storeu_ps(sfPtr8+960+32768*i39+8192*j32+3072*k111+1536*l43, sfIm344);
_mm512_storeu_ps(sfPtr8+1024+32768*i39+8192*j32+3072*k111+1536*l43, sfRe345);
_mm512_storeu_ps(sfPtr8+1088+32768*i39+8192*j32+3072*k111+1536*l43, sfIm345);
_mm512_storeu_ps(sfPtr8+1152+32768*i39+8192*j32+3072*k111+1536*l43, sfRe346);
_mm512_storeu_ps(sfPtr8+1216+32768*i39+8192*j32+3072*k111+1536*l43, sfIm346);
_mm512_storeu_ps(sfPtr8+1280+32768*i39+8192*j32+3072*k111+1536*l43, sfRe347);
_mm512_storeu_ps(sfPtr8+1344+32768*i39+8192*j32+3072*k111+1536*l43, sfIm347);
_mm512_storeu_ps(sfPtr8+1408+32768*i39+8192*j32+3072*k111+1536*l43, sfRe348);
_mm512_storeu_ps(sfPtr8+1472+32768*i39+8192*j32+3072*k111+1536*l43, sfIm348);
}
}
ptrdiff_t l44 = 2*w52;
for (; l44 != 2; ++l44) {
__m512 sfRe349 = _mm512_setzero_ps();
__m512 sfIm349 = _mm512_setzero_ps();
__m512 sfRe353 = _mm512_setzero_ps();
__m512 sfIm353 = _mm512_setzero_ps();
(void)bfPtr9;
__m512 sfRe350 = sfRe349;
__m512 sfIm350 = sfIm349;
__m512 sfRe351 = sfRe349;
__m512 sfIm351 = sfIm349;
__m512 sfRe352 = sfRe349;
__m512 sfIm352 = sfIm349;
__m512 sfRe354 = sfRe353;
__m512 sfIm354 = sfIm353;
__m512 sfRe355 = sfRe353;
__m512 sfIm355 = sfIm353;
__m512 sfRe356 = sfRe353;
__m512 sfIm356 = sfIm353;
for (ptrdiff_t s30 = 0; s30 < 8; ++s30) {
__m512i wfLd23 = _mm512_loadu_si512(wfPtr9+0+8192*i39+2048*j32+1024*l44+128*s30);
__m512 wfRe23 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd23));
__m512 wfIm23 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd23, 1));
__m512i wfLd24 = _mm512_loadu_si512(wfPtr9+64+8192*i39+2048*j32+1024*l44+128*s30);
__m512 wfRe24 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd24));
__m512 wfIm24 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd24, 1));
__m512 dfRe25 = _mm512_loadu_ps(dfPtr9+0+32768*i39+8192*j32+3072*k111+256*s30);
__m512 dfIm25 = _mm512_loadu_ps(dfPtr9+64+32768*i39+8192*j32+3072*k111+256*s30);
sfRe349 = _mm512_fmadd_ps(wfRe23, dfRe25, sfRe349);
sfRe349 = _mm512_fmadd_ps(wfIm23, dfIm25, sfRe349);
sfIm349 = _mm512_fmadd_ps(wfRe23, dfIm25, sfIm349);
sfIm349 = _mm512_fnmadd_ps(wfIm23, dfRe25, sfIm349);
sfRe353 = _mm512_fmadd_ps(wfRe24, dfRe25, sfRe353);
sfRe353 = _mm512_fmadd_ps(wfIm24, dfIm25, sfRe353);
sfIm353 = _mm512_fmadd_ps(wfRe24, dfIm25, sfIm353);
sfIm353 = _mm512_fnmadd_ps(wfIm24, dfRe25, sfIm353);
dfRe25 = _mm512_shuffle_f32x4(dfRe25, dfRe25, 78);
dfIm25 = _mm512_shuffle_f32x4(dfIm25, dfIm25, 78);
sfRe350 = _mm512_fmadd_ps(wfRe23, dfRe25, sfRe350);
sfRe350 = _mm512_fmadd_ps(wfIm23, dfIm25, sfRe350);
sfIm350 = _mm512_fmadd_ps(wfRe23, dfIm25, sfIm350);
sfIm350 = _mm512_fnmadd_ps(wfIm23, dfRe25, sfIm350);
sfRe354 = _mm512_fmadd_ps(wfRe24, dfRe25, sfRe354);
sfRe354 = _mm512_fmadd_ps(wfIm24, dfIm25, sfRe354);
sfIm354 = _mm512_fmadd_ps(wfRe24, dfIm25, sfIm354);
sfIm354 = _mm512_fnmadd_ps(wfIm24, dfRe25, sfIm354);
__m512 dfRe26 = _mm512_loadu_ps(dfPtr9+128+32768*i39+8192*j32+3072*k111+256*s30);
__m512 dfIm26 = _mm512_loadu_ps(dfPtr9+192+32768*i39+8192*j32+3072*k111+256*s30);
sfRe351 = _mm512_fmadd_ps(wfRe23, dfRe26, sfRe351);
sfRe351 = _mm512_fmadd_ps(wfIm23, dfIm26, sfRe351);
sfIm351 = _mm512_fmadd_ps(wfRe23, dfIm26, sfIm351);
sfIm351 = _mm512_fnmadd_ps(wfIm23, dfRe26, sfIm351);
sfRe355 = _mm512_fmadd_ps(wfRe24, dfRe26, sfRe355);
sfRe355 = _mm512_fmadd_ps(wfIm24, dfIm26, sfRe355);
sfIm355 = _mm512_fmadd_ps(wfRe24, dfIm26, sfIm355);
sfIm355 = _mm512_fnmadd_ps(wfIm24, dfRe26, sfIm355);
dfRe26 = _mm512_shuffle_f32x4(dfRe26, dfRe26, 78);
dfIm26 = _mm512_shuffle_f32x4(dfIm26, dfIm26, 78);
sfRe352 = _mm512_fmadd_ps(wfRe23, dfRe26, sfRe352);
sfRe352 = _mm512_fmadd_ps(wfIm23, dfIm26, sfRe352);
sfIm352 = _mm512_fmadd_ps(wfRe23, dfIm26, sfIm352);
sfIm352 = _mm512_fnmadd_ps(wfIm23, dfRe26, sfIm352);
sfRe356 = _mm512_fmadd_ps(wfRe24, dfRe26, sfRe356);
sfRe356 = _mm512_fmadd_ps(wfIm24, dfIm26, sfRe356);
sfIm356 = _mm512_fmadd_ps(wfRe24, dfIm26, sfIm356);
sfIm356 = _mm512_fnmadd_ps(wfIm24, dfRe26, sfIm356);
}
_mm512_storeu_ps(sfPtr8+0+32768*i39+8192*j32+3072*k111+1024*l44, sfRe349);
_mm512_storeu_ps(sfPtr8+64+32768*i39+8192*j32+3072*k111+1024*l44, sfIm349);
_mm512_storeu_ps(sfPtr8+128+32768*i39+8192*j32+3072*k111+1024*l44, sfRe350);
_mm512_storeu_ps(sfPtr8+192+32768*i39+8192*j32+3072*k111+1024*l44, sfIm350);
_mm512_storeu_ps(sfPtr8+256+32768*i39+8192*j32+3072*k111+1024*l44, sfRe351);
_mm512_storeu_ps(sfPtr8+320+32768*i39+8192*j32+3072*k111+1024*l44, sfIm351);
_mm512_storeu_ps(sfPtr8+384+32768*i39+8192*j32+3072*k111+1024*l44, sfRe352);
_mm512_storeu_ps(sfPtr8+448+32768*i39+8192*j32+3072*k111+1024*l44, sfIm352);
_mm512_storeu_ps(sfPtr8+512+32768*i39+8192*j32+3072*k111+1024*l44, sfRe353);
_mm512_storeu_ps(sfPtr8+576+32768*i39+8192*j32+3072*k111+1024*l44, sfIm353);
_mm512_storeu_ps(sfPtr8+640+32768*i39+8192*j32+3072*k111+1024*l44, sfRe354);
_mm512_storeu_ps(sfPtr8+704+32768*i39+8192*j32+3072*k111+1024*l44, sfIm354);
_mm512_storeu_ps(sfPtr8+768+32768*i39+8192*j32+3072*k111+1024*l44, sfRe355);
_mm512_storeu_ps(sfPtr8+832+32768*i39+8192*j32+3072*k111+1024*l44, sfIm355);
_mm512_storeu_ps(sfPtr8+896+32768*i39+8192*j32+3072*k111+1024*l44, sfRe356);
_mm512_storeu_ps(sfPtr8+960+32768*i39+8192*j32+3072*k111+1024*l44, sfIm356);
}
}
}
return;
}
char*restrict bfPtr10 = tensors62[0]+1024*e19;
char*restrict wfPtr10 = tensors62[0]+1024+51904512*e19+262144*z4;
char*restrict dfPtr10 = tensors62[1]+207618048*e19+1048576*z4;
char*restrict sfPtr9 = tensors62[2];
ptrdiff_t i40 = 2*g21;
ptrdiff_t ii26 = i40+1;
for (; i40 <= ii26; ++i40) {
ptrdiff_t j33 = 4*p2;
ptrdiff_t jj37 = j33+3;
if (__builtin_expect(!j33, 0)) {
ptrdiff_t k112 = 3*d12;
for (; k112 != 2; ++k112) {
ptrdiff_t l45 = 2*w52;
for (; l45 != 2; ++l45) {
__m512 sfRe357 = _mm512_setzero_ps();
__m512 sfIm357 = _mm512_setzero_ps();
__m512 sfRe363 = _mm512_setzero_ps();
__m512 sfIm363 = _mm512_setzero_ps();
(void)bfPtr10;
__m512 sfRe358 = sfRe357;
__m512 sfIm358 = sfIm357;
__m512 sfRe359 = sfRe357;
__m512 sfIm359 = sfIm357;
__m512 sfRe360 = sfRe357;
__m512 sfIm360 = sfIm357;
__m512 sfRe361 = sfRe357;
__m512 sfIm361 = sfIm357;
__m512 sfRe362 = sfRe357;
__m512 sfIm362 = sfIm357;
__m512 sfRe364 = sfRe363;
__m512 sfIm364 = sfIm363;
__m512 sfRe365 = sfRe363;
__m512 sfIm365 = sfIm363;
__m512 sfRe366 = sfRe363;
__m512 sfIm366 = sfIm363;
__m512 sfRe367 = sfRe363;
__m512 sfIm367 = sfIm363;
__m512 sfRe368 = sfRe363;
__m512 sfIm368 = sfIm363;
for (ptrdiff_t s31 = 0; s31 < 8; ++s31) {
__m512i wfLd25 = _mm512_loadu_si512(wfPtr10+0+8192*i40+2048*j33+1024*l45+128*s31);
__m512 wfRe25 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd25));
__m512 wfIm25 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd25, 1));
__m512 wfMx13 = _mm512_mask_mov_ps(wfIm25, 64764, wfRe25);
__m512i wfLd26 = _mm512_loadu_si512(wfPtr10+64+8192*i40+2048*j33+1024*l45+128*s31);
__m512 wfRe26 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd26));
__m512 wfIm26 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd26, 1));
__m512 wfMx14 = _mm512_mask_mov_ps(wfIm26, 64764, wfRe26);
__m512 dfRe27 = _mm512_loadu_ps(dfPtr10+0+32768*i40+8192*j33+3072*k112+384*s31);
__m512 dfIm27 = _mm512_loadu_ps(dfPtr10+64+32768*i40+8192*j33+3072*k112+384*s31);
sfRe357 = _mm512_fmadd_ps(wfRe25, dfRe27, sfRe357);
sfRe357 = _mm512_mask3_fmadd_ps(wfIm25, dfIm27, sfRe357, 64764);
sfIm357 = _mm512_fmadd_ps(wfMx13, dfIm27, sfIm357);
sfIm357 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe27, sfIm357, 64764);
sfRe363 = _mm512_fmadd_ps(wfRe26, dfRe27, sfRe363);
sfRe363 = _mm512_mask3_fmadd_ps(wfIm26, dfIm27, sfRe363, 64764);
sfIm363 = _mm512_fmadd_ps(wfMx14, dfIm27, sfIm363);
sfIm363 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe27, sfIm363, 64764);
dfRe27 = _mm512_shuffle_f32x4(dfRe27, dfRe27, 78);
dfIm27 = _mm512_shuffle_f32x4(dfIm27, dfIm27, 78);
sfRe358 = _mm512_fmadd_ps(wfRe25, dfRe27, sfRe358);
sfRe358 = _mm512_mask3_fmadd_ps(wfIm25, dfIm27, sfRe358, 64764);
sfIm358 = _mm512_fmadd_ps(wfMx13, dfIm27, sfIm358);
sfIm358 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe27, sfIm358, 64764);
sfRe364 = _mm512_fmadd_ps(wfRe26, dfRe27, sfRe364);
sfRe364 = _mm512_mask3_fmadd_ps(wfIm26, dfIm27, sfRe364, 64764);
sfIm364 = _mm512_fmadd_ps(wfMx14, dfIm27, sfIm364);
sfIm364 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe27, sfIm364, 64764);
__m512 dfRe28 = _mm512_loadu_ps(dfPtr10+128+32768*i40+8192*j33+3072*k112+384*s31);
__m512 dfIm28 = _mm512_loadu_ps(dfPtr10+192+32768*i40+8192*j33+3072*k112+384*s31);
sfRe359 = _mm512_fmadd_ps(wfRe25, dfRe28, sfRe359);
sfRe359 = _mm512_mask3_fmadd_ps(wfIm25, dfIm28, sfRe359, 64764);
sfIm359 = _mm512_fmadd_ps(wfMx13, dfIm28, sfIm359);
sfIm359 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe28, sfIm359, 64764);
sfRe365 = _mm512_fmadd_ps(wfRe26, dfRe28, sfRe365);
sfRe365 = _mm512_mask3_fmadd_ps(wfIm26, dfIm28, sfRe365, 64764);
sfIm365 = _mm512_fmadd_ps(wfMx14, dfIm28, sfIm365);
sfIm365 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe28, sfIm365, 64764);
dfRe28 = _mm512_shuffle_f32x4(dfRe28, dfRe28, 78);
dfIm28 = _mm512_shuffle_f32x4(dfIm28, dfIm28, 78);
sfRe360 = _mm512_fmadd_ps(wfRe25, dfRe28, sfRe360);
sfRe360 = _mm512_mask3_fmadd_ps(wfIm25, dfIm28, sfRe360, 64764);
sfIm360 = _mm512_fmadd_ps(wfMx13, dfIm28, sfIm360);
sfIm360 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe28, sfIm360, 64764);
sfRe366 = _mm512_fmadd_ps(wfRe26, dfRe28, sfRe366);
sfRe366 = _mm512_mask3_fmadd_ps(wfIm26, dfIm28, sfRe366, 64764);
sfIm366 = _mm512_fmadd_ps(wfMx14, dfIm28, sfIm366);
sfIm366 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe28, sfIm366, 64764);
__m512 dfRe29 = _mm512_loadu_ps(dfPtr10+256+32768*i40+8192*j33+3072*k112+384*s31);
__m512 dfIm29 = _mm512_loadu_ps(dfPtr10+320+32768*i40+8192*j33+3072*k112+384*s31);
sfRe361 = _mm512_fmadd_ps(wfRe25, dfRe29, sfRe361);
sfRe361 = _mm512_mask3_fmadd_ps(wfIm25, dfIm29, sfRe361, 64764);
sfIm361 = _mm512_fmadd_ps(wfMx13, dfIm29, sfIm361);
sfIm361 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe29, sfIm361, 64764);
sfRe367 = _mm512_fmadd_ps(wfRe26, dfRe29, sfRe367);
sfRe367 = _mm512_mask3_fmadd_ps(wfIm26, dfIm29, sfRe367, 64764);
sfIm367 = _mm512_fmadd_ps(wfMx14, dfIm29, sfIm367);
sfIm367 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe29, sfIm367, 64764);
dfRe29 = _mm512_shuffle_f32x4(dfRe29, dfRe29, 78);
dfIm29 = _mm512_shuffle_f32x4(dfIm29, dfIm29, 78);
sfRe362 = _mm512_fmadd_ps(wfRe25, dfRe29, sfRe362);
sfRe362 = _mm512_mask3_fmadd_ps(wfIm25, dfIm29, sfRe362, 64764);
sfIm362 = _mm512_fmadd_ps(wfMx13, dfIm29, sfIm362);
sfIm362 = _mm512_mask3_fnmadd_ps(wfIm25, dfRe29, sfIm362, 64764);
sfRe368 = _mm512_fmadd_ps(wfRe26, dfRe29, sfRe368);
sfRe368 = _mm512_mask3_fmadd_ps(wfIm26, dfIm29, sfRe368, 64764);
sfIm368 = _mm512_fmadd_ps(wfMx14, dfIm29, sfIm368);
sfIm368 = _mm512_mask3_fnmadd_ps(wfIm26, dfRe29, sfIm368, 64764);
}
sfRe357 = _mm512_add_ps(sfRe357, _mm512_loadu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm357 = _mm512_add_ps(sfIm357, _mm512_loadu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe358 = _mm512_add_ps(sfRe358, _mm512_loadu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm358 = _mm512_add_ps(sfIm358, _mm512_loadu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe359 = _mm512_add_ps(sfRe359, _mm512_loadu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm359 = _mm512_add_ps(sfIm359, _mm512_loadu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe360 = _mm512_add_ps(sfRe360, _mm512_loadu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm360 = _mm512_add_ps(sfIm360, _mm512_loadu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe361 = _mm512_add_ps(sfRe361, _mm512_loadu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm361 = _mm512_add_ps(sfIm361, _mm512_loadu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe362 = _mm512_add_ps(sfRe362, _mm512_loadu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm362 = _mm512_add_ps(sfIm362, _mm512_loadu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe363 = _mm512_add_ps(sfRe363, _mm512_loadu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm363 = _mm512_add_ps(sfIm363, _mm512_loadu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe364 = _mm512_add_ps(sfRe364, _mm512_loadu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm364 = _mm512_add_ps(sfIm364, _mm512_loadu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe365 = _mm512_add_ps(sfRe365, _mm512_loadu_ps(sfPtr9+1024+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm365 = _mm512_add_ps(sfIm365, _mm512_loadu_ps(sfPtr9+1088+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe366 = _mm512_add_ps(sfRe366, _mm512_loadu_ps(sfPtr9+1152+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm366 = _mm512_add_ps(sfIm366, _mm512_loadu_ps(sfPtr9+1216+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe367 = _mm512_add_ps(sfRe367, _mm512_loadu_ps(sfPtr9+1280+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm367 = _mm512_add_ps(sfIm367, _mm512_loadu_ps(sfPtr9+1344+32768*i40+8192*j33+3072*k112+1536*l45));
sfRe368 = _mm512_add_ps(sfRe368, _mm512_loadu_ps(sfPtr9+1408+32768*i40+8192*j33+3072*k112+1536*l45));
sfIm368 = _mm512_add_ps(sfIm368, _mm512_loadu_ps(sfPtr9+1472+32768*i40+8192*j33+3072*k112+1536*l45));
_mm512_storeu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k112+1536*l45, sfRe357);
_mm512_storeu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k112+1536*l45, sfIm357);
_mm512_storeu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k112+1536*l45, sfRe358);
_mm512_storeu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k112+1536*l45, sfIm358);
_mm512_storeu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k112+1536*l45, sfRe359);
_mm512_storeu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k112+1536*l45, sfIm359);
_mm512_storeu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k112+1536*l45, sfRe360);
_mm512_storeu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k112+1536*l45, sfIm360);
_mm512_storeu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k112+1536*l45, sfRe361);
_mm512_storeu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k112+1536*l45, sfIm361);
_mm512_storeu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k112+1536*l45, sfRe362);
_mm512_storeu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k112+1536*l45, sfIm362);
_mm512_storeu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k112+1536*l45, sfRe363);
_mm512_storeu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k112+1536*l45, sfIm363);
_mm512_storeu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k112+1536*l45, sfRe364);
_mm512_storeu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k112+1536*l45, sfIm364);
_mm512_storeu_ps(sfPtr9+1024+32768*i40+8192*j33+3072*k112+1536*l45, sfRe365);
_mm512_storeu_ps(sfPtr9+1088+32768*i40+8192*j33+3072*k112+1536*l45, sfIm365);
_mm512_storeu_ps(sfPtr9+1152+32768*i40+8192*j33+3072*k112+1536*l45, sfRe366);
_mm512_storeu_ps(sfPtr9+1216+32768*i40+8192*j33+3072*k112+1536*l45, sfIm366);
_mm512_storeu_ps(sfPtr9+1280+32768*i40+8192*j33+3072*k112+1536*l45, sfRe367);
_mm512_storeu_ps(sfPtr9+1344+32768*i40+8192*j33+3072*k112+1536*l45, sfIm367);
_mm512_storeu_ps(sfPtr9+1408+32768*i40+8192*j33+3072*k112+1536*l45, sfRe368);
_mm512_storeu_ps(sfPtr9+1472+32768*i40+8192*j33+3072*k112+1536*l45, sfIm368);
}
}
ptrdiff_t l46 = 2*w52;
for (; l46 != 2; ++l46) {
__m512 sfRe369 = _mm512_setzero_ps();
__m512 sfIm369 = _mm512_setzero_ps();
__m512 sfRe373 = _mm512_setzero_ps();
__m512 sfIm373 = _mm512_setzero_ps();
(void)bfPtr10;
__m512 sfRe370 = sfRe369;
__m512 sfIm370 = sfIm369;
__m512 sfRe371 = sfRe369;
__m512 sfIm371 = sfIm369;
__m512 sfRe372 = sfRe369;
__m512 sfIm372 = sfIm369;
__m512 sfRe374 = sfRe373;
__m512 sfIm374 = sfIm373;
__m512 sfRe375 = sfRe373;
__m512 sfIm375 = sfIm373;
__m512 sfRe376 = sfRe373;
__m512 sfIm376 = sfIm373;
for (ptrdiff_t s32 = 0; s32 < 8; ++s32) {
__m512i wfLd27 = _mm512_loadu_si512(wfPtr10+0+8192*i40+2048*j33+1024*l46+128*s32);
__m512 wfRe27 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd27));
__m512 wfIm27 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd27, 1));
__m512 wfMx15 = _mm512_mask_mov_ps(wfIm27, 64764, wfRe27);
__m512i wfLd28 = _mm512_loadu_si512(wfPtr10+64+8192*i40+2048*j33+1024*l46+128*s32);
__m512 wfRe28 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd28));
__m512 wfIm28 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd28, 1));
__m512 wfMx16 = _mm512_mask_mov_ps(wfIm28, 64764, wfRe28);
__m512 dfRe30 = _mm512_loadu_ps(dfPtr10+0+32768*i40+8192*j33+3072*k112+256*s32);
__m512 dfIm30 = _mm512_loadu_ps(dfPtr10+64+32768*i40+8192*j33+3072*k112+256*s32);
sfRe369 = _mm512_fmadd_ps(wfRe27, dfRe30, sfRe369);
sfRe369 = _mm512_mask3_fmadd_ps(wfIm27, dfIm30, sfRe369, 64764);
sfIm369 = _mm512_fmadd_ps(wfMx15, dfIm30, sfIm369);
sfIm369 = _mm512_mask3_fnmadd_ps(wfIm27, dfRe30, sfIm369, 64764);
sfRe373 = _mm512_fmadd_ps(wfRe28, dfRe30, sfRe373);
sfRe373 = _mm512_mask3_fmadd_ps(wfIm28, dfIm30, sfRe373, 64764);
sfIm373 = _mm512_fmadd_ps(wfMx16, dfIm30, sfIm373);
sfIm373 = _mm512_mask3_fnmadd_ps(wfIm28, dfRe30, sfIm373, 64764);
dfRe30 = _mm512_shuffle_f32x4(dfRe30, dfRe30, 78);
dfIm30 = _mm512_shuffle_f32x4(dfIm30, dfIm30, 78);
sfRe370 = _mm512_fmadd_ps(wfRe27, dfRe30, sfRe370);
sfRe370 = _mm512_mask3_fmadd_ps(wfIm27, dfIm30, sfRe370, 64764);
sfIm370 = _mm512_fmadd_ps(wfMx15, dfIm30, sfIm370);
sfIm370 = _mm512_mask3_fnmadd_ps(wfIm27, dfRe30, sfIm370, 64764);
sfRe374 = _mm512_fmadd_ps(wfRe28, dfRe30, sfRe374);
sfRe374 = _mm512_mask3_fmadd_ps(wfIm28, dfIm30, sfRe374, 64764);
sfIm374 = _mm512_fmadd_ps(wfMx16, dfIm30, sfIm374);
sfIm374 = _mm512_mask3_fnmadd_ps(wfIm28, dfRe30, sfIm374, 64764);
__m512 dfRe31 = _mm512_loadu_ps(dfPtr10+128+32768*i40+8192*j33+3072*k112+256*s32);
__m512 dfIm31 = _mm512_loadu_ps(dfPtr10+192+32768*i40+8192*j33+3072*k112+256*s32);
sfRe371 = _mm512_fmadd_ps(wfRe27, dfRe31, sfRe371);
sfRe371 = _mm512_mask3_fmadd_ps(wfIm27, dfIm31, sfRe371, 64764);
sfIm371 = _mm512_fmadd_ps(wfMx15, dfIm31, sfIm371);
sfIm371 = _mm512_mask3_fnmadd_ps(wfIm27, dfRe31, sfIm371, 64764);
sfRe375 = _mm512_fmadd_ps(wfRe28, dfRe31, sfRe375);
sfRe375 = _mm512_mask3_fmadd_ps(wfIm28, dfIm31, sfRe375, 64764);
sfIm375 = _mm512_fmadd_ps(wfMx16, dfIm31, sfIm375);
sfIm375 = _mm512_mask3_fnmadd_ps(wfIm28, dfRe31, sfIm375, 64764);
dfRe31 = _mm512_shuffle_f32x4(dfRe31, dfRe31, 78);
dfIm31 = _mm512_shuffle_f32x4(dfIm31, dfIm31, 78);
sfRe372 = _mm512_fmadd_ps(wfRe27, dfRe31, sfRe372);
sfRe372 = _mm512_mask3_fmadd_ps(wfIm27, dfIm31, sfRe372, 64764);
sfIm372 = _mm512_fmadd_ps(wfMx15, dfIm31, sfIm372);
sfIm372 = _mm512_mask3_fnmadd_ps(wfIm27, dfRe31, sfIm372, 64764);
sfRe376 = _mm512_fmadd_ps(wfRe28, dfRe31, sfRe376);
sfRe376 = _mm512_mask3_fmadd_ps(wfIm28, dfIm31, sfRe376, 64764);
sfIm376 = _mm512_fmadd_ps(wfMx16, dfIm31, sfIm376);
sfIm376 = _mm512_mask3_fnmadd_ps(wfIm28, dfRe31, sfIm376, 64764);
}
sfRe369 = _mm512_add_ps(sfRe369, _mm512_loadu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm369 = _mm512_add_ps(sfIm369, _mm512_loadu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe370 = _mm512_add_ps(sfRe370, _mm512_loadu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm370 = _mm512_add_ps(sfIm370, _mm512_loadu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe371 = _mm512_add_ps(sfRe371, _mm512_loadu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm371 = _mm512_add_ps(sfIm371, _mm512_loadu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe372 = _mm512_add_ps(sfRe372, _mm512_loadu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm372 = _mm512_add_ps(sfIm372, _mm512_loadu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe373 = _mm512_add_ps(sfRe373, _mm512_loadu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm373 = _mm512_add_ps(sfIm373, _mm512_loadu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe374 = _mm512_add_ps(sfRe374, _mm512_loadu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm374 = _mm512_add_ps(sfIm374, _mm512_loadu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe375 = _mm512_add_ps(sfRe375, _mm512_loadu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm375 = _mm512_add_ps(sfIm375, _mm512_loadu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k112+1024*l46));
sfRe376 = _mm512_add_ps(sfRe376, _mm512_loadu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k112+1024*l46));
sfIm376 = _mm512_add_ps(sfIm376, _mm512_loadu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k112+1024*l46));
_mm512_storeu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k112+1024*l46, sfRe369);
_mm512_storeu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k112+1024*l46, sfIm369);
_mm512_storeu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k112+1024*l46, sfRe370);
_mm512_storeu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k112+1024*l46, sfIm370);
_mm512_storeu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k112+1024*l46, sfRe371);
_mm512_storeu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k112+1024*l46, sfIm371);
_mm512_storeu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k112+1024*l46, sfRe372);
_mm512_storeu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k112+1024*l46, sfIm372);
_mm512_storeu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k112+1024*l46, sfRe373);
_mm512_storeu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k112+1024*l46, sfIm373);
_mm512_storeu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k112+1024*l46, sfRe374);
_mm512_storeu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k112+1024*l46, sfIm374);
_mm512_storeu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k112+1024*l46, sfRe375);
_mm512_storeu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k112+1024*l46, sfIm375);
_mm512_storeu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k112+1024*l46, sfRe376);
_mm512_storeu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k112+1024*l46, sfIm376);
}
j33 = 1;
}
for (; j33 <= jj37; ++j33) {
ptrdiff_t k113 = 3*d12;
for (; k113 != 2; ++k113) {
ptrdiff_t l47 = 2*w52;
for (; l47 != 2; ++l47) {
__m512 sfRe377 = _mm512_setzero_ps();
__m512 sfIm377 = _mm512_setzero_ps();
__m512 sfRe383 = _mm512_setzero_ps();
__m512 sfIm383 = _mm512_setzero_ps();
(void)bfPtr10;
__m512 sfRe378 = sfRe377;
__m512 sfIm378 = sfIm377;
__m512 sfRe379 = sfRe377;
__m512 sfIm379 = sfIm377;
__m512 sfRe380 = sfRe377;
__m512 sfIm380 = sfIm377;
__m512 sfRe381 = sfRe377;
__m512 sfIm381 = sfIm377;
__m512 sfRe382 = sfRe377;
__m512 sfIm382 = sfIm377;
__m512 sfRe384 = sfRe383;
__m512 sfIm384 = sfIm383;
__m512 sfRe385 = sfRe383;
__m512 sfIm385 = sfIm383;
__m512 sfRe386 = sfRe383;
__m512 sfIm386 = sfIm383;
__m512 sfRe387 = sfRe383;
__m512 sfIm387 = sfIm383;
__m512 sfRe388 = sfRe383;
__m512 sfIm388 = sfIm383;
for (ptrdiff_t s33 = 0; s33 < 8; ++s33) {
__m512i wfLd29 = _mm512_loadu_si512(wfPtr10+0+8192*i40+2048*j33+1024*l47+128*s33);
__m512 wfRe29 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd29));
__m512 wfIm29 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd29, 1));
__m512i wfLd30 = _mm512_loadu_si512(wfPtr10+64+8192*i40+2048*j33+1024*l47+128*s33);
__m512 wfRe30 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd30));
__m512 wfIm30 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd30, 1));
__m512 dfRe32 = _mm512_loadu_ps(dfPtr10+0+32768*i40+8192*j33+3072*k113+384*s33);
__m512 dfIm32 = _mm512_loadu_ps(dfPtr10+64+32768*i40+8192*j33+3072*k113+384*s33);
sfRe377 = _mm512_fmadd_ps(wfRe29, dfRe32, sfRe377);
sfRe377 = _mm512_fmadd_ps(wfIm29, dfIm32, sfRe377);
sfIm377 = _mm512_fmadd_ps(wfRe29, dfIm32, sfIm377);
sfIm377 = _mm512_fnmadd_ps(wfIm29, dfRe32, sfIm377);
sfRe383 = _mm512_fmadd_ps(wfRe30, dfRe32, sfRe383);
sfRe383 = _mm512_fmadd_ps(wfIm30, dfIm32, sfRe383);
sfIm383 = _mm512_fmadd_ps(wfRe30, dfIm32, sfIm383);
sfIm383 = _mm512_fnmadd_ps(wfIm30, dfRe32, sfIm383);
dfRe32 = _mm512_shuffle_f32x4(dfRe32, dfRe32, 78);
dfIm32 = _mm512_shuffle_f32x4(dfIm32, dfIm32, 78);
sfRe378 = _mm512_fmadd_ps(wfRe29, dfRe32, sfRe378);
sfRe378 = _mm512_fmadd_ps(wfIm29, dfIm32, sfRe378);
sfIm378 = _mm512_fmadd_ps(wfRe29, dfIm32, sfIm378);
sfIm378 = _mm512_fnmadd_ps(wfIm29, dfRe32, sfIm378);
sfRe384 = _mm512_fmadd_ps(wfRe30, dfRe32, sfRe384);
sfRe384 = _mm512_fmadd_ps(wfIm30, dfIm32, sfRe384);
sfIm384 = _mm512_fmadd_ps(wfRe30, dfIm32, sfIm384);
sfIm384 = _mm512_fnmadd_ps(wfIm30, dfRe32, sfIm384);
__m512 dfRe33 = _mm512_loadu_ps(dfPtr10+128+32768*i40+8192*j33+3072*k113+384*s33);
__m512 dfIm33 = _mm512_loadu_ps(dfPtr10+192+32768*i40+8192*j33+3072*k113+384*s33);
sfRe379 = _mm512_fmadd_ps(wfRe29, dfRe33, sfRe379);
sfRe379 = _mm512_fmadd_ps(wfIm29, dfIm33, sfRe379);
sfIm379 = _mm512_fmadd_ps(wfRe29, dfIm33, sfIm379);
sfIm379 = _mm512_fnmadd_ps(wfIm29, dfRe33, sfIm379);
sfRe385 = _mm512_fmadd_ps(wfRe30, dfRe33, sfRe385);
sfRe385 = _mm512_fmadd_ps(wfIm30, dfIm33, sfRe385);
sfIm385 = _mm512_fmadd_ps(wfRe30, dfIm33, sfIm385);
sfIm385 = _mm512_fnmadd_ps(wfIm30, dfRe33, sfIm385);
dfRe33 = _mm512_shuffle_f32x4(dfRe33, dfRe33, 78);
dfIm33 = _mm512_shuffle_f32x4(dfIm33, dfIm33, 78);
sfRe380 = _mm512_fmadd_ps(wfRe29, dfRe33, sfRe380);
sfRe380 = _mm512_fmadd_ps(wfIm29, dfIm33, sfRe380);
sfIm380 = _mm512_fmadd_ps(wfRe29, dfIm33, sfIm380);
sfIm380 = _mm512_fnmadd_ps(wfIm29, dfRe33, sfIm380);
sfRe386 = _mm512_fmadd_ps(wfRe30, dfRe33, sfRe386);
sfRe386 = _mm512_fmadd_ps(wfIm30, dfIm33, sfRe386);
sfIm386 = _mm512_fmadd_ps(wfRe30, dfIm33, sfIm386);
sfIm386 = _mm512_fnmadd_ps(wfIm30, dfRe33, sfIm386);
__m512 dfRe34 = _mm512_loadu_ps(dfPtr10+256+32768*i40+8192*j33+3072*k113+384*s33);
__m512 dfIm34 = _mm512_loadu_ps(dfPtr10+320+32768*i40+8192*j33+3072*k113+384*s33);
sfRe381 = _mm512_fmadd_ps(wfRe29, dfRe34, sfRe381);
sfRe381 = _mm512_fmadd_ps(wfIm29, dfIm34, sfRe381);
sfIm381 = _mm512_fmadd_ps(wfRe29, dfIm34, sfIm381);
sfIm381 = _mm512_fnmadd_ps(wfIm29, dfRe34, sfIm381);
sfRe387 = _mm512_fmadd_ps(wfRe30, dfRe34, sfRe387);
sfRe387 = _mm512_fmadd_ps(wfIm30, dfIm34, sfRe387);
sfIm387 = _mm512_fmadd_ps(wfRe30, dfIm34, sfIm387);
sfIm387 = _mm512_fnmadd_ps(wfIm30, dfRe34, sfIm387);
dfRe34 = _mm512_shuffle_f32x4(dfRe34, dfRe34, 78);
dfIm34 = _mm512_shuffle_f32x4(dfIm34, dfIm34, 78);
sfRe382 = _mm512_fmadd_ps(wfRe29, dfRe34, sfRe382);
sfRe382 = _mm512_fmadd_ps(wfIm29, dfIm34, sfRe382);
sfIm382 = _mm512_fmadd_ps(wfRe29, dfIm34, sfIm382);
sfIm382 = _mm512_fnmadd_ps(wfIm29, dfRe34, sfIm382);
sfRe388 = _mm512_fmadd_ps(wfRe30, dfRe34, sfRe388);
sfRe388 = _mm512_fmadd_ps(wfIm30, dfIm34, sfRe388);
sfIm388 = _mm512_fmadd_ps(wfRe30, dfIm34, sfIm388);
sfIm388 = _mm512_fnmadd_ps(wfIm30, dfRe34, sfIm388);
}
sfRe377 = _mm512_add_ps(sfRe377, _mm512_loadu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm377 = _mm512_add_ps(sfIm377, _mm512_loadu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe378 = _mm512_add_ps(sfRe378, _mm512_loadu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm378 = _mm512_add_ps(sfIm378, _mm512_loadu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe379 = _mm512_add_ps(sfRe379, _mm512_loadu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm379 = _mm512_add_ps(sfIm379, _mm512_loadu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe380 = _mm512_add_ps(sfRe380, _mm512_loadu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm380 = _mm512_add_ps(sfIm380, _mm512_loadu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe381 = _mm512_add_ps(sfRe381, _mm512_loadu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm381 = _mm512_add_ps(sfIm381, _mm512_loadu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe382 = _mm512_add_ps(sfRe382, _mm512_loadu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm382 = _mm512_add_ps(sfIm382, _mm512_loadu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe383 = _mm512_add_ps(sfRe383, _mm512_loadu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm383 = _mm512_add_ps(sfIm383, _mm512_loadu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe384 = _mm512_add_ps(sfRe384, _mm512_loadu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm384 = _mm512_add_ps(sfIm384, _mm512_loadu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe385 = _mm512_add_ps(sfRe385, _mm512_loadu_ps(sfPtr9+1024+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm385 = _mm512_add_ps(sfIm385, _mm512_loadu_ps(sfPtr9+1088+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe386 = _mm512_add_ps(sfRe386, _mm512_loadu_ps(sfPtr9+1152+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm386 = _mm512_add_ps(sfIm386, _mm512_loadu_ps(sfPtr9+1216+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe387 = _mm512_add_ps(sfRe387, _mm512_loadu_ps(sfPtr9+1280+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm387 = _mm512_add_ps(sfIm387, _mm512_loadu_ps(sfPtr9+1344+32768*i40+8192*j33+3072*k113+1536*l47));
sfRe388 = _mm512_add_ps(sfRe388, _mm512_loadu_ps(sfPtr9+1408+32768*i40+8192*j33+3072*k113+1536*l47));
sfIm388 = _mm512_add_ps(sfIm388, _mm512_loadu_ps(sfPtr9+1472+32768*i40+8192*j33+3072*k113+1536*l47));
_mm512_storeu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k113+1536*l47, sfRe377);
_mm512_storeu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k113+1536*l47, sfIm377);
_mm512_storeu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k113+1536*l47, sfRe378);
_mm512_storeu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k113+1536*l47, sfIm378);
_mm512_storeu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k113+1536*l47, sfRe379);
_mm512_storeu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k113+1536*l47, sfIm379);
_mm512_storeu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k113+1536*l47, sfRe380);
_mm512_storeu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k113+1536*l47, sfIm380);
_mm512_storeu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k113+1536*l47, sfRe381);
_mm512_storeu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k113+1536*l47, sfIm381);
_mm512_storeu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k113+1536*l47, sfRe382);
_mm512_storeu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k113+1536*l47, sfIm382);
_mm512_storeu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k113+1536*l47, sfRe383);
_mm512_storeu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k113+1536*l47, sfIm383);
_mm512_storeu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k113+1536*l47, sfRe384);
_mm512_storeu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k113+1536*l47, sfIm384);
_mm512_storeu_ps(sfPtr9+1024+32768*i40+8192*j33+3072*k113+1536*l47, sfRe385);
_mm512_storeu_ps(sfPtr9+1088+32768*i40+8192*j33+3072*k113+1536*l47, sfIm385);
_mm512_storeu_ps(sfPtr9+1152+32768*i40+8192*j33+3072*k113+1536*l47, sfRe386);
_mm512_storeu_ps(sfPtr9+1216+32768*i40+8192*j33+3072*k113+1536*l47, sfIm386);
_mm512_storeu_ps(sfPtr9+1280+32768*i40+8192*j33+3072*k113+1536*l47, sfRe387);
_mm512_storeu_ps(sfPtr9+1344+32768*i40+8192*j33+3072*k113+1536*l47, sfIm387);
_mm512_storeu_ps(sfPtr9+1408+32768*i40+8192*j33+3072*k113+1536*l47, sfRe388);
_mm512_storeu_ps(sfPtr9+1472+32768*i40+8192*j33+3072*k113+1536*l47, sfIm388);
}
}
ptrdiff_t l48 = 2*w52;
for (; l48 != 2; ++l48) {
__m512 sfRe389 = _mm512_setzero_ps();
__m512 sfIm389 = _mm512_setzero_ps();
__m512 sfRe393 = _mm512_setzero_ps();
__m512 sfIm393 = _mm512_setzero_ps();
(void)bfPtr10;
__m512 sfRe390 = sfRe389;
__m512 sfIm390 = sfIm389;
__m512 sfRe391 = sfRe389;
__m512 sfIm391 = sfIm389;
__m512 sfRe392 = sfRe389;
__m512 sfIm392 = sfIm389;
__m512 sfRe394 = sfRe393;
__m512 sfIm394 = sfIm393;
__m512 sfRe395 = sfRe393;
__m512 sfIm395 = sfIm393;
__m512 sfRe396 = sfRe393;
__m512 sfIm396 = sfIm393;
for (ptrdiff_t s34 = 0; s34 < 8; ++s34) {
__m512i wfLd31 = _mm512_loadu_si512(wfPtr10+0+8192*i40+2048*j33+1024*l48+128*s34);
__m512 wfRe31 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd31));
__m512 wfIm31 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd31, 1));
__m512i wfLd32 = _mm512_loadu_si512(wfPtr10+64+8192*i40+2048*j33+1024*l48+128*s34);
__m512 wfRe32 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd32));
__m512 wfIm32 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd32, 1));
__m512 dfRe35 = _mm512_loadu_ps(dfPtr10+0+32768*i40+8192*j33+3072*k113+256*s34);
__m512 dfIm35 = _mm512_loadu_ps(dfPtr10+64+32768*i40+8192*j33+3072*k113+256*s34);
sfRe389 = _mm512_fmadd_ps(wfRe31, dfRe35, sfRe389);
sfRe389 = _mm512_fmadd_ps(wfIm31, dfIm35, sfRe389);
sfIm389 = _mm512_fmadd_ps(wfRe31, dfIm35, sfIm389);
sfIm389 = _mm512_fnmadd_ps(wfIm31, dfRe35, sfIm389);
sfRe393 = _mm512_fmadd_ps(wfRe32, dfRe35, sfRe393);
sfRe393 = _mm512_fmadd_ps(wfIm32, dfIm35, sfRe393);
sfIm393 = _mm512_fmadd_ps(wfRe32, dfIm35, sfIm393);
sfIm393 = _mm512_fnmadd_ps(wfIm32, dfRe35, sfIm393);
dfRe35 = _mm512_shuffle_f32x4(dfRe35, dfRe35, 78);
dfIm35 = _mm512_shuffle_f32x4(dfIm35, dfIm35, 78);
sfRe390 = _mm512_fmadd_ps(wfRe31, dfRe35, sfRe390);
sfRe390 = _mm512_fmadd_ps(wfIm31, dfIm35, sfRe390);
sfIm390 = _mm512_fmadd_ps(wfRe31, dfIm35, sfIm390);
sfIm390 = _mm512_fnmadd_ps(wfIm31, dfRe35, sfIm390);
sfRe394 = _mm512_fmadd_ps(wfRe32, dfRe35, sfRe394);
sfRe394 = _mm512_fmadd_ps(wfIm32, dfIm35, sfRe394);
sfIm394 = _mm512_fmadd_ps(wfRe32, dfIm35, sfIm394);
sfIm394 = _mm512_fnmadd_ps(wfIm32, dfRe35, sfIm394);
__m512 dfRe36 = _mm512_loadu_ps(dfPtr10+128+32768*i40+8192*j33+3072*k113+256*s34);
__m512 dfIm36 = _mm512_loadu_ps(dfPtr10+192+32768*i40+8192*j33+3072*k113+256*s34);
sfRe391 = _mm512_fmadd_ps(wfRe31, dfRe36, sfRe391);
sfRe391 = _mm512_fmadd_ps(wfIm31, dfIm36, sfRe391);
sfIm391 = _mm512_fmadd_ps(wfRe31, dfIm36, sfIm391);
sfIm391 = _mm512_fnmadd_ps(wfIm31, dfRe36, sfIm391);
sfRe395 = _mm512_fmadd_ps(wfRe32, dfRe36, sfRe395);
sfRe395 = _mm512_fmadd_ps(wfIm32, dfIm36, sfRe395);
sfIm395 = _mm512_fmadd_ps(wfRe32, dfIm36, sfIm395);
sfIm395 = _mm512_fnmadd_ps(wfIm32, dfRe36, sfIm395);
dfRe36 = _mm512_shuffle_f32x4(dfRe36, dfRe36, 78);
dfIm36 = _mm512_shuffle_f32x4(dfIm36, dfIm36, 78);
sfRe392 = _mm512_fmadd_ps(wfRe31, dfRe36, sfRe392);
sfRe392 = _mm512_fmadd_ps(wfIm31, dfIm36, sfRe392);
sfIm392 = _mm512_fmadd_ps(wfRe31, dfIm36, sfIm392);
sfIm392 = _mm512_fnmadd_ps(wfIm31, dfRe36, sfIm392);
sfRe396 = _mm512_fmadd_ps(wfRe32, dfRe36, sfRe396);
sfRe396 = _mm512_fmadd_ps(wfIm32, dfIm36, sfRe396);
sfIm396 = _mm512_fmadd_ps(wfRe32, dfIm36, sfIm396);
sfIm396 = _mm512_fnmadd_ps(wfIm32, dfRe36, sfIm396);
}
sfRe389 = _mm512_add_ps(sfRe389, _mm512_loadu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm389 = _mm512_add_ps(sfIm389, _mm512_loadu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe390 = _mm512_add_ps(sfRe390, _mm512_loadu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm390 = _mm512_add_ps(sfIm390, _mm512_loadu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe391 = _mm512_add_ps(sfRe391, _mm512_loadu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm391 = _mm512_add_ps(sfIm391, _mm512_loadu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe392 = _mm512_add_ps(sfRe392, _mm512_loadu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm392 = _mm512_add_ps(sfIm392, _mm512_loadu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe393 = _mm512_add_ps(sfRe393, _mm512_loadu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm393 = _mm512_add_ps(sfIm393, _mm512_loadu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe394 = _mm512_add_ps(sfRe394, _mm512_loadu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm394 = _mm512_add_ps(sfIm394, _mm512_loadu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe395 = _mm512_add_ps(sfRe395, _mm512_loadu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm395 = _mm512_add_ps(sfIm395, _mm512_loadu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k113+1024*l48));
sfRe396 = _mm512_add_ps(sfRe396, _mm512_loadu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k113+1024*l48));
sfIm396 = _mm512_add_ps(sfIm396, _mm512_loadu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k113+1024*l48));
_mm512_storeu_ps(sfPtr9+0+32768*i40+8192*j33+3072*k113+1024*l48, sfRe389);
_mm512_storeu_ps(sfPtr9+64+32768*i40+8192*j33+3072*k113+1024*l48, sfIm389);
_mm512_storeu_ps(sfPtr9+128+32768*i40+8192*j33+3072*k113+1024*l48, sfRe390);
_mm512_storeu_ps(sfPtr9+192+32768*i40+8192*j33+3072*k113+1024*l48, sfIm390);
_mm512_storeu_ps(sfPtr9+256+32768*i40+8192*j33+3072*k113+1024*l48, sfRe391);
_mm512_storeu_ps(sfPtr9+320+32768*i40+8192*j33+3072*k113+1024*l48, sfIm391);
_mm512_storeu_ps(sfPtr9+384+32768*i40+8192*j33+3072*k113+1024*l48, sfRe392);
_mm512_storeu_ps(sfPtr9+448+32768*i40+8192*j33+3072*k113+1024*l48, sfIm392);
_mm512_storeu_ps(sfPtr9+512+32768*i40+8192*j33+3072*k113+1024*l48, sfRe393);
_mm512_storeu_ps(sfPtr9+576+32768*i40+8192*j33+3072*k113+1024*l48, sfIm393);
_mm512_storeu_ps(sfPtr9+640+32768*i40+8192*j33+3072*k113+1024*l48, sfRe394);
_mm512_storeu_ps(sfPtr9+704+32768*i40+8192*j33+3072*k113+1024*l48, sfIm394);
_mm512_storeu_ps(sfPtr9+768+32768*i40+8192*j33+3072*k113+1024*l48, sfRe395);
_mm512_storeu_ps(sfPtr9+832+32768*i40+8192*j33+3072*k113+1024*l48, sfIm395);
_mm512_storeu_ps(sfPtr9+896+32768*i40+8192*j33+3072*k113+1024*l48, sfRe396);
_mm512_storeu_ps(sfPtr9+960+32768*i40+8192*j33+3072*k113+1024*l48, sfIm396);
}
}
}
}

static void ResNeXt50StriderProduceSums2(ResNeXt50ThreaderTeam1* team44, char** tensors61) {
void* tuple3[3];
tuple3[0] = tensors61;
for (ptrdiff_t e20 = 0; e20 < 1; ++e20) {
tuple3[1] = (void*)e20;
for (ptrdiff_t z5 = 0; z5 < 4; ++z5) {
tuple3[2] = (void*)z5;
ResNeXt50ThreaderTask1 task65;
task65.callee1 = ResNeXt50StriderProduceSums2Callee1;
task65.any1 = tuple3;
task65.nd1 = 4;
task65.hull1[0] = 1;
task65.hull1[1] = 1;
task65.hull1[2] = 1;
task65.hull1[3] = 16;
ResNeXt50ThreaderDo1(team44, &task65);
}
}
}

static void ResNeXt50StriderConsumeSums2Callee1(ResNeXt50ThreaderTask1* task66, int64_t* pt38) {
char** tensors64 = task66->any1;
ptrdiff_t w53 = 0;
ptrdiff_t d13 = 0;
ptrdiff_t g22 = pt38[2];
char*restrict sfPtr10 = tensors64[0];
char*restrict datPtr19 = tensors64[1];
ptrdiff_t i41 = 4*g22;
ptrdiff_t ii27 = i41+3;
for (; i41 <= ii27; ++i41) {
ptrdiff_t j34 = 3*d13;
ptrdiff_t rel20 = j34-0;
ptrdiff_t base20 = 0;
if (rel20 < 1) {
ptrdiff_t toH38 = base20+0;
ptrdiff_t toW38 = 0;
ptrdiff_t k114 = 2*w53;
for (; k114 != 2; ++k114) {
ptrdiff_t r21 = 0;
for (; r21 != 2; ++r21) {
ptrdiff_t t35 = 0;
__m512 sfRe397 = _mm512_loadu_ps(sfPtr10+0+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm397 = _mm512_loadu_ps(sfPtr10+64+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe401 = _mm512_loadu_ps(sfPtr10+128+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm401 = _mm512_loadu_ps(sfPtr10+192+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe398 = _mm512_loadu_ps(sfPtr10+8192+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm398 = _mm512_loadu_ps(sfPtr10+8256+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe402 = _mm512_loadu_ps(sfPtr10+8320+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm402 = _mm512_loadu_ps(sfPtr10+8384+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe399 = _mm512_loadu_ps(sfPtr10+16384+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm399 = _mm512_loadu_ps(sfPtr10+16448+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe403 = _mm512_loadu_ps(sfPtr10+16512+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm403 = _mm512_loadu_ps(sfPtr10+16576+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe400 = _mm512_loadu_ps(sfPtr10+24576+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm400 = _mm512_loadu_ps(sfPtr10+24640+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfRe404 = _mm512_loadu_ps(sfPtr10+24704+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512 sfIm404 = _mm512_loadu_ps(sfPtr10+24768+32768*i41+3072*j34+1536*k114+768*r21+256*t35);
__m512i ifft5725 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5726 = _mm512_permutexvar_ps(ifft5725, sfRe397);
__m512 ifft5817 = _mm512_permutexvar_ps(ifft5725, sfRe401);
__m512i ifft5727 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5728 = _mm512_permutexvar_ps(ifft5727, sfRe397);
__m512 ifft5818 = _mm512_permutexvar_ps(ifft5727, sfRe401);
__m512 ifft5729 = _mm512_permutexvar_ps(ifft5725, sfIm397);
__m512 ifft5819 = _mm512_permutexvar_ps(ifft5725, sfIm401);
__m512 ifft5730 = _mm512_permutexvar_ps(ifft5727, sfIm397);
__m512 ifft5820 = _mm512_permutexvar_ps(ifft5727, sfIm401);
__m512 ifft5731 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5732 = _mm512_mask_fmadd_ps(ifft5730, 65021, ifft5731, ifft5726);
__m512 ifft5821 = _mm512_mask_fmadd_ps(ifft5820, 65021, ifft5731, ifft5817);
__m512 ifft5733 = _mm512_mask_fnmadd_ps(ifft5729, 65021, ifft5731, ifft5728);
__m512 ifft5822 = _mm512_mask_fnmadd_ps(ifft5819, 65021, ifft5731, ifft5818);
__m512 ifft5734 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5735 = _mm512_fmadd_ps(ifft5732, ifft5734, _mm512_shuffle_ps(ifft5732, ifft5732, 177));
__m512 ifft5823 = _mm512_fmadd_ps(ifft5821, ifft5734, _mm512_shuffle_ps(ifft5821, ifft5821, 177));
__m512 ifft5736 = _mm512_fmadd_ps(ifft5733, ifft5734, _mm512_shuffle_ps(ifft5733, ifft5733, 177));
__m512 ifft5824 = _mm512_fmadd_ps(ifft5822, ifft5734, _mm512_shuffle_ps(ifft5822, ifft5822, 177));
__m512 ifft5737 = _mm512_fmadd_ps(sfRe398, ifft5734, _mm512_shuffle_ps(sfRe398, sfRe398, 177));
__m512 ifft5825 = _mm512_fmadd_ps(sfRe402, ifft5734, _mm512_shuffle_ps(sfRe402, sfRe402, 177));
__m512 ifft5738 = _mm512_fmadd_ps(sfIm398, ifft5734, _mm512_shuffle_ps(sfIm398, sfIm398, 177));
__m512 ifft5826 = _mm512_fmadd_ps(sfIm402, ifft5734, _mm512_shuffle_ps(sfIm402, sfIm402, 177));
__m512 ifft5739 = _mm512_fmadd_ps(sfRe399, ifft5734, _mm512_shuffle_ps(sfRe399, sfRe399, 177));
__m512 ifft5827 = _mm512_fmadd_ps(sfRe403, ifft5734, _mm512_shuffle_ps(sfRe403, sfRe403, 177));
__m512 ifft5740 = _mm512_fmadd_ps(sfIm399, ifft5734, _mm512_shuffle_ps(sfIm399, sfIm399, 177));
__m512 ifft5828 = _mm512_fmadd_ps(sfIm403, ifft5734, _mm512_shuffle_ps(sfIm403, sfIm403, 177));
__m512 ifft5741 = _mm512_fmadd_ps(sfRe400, ifft5734, _mm512_shuffle_ps(sfRe400, sfRe400, 177));
__m512 ifft5829 = _mm512_fmadd_ps(sfRe404, ifft5734, _mm512_shuffle_ps(sfRe404, sfRe404, 177));
__m512 ifft5742 = _mm512_fmadd_ps(sfIm400, ifft5734, _mm512_shuffle_ps(sfIm400, sfIm400, 177));
__m512 ifft5830 = _mm512_fmadd_ps(sfIm404, ifft5734, _mm512_shuffle_ps(sfIm404, sfIm404, 177));
__m512 ifft5743 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5744 = _mm512_mul_ps(ifft5735, ifft5743);
__m512 ifft5831 = _mm512_mul_ps(ifft5823, ifft5743);
__m512 ifft5745 = _mm512_mul_ps(ifft5736, ifft5743);
__m512 ifft5832 = _mm512_mul_ps(ifft5824, ifft5743);
__m512 ifft5746 = _mm512_mul_ps(ifft5737, ifft5743);
__m512 ifft5833 = _mm512_mul_ps(ifft5825, ifft5743);
__m512 ifft5747 = _mm512_mul_ps(ifft5738, ifft5743);
__m512 ifft5834 = _mm512_mul_ps(ifft5826, ifft5743);
__m512 ifft5748 = _mm512_mul_ps(ifft5739, ifft5743);
__m512 ifft5835 = _mm512_mul_ps(ifft5827, ifft5743);
__m512 ifft5749 = _mm512_mul_ps(ifft5740, ifft5743);
__m512 ifft5836 = _mm512_mul_ps(ifft5828, ifft5743);
__m512 ifft5750 = _mm512_mul_ps(ifft5741, ifft5743);
__m512 ifft5837 = _mm512_mul_ps(ifft5829, ifft5743);
__m512 ifft5751 = _mm512_mul_ps(ifft5742, ifft5743);
__m512 ifft5838 = _mm512_mul_ps(ifft5830, ifft5743);
__m512 ifft5752 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5753 = _mm512_fnmadd_ps(ifft5736, ifft5752, ifft5744);
__m512 ifft5839 = _mm512_fnmadd_ps(ifft5824, ifft5752, ifft5831);
__m512 ifft5754 = _mm512_fmadd_ps(ifft5735, ifft5752, ifft5745);
__m512 ifft5840 = _mm512_fmadd_ps(ifft5823, ifft5752, ifft5832);
__m512 ifft5755 = _mm512_fnmadd_ps(ifft5738, ifft5752, ifft5746);
__m512 ifft5841 = _mm512_fnmadd_ps(ifft5826, ifft5752, ifft5833);
__m512 ifft5756 = _mm512_fmadd_ps(ifft5737, ifft5752, ifft5747);
__m512 ifft5842 = _mm512_fmadd_ps(ifft5825, ifft5752, ifft5834);
__m512 ifft5757 = _mm512_fnmadd_ps(ifft5740, ifft5752, ifft5748);
__m512 ifft5843 = _mm512_fnmadd_ps(ifft5828, ifft5752, ifft5835);
__m512 ifft5758 = _mm512_fmadd_ps(ifft5739, ifft5752, ifft5749);
__m512 ifft5844 = _mm512_fmadd_ps(ifft5827, ifft5752, ifft5836);
__m512 ifft5759 = _mm512_fnmadd_ps(ifft5742, ifft5752, ifft5750);
__m512 ifft5845 = _mm512_fnmadd_ps(ifft5830, ifft5752, ifft5837);
__m512 ifft5760 = _mm512_fmadd_ps(ifft5741, ifft5752, ifft5751);
__m512 ifft5846 = _mm512_fmadd_ps(ifft5829, ifft5752, ifft5838);
__m512 ifft5761 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5762 = _mm512_fmadd_ps(ifft5753, ifft5761, _mm512_shuffle_ps(ifft5753, ifft5753, 78));
__m512 ifft5847 = _mm512_fmadd_ps(ifft5839, ifft5761, _mm512_shuffle_ps(ifft5839, ifft5839, 78));
__m512 ifft5763 = _mm512_fmadd_ps(ifft5754, ifft5761, _mm512_shuffle_ps(ifft5754, ifft5754, 78));
__m512 ifft5848 = _mm512_fmadd_ps(ifft5840, ifft5761, _mm512_shuffle_ps(ifft5840, ifft5840, 78));
__m512 ifft5764 = _mm512_fmadd_ps(ifft5755, ifft5761, _mm512_shuffle_ps(ifft5755, ifft5755, 78));
__m512 ifft5849 = _mm512_fmadd_ps(ifft5841, ifft5761, _mm512_shuffle_ps(ifft5841, ifft5841, 78));
__m512 ifft5765 = _mm512_fmadd_ps(ifft5756, ifft5761, _mm512_shuffle_ps(ifft5756, ifft5756, 78));
__m512 ifft5850 = _mm512_fmadd_ps(ifft5842, ifft5761, _mm512_shuffle_ps(ifft5842, ifft5842, 78));
__m512 ifft5766 = _mm512_fmadd_ps(ifft5757, ifft5761, _mm512_shuffle_ps(ifft5757, ifft5757, 78));
__m512 ifft5851 = _mm512_fmadd_ps(ifft5843, ifft5761, _mm512_shuffle_ps(ifft5843, ifft5843, 78));
__m512 ifft5767 = _mm512_fmadd_ps(ifft5758, ifft5761, _mm512_shuffle_ps(ifft5758, ifft5758, 78));
__m512 ifft5852 = _mm512_fmadd_ps(ifft5844, ifft5761, _mm512_shuffle_ps(ifft5844, ifft5844, 78));
__m512 ifft5768 = _mm512_fmadd_ps(ifft5759, ifft5761, _mm512_shuffle_ps(ifft5759, ifft5759, 78));
__m512 ifft5853 = _mm512_fmadd_ps(ifft5845, ifft5761, _mm512_shuffle_ps(ifft5845, ifft5845, 78));
__m512 ifft5769 = _mm512_fmadd_ps(ifft5760, ifft5761, _mm512_shuffle_ps(ifft5760, ifft5760, 78));
__m512 ifft5854 = _mm512_fmadd_ps(ifft5846, ifft5761, _mm512_shuffle_ps(ifft5846, ifft5846, 78));
__m512 ifft5770 = _mm512_mask_sub_ps(ifft5762, 49344, _mm512_setzero_ps(), ifft5763);
__m512 ifft5855 = _mm512_mask_sub_ps(ifft5847, 49344, _mm512_setzero_ps(), ifft5848);
__m512 ifft5771 = _mm512_mask_mov_ps(ifft5763, 49344, ifft5762);
__m512 ifft5856 = _mm512_mask_mov_ps(ifft5848, 49344, ifft5847);
__m512 ifft5772 = _mm512_mask_sub_ps(ifft5764, 49344, _mm512_setzero_ps(), ifft5765);
__m512 ifft5857 = _mm512_mask_sub_ps(ifft5849, 49344, _mm512_setzero_ps(), ifft5850);
__m512 ifft5773 = _mm512_mask_mov_ps(ifft5765, 49344, ifft5764);
__m512 ifft5858 = _mm512_mask_mov_ps(ifft5850, 49344, ifft5849);
__m512 ifft5774 = _mm512_mask_sub_ps(ifft5766, 49344, _mm512_setzero_ps(), ifft5767);
__m512 ifft5859 = _mm512_mask_sub_ps(ifft5851, 49344, _mm512_setzero_ps(), ifft5852);
__m512 ifft5775 = _mm512_mask_mov_ps(ifft5767, 49344, ifft5766);
__m512 ifft5860 = _mm512_mask_mov_ps(ifft5852, 49344, ifft5851);
__m512 ifft5776 = _mm512_mask_sub_ps(ifft5768, 49344, _mm512_setzero_ps(), ifft5769);
__m512 ifft5861 = _mm512_mask_sub_ps(ifft5853, 49344, _mm512_setzero_ps(), ifft5854);
__m512 ifft5777 = _mm512_mask_mov_ps(ifft5769, 49344, ifft5768);
__m512 ifft5862 = _mm512_mask_mov_ps(ifft5854, 49344, ifft5853);
__m512 ifft5778 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5779 = _mm512_fmadd_ps(ifft5770, ifft5778, _mm512_shuffle_f32x4(ifft5770, ifft5770, 177));
__m512 ifft5863 = _mm512_fmadd_ps(ifft5855, ifft5778, _mm512_shuffle_f32x4(ifft5855, ifft5855, 177));
__m512 ifft5780 = _mm512_fmadd_ps(ifft5771, ifft5778, _mm512_shuffle_f32x4(ifft5771, ifft5771, 177));
__m512 ifft5864 = _mm512_fmadd_ps(ifft5856, ifft5778, _mm512_shuffle_f32x4(ifft5856, ifft5856, 177));
__m512 ifft5781 = _mm512_fmadd_ps(ifft5772, ifft5778, _mm512_shuffle_f32x4(ifft5772, ifft5772, 177));
__m512 ifft5865 = _mm512_fmadd_ps(ifft5857, ifft5778, _mm512_shuffle_f32x4(ifft5857, ifft5857, 177));
__m512 ifft5782 = _mm512_fmadd_ps(ifft5773, ifft5778, _mm512_shuffle_f32x4(ifft5773, ifft5773, 177));
__m512 ifft5866 = _mm512_fmadd_ps(ifft5858, ifft5778, _mm512_shuffle_f32x4(ifft5858, ifft5858, 177));
__m512 ifft5783 = _mm512_fmadd_ps(ifft5774, ifft5778, _mm512_shuffle_f32x4(ifft5774, ifft5774, 177));
__m512 ifft5867 = _mm512_fmadd_ps(ifft5859, ifft5778, _mm512_shuffle_f32x4(ifft5859, ifft5859, 177));
__m512 ifft5784 = _mm512_fnmsub_ps(ifft5775, ifft5778, _mm512_shuffle_f32x4(ifft5775, ifft5775, 177));
__m512 ifft5868 = _mm512_fnmsub_ps(ifft5860, ifft5778, _mm512_shuffle_f32x4(ifft5860, ifft5860, 177));
__m512 ifft5785 = _mm512_fmadd_ps(ifft5776, ifft5778, _mm512_shuffle_f32x4(ifft5776, ifft5776, 177));
__m512 ifft5869 = _mm512_fmadd_ps(ifft5861, ifft5778, _mm512_shuffle_f32x4(ifft5861, ifft5861, 177));
__m512 ifft5786 = _mm512_fmadd_ps(ifft5777, ifft5778, _mm512_shuffle_f32x4(ifft5777, ifft5777, 177));
__m512 ifft5870 = _mm512_fmadd_ps(ifft5862, ifft5778, _mm512_shuffle_f32x4(ifft5862, ifft5862, 177));
__m512 ifft5787 = _mm512_add_ps(ifft5779, ifft5780);
__m512 ifft5871 = _mm512_add_ps(ifft5863, ifft5864);
__m512 ifft5788 = _mm512_sub_ps(ifft5779, ifft5780);
__m512 ifft5872 = _mm512_sub_ps(ifft5863, ifft5864);
__m512 ifft5789 = _mm512_sub_ps(ifft5781, ifft5785);
__m512 ifft5873 = _mm512_sub_ps(ifft5865, ifft5869);
__m512 ifft5790 = _mm512_add_ps(ifft5782, ifft5786);
__m512 ifft5874 = _mm512_add_ps(ifft5866, ifft5870);
__m512 ifft5791 = _mm512_add_ps(ifft5781, ifft5785);
__m512 ifft5875 = _mm512_add_ps(ifft5865, ifft5869);
__m512 ifft5792 = _mm512_sub_ps(ifft5782, ifft5786);
__m512 ifft5876 = _mm512_sub_ps(ifft5866, ifft5870);
__m512 ifft5793 = _mm512_mul_ps(ifft5783, _mm512_set1_ps(3.125e-02f));
__m512 ifft5877 = _mm512_mul_ps(ifft5867, _mm512_set1_ps(3.125e-02f));
__m512 ifft5794 = _mm512_mul_ps(ifft5784, _mm512_set1_ps(3.125e-02f));
__m512 ifft5878 = _mm512_mul_ps(ifft5868, _mm512_set1_ps(3.125e-02f));
__m512 ifft5795 = _mm512_fmadd_ps(ifft5787, _mm512_set1_ps(1.5625e-02f), ifft5793);
__m512 ifft5879 = _mm512_fmadd_ps(ifft5871, _mm512_set1_ps(1.5625e-02f), ifft5877);
__m512 ifft5796 = _mm512_fmsub_ps(ifft5787, _mm512_set1_ps(1.5625e-02f), ifft5793);
__m512 ifft5880 = _mm512_fmsub_ps(ifft5871, _mm512_set1_ps(1.5625e-02f), ifft5877);
__m512 ifft5797 = _mm512_fmadd_ps(ifft5788, _mm512_set1_ps(1.5625e-02f), ifft5794);
__m512 ifft5881 = _mm512_fmadd_ps(ifft5872, _mm512_set1_ps(1.5625e-02f), ifft5878);
__m512 ifft5798 = _mm512_fmsub_ps(ifft5788, _mm512_set1_ps(1.5625e-02f), ifft5794);
__m512 ifft5882 = _mm512_fmsub_ps(ifft5872, _mm512_set1_ps(1.5625e-02f), ifft5878);
__m512 ifft5799 = _mm512_add_ps(ifft5789, ifft5790);
__m512 ifft5883 = _mm512_add_ps(ifft5873, ifft5874);
__m512 ifft5800 = _mm512_sub_ps(ifft5789, ifft5790);
__m512 ifft5884 = _mm512_sub_ps(ifft5873, ifft5874);
__m512 ifft5801 = _mm512_fnmadd_ps(ifft5799, _mm512_set1_ps(7.0710677e-01f), ifft5791);
__m512 ifft5885 = _mm512_fnmadd_ps(ifft5883, _mm512_set1_ps(7.0710677e-01f), ifft5875);
__m512 ifft5802 = _mm512_fmadd_ps(ifft5799, _mm512_set1_ps(7.0710677e-01f), ifft5791);
__m512 ifft5886 = _mm512_fmadd_ps(ifft5883, _mm512_set1_ps(7.0710677e-01f), ifft5875);
__m512 ifft5803 = _mm512_fmadd_ps(ifft5800, _mm512_set1_ps(7.0710677e-01f), ifft5792);
__m512 ifft5887 = _mm512_fmadd_ps(ifft5884, _mm512_set1_ps(7.0710677e-01f), ifft5876);
__m512 ifft5804 = _mm512_fmsub_ps(ifft5800, _mm512_set1_ps(7.0710677e-01f), ifft5792);
__m512 ifft5888 = _mm512_fmsub_ps(ifft5884, _mm512_set1_ps(7.0710677e-01f), ifft5876);
__m512 ifft5805 = _mm512_add_ps(ifft5801, ifft5802);
__m512 ifft5889 = _mm512_add_ps(ifft5885, ifft5886);
__m512 ifft5806 = _mm512_sub_ps(ifft5801, ifft5802);
__m512 ifft5890 = _mm512_sub_ps(ifft5885, ifft5886);
__m512 ifft5807 = _mm512_add_ps(ifft5803, ifft5804);
__m512 ifft5891 = _mm512_add_ps(ifft5887, ifft5888);
__m512 ifft5808 = _mm512_sub_ps(ifft5803, ifft5804);
__m512 ifft5892 = _mm512_sub_ps(ifft5887, ifft5888);
__m512 ifft5809 = _mm512_fmadd_ps(ifft5805, _mm512_set1_ps(1.5625e-02f), ifft5795);
__m512 ifft5893 = _mm512_fmadd_ps(ifft5889, _mm512_set1_ps(1.5625e-02f), ifft5879);
__m512 ifft5810 = _mm512_fnmadd_ps(ifft5805, _mm512_set1_ps(1.5625e-02f), ifft5795);
__m512 ifft5894 = _mm512_fnmadd_ps(ifft5889, _mm512_set1_ps(1.5625e-02f), ifft5879);
__m512 ifft5811 = _mm512_fmadd_ps(ifft5807, _mm512_set1_ps(1.5625e-02f), ifft5797);
__m512 ifft5895 = _mm512_fmadd_ps(ifft5891, _mm512_set1_ps(1.5625e-02f), ifft5881);
__m512 ifft5812 = _mm512_fnmadd_ps(ifft5807, _mm512_set1_ps(1.5625e-02f), ifft5797);
__m512 ifft5896 = _mm512_fnmadd_ps(ifft5891, _mm512_set1_ps(1.5625e-02f), ifft5881);
__m512 ifft5813 = _mm512_fnmadd_ps(ifft5808, _mm512_set1_ps(1.5625e-02f), ifft5796);
__m512 ifft5897 = _mm512_fnmadd_ps(ifft5892, _mm512_set1_ps(1.5625e-02f), ifft5880);
__m512 ifft5814 = _mm512_fmadd_ps(ifft5808, _mm512_set1_ps(1.5625e-02f), ifft5796);
__m512 ifft5898 = _mm512_fmadd_ps(ifft5892, _mm512_set1_ps(1.5625e-02f), ifft5880);
__m512 ifft5815 = _mm512_fmadd_ps(ifft5806, _mm512_set1_ps(1.5625e-02f), ifft5798);
__m512 ifft5899 = _mm512_fmadd_ps(ifft5890, _mm512_set1_ps(1.5625e-02f), ifft5882);
__m512 ifft5816 = _mm512_fnmadd_ps(ifft5806, _mm512_set1_ps(1.5625e-02f), ifft5798);
__m512 ifft5900 = _mm512_fnmadd_ps(ifft5890, _mm512_set1_ps(1.5625e-02f), ifft5882);
__m512 dat1891 = ifft5809;
__m512 dat1898 = ifft5893;
__m512 dat1892 = ifft5811;
__m512 dat1899 = ifft5895;
__m512 dat1893 = ifft5813;
__m512 dat1900 = ifft5897;
__m512 dat1894 = ifft5815;
__m512 dat1901 = ifft5899;
__m512 dat1895 = ifft5810;
__m512 dat1902 = ifft5894;
__m512 dat1896 = ifft5812;
__m512 dat1903 = ifft5896;
__m512 dat1897 = ifft5814;
__m512 dat1904 = ifft5898;
(void)ifft5816;
(void)ifft5900;
__m512i pm157 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack275 = _mm512_permutex2var_ps(dat1891, pm157, dat1898);
__m512i pm158 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack276 = _mm512_permutex2var_ps(dat1891, pm158, dat1898);
__m512 pack277 = _mm512_permutex2var_ps(dat1892, pm157, dat1899);
__m512 pack278 = _mm512_permutex2var_ps(dat1892, pm158, dat1899);
__m512 pack279 = _mm512_permutex2var_ps(dat1893, pm157, dat1900);
__m512 pack280 = _mm512_permutex2var_ps(dat1893, pm158, dat1900);
__m512 pack281 = _mm512_permutex2var_ps(dat1894, pm157, dat1901);
__m512 pack282 = _mm512_permutex2var_ps(dat1894, pm158, dat1901);
__m512 pack283 = _mm512_permutex2var_ps(dat1895, pm157, dat1902);
__m512 pack284 = _mm512_permutex2var_ps(dat1895, pm158, dat1902);
__m512 pack285 = _mm512_permutex2var_ps(dat1896, pm157, dat1903);
__m512 pack286 = _mm512_permutex2var_ps(dat1896, pm158, dat1903);
__m512 pack287 = _mm512_permutex2var_ps(dat1897, pm157, dat1904);
__m512 pack288 = _mm512_permutex2var_ps(dat1897, pm158, dat1904);
pack275 = _mm512_max_ps(_mm512_setzero_ps(), pack275);
pack276 = _mm512_max_ps(_mm512_setzero_ps(), pack276);
pack277 = _mm512_max_ps(_mm512_setzero_ps(), pack277);
pack278 = _mm512_max_ps(_mm512_setzero_ps(), pack278);
pack279 = _mm512_max_ps(_mm512_setzero_ps(), pack279);
pack280 = _mm512_max_ps(_mm512_setzero_ps(), pack280);
pack281 = _mm512_max_ps(_mm512_setzero_ps(), pack281);
pack282 = _mm512_max_ps(_mm512_setzero_ps(), pack282);
pack283 = _mm512_max_ps(_mm512_setzero_ps(), pack283);
pack284 = _mm512_max_ps(_mm512_setzero_ps(), pack284);
pack285 = _mm512_max_ps(_mm512_setzero_ps(), pack285);
pack286 = _mm512_max_ps(_mm512_setzero_ps(), pack286);
pack287 = _mm512_max_ps(_mm512_setzero_ps(), pack287);
pack288 = _mm512_max_ps(_mm512_setzero_ps(), pack288);
_mm512_mask_storeu_ps(datPtr19+0+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack275);
_mm512_mask_storeu_ps(datPtr19+3136+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack276);
_mm512_mask_storeu_ps(datPtr19+112+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack277);
_mm512_mask_storeu_ps(datPtr19+3248+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack278);
_mm512_mask_storeu_ps(datPtr19+224+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack279);
_mm512_mask_storeu_ps(datPtr19+3360+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack280);
_mm512_mask_storeu_ps(datPtr19+336+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack281);
_mm512_mask_storeu_ps(datPtr19+3472+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack282);
_mm512_mask_storeu_ps(datPtr19+448+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack283);
_mm512_mask_storeu_ps(datPtr19+3584+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack284);
_mm512_mask_storeu_ps(datPtr19+560+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack285);
_mm512_mask_storeu_ps(datPtr19+3696+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack286);
_mm512_mask_storeu_ps(datPtr19+672+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack287);
_mm512_mask_storeu_ps(datPtr19+3808+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t35, 16383, pack288);
ptrdiff_t t36 = 0;
__m512 sfRe405 = _mm512_loadu_ps(sfPtr10+256+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm405 = _mm512_loadu_ps(sfPtr10+320+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe409 = _mm512_loadu_ps(sfPtr10+384+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm409 = _mm512_loadu_ps(sfPtr10+448+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe406 = _mm512_loadu_ps(sfPtr10+8448+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm406 = _mm512_loadu_ps(sfPtr10+8512+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe410 = _mm512_loadu_ps(sfPtr10+8576+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm410 = _mm512_loadu_ps(sfPtr10+8640+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe407 = _mm512_loadu_ps(sfPtr10+16640+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm407 = _mm512_loadu_ps(sfPtr10+16704+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe411 = _mm512_loadu_ps(sfPtr10+16768+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm411 = _mm512_loadu_ps(sfPtr10+16832+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe408 = _mm512_loadu_ps(sfPtr10+24832+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm408 = _mm512_loadu_ps(sfPtr10+24896+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfRe412 = _mm512_loadu_ps(sfPtr10+24960+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512 sfIm412 = _mm512_loadu_ps(sfPtr10+25024+32768*i41+3072*j34+1536*k114+768*r21+256*t36);
__m512i ifft5901 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft5902 = _mm512_permutexvar_ps(ifft5901, sfRe405);
__m512 ifft5993 = _mm512_permutexvar_ps(ifft5901, sfRe409);
__m512i ifft5903 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft5904 = _mm512_permutexvar_ps(ifft5903, sfRe405);
__m512 ifft5994 = _mm512_permutexvar_ps(ifft5903, sfRe409);
__m512 ifft5905 = _mm512_permutexvar_ps(ifft5901, sfIm405);
__m512 ifft5995 = _mm512_permutexvar_ps(ifft5901, sfIm409);
__m512 ifft5906 = _mm512_permutexvar_ps(ifft5903, sfIm405);
__m512 ifft5996 = _mm512_permutexvar_ps(ifft5903, sfIm409);
__m512 ifft5907 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft5908 = _mm512_mask_fmadd_ps(ifft5906, 65021, ifft5907, ifft5902);
__m512 ifft5997 = _mm512_mask_fmadd_ps(ifft5996, 65021, ifft5907, ifft5993);
__m512 ifft5909 = _mm512_mask_fnmadd_ps(ifft5905, 65021, ifft5907, ifft5904);
__m512 ifft5998 = _mm512_mask_fnmadd_ps(ifft5995, 65021, ifft5907, ifft5994);
__m512 ifft5910 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft5911 = _mm512_fmadd_ps(ifft5908, ifft5910, _mm512_shuffle_ps(ifft5908, ifft5908, 177));
__m512 ifft5999 = _mm512_fmadd_ps(ifft5997, ifft5910, _mm512_shuffle_ps(ifft5997, ifft5997, 177));
__m512 ifft5912 = _mm512_fmadd_ps(ifft5909, ifft5910, _mm512_shuffle_ps(ifft5909, ifft5909, 177));
__m512 ifft6000 = _mm512_fmadd_ps(ifft5998, ifft5910, _mm512_shuffle_ps(ifft5998, ifft5998, 177));
__m512 ifft5913 = _mm512_fmadd_ps(sfRe406, ifft5910, _mm512_shuffle_ps(sfRe406, sfRe406, 177));
__m512 ifft6001 = _mm512_fmadd_ps(sfRe410, ifft5910, _mm512_shuffle_ps(sfRe410, sfRe410, 177));
__m512 ifft5914 = _mm512_fmadd_ps(sfIm406, ifft5910, _mm512_shuffle_ps(sfIm406, sfIm406, 177));
__m512 ifft6002 = _mm512_fmadd_ps(sfIm410, ifft5910, _mm512_shuffle_ps(sfIm410, sfIm410, 177));
__m512 ifft5915 = _mm512_fmadd_ps(sfRe407, ifft5910, _mm512_shuffle_ps(sfRe407, sfRe407, 177));
__m512 ifft6003 = _mm512_fmadd_ps(sfRe411, ifft5910, _mm512_shuffle_ps(sfRe411, sfRe411, 177));
__m512 ifft5916 = _mm512_fmadd_ps(sfIm407, ifft5910, _mm512_shuffle_ps(sfIm407, sfIm407, 177));
__m512 ifft6004 = _mm512_fmadd_ps(sfIm411, ifft5910, _mm512_shuffle_ps(sfIm411, sfIm411, 177));
__m512 ifft5917 = _mm512_fmadd_ps(sfRe408, ifft5910, _mm512_shuffle_ps(sfRe408, sfRe408, 177));
__m512 ifft6005 = _mm512_fmadd_ps(sfRe412, ifft5910, _mm512_shuffle_ps(sfRe412, sfRe412, 177));
__m512 ifft5918 = _mm512_fmadd_ps(sfIm408, ifft5910, _mm512_shuffle_ps(sfIm408, sfIm408, 177));
__m512 ifft6006 = _mm512_fmadd_ps(sfIm412, ifft5910, _mm512_shuffle_ps(sfIm412, sfIm412, 177));
__m512 ifft5919 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft5920 = _mm512_mul_ps(ifft5911, ifft5919);
__m512 ifft6007 = _mm512_mul_ps(ifft5999, ifft5919);
__m512 ifft5921 = _mm512_mul_ps(ifft5912, ifft5919);
__m512 ifft6008 = _mm512_mul_ps(ifft6000, ifft5919);
__m512 ifft5922 = _mm512_mul_ps(ifft5913, ifft5919);
__m512 ifft6009 = _mm512_mul_ps(ifft6001, ifft5919);
__m512 ifft5923 = _mm512_mul_ps(ifft5914, ifft5919);
__m512 ifft6010 = _mm512_mul_ps(ifft6002, ifft5919);
__m512 ifft5924 = _mm512_mul_ps(ifft5915, ifft5919);
__m512 ifft6011 = _mm512_mul_ps(ifft6003, ifft5919);
__m512 ifft5925 = _mm512_mul_ps(ifft5916, ifft5919);
__m512 ifft6012 = _mm512_mul_ps(ifft6004, ifft5919);
__m512 ifft5926 = _mm512_mul_ps(ifft5917, ifft5919);
__m512 ifft6013 = _mm512_mul_ps(ifft6005, ifft5919);
__m512 ifft5927 = _mm512_mul_ps(ifft5918, ifft5919);
__m512 ifft6014 = _mm512_mul_ps(ifft6006, ifft5919);
__m512 ifft5928 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft5929 = _mm512_fnmadd_ps(ifft5912, ifft5928, ifft5920);
__m512 ifft6015 = _mm512_fnmadd_ps(ifft6000, ifft5928, ifft6007);
__m512 ifft5930 = _mm512_fmadd_ps(ifft5911, ifft5928, ifft5921);
__m512 ifft6016 = _mm512_fmadd_ps(ifft5999, ifft5928, ifft6008);
__m512 ifft5931 = _mm512_fnmadd_ps(ifft5914, ifft5928, ifft5922);
__m512 ifft6017 = _mm512_fnmadd_ps(ifft6002, ifft5928, ifft6009);
__m512 ifft5932 = _mm512_fmadd_ps(ifft5913, ifft5928, ifft5923);
__m512 ifft6018 = _mm512_fmadd_ps(ifft6001, ifft5928, ifft6010);
__m512 ifft5933 = _mm512_fnmadd_ps(ifft5916, ifft5928, ifft5924);
__m512 ifft6019 = _mm512_fnmadd_ps(ifft6004, ifft5928, ifft6011);
__m512 ifft5934 = _mm512_fmadd_ps(ifft5915, ifft5928, ifft5925);
__m512 ifft6020 = _mm512_fmadd_ps(ifft6003, ifft5928, ifft6012);
__m512 ifft5935 = _mm512_fnmadd_ps(ifft5918, ifft5928, ifft5926);
__m512 ifft6021 = _mm512_fnmadd_ps(ifft6006, ifft5928, ifft6013);
__m512 ifft5936 = _mm512_fmadd_ps(ifft5917, ifft5928, ifft5927);
__m512 ifft6022 = _mm512_fmadd_ps(ifft6005, ifft5928, ifft6014);
__m512 ifft5937 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft5938 = _mm512_fmadd_ps(ifft5929, ifft5937, _mm512_shuffle_ps(ifft5929, ifft5929, 78));
__m512 ifft6023 = _mm512_fmadd_ps(ifft6015, ifft5937, _mm512_shuffle_ps(ifft6015, ifft6015, 78));
__m512 ifft5939 = _mm512_fmadd_ps(ifft5930, ifft5937, _mm512_shuffle_ps(ifft5930, ifft5930, 78));
__m512 ifft6024 = _mm512_fmadd_ps(ifft6016, ifft5937, _mm512_shuffle_ps(ifft6016, ifft6016, 78));
__m512 ifft5940 = _mm512_fmadd_ps(ifft5931, ifft5937, _mm512_shuffle_ps(ifft5931, ifft5931, 78));
__m512 ifft6025 = _mm512_fmadd_ps(ifft6017, ifft5937, _mm512_shuffle_ps(ifft6017, ifft6017, 78));
__m512 ifft5941 = _mm512_fmadd_ps(ifft5932, ifft5937, _mm512_shuffle_ps(ifft5932, ifft5932, 78));
__m512 ifft6026 = _mm512_fmadd_ps(ifft6018, ifft5937, _mm512_shuffle_ps(ifft6018, ifft6018, 78));
__m512 ifft5942 = _mm512_fmadd_ps(ifft5933, ifft5937, _mm512_shuffle_ps(ifft5933, ifft5933, 78));
__m512 ifft6027 = _mm512_fmadd_ps(ifft6019, ifft5937, _mm512_shuffle_ps(ifft6019, ifft6019, 78));
__m512 ifft5943 = _mm512_fmadd_ps(ifft5934, ifft5937, _mm512_shuffle_ps(ifft5934, ifft5934, 78));
__m512 ifft6028 = _mm512_fmadd_ps(ifft6020, ifft5937, _mm512_shuffle_ps(ifft6020, ifft6020, 78));
__m512 ifft5944 = _mm512_fmadd_ps(ifft5935, ifft5937, _mm512_shuffle_ps(ifft5935, ifft5935, 78));
__m512 ifft6029 = _mm512_fmadd_ps(ifft6021, ifft5937, _mm512_shuffle_ps(ifft6021, ifft6021, 78));
__m512 ifft5945 = _mm512_fmadd_ps(ifft5936, ifft5937, _mm512_shuffle_ps(ifft5936, ifft5936, 78));
__m512 ifft6030 = _mm512_fmadd_ps(ifft6022, ifft5937, _mm512_shuffle_ps(ifft6022, ifft6022, 78));
__m512 ifft5946 = _mm512_mask_sub_ps(ifft5938, 49344, _mm512_setzero_ps(), ifft5939);
__m512 ifft6031 = _mm512_mask_sub_ps(ifft6023, 49344, _mm512_setzero_ps(), ifft6024);
__m512 ifft5947 = _mm512_mask_mov_ps(ifft5939, 49344, ifft5938);
__m512 ifft6032 = _mm512_mask_mov_ps(ifft6024, 49344, ifft6023);
__m512 ifft5948 = _mm512_mask_sub_ps(ifft5940, 49344, _mm512_setzero_ps(), ifft5941);
__m512 ifft6033 = _mm512_mask_sub_ps(ifft6025, 49344, _mm512_setzero_ps(), ifft6026);
__m512 ifft5949 = _mm512_mask_mov_ps(ifft5941, 49344, ifft5940);
__m512 ifft6034 = _mm512_mask_mov_ps(ifft6026, 49344, ifft6025);
__m512 ifft5950 = _mm512_mask_sub_ps(ifft5942, 49344, _mm512_setzero_ps(), ifft5943);
__m512 ifft6035 = _mm512_mask_sub_ps(ifft6027, 49344, _mm512_setzero_ps(), ifft6028);
__m512 ifft5951 = _mm512_mask_mov_ps(ifft5943, 49344, ifft5942);
__m512 ifft6036 = _mm512_mask_mov_ps(ifft6028, 49344, ifft6027);
__m512 ifft5952 = _mm512_mask_sub_ps(ifft5944, 49344, _mm512_setzero_ps(), ifft5945);
__m512 ifft6037 = _mm512_mask_sub_ps(ifft6029, 49344, _mm512_setzero_ps(), ifft6030);
__m512 ifft5953 = _mm512_mask_mov_ps(ifft5945, 49344, ifft5944);
__m512 ifft6038 = _mm512_mask_mov_ps(ifft6030, 49344, ifft6029);
__m512 ifft5954 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft5955 = _mm512_fmadd_ps(ifft5946, ifft5954, _mm512_shuffle_f32x4(ifft5946, ifft5946, 177));
__m512 ifft6039 = _mm512_fmadd_ps(ifft6031, ifft5954, _mm512_shuffle_f32x4(ifft6031, ifft6031, 177));
__m512 ifft5956 = _mm512_fmadd_ps(ifft5947, ifft5954, _mm512_shuffle_f32x4(ifft5947, ifft5947, 177));
__m512 ifft6040 = _mm512_fmadd_ps(ifft6032, ifft5954, _mm512_shuffle_f32x4(ifft6032, ifft6032, 177));
__m512 ifft5957 = _mm512_fmadd_ps(ifft5948, ifft5954, _mm512_shuffle_f32x4(ifft5948, ifft5948, 177));
__m512 ifft6041 = _mm512_fmadd_ps(ifft6033, ifft5954, _mm512_shuffle_f32x4(ifft6033, ifft6033, 177));
__m512 ifft5958 = _mm512_fmadd_ps(ifft5949, ifft5954, _mm512_shuffle_f32x4(ifft5949, ifft5949, 177));
__m512 ifft6042 = _mm512_fmadd_ps(ifft6034, ifft5954, _mm512_shuffle_f32x4(ifft6034, ifft6034, 177));
__m512 ifft5959 = _mm512_fmadd_ps(ifft5950, ifft5954, _mm512_shuffle_f32x4(ifft5950, ifft5950, 177));
__m512 ifft6043 = _mm512_fmadd_ps(ifft6035, ifft5954, _mm512_shuffle_f32x4(ifft6035, ifft6035, 177));
__m512 ifft5960 = _mm512_fnmsub_ps(ifft5951, ifft5954, _mm512_shuffle_f32x4(ifft5951, ifft5951, 177));
__m512 ifft6044 = _mm512_fnmsub_ps(ifft6036, ifft5954, _mm512_shuffle_f32x4(ifft6036, ifft6036, 177));
__m512 ifft5961 = _mm512_fmadd_ps(ifft5952, ifft5954, _mm512_shuffle_f32x4(ifft5952, ifft5952, 177));
__m512 ifft6045 = _mm512_fmadd_ps(ifft6037, ifft5954, _mm512_shuffle_f32x4(ifft6037, ifft6037, 177));
__m512 ifft5962 = _mm512_fmadd_ps(ifft5953, ifft5954, _mm512_shuffle_f32x4(ifft5953, ifft5953, 177));
__m512 ifft6046 = _mm512_fmadd_ps(ifft6038, ifft5954, _mm512_shuffle_f32x4(ifft6038, ifft6038, 177));
__m512 ifft5963 = _mm512_add_ps(ifft5955, ifft5956);
__m512 ifft6047 = _mm512_add_ps(ifft6039, ifft6040);
__m512 ifft5964 = _mm512_sub_ps(ifft5955, ifft5956);
__m512 ifft6048 = _mm512_sub_ps(ifft6039, ifft6040);
__m512 ifft5965 = _mm512_sub_ps(ifft5957, ifft5961);
__m512 ifft6049 = _mm512_sub_ps(ifft6041, ifft6045);
__m512 ifft5966 = _mm512_add_ps(ifft5958, ifft5962);
__m512 ifft6050 = _mm512_add_ps(ifft6042, ifft6046);
__m512 ifft5967 = _mm512_add_ps(ifft5957, ifft5961);
__m512 ifft6051 = _mm512_add_ps(ifft6041, ifft6045);
__m512 ifft5968 = _mm512_sub_ps(ifft5958, ifft5962);
__m512 ifft6052 = _mm512_sub_ps(ifft6042, ifft6046);
__m512 ifft5969 = _mm512_mul_ps(ifft5959, _mm512_set1_ps(3.125e-02f));
__m512 ifft6053 = _mm512_mul_ps(ifft6043, _mm512_set1_ps(3.125e-02f));
__m512 ifft5970 = _mm512_mul_ps(ifft5960, _mm512_set1_ps(3.125e-02f));
__m512 ifft6054 = _mm512_mul_ps(ifft6044, _mm512_set1_ps(3.125e-02f));
__m512 ifft5971 = _mm512_fmadd_ps(ifft5963, _mm512_set1_ps(1.5625e-02f), ifft5969);
__m512 ifft6055 = _mm512_fmadd_ps(ifft6047, _mm512_set1_ps(1.5625e-02f), ifft6053);
__m512 ifft5972 = _mm512_fmsub_ps(ifft5963, _mm512_set1_ps(1.5625e-02f), ifft5969);
__m512 ifft6056 = _mm512_fmsub_ps(ifft6047, _mm512_set1_ps(1.5625e-02f), ifft6053);
__m512 ifft5973 = _mm512_fmadd_ps(ifft5964, _mm512_set1_ps(1.5625e-02f), ifft5970);
__m512 ifft6057 = _mm512_fmadd_ps(ifft6048, _mm512_set1_ps(1.5625e-02f), ifft6054);
__m512 ifft5974 = _mm512_fmsub_ps(ifft5964, _mm512_set1_ps(1.5625e-02f), ifft5970);
__m512 ifft6058 = _mm512_fmsub_ps(ifft6048, _mm512_set1_ps(1.5625e-02f), ifft6054);
__m512 ifft5975 = _mm512_add_ps(ifft5965, ifft5966);
__m512 ifft6059 = _mm512_add_ps(ifft6049, ifft6050);
__m512 ifft5976 = _mm512_sub_ps(ifft5965, ifft5966);
__m512 ifft6060 = _mm512_sub_ps(ifft6049, ifft6050);
__m512 ifft5977 = _mm512_fnmadd_ps(ifft5975, _mm512_set1_ps(7.0710677e-01f), ifft5967);
__m512 ifft6061 = _mm512_fnmadd_ps(ifft6059, _mm512_set1_ps(7.0710677e-01f), ifft6051);
__m512 ifft5978 = _mm512_fmadd_ps(ifft5975, _mm512_set1_ps(7.0710677e-01f), ifft5967);
__m512 ifft6062 = _mm512_fmadd_ps(ifft6059, _mm512_set1_ps(7.0710677e-01f), ifft6051);
__m512 ifft5979 = _mm512_fmadd_ps(ifft5976, _mm512_set1_ps(7.0710677e-01f), ifft5968);
__m512 ifft6063 = _mm512_fmadd_ps(ifft6060, _mm512_set1_ps(7.0710677e-01f), ifft6052);
__m512 ifft5980 = _mm512_fmsub_ps(ifft5976, _mm512_set1_ps(7.0710677e-01f), ifft5968);
__m512 ifft6064 = _mm512_fmsub_ps(ifft6060, _mm512_set1_ps(7.0710677e-01f), ifft6052);
__m512 ifft5981 = _mm512_add_ps(ifft5977, ifft5978);
__m512 ifft6065 = _mm512_add_ps(ifft6061, ifft6062);
__m512 ifft5982 = _mm512_sub_ps(ifft5977, ifft5978);
__m512 ifft6066 = _mm512_sub_ps(ifft6061, ifft6062);
__m512 ifft5983 = _mm512_add_ps(ifft5979, ifft5980);
__m512 ifft6067 = _mm512_add_ps(ifft6063, ifft6064);
__m512 ifft5984 = _mm512_sub_ps(ifft5979, ifft5980);
__m512 ifft6068 = _mm512_sub_ps(ifft6063, ifft6064);
__m512 ifft5985 = _mm512_fmadd_ps(ifft5981, _mm512_set1_ps(1.5625e-02f), ifft5971);
__m512 ifft6069 = _mm512_fmadd_ps(ifft6065, _mm512_set1_ps(1.5625e-02f), ifft6055);
__m512 ifft5986 = _mm512_fnmadd_ps(ifft5981, _mm512_set1_ps(1.5625e-02f), ifft5971);
__m512 ifft6070 = _mm512_fnmadd_ps(ifft6065, _mm512_set1_ps(1.5625e-02f), ifft6055);
__m512 ifft5987 = _mm512_fmadd_ps(ifft5983, _mm512_set1_ps(1.5625e-02f), ifft5973);
__m512 ifft6071 = _mm512_fmadd_ps(ifft6067, _mm512_set1_ps(1.5625e-02f), ifft6057);
__m512 ifft5988 = _mm512_fnmadd_ps(ifft5983, _mm512_set1_ps(1.5625e-02f), ifft5973);
__m512 ifft6072 = _mm512_fnmadd_ps(ifft6067, _mm512_set1_ps(1.5625e-02f), ifft6057);
__m512 ifft5989 = _mm512_fnmadd_ps(ifft5984, _mm512_set1_ps(1.5625e-02f), ifft5972);
__m512 ifft6073 = _mm512_fnmadd_ps(ifft6068, _mm512_set1_ps(1.5625e-02f), ifft6056);
__m512 ifft5990 = _mm512_fmadd_ps(ifft5984, _mm512_set1_ps(1.5625e-02f), ifft5972);
__m512 ifft6074 = _mm512_fmadd_ps(ifft6068, _mm512_set1_ps(1.5625e-02f), ifft6056);
__m512 ifft5991 = _mm512_fmadd_ps(ifft5982, _mm512_set1_ps(1.5625e-02f), ifft5974);
__m512 ifft6075 = _mm512_fmadd_ps(ifft6066, _mm512_set1_ps(1.5625e-02f), ifft6058);
__m512 ifft5992 = _mm512_fnmadd_ps(ifft5982, _mm512_set1_ps(1.5625e-02f), ifft5974);
__m512 ifft6076 = _mm512_fnmadd_ps(ifft6066, _mm512_set1_ps(1.5625e-02f), ifft6058);
__m512 dat1905 = ifft5985;
__m512 dat1912 = ifft6069;
__m512 dat1906 = ifft5987;
__m512 dat1913 = ifft6071;
__m512 dat1907 = ifft5989;
__m512 dat1914 = ifft6073;
__m512 dat1908 = ifft5991;
__m512 dat1915 = ifft6075;
__m512 dat1909 = ifft5986;
__m512 dat1916 = ifft6070;
__m512 dat1910 = ifft5988;
__m512 dat1917 = ifft6072;
__m512 dat1911 = ifft5990;
__m512 dat1918 = ifft6074;
(void)ifft5992;
(void)ifft6076;
__m512i pm159 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack289 = _mm512_permutex2var_ps(dat1905, pm159, dat1912);
__m512i pm160 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack290 = _mm512_permutex2var_ps(dat1905, pm160, dat1912);
__m512 pack291 = _mm512_permutex2var_ps(dat1906, pm159, dat1913);
__m512 pack292 = _mm512_permutex2var_ps(dat1906, pm160, dat1913);
__m512 pack293 = _mm512_permutex2var_ps(dat1907, pm159, dat1914);
__m512 pack294 = _mm512_permutex2var_ps(dat1907, pm160, dat1914);
__m512 pack295 = _mm512_permutex2var_ps(dat1908, pm159, dat1915);
__m512 pack296 = _mm512_permutex2var_ps(dat1908, pm160, dat1915);
__m512 pack297 = _mm512_permutex2var_ps(dat1909, pm159, dat1916);
__m512 pack298 = _mm512_permutex2var_ps(dat1909, pm160, dat1916);
__m512 pack299 = _mm512_permutex2var_ps(dat1910, pm159, dat1917);
__m512 pack300 = _mm512_permutex2var_ps(dat1910, pm160, dat1917);
__m512 pack301 = _mm512_permutex2var_ps(dat1911, pm159, dat1918);
__m512 pack302 = _mm512_permutex2var_ps(dat1911, pm160, dat1918);
pack289 = _mm512_max_ps(_mm512_setzero_ps(), pack289);
pack290 = _mm512_max_ps(_mm512_setzero_ps(), pack290);
pack291 = _mm512_max_ps(_mm512_setzero_ps(), pack291);
pack292 = _mm512_max_ps(_mm512_setzero_ps(), pack292);
pack293 = _mm512_max_ps(_mm512_setzero_ps(), pack293);
pack294 = _mm512_max_ps(_mm512_setzero_ps(), pack294);
pack295 = _mm512_max_ps(_mm512_setzero_ps(), pack295);
pack296 = _mm512_max_ps(_mm512_setzero_ps(), pack296);
pack297 = _mm512_max_ps(_mm512_setzero_ps(), pack297);
pack298 = _mm512_max_ps(_mm512_setzero_ps(), pack298);
pack299 = _mm512_max_ps(_mm512_setzero_ps(), pack299);
pack300 = _mm512_max_ps(_mm512_setzero_ps(), pack300);
pack301 = _mm512_max_ps(_mm512_setzero_ps(), pack301);
pack302 = _mm512_max_ps(_mm512_setzero_ps(), pack302);
_mm512_mask_storeu_ps(datPtr19+56+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack289);
_mm512_mask_storeu_ps(datPtr19+3192+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack290);
_mm512_mask_storeu_ps(datPtr19+168+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack291);
_mm512_mask_storeu_ps(datPtr19+3304+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack292);
_mm512_mask_storeu_ps(datPtr19+280+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack293);
_mm512_mask_storeu_ps(datPtr19+3416+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack294);
_mm512_mask_storeu_ps(datPtr19+392+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack295);
_mm512_mask_storeu_ps(datPtr19+3528+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack296);
_mm512_mask_storeu_ps(datPtr19+504+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack297);
_mm512_mask_storeu_ps(datPtr19+3640+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack298);
_mm512_mask_storeu_ps(datPtr19+616+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack299);
_mm512_mask_storeu_ps(datPtr19+3752+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack300);
_mm512_mask_storeu_ps(datPtr19+728+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack301);
_mm512_mask_storeu_ps(datPtr19+3864+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+56*t36, 16383, pack302);
ptrdiff_t t37 = 0;
__m512 sfRe413 = _mm512_loadu_ps(sfPtr10+512+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm413 = _mm512_loadu_ps(sfPtr10+576+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe417 = _mm512_loadu_ps(sfPtr10+640+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm417 = _mm512_loadu_ps(sfPtr10+704+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe414 = _mm512_loadu_ps(sfPtr10+8704+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm414 = _mm512_loadu_ps(sfPtr10+8768+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe418 = _mm512_loadu_ps(sfPtr10+8832+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm418 = _mm512_loadu_ps(sfPtr10+8896+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe415 = _mm512_loadu_ps(sfPtr10+16896+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm415 = _mm512_loadu_ps(sfPtr10+16960+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe419 = _mm512_loadu_ps(sfPtr10+17024+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm419 = _mm512_loadu_ps(sfPtr10+17088+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe416 = _mm512_loadu_ps(sfPtr10+25088+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm416 = _mm512_loadu_ps(sfPtr10+25152+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfRe420 = _mm512_loadu_ps(sfPtr10+25216+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512 sfIm420 = _mm512_loadu_ps(sfPtr10+25280+32768*i41+3072*j34+1536*k114+768*r21+256*t37);
__m512i ifft6077 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6078 = _mm512_permutexvar_ps(ifft6077, sfRe413);
__m512 ifft6169 = _mm512_permutexvar_ps(ifft6077, sfRe417);
__m512i ifft6079 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6080 = _mm512_permutexvar_ps(ifft6079, sfRe413);
__m512 ifft6170 = _mm512_permutexvar_ps(ifft6079, sfRe417);
__m512 ifft6081 = _mm512_permutexvar_ps(ifft6077, sfIm413);
__m512 ifft6171 = _mm512_permutexvar_ps(ifft6077, sfIm417);
__m512 ifft6082 = _mm512_permutexvar_ps(ifft6079, sfIm413);
__m512 ifft6172 = _mm512_permutexvar_ps(ifft6079, sfIm417);
__m512 ifft6083 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6084 = _mm512_mask_fmadd_ps(ifft6082, 65021, ifft6083, ifft6078);
__m512 ifft6173 = _mm512_mask_fmadd_ps(ifft6172, 65021, ifft6083, ifft6169);
__m512 ifft6085 = _mm512_mask_fnmadd_ps(ifft6081, 65021, ifft6083, ifft6080);
__m512 ifft6174 = _mm512_mask_fnmadd_ps(ifft6171, 65021, ifft6083, ifft6170);
__m512 ifft6086 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6087 = _mm512_fmadd_ps(ifft6084, ifft6086, _mm512_shuffle_ps(ifft6084, ifft6084, 177));
__m512 ifft6175 = _mm512_fmadd_ps(ifft6173, ifft6086, _mm512_shuffle_ps(ifft6173, ifft6173, 177));
__m512 ifft6088 = _mm512_fmadd_ps(ifft6085, ifft6086, _mm512_shuffle_ps(ifft6085, ifft6085, 177));
__m512 ifft6176 = _mm512_fmadd_ps(ifft6174, ifft6086, _mm512_shuffle_ps(ifft6174, ifft6174, 177));
__m512 ifft6089 = _mm512_fmadd_ps(sfRe414, ifft6086, _mm512_shuffle_ps(sfRe414, sfRe414, 177));
__m512 ifft6177 = _mm512_fmadd_ps(sfRe418, ifft6086, _mm512_shuffle_ps(sfRe418, sfRe418, 177));
__m512 ifft6090 = _mm512_fmadd_ps(sfIm414, ifft6086, _mm512_shuffle_ps(sfIm414, sfIm414, 177));
__m512 ifft6178 = _mm512_fmadd_ps(sfIm418, ifft6086, _mm512_shuffle_ps(sfIm418, sfIm418, 177));
__m512 ifft6091 = _mm512_fmadd_ps(sfRe415, ifft6086, _mm512_shuffle_ps(sfRe415, sfRe415, 177));
__m512 ifft6179 = _mm512_fmadd_ps(sfRe419, ifft6086, _mm512_shuffle_ps(sfRe419, sfRe419, 177));
__m512 ifft6092 = _mm512_fmadd_ps(sfIm415, ifft6086, _mm512_shuffle_ps(sfIm415, sfIm415, 177));
__m512 ifft6180 = _mm512_fmadd_ps(sfIm419, ifft6086, _mm512_shuffle_ps(sfIm419, sfIm419, 177));
__m512 ifft6093 = _mm512_fmadd_ps(sfRe416, ifft6086, _mm512_shuffle_ps(sfRe416, sfRe416, 177));
__m512 ifft6181 = _mm512_fmadd_ps(sfRe420, ifft6086, _mm512_shuffle_ps(sfRe420, sfRe420, 177));
__m512 ifft6094 = _mm512_fmadd_ps(sfIm416, ifft6086, _mm512_shuffle_ps(sfIm416, sfIm416, 177));
__m512 ifft6182 = _mm512_fmadd_ps(sfIm420, ifft6086, _mm512_shuffle_ps(sfIm420, sfIm420, 177));
__m512 ifft6095 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6096 = _mm512_mul_ps(ifft6087, ifft6095);
__m512 ifft6183 = _mm512_mul_ps(ifft6175, ifft6095);
__m512 ifft6097 = _mm512_mul_ps(ifft6088, ifft6095);
__m512 ifft6184 = _mm512_mul_ps(ifft6176, ifft6095);
__m512 ifft6098 = _mm512_mul_ps(ifft6089, ifft6095);
__m512 ifft6185 = _mm512_mul_ps(ifft6177, ifft6095);
__m512 ifft6099 = _mm512_mul_ps(ifft6090, ifft6095);
__m512 ifft6186 = _mm512_mul_ps(ifft6178, ifft6095);
__m512 ifft6100 = _mm512_mul_ps(ifft6091, ifft6095);
__m512 ifft6187 = _mm512_mul_ps(ifft6179, ifft6095);
__m512 ifft6101 = _mm512_mul_ps(ifft6092, ifft6095);
__m512 ifft6188 = _mm512_mul_ps(ifft6180, ifft6095);
__m512 ifft6102 = _mm512_mul_ps(ifft6093, ifft6095);
__m512 ifft6189 = _mm512_mul_ps(ifft6181, ifft6095);
__m512 ifft6103 = _mm512_mul_ps(ifft6094, ifft6095);
__m512 ifft6190 = _mm512_mul_ps(ifft6182, ifft6095);
__m512 ifft6104 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6105 = _mm512_fnmadd_ps(ifft6088, ifft6104, ifft6096);
__m512 ifft6191 = _mm512_fnmadd_ps(ifft6176, ifft6104, ifft6183);
__m512 ifft6106 = _mm512_fmadd_ps(ifft6087, ifft6104, ifft6097);
__m512 ifft6192 = _mm512_fmadd_ps(ifft6175, ifft6104, ifft6184);
__m512 ifft6107 = _mm512_fnmadd_ps(ifft6090, ifft6104, ifft6098);
__m512 ifft6193 = _mm512_fnmadd_ps(ifft6178, ifft6104, ifft6185);
__m512 ifft6108 = _mm512_fmadd_ps(ifft6089, ifft6104, ifft6099);
__m512 ifft6194 = _mm512_fmadd_ps(ifft6177, ifft6104, ifft6186);
__m512 ifft6109 = _mm512_fnmadd_ps(ifft6092, ifft6104, ifft6100);
__m512 ifft6195 = _mm512_fnmadd_ps(ifft6180, ifft6104, ifft6187);
__m512 ifft6110 = _mm512_fmadd_ps(ifft6091, ifft6104, ifft6101);
__m512 ifft6196 = _mm512_fmadd_ps(ifft6179, ifft6104, ifft6188);
__m512 ifft6111 = _mm512_fnmadd_ps(ifft6094, ifft6104, ifft6102);
__m512 ifft6197 = _mm512_fnmadd_ps(ifft6182, ifft6104, ifft6189);
__m512 ifft6112 = _mm512_fmadd_ps(ifft6093, ifft6104, ifft6103);
__m512 ifft6198 = _mm512_fmadd_ps(ifft6181, ifft6104, ifft6190);
__m512 ifft6113 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6114 = _mm512_fmadd_ps(ifft6105, ifft6113, _mm512_shuffle_ps(ifft6105, ifft6105, 78));
__m512 ifft6199 = _mm512_fmadd_ps(ifft6191, ifft6113, _mm512_shuffle_ps(ifft6191, ifft6191, 78));
__m512 ifft6115 = _mm512_fmadd_ps(ifft6106, ifft6113, _mm512_shuffle_ps(ifft6106, ifft6106, 78));
__m512 ifft6200 = _mm512_fmadd_ps(ifft6192, ifft6113, _mm512_shuffle_ps(ifft6192, ifft6192, 78));
__m512 ifft6116 = _mm512_fmadd_ps(ifft6107, ifft6113, _mm512_shuffle_ps(ifft6107, ifft6107, 78));
__m512 ifft6201 = _mm512_fmadd_ps(ifft6193, ifft6113, _mm512_shuffle_ps(ifft6193, ifft6193, 78));
__m512 ifft6117 = _mm512_fmadd_ps(ifft6108, ifft6113, _mm512_shuffle_ps(ifft6108, ifft6108, 78));
__m512 ifft6202 = _mm512_fmadd_ps(ifft6194, ifft6113, _mm512_shuffle_ps(ifft6194, ifft6194, 78));
__m512 ifft6118 = _mm512_fmadd_ps(ifft6109, ifft6113, _mm512_shuffle_ps(ifft6109, ifft6109, 78));
__m512 ifft6203 = _mm512_fmadd_ps(ifft6195, ifft6113, _mm512_shuffle_ps(ifft6195, ifft6195, 78));
__m512 ifft6119 = _mm512_fmadd_ps(ifft6110, ifft6113, _mm512_shuffle_ps(ifft6110, ifft6110, 78));
__m512 ifft6204 = _mm512_fmadd_ps(ifft6196, ifft6113, _mm512_shuffle_ps(ifft6196, ifft6196, 78));
__m512 ifft6120 = _mm512_fmadd_ps(ifft6111, ifft6113, _mm512_shuffle_ps(ifft6111, ifft6111, 78));
__m512 ifft6205 = _mm512_fmadd_ps(ifft6197, ifft6113, _mm512_shuffle_ps(ifft6197, ifft6197, 78));
__m512 ifft6121 = _mm512_fmadd_ps(ifft6112, ifft6113, _mm512_shuffle_ps(ifft6112, ifft6112, 78));
__m512 ifft6206 = _mm512_fmadd_ps(ifft6198, ifft6113, _mm512_shuffle_ps(ifft6198, ifft6198, 78));
__m512 ifft6122 = _mm512_mask_sub_ps(ifft6114, 49344, _mm512_setzero_ps(), ifft6115);
__m512 ifft6207 = _mm512_mask_sub_ps(ifft6199, 49344, _mm512_setzero_ps(), ifft6200);
__m512 ifft6123 = _mm512_mask_mov_ps(ifft6115, 49344, ifft6114);
__m512 ifft6208 = _mm512_mask_mov_ps(ifft6200, 49344, ifft6199);
__m512 ifft6124 = _mm512_mask_sub_ps(ifft6116, 49344, _mm512_setzero_ps(), ifft6117);
__m512 ifft6209 = _mm512_mask_sub_ps(ifft6201, 49344, _mm512_setzero_ps(), ifft6202);
__m512 ifft6125 = _mm512_mask_mov_ps(ifft6117, 49344, ifft6116);
__m512 ifft6210 = _mm512_mask_mov_ps(ifft6202, 49344, ifft6201);
__m512 ifft6126 = _mm512_mask_sub_ps(ifft6118, 49344, _mm512_setzero_ps(), ifft6119);
__m512 ifft6211 = _mm512_mask_sub_ps(ifft6203, 49344, _mm512_setzero_ps(), ifft6204);
__m512 ifft6127 = _mm512_mask_mov_ps(ifft6119, 49344, ifft6118);
__m512 ifft6212 = _mm512_mask_mov_ps(ifft6204, 49344, ifft6203);
__m512 ifft6128 = _mm512_mask_sub_ps(ifft6120, 49344, _mm512_setzero_ps(), ifft6121);
__m512 ifft6213 = _mm512_mask_sub_ps(ifft6205, 49344, _mm512_setzero_ps(), ifft6206);
__m512 ifft6129 = _mm512_mask_mov_ps(ifft6121, 49344, ifft6120);
__m512 ifft6214 = _mm512_mask_mov_ps(ifft6206, 49344, ifft6205);
__m512 ifft6130 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft6131 = _mm512_fmadd_ps(ifft6122, ifft6130, _mm512_shuffle_f32x4(ifft6122, ifft6122, 177));
__m512 ifft6215 = _mm512_fmadd_ps(ifft6207, ifft6130, _mm512_shuffle_f32x4(ifft6207, ifft6207, 177));
__m512 ifft6132 = _mm512_fmadd_ps(ifft6123, ifft6130, _mm512_shuffle_f32x4(ifft6123, ifft6123, 177));
__m512 ifft6216 = _mm512_fmadd_ps(ifft6208, ifft6130, _mm512_shuffle_f32x4(ifft6208, ifft6208, 177));
__m512 ifft6133 = _mm512_fmadd_ps(ifft6124, ifft6130, _mm512_shuffle_f32x4(ifft6124, ifft6124, 177));
__m512 ifft6217 = _mm512_fmadd_ps(ifft6209, ifft6130, _mm512_shuffle_f32x4(ifft6209, ifft6209, 177));
__m512 ifft6134 = _mm512_fmadd_ps(ifft6125, ifft6130, _mm512_shuffle_f32x4(ifft6125, ifft6125, 177));
__m512 ifft6218 = _mm512_fmadd_ps(ifft6210, ifft6130, _mm512_shuffle_f32x4(ifft6210, ifft6210, 177));
__m512 ifft6135 = _mm512_fmadd_ps(ifft6126, ifft6130, _mm512_shuffle_f32x4(ifft6126, ifft6126, 177));
__m512 ifft6219 = _mm512_fmadd_ps(ifft6211, ifft6130, _mm512_shuffle_f32x4(ifft6211, ifft6211, 177));
__m512 ifft6136 = _mm512_fnmsub_ps(ifft6127, ifft6130, _mm512_shuffle_f32x4(ifft6127, ifft6127, 177));
__m512 ifft6220 = _mm512_fnmsub_ps(ifft6212, ifft6130, _mm512_shuffle_f32x4(ifft6212, ifft6212, 177));
__m512 ifft6137 = _mm512_fmadd_ps(ifft6128, ifft6130, _mm512_shuffle_f32x4(ifft6128, ifft6128, 177));
__m512 ifft6221 = _mm512_fmadd_ps(ifft6213, ifft6130, _mm512_shuffle_f32x4(ifft6213, ifft6213, 177));
__m512 ifft6138 = _mm512_fmadd_ps(ifft6129, ifft6130, _mm512_shuffle_f32x4(ifft6129, ifft6129, 177));
__m512 ifft6222 = _mm512_fmadd_ps(ifft6214, ifft6130, _mm512_shuffle_f32x4(ifft6214, ifft6214, 177));
__m512 ifft6139 = _mm512_add_ps(ifft6131, ifft6132);
__m512 ifft6223 = _mm512_add_ps(ifft6215, ifft6216);
__m512 ifft6140 = _mm512_sub_ps(ifft6131, ifft6132);
__m512 ifft6224 = _mm512_sub_ps(ifft6215, ifft6216);
__m512 ifft6141 = _mm512_sub_ps(ifft6133, ifft6137);
__m512 ifft6225 = _mm512_sub_ps(ifft6217, ifft6221);
__m512 ifft6142 = _mm512_add_ps(ifft6134, ifft6138);
__m512 ifft6226 = _mm512_add_ps(ifft6218, ifft6222);
__m512 ifft6143 = _mm512_add_ps(ifft6133, ifft6137);
__m512 ifft6227 = _mm512_add_ps(ifft6217, ifft6221);
__m512 ifft6144 = _mm512_sub_ps(ifft6134, ifft6138);
__m512 ifft6228 = _mm512_sub_ps(ifft6218, ifft6222);
__m512 ifft6145 = _mm512_mul_ps(ifft6135, _mm512_set1_ps(3.125e-02f));
__m512 ifft6229 = _mm512_mul_ps(ifft6219, _mm512_set1_ps(3.125e-02f));
__m512 ifft6146 = _mm512_mul_ps(ifft6136, _mm512_set1_ps(3.125e-02f));
__m512 ifft6230 = _mm512_mul_ps(ifft6220, _mm512_set1_ps(3.125e-02f));
__m512 ifft6147 = _mm512_fmadd_ps(ifft6139, _mm512_set1_ps(1.5625e-02f), ifft6145);
__m512 ifft6231 = _mm512_fmadd_ps(ifft6223, _mm512_set1_ps(1.5625e-02f), ifft6229);
__m512 ifft6148 = _mm512_fmsub_ps(ifft6139, _mm512_set1_ps(1.5625e-02f), ifft6145);
__m512 ifft6232 = _mm512_fmsub_ps(ifft6223, _mm512_set1_ps(1.5625e-02f), ifft6229);
__m512 ifft6149 = _mm512_fmadd_ps(ifft6140, _mm512_set1_ps(1.5625e-02f), ifft6146);
__m512 ifft6233 = _mm512_fmadd_ps(ifft6224, _mm512_set1_ps(1.5625e-02f), ifft6230);
__m512 ifft6150 = _mm512_fmsub_ps(ifft6140, _mm512_set1_ps(1.5625e-02f), ifft6146);
__m512 ifft6234 = _mm512_fmsub_ps(ifft6224, _mm512_set1_ps(1.5625e-02f), ifft6230);
__m512 ifft6151 = _mm512_add_ps(ifft6141, ifft6142);
__m512 ifft6235 = _mm512_add_ps(ifft6225, ifft6226);
__m512 ifft6152 = _mm512_sub_ps(ifft6141, ifft6142);
__m512 ifft6236 = _mm512_sub_ps(ifft6225, ifft6226);
__m512 ifft6153 = _mm512_fnmadd_ps(ifft6151, _mm512_set1_ps(7.0710677e-01f), ifft6143);
__m512 ifft6237 = _mm512_fnmadd_ps(ifft6235, _mm512_set1_ps(7.0710677e-01f), ifft6227);
__m512 ifft6154 = _mm512_fmadd_ps(ifft6151, _mm512_set1_ps(7.0710677e-01f), ifft6143);
__m512 ifft6238 = _mm512_fmadd_ps(ifft6235, _mm512_set1_ps(7.0710677e-01f), ifft6227);
__m512 ifft6155 = _mm512_fmadd_ps(ifft6152, _mm512_set1_ps(7.0710677e-01f), ifft6144);
__m512 ifft6239 = _mm512_fmadd_ps(ifft6236, _mm512_set1_ps(7.0710677e-01f), ifft6228);
__m512 ifft6156 = _mm512_fmsub_ps(ifft6152, _mm512_set1_ps(7.0710677e-01f), ifft6144);
__m512 ifft6240 = _mm512_fmsub_ps(ifft6236, _mm512_set1_ps(7.0710677e-01f), ifft6228);
__m512 ifft6157 = _mm512_add_ps(ifft6153, ifft6154);
__m512 ifft6241 = _mm512_add_ps(ifft6237, ifft6238);
__m512 ifft6158 = _mm512_sub_ps(ifft6153, ifft6154);
__m512 ifft6242 = _mm512_sub_ps(ifft6237, ifft6238);
__m512 ifft6159 = _mm512_add_ps(ifft6155, ifft6156);
__m512 ifft6243 = _mm512_add_ps(ifft6239, ifft6240);
__m512 ifft6160 = _mm512_sub_ps(ifft6155, ifft6156);
__m512 ifft6244 = _mm512_sub_ps(ifft6239, ifft6240);
__m512 ifft6161 = _mm512_fmadd_ps(ifft6157, _mm512_set1_ps(1.5625e-02f), ifft6147);
__m512 ifft6245 = _mm512_fmadd_ps(ifft6241, _mm512_set1_ps(1.5625e-02f), ifft6231);
__m512 ifft6162 = _mm512_fnmadd_ps(ifft6157, _mm512_set1_ps(1.5625e-02f), ifft6147);
__m512 ifft6246 = _mm512_fnmadd_ps(ifft6241, _mm512_set1_ps(1.5625e-02f), ifft6231);
__m512 ifft6163 = _mm512_fmadd_ps(ifft6159, _mm512_set1_ps(1.5625e-02f), ifft6149);
__m512 ifft6247 = _mm512_fmadd_ps(ifft6243, _mm512_set1_ps(1.5625e-02f), ifft6233);
__m512 ifft6164 = _mm512_fnmadd_ps(ifft6159, _mm512_set1_ps(1.5625e-02f), ifft6149);
__m512 ifft6248 = _mm512_fnmadd_ps(ifft6243, _mm512_set1_ps(1.5625e-02f), ifft6233);
__m512 ifft6165 = _mm512_fnmadd_ps(ifft6160, _mm512_set1_ps(1.5625e-02f), ifft6148);
__m512 ifft6249 = _mm512_fnmadd_ps(ifft6244, _mm512_set1_ps(1.5625e-02f), ifft6232);
__m512 ifft6166 = _mm512_fmadd_ps(ifft6160, _mm512_set1_ps(1.5625e-02f), ifft6148);
__m512 ifft6250 = _mm512_fmadd_ps(ifft6244, _mm512_set1_ps(1.5625e-02f), ifft6232);
__m512 ifft6167 = _mm512_fmadd_ps(ifft6158, _mm512_set1_ps(1.5625e-02f), ifft6150);
__m512 ifft6251 = _mm512_fmadd_ps(ifft6242, _mm512_set1_ps(1.5625e-02f), ifft6234);
__m512 ifft6168 = _mm512_fnmadd_ps(ifft6158, _mm512_set1_ps(1.5625e-02f), ifft6150);
__m512 ifft6252 = _mm512_fnmadd_ps(ifft6242, _mm512_set1_ps(1.5625e-02f), ifft6234);
__m512 dat1919 = ifft6161;
__m512 dat1926 = ifft6245;
__m512 dat1920 = ifft6163;
__m512 dat1927 = ifft6247;
__m512 dat1921 = ifft6165;
__m512 dat1928 = ifft6249;
__m512 dat1922 = ifft6167;
__m512 dat1929 = ifft6251;
__m512 dat1923 = ifft6162;
__m512 dat1930 = ifft6246;
__m512 dat1924 = ifft6164;
__m512 dat1931 = ifft6248;
__m512 dat1925 = ifft6166;
__m512 dat1932 = ifft6250;
(void)ifft6168;
(void)ifft6252;
__m512i pm161 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack303 = _mm512_permutex2var_ps(dat1919, pm161, dat1926);
__m512i pm162 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack304 = _mm512_permutex2var_ps(dat1919, pm162, dat1926);
__m512 pack305 = _mm512_permutex2var_ps(dat1920, pm161, dat1927);
__m512 pack306 = _mm512_permutex2var_ps(dat1920, pm162, dat1927);
__m512 pack307 = _mm512_permutex2var_ps(dat1921, pm161, dat1928);
__m512 pack308 = _mm512_permutex2var_ps(dat1921, pm162, dat1928);
__m512 pack309 = _mm512_permutex2var_ps(dat1922, pm161, dat1929);
__m512 pack310 = _mm512_permutex2var_ps(dat1922, pm162, dat1929);
__m512 pack311 = _mm512_permutex2var_ps(dat1923, pm161, dat1930);
__m512 pack312 = _mm512_permutex2var_ps(dat1923, pm162, dat1930);
__m512 pack313 = _mm512_permutex2var_ps(dat1924, pm161, dat1931);
__m512 pack314 = _mm512_permutex2var_ps(dat1924, pm162, dat1931);
__m512 pack315 = _mm512_permutex2var_ps(dat1925, pm161, dat1932);
__m512 pack316 = _mm512_permutex2var_ps(dat1925, pm162, dat1932);
pack303 = _mm512_max_ps(_mm512_setzero_ps(), pack303);
pack304 = _mm512_max_ps(_mm512_setzero_ps(), pack304);
pack305 = _mm512_max_ps(_mm512_setzero_ps(), pack305);
pack306 = _mm512_max_ps(_mm512_setzero_ps(), pack306);
pack307 = _mm512_max_ps(_mm512_setzero_ps(), pack307);
pack308 = _mm512_max_ps(_mm512_setzero_ps(), pack308);
pack309 = _mm512_max_ps(_mm512_setzero_ps(), pack309);
pack310 = _mm512_max_ps(_mm512_setzero_ps(), pack310);
pack311 = _mm512_max_ps(_mm512_setzero_ps(), pack311);
pack312 = _mm512_max_ps(_mm512_setzero_ps(), pack312);
pack313 = _mm512_max_ps(_mm512_setzero_ps(), pack313);
pack314 = _mm512_max_ps(_mm512_setzero_ps(), pack314);
pack315 = _mm512_max_ps(_mm512_setzero_ps(), pack315);
pack316 = _mm512_max_ps(_mm512_setzero_ps(), pack316);
_mm512_mask_storeu_ps(datPtr19+784+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack303);
_mm512_mask_storeu_ps(datPtr19+3920+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack304);
_mm512_mask_storeu_ps(datPtr19+896+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack305);
_mm512_mask_storeu_ps(datPtr19+4032+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack306);
_mm512_mask_storeu_ps(datPtr19+1008+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack307);
_mm512_mask_storeu_ps(datPtr19+4144+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack308);
_mm512_mask_storeu_ps(datPtr19+1120+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack309);
_mm512_mask_storeu_ps(datPtr19+4256+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack310);
_mm512_mask_storeu_ps(datPtr19+1232+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack311);
_mm512_mask_storeu_ps(datPtr19+4368+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack312);
_mm512_mask_storeu_ps(datPtr19+1344+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack313);
_mm512_mask_storeu_ps(datPtr19+4480+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack314);
_mm512_mask_storeu_ps(datPtr19+1456+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack315);
_mm512_mask_storeu_ps(datPtr19+4592+25088*i41+12544*k114+6272*r21+112*toH38+4*toW38+0*t37, 16383, pack316);
}
}
++j34;
rel20 = 1;
}
if (rel20 < 2) {
ptrdiff_t toH39 = base20+7;
ptrdiff_t toW39 = 14;
ptrdiff_t k115 = 2*w53;
for (; k115 != 2; ++k115) {
ptrdiff_t r22 = 0;
for (; r22 != 2; ++r22) {
ptrdiff_t t38 = 0;
__m512 sfRe421 = _mm512_loadu_ps(sfPtr10+0+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm421 = _mm512_loadu_ps(sfPtr10+64+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe425 = _mm512_loadu_ps(sfPtr10+128+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm425 = _mm512_loadu_ps(sfPtr10+192+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe422 = _mm512_loadu_ps(sfPtr10+8192+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm422 = _mm512_loadu_ps(sfPtr10+8256+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe426 = _mm512_loadu_ps(sfPtr10+8320+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm426 = _mm512_loadu_ps(sfPtr10+8384+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe423 = _mm512_loadu_ps(sfPtr10+16384+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm423 = _mm512_loadu_ps(sfPtr10+16448+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe427 = _mm512_loadu_ps(sfPtr10+16512+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm427 = _mm512_loadu_ps(sfPtr10+16576+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe424 = _mm512_loadu_ps(sfPtr10+24576+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm424 = _mm512_loadu_ps(sfPtr10+24640+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfRe428 = _mm512_loadu_ps(sfPtr10+24704+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512 sfIm428 = _mm512_loadu_ps(sfPtr10+24768+32768*i41+3072*j34+1536*k115+768*r22+256*t38);
__m512i ifft6253 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6254 = _mm512_permutexvar_ps(ifft6253, sfRe421);
__m512 ifft6345 = _mm512_permutexvar_ps(ifft6253, sfRe425);
__m512i ifft6255 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6256 = _mm512_permutexvar_ps(ifft6255, sfRe421);
__m512 ifft6346 = _mm512_permutexvar_ps(ifft6255, sfRe425);
__m512 ifft6257 = _mm512_permutexvar_ps(ifft6253, sfIm421);
__m512 ifft6347 = _mm512_permutexvar_ps(ifft6253, sfIm425);
__m512 ifft6258 = _mm512_permutexvar_ps(ifft6255, sfIm421);
__m512 ifft6348 = _mm512_permutexvar_ps(ifft6255, sfIm425);
__m512 ifft6259 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6260 = _mm512_mask_fmadd_ps(ifft6258, 65021, ifft6259, ifft6254);
__m512 ifft6349 = _mm512_mask_fmadd_ps(ifft6348, 65021, ifft6259, ifft6345);
__m512 ifft6261 = _mm512_mask_fnmadd_ps(ifft6257, 65021, ifft6259, ifft6256);
__m512 ifft6350 = _mm512_mask_fnmadd_ps(ifft6347, 65021, ifft6259, ifft6346);
__m512 ifft6262 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6263 = _mm512_fmadd_ps(ifft6260, ifft6262, _mm512_shuffle_ps(ifft6260, ifft6260, 177));
__m512 ifft6351 = _mm512_fmadd_ps(ifft6349, ifft6262, _mm512_shuffle_ps(ifft6349, ifft6349, 177));
__m512 ifft6264 = _mm512_fmadd_ps(ifft6261, ifft6262, _mm512_shuffle_ps(ifft6261, ifft6261, 177));
__m512 ifft6352 = _mm512_fmadd_ps(ifft6350, ifft6262, _mm512_shuffle_ps(ifft6350, ifft6350, 177));
__m512 ifft6265 = _mm512_fmadd_ps(sfRe422, ifft6262, _mm512_shuffle_ps(sfRe422, sfRe422, 177));
__m512 ifft6353 = _mm512_fmadd_ps(sfRe426, ifft6262, _mm512_shuffle_ps(sfRe426, sfRe426, 177));
__m512 ifft6266 = _mm512_fmadd_ps(sfIm422, ifft6262, _mm512_shuffle_ps(sfIm422, sfIm422, 177));
__m512 ifft6354 = _mm512_fmadd_ps(sfIm426, ifft6262, _mm512_shuffle_ps(sfIm426, sfIm426, 177));
__m512 ifft6267 = _mm512_fmadd_ps(sfRe423, ifft6262, _mm512_shuffle_ps(sfRe423, sfRe423, 177));
__m512 ifft6355 = _mm512_fmadd_ps(sfRe427, ifft6262, _mm512_shuffle_ps(sfRe427, sfRe427, 177));
__m512 ifft6268 = _mm512_fmadd_ps(sfIm423, ifft6262, _mm512_shuffle_ps(sfIm423, sfIm423, 177));
__m512 ifft6356 = _mm512_fmadd_ps(sfIm427, ifft6262, _mm512_shuffle_ps(sfIm427, sfIm427, 177));
__m512 ifft6269 = _mm512_fmadd_ps(sfRe424, ifft6262, _mm512_shuffle_ps(sfRe424, sfRe424, 177));
__m512 ifft6357 = _mm512_fmadd_ps(sfRe428, ifft6262, _mm512_shuffle_ps(sfRe428, sfRe428, 177));
__m512 ifft6270 = _mm512_fmadd_ps(sfIm424, ifft6262, _mm512_shuffle_ps(sfIm424, sfIm424, 177));
__m512 ifft6358 = _mm512_fmadd_ps(sfIm428, ifft6262, _mm512_shuffle_ps(sfIm428, sfIm428, 177));
__m512 ifft6271 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6272 = _mm512_mul_ps(ifft6263, ifft6271);
__m512 ifft6359 = _mm512_mul_ps(ifft6351, ifft6271);
__m512 ifft6273 = _mm512_mul_ps(ifft6264, ifft6271);
__m512 ifft6360 = _mm512_mul_ps(ifft6352, ifft6271);
__m512 ifft6274 = _mm512_mul_ps(ifft6265, ifft6271);
__m512 ifft6361 = _mm512_mul_ps(ifft6353, ifft6271);
__m512 ifft6275 = _mm512_mul_ps(ifft6266, ifft6271);
__m512 ifft6362 = _mm512_mul_ps(ifft6354, ifft6271);
__m512 ifft6276 = _mm512_mul_ps(ifft6267, ifft6271);
__m512 ifft6363 = _mm512_mul_ps(ifft6355, ifft6271);
__m512 ifft6277 = _mm512_mul_ps(ifft6268, ifft6271);
__m512 ifft6364 = _mm512_mul_ps(ifft6356, ifft6271);
__m512 ifft6278 = _mm512_mul_ps(ifft6269, ifft6271);
__m512 ifft6365 = _mm512_mul_ps(ifft6357, ifft6271);
__m512 ifft6279 = _mm512_mul_ps(ifft6270, ifft6271);
__m512 ifft6366 = _mm512_mul_ps(ifft6358, ifft6271);
__m512 ifft6280 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6281 = _mm512_fnmadd_ps(ifft6264, ifft6280, ifft6272);
__m512 ifft6367 = _mm512_fnmadd_ps(ifft6352, ifft6280, ifft6359);
__m512 ifft6282 = _mm512_fmadd_ps(ifft6263, ifft6280, ifft6273);
__m512 ifft6368 = _mm512_fmadd_ps(ifft6351, ifft6280, ifft6360);
__m512 ifft6283 = _mm512_fnmadd_ps(ifft6266, ifft6280, ifft6274);
__m512 ifft6369 = _mm512_fnmadd_ps(ifft6354, ifft6280, ifft6361);
__m512 ifft6284 = _mm512_fmadd_ps(ifft6265, ifft6280, ifft6275);
__m512 ifft6370 = _mm512_fmadd_ps(ifft6353, ifft6280, ifft6362);
__m512 ifft6285 = _mm512_fnmadd_ps(ifft6268, ifft6280, ifft6276);
__m512 ifft6371 = _mm512_fnmadd_ps(ifft6356, ifft6280, ifft6363);
__m512 ifft6286 = _mm512_fmadd_ps(ifft6267, ifft6280, ifft6277);
__m512 ifft6372 = _mm512_fmadd_ps(ifft6355, ifft6280, ifft6364);
__m512 ifft6287 = _mm512_fnmadd_ps(ifft6270, ifft6280, ifft6278);
__m512 ifft6373 = _mm512_fnmadd_ps(ifft6358, ifft6280, ifft6365);
__m512 ifft6288 = _mm512_fmadd_ps(ifft6269, ifft6280, ifft6279);
__m512 ifft6374 = _mm512_fmadd_ps(ifft6357, ifft6280, ifft6366);
__m512 ifft6289 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6290 = _mm512_fmadd_ps(ifft6281, ifft6289, _mm512_shuffle_ps(ifft6281, ifft6281, 78));
__m512 ifft6375 = _mm512_fmadd_ps(ifft6367, ifft6289, _mm512_shuffle_ps(ifft6367, ifft6367, 78));
__m512 ifft6291 = _mm512_fmadd_ps(ifft6282, ifft6289, _mm512_shuffle_ps(ifft6282, ifft6282, 78));
__m512 ifft6376 = _mm512_fmadd_ps(ifft6368, ifft6289, _mm512_shuffle_ps(ifft6368, ifft6368, 78));
__m512 ifft6292 = _mm512_fmadd_ps(ifft6283, ifft6289, _mm512_shuffle_ps(ifft6283, ifft6283, 78));
__m512 ifft6377 = _mm512_fmadd_ps(ifft6369, ifft6289, _mm512_shuffle_ps(ifft6369, ifft6369, 78));
__m512 ifft6293 = _mm512_fmadd_ps(ifft6284, ifft6289, _mm512_shuffle_ps(ifft6284, ifft6284, 78));
__m512 ifft6378 = _mm512_fmadd_ps(ifft6370, ifft6289, _mm512_shuffle_ps(ifft6370, ifft6370, 78));
__m512 ifft6294 = _mm512_fmadd_ps(ifft6285, ifft6289, _mm512_shuffle_ps(ifft6285, ifft6285, 78));
__m512 ifft6379 = _mm512_fmadd_ps(ifft6371, ifft6289, _mm512_shuffle_ps(ifft6371, ifft6371, 78));
__m512 ifft6295 = _mm512_fmadd_ps(ifft6286, ifft6289, _mm512_shuffle_ps(ifft6286, ifft6286, 78));
__m512 ifft6380 = _mm512_fmadd_ps(ifft6372, ifft6289, _mm512_shuffle_ps(ifft6372, ifft6372, 78));
__m512 ifft6296 = _mm512_fmadd_ps(ifft6287, ifft6289, _mm512_shuffle_ps(ifft6287, ifft6287, 78));
__m512 ifft6381 = _mm512_fmadd_ps(ifft6373, ifft6289, _mm512_shuffle_ps(ifft6373, ifft6373, 78));
__m512 ifft6297 = _mm512_fmadd_ps(ifft6288, ifft6289, _mm512_shuffle_ps(ifft6288, ifft6288, 78));
__m512 ifft6382 = _mm512_fmadd_ps(ifft6374, ifft6289, _mm512_shuffle_ps(ifft6374, ifft6374, 78));
__m512 ifft6298 = _mm512_mask_sub_ps(ifft6290, 49344, _mm512_setzero_ps(), ifft6291);
__m512 ifft6383 = _mm512_mask_sub_ps(ifft6375, 49344, _mm512_setzero_ps(), ifft6376);
__m512 ifft6299 = _mm512_mask_mov_ps(ifft6291, 49344, ifft6290);
__m512 ifft6384 = _mm512_mask_mov_ps(ifft6376, 49344, ifft6375);
__m512 ifft6300 = _mm512_mask_sub_ps(ifft6292, 49344, _mm512_setzero_ps(), ifft6293);
__m512 ifft6385 = _mm512_mask_sub_ps(ifft6377, 49344, _mm512_setzero_ps(), ifft6378);
__m512 ifft6301 = _mm512_mask_mov_ps(ifft6293, 49344, ifft6292);
__m512 ifft6386 = _mm512_mask_mov_ps(ifft6378, 49344, ifft6377);
__m512 ifft6302 = _mm512_mask_sub_ps(ifft6294, 49344, _mm512_setzero_ps(), ifft6295);
__m512 ifft6387 = _mm512_mask_sub_ps(ifft6379, 49344, _mm512_setzero_ps(), ifft6380);
__m512 ifft6303 = _mm512_mask_mov_ps(ifft6295, 49344, ifft6294);
__m512 ifft6388 = _mm512_mask_mov_ps(ifft6380, 49344, ifft6379);
__m512 ifft6304 = _mm512_mask_sub_ps(ifft6296, 49344, _mm512_setzero_ps(), ifft6297);
__m512 ifft6389 = _mm512_mask_sub_ps(ifft6381, 49344, _mm512_setzero_ps(), ifft6382);
__m512 ifft6305 = _mm512_mask_mov_ps(ifft6297, 49344, ifft6296);
__m512 ifft6390 = _mm512_mask_mov_ps(ifft6382, 49344, ifft6381);
__m512 ifft6306 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft6307 = _mm512_fmadd_ps(ifft6298, ifft6306, _mm512_shuffle_f32x4(ifft6298, ifft6298, 177));
__m512 ifft6391 = _mm512_fmadd_ps(ifft6383, ifft6306, _mm512_shuffle_f32x4(ifft6383, ifft6383, 177));
__m512 ifft6308 = _mm512_fmadd_ps(ifft6299, ifft6306, _mm512_shuffle_f32x4(ifft6299, ifft6299, 177));
__m512 ifft6392 = _mm512_fmadd_ps(ifft6384, ifft6306, _mm512_shuffle_f32x4(ifft6384, ifft6384, 177));
__m512 ifft6309 = _mm512_fmadd_ps(ifft6300, ifft6306, _mm512_shuffle_f32x4(ifft6300, ifft6300, 177));
__m512 ifft6393 = _mm512_fmadd_ps(ifft6385, ifft6306, _mm512_shuffle_f32x4(ifft6385, ifft6385, 177));
__m512 ifft6310 = _mm512_fmadd_ps(ifft6301, ifft6306, _mm512_shuffle_f32x4(ifft6301, ifft6301, 177));
__m512 ifft6394 = _mm512_fmadd_ps(ifft6386, ifft6306, _mm512_shuffle_f32x4(ifft6386, ifft6386, 177));
__m512 ifft6311 = _mm512_fmadd_ps(ifft6302, ifft6306, _mm512_shuffle_f32x4(ifft6302, ifft6302, 177));
__m512 ifft6395 = _mm512_fmadd_ps(ifft6387, ifft6306, _mm512_shuffle_f32x4(ifft6387, ifft6387, 177));
__m512 ifft6312 = _mm512_fnmsub_ps(ifft6303, ifft6306, _mm512_shuffle_f32x4(ifft6303, ifft6303, 177));
__m512 ifft6396 = _mm512_fnmsub_ps(ifft6388, ifft6306, _mm512_shuffle_f32x4(ifft6388, ifft6388, 177));
__m512 ifft6313 = _mm512_fmadd_ps(ifft6304, ifft6306, _mm512_shuffle_f32x4(ifft6304, ifft6304, 177));
__m512 ifft6397 = _mm512_fmadd_ps(ifft6389, ifft6306, _mm512_shuffle_f32x4(ifft6389, ifft6389, 177));
__m512 ifft6314 = _mm512_fmadd_ps(ifft6305, ifft6306, _mm512_shuffle_f32x4(ifft6305, ifft6305, 177));
__m512 ifft6398 = _mm512_fmadd_ps(ifft6390, ifft6306, _mm512_shuffle_f32x4(ifft6390, ifft6390, 177));
__m512 ifft6315 = _mm512_add_ps(ifft6307, ifft6308);
__m512 ifft6399 = _mm512_add_ps(ifft6391, ifft6392);
__m512 ifft6316 = _mm512_sub_ps(ifft6307, ifft6308);
__m512 ifft6400 = _mm512_sub_ps(ifft6391, ifft6392);
__m512 ifft6317 = _mm512_sub_ps(ifft6309, ifft6313);
__m512 ifft6401 = _mm512_sub_ps(ifft6393, ifft6397);
__m512 ifft6318 = _mm512_add_ps(ifft6310, ifft6314);
__m512 ifft6402 = _mm512_add_ps(ifft6394, ifft6398);
__m512 ifft6319 = _mm512_add_ps(ifft6309, ifft6313);
__m512 ifft6403 = _mm512_add_ps(ifft6393, ifft6397);
__m512 ifft6320 = _mm512_sub_ps(ifft6310, ifft6314);
__m512 ifft6404 = _mm512_sub_ps(ifft6394, ifft6398);
__m512 ifft6321 = _mm512_mul_ps(ifft6311, _mm512_set1_ps(3.125e-02f));
__m512 ifft6405 = _mm512_mul_ps(ifft6395, _mm512_set1_ps(3.125e-02f));
__m512 ifft6322 = _mm512_mul_ps(ifft6312, _mm512_set1_ps(3.125e-02f));
__m512 ifft6406 = _mm512_mul_ps(ifft6396, _mm512_set1_ps(3.125e-02f));
__m512 ifft6323 = _mm512_fmadd_ps(ifft6315, _mm512_set1_ps(1.5625e-02f), ifft6321);
__m512 ifft6407 = _mm512_fmadd_ps(ifft6399, _mm512_set1_ps(1.5625e-02f), ifft6405);
__m512 ifft6324 = _mm512_fmsub_ps(ifft6315, _mm512_set1_ps(1.5625e-02f), ifft6321);
__m512 ifft6408 = _mm512_fmsub_ps(ifft6399, _mm512_set1_ps(1.5625e-02f), ifft6405);
__m512 ifft6325 = _mm512_fmadd_ps(ifft6316, _mm512_set1_ps(1.5625e-02f), ifft6322);
__m512 ifft6409 = _mm512_fmadd_ps(ifft6400, _mm512_set1_ps(1.5625e-02f), ifft6406);
__m512 ifft6326 = _mm512_fmsub_ps(ifft6316, _mm512_set1_ps(1.5625e-02f), ifft6322);
__m512 ifft6410 = _mm512_fmsub_ps(ifft6400, _mm512_set1_ps(1.5625e-02f), ifft6406);
__m512 ifft6327 = _mm512_add_ps(ifft6317, ifft6318);
__m512 ifft6411 = _mm512_add_ps(ifft6401, ifft6402);
__m512 ifft6328 = _mm512_sub_ps(ifft6317, ifft6318);
__m512 ifft6412 = _mm512_sub_ps(ifft6401, ifft6402);
__m512 ifft6329 = _mm512_fnmadd_ps(ifft6327, _mm512_set1_ps(7.0710677e-01f), ifft6319);
__m512 ifft6413 = _mm512_fnmadd_ps(ifft6411, _mm512_set1_ps(7.0710677e-01f), ifft6403);
__m512 ifft6330 = _mm512_fmadd_ps(ifft6327, _mm512_set1_ps(7.0710677e-01f), ifft6319);
__m512 ifft6414 = _mm512_fmadd_ps(ifft6411, _mm512_set1_ps(7.0710677e-01f), ifft6403);
__m512 ifft6331 = _mm512_fmadd_ps(ifft6328, _mm512_set1_ps(7.0710677e-01f), ifft6320);
__m512 ifft6415 = _mm512_fmadd_ps(ifft6412, _mm512_set1_ps(7.0710677e-01f), ifft6404);
__m512 ifft6332 = _mm512_fmsub_ps(ifft6328, _mm512_set1_ps(7.0710677e-01f), ifft6320);
__m512 ifft6416 = _mm512_fmsub_ps(ifft6412, _mm512_set1_ps(7.0710677e-01f), ifft6404);
__m512 ifft6333 = _mm512_add_ps(ifft6329, ifft6330);
__m512 ifft6417 = _mm512_add_ps(ifft6413, ifft6414);
__m512 ifft6334 = _mm512_sub_ps(ifft6329, ifft6330);
__m512 ifft6418 = _mm512_sub_ps(ifft6413, ifft6414);
__m512 ifft6335 = _mm512_add_ps(ifft6331, ifft6332);
__m512 ifft6419 = _mm512_add_ps(ifft6415, ifft6416);
__m512 ifft6336 = _mm512_sub_ps(ifft6331, ifft6332);
__m512 ifft6420 = _mm512_sub_ps(ifft6415, ifft6416);
__m512 ifft6337 = _mm512_fmadd_ps(ifft6333, _mm512_set1_ps(1.5625e-02f), ifft6323);
__m512 ifft6421 = _mm512_fmadd_ps(ifft6417, _mm512_set1_ps(1.5625e-02f), ifft6407);
__m512 ifft6338 = _mm512_fnmadd_ps(ifft6333, _mm512_set1_ps(1.5625e-02f), ifft6323);
__m512 ifft6422 = _mm512_fnmadd_ps(ifft6417, _mm512_set1_ps(1.5625e-02f), ifft6407);
__m512 ifft6339 = _mm512_fmadd_ps(ifft6335, _mm512_set1_ps(1.5625e-02f), ifft6325);
__m512 ifft6423 = _mm512_fmadd_ps(ifft6419, _mm512_set1_ps(1.5625e-02f), ifft6409);
__m512 ifft6340 = _mm512_fnmadd_ps(ifft6335, _mm512_set1_ps(1.5625e-02f), ifft6325);
__m512 ifft6424 = _mm512_fnmadd_ps(ifft6419, _mm512_set1_ps(1.5625e-02f), ifft6409);
__m512 ifft6341 = _mm512_fnmadd_ps(ifft6336, _mm512_set1_ps(1.5625e-02f), ifft6324);
__m512 ifft6425 = _mm512_fnmadd_ps(ifft6420, _mm512_set1_ps(1.5625e-02f), ifft6408);
__m512 ifft6342 = _mm512_fmadd_ps(ifft6336, _mm512_set1_ps(1.5625e-02f), ifft6324);
__m512 ifft6426 = _mm512_fmadd_ps(ifft6420, _mm512_set1_ps(1.5625e-02f), ifft6408);
__m512 ifft6343 = _mm512_fmadd_ps(ifft6334, _mm512_set1_ps(1.5625e-02f), ifft6326);
__m512 ifft6427 = _mm512_fmadd_ps(ifft6418, _mm512_set1_ps(1.5625e-02f), ifft6410);
__m512 ifft6344 = _mm512_fnmadd_ps(ifft6334, _mm512_set1_ps(1.5625e-02f), ifft6326);
__m512 ifft6428 = _mm512_fnmadd_ps(ifft6418, _mm512_set1_ps(1.5625e-02f), ifft6410);
__m512 dat1933 = ifft6337;
__m512 dat1940 = ifft6421;
__m512 dat1934 = ifft6339;
__m512 dat1941 = ifft6423;
__m512 dat1935 = ifft6341;
__m512 dat1942 = ifft6425;
__m512 dat1936 = ifft6343;
__m512 dat1943 = ifft6427;
__m512 dat1937 = ifft6338;
__m512 dat1944 = ifft6422;
__m512 dat1938 = ifft6340;
__m512 dat1945 = ifft6424;
__m512 dat1939 = ifft6342;
__m512 dat1946 = ifft6426;
(void)ifft6344;
(void)ifft6428;
__m512i pm163 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack317 = _mm512_permutex2var_ps(dat1933, pm163, dat1940);
__m512i pm164 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack318 = _mm512_permutex2var_ps(dat1933, pm164, dat1940);
__m512 pack319 = _mm512_permutex2var_ps(dat1934, pm163, dat1941);
__m512 pack320 = _mm512_permutex2var_ps(dat1934, pm164, dat1941);
__m512 pack321 = _mm512_permutex2var_ps(dat1935, pm163, dat1942);
__m512 pack322 = _mm512_permutex2var_ps(dat1935, pm164, dat1942);
__m512 pack323 = _mm512_permutex2var_ps(dat1936, pm163, dat1943);
__m512 pack324 = _mm512_permutex2var_ps(dat1936, pm164, dat1943);
__m512 pack325 = _mm512_permutex2var_ps(dat1937, pm163, dat1944);
__m512 pack326 = _mm512_permutex2var_ps(dat1937, pm164, dat1944);
__m512 pack327 = _mm512_permutex2var_ps(dat1938, pm163, dat1945);
__m512 pack328 = _mm512_permutex2var_ps(dat1938, pm164, dat1945);
__m512 pack329 = _mm512_permutex2var_ps(dat1939, pm163, dat1946);
__m512 pack330 = _mm512_permutex2var_ps(dat1939, pm164, dat1946);
pack317 = _mm512_max_ps(_mm512_setzero_ps(), pack317);
pack318 = _mm512_max_ps(_mm512_setzero_ps(), pack318);
pack319 = _mm512_max_ps(_mm512_setzero_ps(), pack319);
pack320 = _mm512_max_ps(_mm512_setzero_ps(), pack320);
pack321 = _mm512_max_ps(_mm512_setzero_ps(), pack321);
pack322 = _mm512_max_ps(_mm512_setzero_ps(), pack322);
pack323 = _mm512_max_ps(_mm512_setzero_ps(), pack323);
pack324 = _mm512_max_ps(_mm512_setzero_ps(), pack324);
pack325 = _mm512_max_ps(_mm512_setzero_ps(), pack325);
pack326 = _mm512_max_ps(_mm512_setzero_ps(), pack326);
pack327 = _mm512_max_ps(_mm512_setzero_ps(), pack327);
pack328 = _mm512_max_ps(_mm512_setzero_ps(), pack328);
pack329 = _mm512_max_ps(_mm512_setzero_ps(), pack329);
pack330 = _mm512_max_ps(_mm512_setzero_ps(), pack330);
_mm512_mask_storeu_ps(datPtr19+0+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack317);
_mm512_mask_storeu_ps(datPtr19+3136+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack318);
_mm512_mask_storeu_ps(datPtr19+112+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack319);
_mm512_mask_storeu_ps(datPtr19+3248+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack320);
_mm512_mask_storeu_ps(datPtr19+224+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack321);
_mm512_mask_storeu_ps(datPtr19+3360+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack322);
_mm512_mask_storeu_ps(datPtr19+336+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack323);
_mm512_mask_storeu_ps(datPtr19+3472+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack324);
_mm512_mask_storeu_ps(datPtr19+448+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack325);
_mm512_mask_storeu_ps(datPtr19+3584+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack326);
_mm512_mask_storeu_ps(datPtr19+560+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack327);
_mm512_mask_storeu_ps(datPtr19+3696+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack328);
_mm512_mask_storeu_ps(datPtr19+672+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack329);
_mm512_mask_storeu_ps(datPtr19+3808+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t38, 16383, pack330);
ptrdiff_t t39 = 0;
__m512 sfRe429 = _mm512_loadu_ps(sfPtr10+256+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm429 = _mm512_loadu_ps(sfPtr10+320+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe433 = _mm512_loadu_ps(sfPtr10+384+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm433 = _mm512_loadu_ps(sfPtr10+448+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe430 = _mm512_loadu_ps(sfPtr10+8448+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm430 = _mm512_loadu_ps(sfPtr10+8512+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe434 = _mm512_loadu_ps(sfPtr10+8576+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm434 = _mm512_loadu_ps(sfPtr10+8640+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe431 = _mm512_loadu_ps(sfPtr10+16640+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm431 = _mm512_loadu_ps(sfPtr10+16704+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe435 = _mm512_loadu_ps(sfPtr10+16768+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm435 = _mm512_loadu_ps(sfPtr10+16832+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe432 = _mm512_loadu_ps(sfPtr10+24832+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm432 = _mm512_loadu_ps(sfPtr10+24896+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfRe436 = _mm512_loadu_ps(sfPtr10+24960+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512 sfIm436 = _mm512_loadu_ps(sfPtr10+25024+32768*i41+3072*j34+1536*k115+768*r22+256*t39);
__m512i ifft6429 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6430 = _mm512_permutexvar_ps(ifft6429, sfRe429);
__m512 ifft6521 = _mm512_permutexvar_ps(ifft6429, sfRe433);
__m512i ifft6431 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6432 = _mm512_permutexvar_ps(ifft6431, sfRe429);
__m512 ifft6522 = _mm512_permutexvar_ps(ifft6431, sfRe433);
__m512 ifft6433 = _mm512_permutexvar_ps(ifft6429, sfIm429);
__m512 ifft6523 = _mm512_permutexvar_ps(ifft6429, sfIm433);
__m512 ifft6434 = _mm512_permutexvar_ps(ifft6431, sfIm429);
__m512 ifft6524 = _mm512_permutexvar_ps(ifft6431, sfIm433);
__m512 ifft6435 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6436 = _mm512_mask_fmadd_ps(ifft6434, 65021, ifft6435, ifft6430);
__m512 ifft6525 = _mm512_mask_fmadd_ps(ifft6524, 65021, ifft6435, ifft6521);
__m512 ifft6437 = _mm512_mask_fnmadd_ps(ifft6433, 65021, ifft6435, ifft6432);
__m512 ifft6526 = _mm512_mask_fnmadd_ps(ifft6523, 65021, ifft6435, ifft6522);
__m512 ifft6438 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6439 = _mm512_fmadd_ps(ifft6436, ifft6438, _mm512_shuffle_ps(ifft6436, ifft6436, 177));
__m512 ifft6527 = _mm512_fmadd_ps(ifft6525, ifft6438, _mm512_shuffle_ps(ifft6525, ifft6525, 177));
__m512 ifft6440 = _mm512_fmadd_ps(ifft6437, ifft6438, _mm512_shuffle_ps(ifft6437, ifft6437, 177));
__m512 ifft6528 = _mm512_fmadd_ps(ifft6526, ifft6438, _mm512_shuffle_ps(ifft6526, ifft6526, 177));
__m512 ifft6441 = _mm512_fmadd_ps(sfRe430, ifft6438, _mm512_shuffle_ps(sfRe430, sfRe430, 177));
__m512 ifft6529 = _mm512_fmadd_ps(sfRe434, ifft6438, _mm512_shuffle_ps(sfRe434, sfRe434, 177));
__m512 ifft6442 = _mm512_fmadd_ps(sfIm430, ifft6438, _mm512_shuffle_ps(sfIm430, sfIm430, 177));
__m512 ifft6530 = _mm512_fmadd_ps(sfIm434, ifft6438, _mm512_shuffle_ps(sfIm434, sfIm434, 177));
__m512 ifft6443 = _mm512_fmadd_ps(sfRe431, ifft6438, _mm512_shuffle_ps(sfRe431, sfRe431, 177));
__m512 ifft6531 = _mm512_fmadd_ps(sfRe435, ifft6438, _mm512_shuffle_ps(sfRe435, sfRe435, 177));
__m512 ifft6444 = _mm512_fmadd_ps(sfIm431, ifft6438, _mm512_shuffle_ps(sfIm431, sfIm431, 177));
__m512 ifft6532 = _mm512_fmadd_ps(sfIm435, ifft6438, _mm512_shuffle_ps(sfIm435, sfIm435, 177));
__m512 ifft6445 = _mm512_fmadd_ps(sfRe432, ifft6438, _mm512_shuffle_ps(sfRe432, sfRe432, 177));
__m512 ifft6533 = _mm512_fmadd_ps(sfRe436, ifft6438, _mm512_shuffle_ps(sfRe436, sfRe436, 177));
__m512 ifft6446 = _mm512_fmadd_ps(sfIm432, ifft6438, _mm512_shuffle_ps(sfIm432, sfIm432, 177));
__m512 ifft6534 = _mm512_fmadd_ps(sfIm436, ifft6438, _mm512_shuffle_ps(sfIm436, sfIm436, 177));
__m512 ifft6447 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6448 = _mm512_mul_ps(ifft6439, ifft6447);
__m512 ifft6535 = _mm512_mul_ps(ifft6527, ifft6447);
__m512 ifft6449 = _mm512_mul_ps(ifft6440, ifft6447);
__m512 ifft6536 = _mm512_mul_ps(ifft6528, ifft6447);
__m512 ifft6450 = _mm512_mul_ps(ifft6441, ifft6447);
__m512 ifft6537 = _mm512_mul_ps(ifft6529, ifft6447);
__m512 ifft6451 = _mm512_mul_ps(ifft6442, ifft6447);
__m512 ifft6538 = _mm512_mul_ps(ifft6530, ifft6447);
__m512 ifft6452 = _mm512_mul_ps(ifft6443, ifft6447);
__m512 ifft6539 = _mm512_mul_ps(ifft6531, ifft6447);
__m512 ifft6453 = _mm512_mul_ps(ifft6444, ifft6447);
__m512 ifft6540 = _mm512_mul_ps(ifft6532, ifft6447);
__m512 ifft6454 = _mm512_mul_ps(ifft6445, ifft6447);
__m512 ifft6541 = _mm512_mul_ps(ifft6533, ifft6447);
__m512 ifft6455 = _mm512_mul_ps(ifft6446, ifft6447);
__m512 ifft6542 = _mm512_mul_ps(ifft6534, ifft6447);
__m512 ifft6456 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6457 = _mm512_fnmadd_ps(ifft6440, ifft6456, ifft6448);
__m512 ifft6543 = _mm512_fnmadd_ps(ifft6528, ifft6456, ifft6535);
__m512 ifft6458 = _mm512_fmadd_ps(ifft6439, ifft6456, ifft6449);
__m512 ifft6544 = _mm512_fmadd_ps(ifft6527, ifft6456, ifft6536);
__m512 ifft6459 = _mm512_fnmadd_ps(ifft6442, ifft6456, ifft6450);
__m512 ifft6545 = _mm512_fnmadd_ps(ifft6530, ifft6456, ifft6537);
__m512 ifft6460 = _mm512_fmadd_ps(ifft6441, ifft6456, ifft6451);
__m512 ifft6546 = _mm512_fmadd_ps(ifft6529, ifft6456, ifft6538);
__m512 ifft6461 = _mm512_fnmadd_ps(ifft6444, ifft6456, ifft6452);
__m512 ifft6547 = _mm512_fnmadd_ps(ifft6532, ifft6456, ifft6539);
__m512 ifft6462 = _mm512_fmadd_ps(ifft6443, ifft6456, ifft6453);
__m512 ifft6548 = _mm512_fmadd_ps(ifft6531, ifft6456, ifft6540);
__m512 ifft6463 = _mm512_fnmadd_ps(ifft6446, ifft6456, ifft6454);
__m512 ifft6549 = _mm512_fnmadd_ps(ifft6534, ifft6456, ifft6541);
__m512 ifft6464 = _mm512_fmadd_ps(ifft6445, ifft6456, ifft6455);
__m512 ifft6550 = _mm512_fmadd_ps(ifft6533, ifft6456, ifft6542);
__m512 ifft6465 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6466 = _mm512_fmadd_ps(ifft6457, ifft6465, _mm512_shuffle_ps(ifft6457, ifft6457, 78));
__m512 ifft6551 = _mm512_fmadd_ps(ifft6543, ifft6465, _mm512_shuffle_ps(ifft6543, ifft6543, 78));
__m512 ifft6467 = _mm512_fmadd_ps(ifft6458, ifft6465, _mm512_shuffle_ps(ifft6458, ifft6458, 78));
__m512 ifft6552 = _mm512_fmadd_ps(ifft6544, ifft6465, _mm512_shuffle_ps(ifft6544, ifft6544, 78));
__m512 ifft6468 = _mm512_fmadd_ps(ifft6459, ifft6465, _mm512_shuffle_ps(ifft6459, ifft6459, 78));
__m512 ifft6553 = _mm512_fmadd_ps(ifft6545, ifft6465, _mm512_shuffle_ps(ifft6545, ifft6545, 78));
__m512 ifft6469 = _mm512_fmadd_ps(ifft6460, ifft6465, _mm512_shuffle_ps(ifft6460, ifft6460, 78));
__m512 ifft6554 = _mm512_fmadd_ps(ifft6546, ifft6465, _mm512_shuffle_ps(ifft6546, ifft6546, 78));
__m512 ifft6470 = _mm512_fmadd_ps(ifft6461, ifft6465, _mm512_shuffle_ps(ifft6461, ifft6461, 78));
__m512 ifft6555 = _mm512_fmadd_ps(ifft6547, ifft6465, _mm512_shuffle_ps(ifft6547, ifft6547, 78));
__m512 ifft6471 = _mm512_fmadd_ps(ifft6462, ifft6465, _mm512_shuffle_ps(ifft6462, ifft6462, 78));
__m512 ifft6556 = _mm512_fmadd_ps(ifft6548, ifft6465, _mm512_shuffle_ps(ifft6548, ifft6548, 78));
__m512 ifft6472 = _mm512_fmadd_ps(ifft6463, ifft6465, _mm512_shuffle_ps(ifft6463, ifft6463, 78));
__m512 ifft6557 = _mm512_fmadd_ps(ifft6549, ifft6465, _mm512_shuffle_ps(ifft6549, ifft6549, 78));
__m512 ifft6473 = _mm512_fmadd_ps(ifft6464, ifft6465, _mm512_shuffle_ps(ifft6464, ifft6464, 78));
__m512 ifft6558 = _mm512_fmadd_ps(ifft6550, ifft6465, _mm512_shuffle_ps(ifft6550, ifft6550, 78));
__m512 ifft6474 = _mm512_mask_sub_ps(ifft6466, 49344, _mm512_setzero_ps(), ifft6467);
__m512 ifft6559 = _mm512_mask_sub_ps(ifft6551, 49344, _mm512_setzero_ps(), ifft6552);
__m512 ifft6475 = _mm512_mask_mov_ps(ifft6467, 49344, ifft6466);
__m512 ifft6560 = _mm512_mask_mov_ps(ifft6552, 49344, ifft6551);
__m512 ifft6476 = _mm512_mask_sub_ps(ifft6468, 49344, _mm512_setzero_ps(), ifft6469);
__m512 ifft6561 = _mm512_mask_sub_ps(ifft6553, 49344, _mm512_setzero_ps(), ifft6554);
__m512 ifft6477 = _mm512_mask_mov_ps(ifft6469, 49344, ifft6468);
__m512 ifft6562 = _mm512_mask_mov_ps(ifft6554, 49344, ifft6553);
__m512 ifft6478 = _mm512_mask_sub_ps(ifft6470, 49344, _mm512_setzero_ps(), ifft6471);
__m512 ifft6563 = _mm512_mask_sub_ps(ifft6555, 49344, _mm512_setzero_ps(), ifft6556);
__m512 ifft6479 = _mm512_mask_mov_ps(ifft6471, 49344, ifft6470);
__m512 ifft6564 = _mm512_mask_mov_ps(ifft6556, 49344, ifft6555);
__m512 ifft6480 = _mm512_mask_sub_ps(ifft6472, 49344, _mm512_setzero_ps(), ifft6473);
__m512 ifft6565 = _mm512_mask_sub_ps(ifft6557, 49344, _mm512_setzero_ps(), ifft6558);
__m512 ifft6481 = _mm512_mask_mov_ps(ifft6473, 49344, ifft6472);
__m512 ifft6566 = _mm512_mask_mov_ps(ifft6558, 49344, ifft6557);
__m512 ifft6482 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft6483 = _mm512_fmadd_ps(ifft6474, ifft6482, _mm512_shuffle_f32x4(ifft6474, ifft6474, 177));
__m512 ifft6567 = _mm512_fmadd_ps(ifft6559, ifft6482, _mm512_shuffle_f32x4(ifft6559, ifft6559, 177));
__m512 ifft6484 = _mm512_fmadd_ps(ifft6475, ifft6482, _mm512_shuffle_f32x4(ifft6475, ifft6475, 177));
__m512 ifft6568 = _mm512_fmadd_ps(ifft6560, ifft6482, _mm512_shuffle_f32x4(ifft6560, ifft6560, 177));
__m512 ifft6485 = _mm512_fmadd_ps(ifft6476, ifft6482, _mm512_shuffle_f32x4(ifft6476, ifft6476, 177));
__m512 ifft6569 = _mm512_fmadd_ps(ifft6561, ifft6482, _mm512_shuffle_f32x4(ifft6561, ifft6561, 177));
__m512 ifft6486 = _mm512_fmadd_ps(ifft6477, ifft6482, _mm512_shuffle_f32x4(ifft6477, ifft6477, 177));
__m512 ifft6570 = _mm512_fmadd_ps(ifft6562, ifft6482, _mm512_shuffle_f32x4(ifft6562, ifft6562, 177));
__m512 ifft6487 = _mm512_fmadd_ps(ifft6478, ifft6482, _mm512_shuffle_f32x4(ifft6478, ifft6478, 177));
__m512 ifft6571 = _mm512_fmadd_ps(ifft6563, ifft6482, _mm512_shuffle_f32x4(ifft6563, ifft6563, 177));
__m512 ifft6488 = _mm512_fnmsub_ps(ifft6479, ifft6482, _mm512_shuffle_f32x4(ifft6479, ifft6479, 177));
__m512 ifft6572 = _mm512_fnmsub_ps(ifft6564, ifft6482, _mm512_shuffle_f32x4(ifft6564, ifft6564, 177));
__m512 ifft6489 = _mm512_fmadd_ps(ifft6480, ifft6482, _mm512_shuffle_f32x4(ifft6480, ifft6480, 177));
__m512 ifft6573 = _mm512_fmadd_ps(ifft6565, ifft6482, _mm512_shuffle_f32x4(ifft6565, ifft6565, 177));
__m512 ifft6490 = _mm512_fmadd_ps(ifft6481, ifft6482, _mm512_shuffle_f32x4(ifft6481, ifft6481, 177));
__m512 ifft6574 = _mm512_fmadd_ps(ifft6566, ifft6482, _mm512_shuffle_f32x4(ifft6566, ifft6566, 177));
__m512 ifft6491 = _mm512_add_ps(ifft6483, ifft6484);
__m512 ifft6575 = _mm512_add_ps(ifft6567, ifft6568);
__m512 ifft6492 = _mm512_sub_ps(ifft6483, ifft6484);
__m512 ifft6576 = _mm512_sub_ps(ifft6567, ifft6568);
__m512 ifft6493 = _mm512_sub_ps(ifft6485, ifft6489);
__m512 ifft6577 = _mm512_sub_ps(ifft6569, ifft6573);
__m512 ifft6494 = _mm512_add_ps(ifft6486, ifft6490);
__m512 ifft6578 = _mm512_add_ps(ifft6570, ifft6574);
__m512 ifft6495 = _mm512_add_ps(ifft6485, ifft6489);
__m512 ifft6579 = _mm512_add_ps(ifft6569, ifft6573);
__m512 ifft6496 = _mm512_sub_ps(ifft6486, ifft6490);
__m512 ifft6580 = _mm512_sub_ps(ifft6570, ifft6574);
__m512 ifft6497 = _mm512_mul_ps(ifft6487, _mm512_set1_ps(3.125e-02f));
__m512 ifft6581 = _mm512_mul_ps(ifft6571, _mm512_set1_ps(3.125e-02f));
__m512 ifft6498 = _mm512_mul_ps(ifft6488, _mm512_set1_ps(3.125e-02f));
__m512 ifft6582 = _mm512_mul_ps(ifft6572, _mm512_set1_ps(3.125e-02f));
__m512 ifft6499 = _mm512_fmadd_ps(ifft6491, _mm512_set1_ps(1.5625e-02f), ifft6497);
__m512 ifft6583 = _mm512_fmadd_ps(ifft6575, _mm512_set1_ps(1.5625e-02f), ifft6581);
__m512 ifft6500 = _mm512_fmsub_ps(ifft6491, _mm512_set1_ps(1.5625e-02f), ifft6497);
__m512 ifft6584 = _mm512_fmsub_ps(ifft6575, _mm512_set1_ps(1.5625e-02f), ifft6581);
__m512 ifft6501 = _mm512_fmadd_ps(ifft6492, _mm512_set1_ps(1.5625e-02f), ifft6498);
__m512 ifft6585 = _mm512_fmadd_ps(ifft6576, _mm512_set1_ps(1.5625e-02f), ifft6582);
__m512 ifft6502 = _mm512_fmsub_ps(ifft6492, _mm512_set1_ps(1.5625e-02f), ifft6498);
__m512 ifft6586 = _mm512_fmsub_ps(ifft6576, _mm512_set1_ps(1.5625e-02f), ifft6582);
__m512 ifft6503 = _mm512_add_ps(ifft6493, ifft6494);
__m512 ifft6587 = _mm512_add_ps(ifft6577, ifft6578);
__m512 ifft6504 = _mm512_sub_ps(ifft6493, ifft6494);
__m512 ifft6588 = _mm512_sub_ps(ifft6577, ifft6578);
__m512 ifft6505 = _mm512_fnmadd_ps(ifft6503, _mm512_set1_ps(7.0710677e-01f), ifft6495);
__m512 ifft6589 = _mm512_fnmadd_ps(ifft6587, _mm512_set1_ps(7.0710677e-01f), ifft6579);
__m512 ifft6506 = _mm512_fmadd_ps(ifft6503, _mm512_set1_ps(7.0710677e-01f), ifft6495);
__m512 ifft6590 = _mm512_fmadd_ps(ifft6587, _mm512_set1_ps(7.0710677e-01f), ifft6579);
__m512 ifft6507 = _mm512_fmadd_ps(ifft6504, _mm512_set1_ps(7.0710677e-01f), ifft6496);
__m512 ifft6591 = _mm512_fmadd_ps(ifft6588, _mm512_set1_ps(7.0710677e-01f), ifft6580);
__m512 ifft6508 = _mm512_fmsub_ps(ifft6504, _mm512_set1_ps(7.0710677e-01f), ifft6496);
__m512 ifft6592 = _mm512_fmsub_ps(ifft6588, _mm512_set1_ps(7.0710677e-01f), ifft6580);
__m512 ifft6509 = _mm512_add_ps(ifft6505, ifft6506);
__m512 ifft6593 = _mm512_add_ps(ifft6589, ifft6590);
__m512 ifft6510 = _mm512_sub_ps(ifft6505, ifft6506);
__m512 ifft6594 = _mm512_sub_ps(ifft6589, ifft6590);
__m512 ifft6511 = _mm512_add_ps(ifft6507, ifft6508);
__m512 ifft6595 = _mm512_add_ps(ifft6591, ifft6592);
__m512 ifft6512 = _mm512_sub_ps(ifft6507, ifft6508);
__m512 ifft6596 = _mm512_sub_ps(ifft6591, ifft6592);
__m512 ifft6513 = _mm512_fmadd_ps(ifft6509, _mm512_set1_ps(1.5625e-02f), ifft6499);
__m512 ifft6597 = _mm512_fmadd_ps(ifft6593, _mm512_set1_ps(1.5625e-02f), ifft6583);
__m512 ifft6514 = _mm512_fnmadd_ps(ifft6509, _mm512_set1_ps(1.5625e-02f), ifft6499);
__m512 ifft6598 = _mm512_fnmadd_ps(ifft6593, _mm512_set1_ps(1.5625e-02f), ifft6583);
__m512 ifft6515 = _mm512_fmadd_ps(ifft6511, _mm512_set1_ps(1.5625e-02f), ifft6501);
__m512 ifft6599 = _mm512_fmadd_ps(ifft6595, _mm512_set1_ps(1.5625e-02f), ifft6585);
__m512 ifft6516 = _mm512_fnmadd_ps(ifft6511, _mm512_set1_ps(1.5625e-02f), ifft6501);
__m512 ifft6600 = _mm512_fnmadd_ps(ifft6595, _mm512_set1_ps(1.5625e-02f), ifft6585);
__m512 ifft6517 = _mm512_fnmadd_ps(ifft6512, _mm512_set1_ps(1.5625e-02f), ifft6500);
__m512 ifft6601 = _mm512_fnmadd_ps(ifft6596, _mm512_set1_ps(1.5625e-02f), ifft6584);
__m512 ifft6518 = _mm512_fmadd_ps(ifft6512, _mm512_set1_ps(1.5625e-02f), ifft6500);
__m512 ifft6602 = _mm512_fmadd_ps(ifft6596, _mm512_set1_ps(1.5625e-02f), ifft6584);
__m512 ifft6519 = _mm512_fmadd_ps(ifft6510, _mm512_set1_ps(1.5625e-02f), ifft6502);
__m512 ifft6603 = _mm512_fmadd_ps(ifft6594, _mm512_set1_ps(1.5625e-02f), ifft6586);
__m512 ifft6520 = _mm512_fnmadd_ps(ifft6510, _mm512_set1_ps(1.5625e-02f), ifft6502);
__m512 ifft6604 = _mm512_fnmadd_ps(ifft6594, _mm512_set1_ps(1.5625e-02f), ifft6586);
__m512 dat1947 = ifft6513;
__m512 dat1954 = ifft6597;
__m512 dat1948 = ifft6515;
__m512 dat1955 = ifft6599;
__m512 dat1949 = ifft6517;
__m512 dat1956 = ifft6601;
__m512 dat1950 = ifft6519;
__m512 dat1957 = ifft6603;
__m512 dat1951 = ifft6514;
__m512 dat1958 = ifft6598;
__m512 dat1952 = ifft6516;
__m512 dat1959 = ifft6600;
__m512 dat1953 = ifft6518;
__m512 dat1960 = ifft6602;
(void)ifft6520;
(void)ifft6604;
__m512i pm165 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack331 = _mm512_permutex2var_ps(dat1947, pm165, dat1954);
__m512i pm166 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack332 = _mm512_permutex2var_ps(dat1947, pm166, dat1954);
__m512 pack333 = _mm512_permutex2var_ps(dat1948, pm165, dat1955);
__m512 pack334 = _mm512_permutex2var_ps(dat1948, pm166, dat1955);
__m512 pack335 = _mm512_permutex2var_ps(dat1949, pm165, dat1956);
__m512 pack336 = _mm512_permutex2var_ps(dat1949, pm166, dat1956);
__m512 pack337 = _mm512_permutex2var_ps(dat1950, pm165, dat1957);
__m512 pack338 = _mm512_permutex2var_ps(dat1950, pm166, dat1957);
__m512 pack339 = _mm512_permutex2var_ps(dat1951, pm165, dat1958);
__m512 pack340 = _mm512_permutex2var_ps(dat1951, pm166, dat1958);
__m512 pack341 = _mm512_permutex2var_ps(dat1952, pm165, dat1959);
__m512 pack342 = _mm512_permutex2var_ps(dat1952, pm166, dat1959);
__m512 pack343 = _mm512_permutex2var_ps(dat1953, pm165, dat1960);
__m512 pack344 = _mm512_permutex2var_ps(dat1953, pm166, dat1960);
pack331 = _mm512_max_ps(_mm512_setzero_ps(), pack331);
pack332 = _mm512_max_ps(_mm512_setzero_ps(), pack332);
pack333 = _mm512_max_ps(_mm512_setzero_ps(), pack333);
pack334 = _mm512_max_ps(_mm512_setzero_ps(), pack334);
pack335 = _mm512_max_ps(_mm512_setzero_ps(), pack335);
pack336 = _mm512_max_ps(_mm512_setzero_ps(), pack336);
pack337 = _mm512_max_ps(_mm512_setzero_ps(), pack337);
pack338 = _mm512_max_ps(_mm512_setzero_ps(), pack338);
pack339 = _mm512_max_ps(_mm512_setzero_ps(), pack339);
pack340 = _mm512_max_ps(_mm512_setzero_ps(), pack340);
pack341 = _mm512_max_ps(_mm512_setzero_ps(), pack341);
pack342 = _mm512_max_ps(_mm512_setzero_ps(), pack342);
pack343 = _mm512_max_ps(_mm512_setzero_ps(), pack343);
pack344 = _mm512_max_ps(_mm512_setzero_ps(), pack344);
_mm512_mask_storeu_ps(datPtr19+728+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack331);
_mm512_mask_storeu_ps(datPtr19+3864+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack332);
_mm512_mask_storeu_ps(datPtr19+840+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack333);
_mm512_mask_storeu_ps(datPtr19+3976+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack334);
_mm512_mask_storeu_ps(datPtr19+952+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack335);
_mm512_mask_storeu_ps(datPtr19+4088+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack336);
_mm512_mask_storeu_ps(datPtr19+1064+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack337);
_mm512_mask_storeu_ps(datPtr19+4200+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack338);
_mm512_mask_storeu_ps(datPtr19+1176+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack339);
_mm512_mask_storeu_ps(datPtr19+4312+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack340);
_mm512_mask_storeu_ps(datPtr19+1288+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack341);
_mm512_mask_storeu_ps(datPtr19+4424+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack342);
_mm512_mask_storeu_ps(datPtr19+1400+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack343);
_mm512_mask_storeu_ps(datPtr19+4536+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+0*t39, 16383, pack344);
ptrdiff_t t40 = 0;
__m512 sfRe437 = _mm512_loadu_ps(sfPtr10+512+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm437 = _mm512_loadu_ps(sfPtr10+576+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe441 = _mm512_loadu_ps(sfPtr10+640+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm441 = _mm512_loadu_ps(sfPtr10+704+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe438 = _mm512_loadu_ps(sfPtr10+8704+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm438 = _mm512_loadu_ps(sfPtr10+8768+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe442 = _mm512_loadu_ps(sfPtr10+8832+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm442 = _mm512_loadu_ps(sfPtr10+8896+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe439 = _mm512_loadu_ps(sfPtr10+16896+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm439 = _mm512_loadu_ps(sfPtr10+16960+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe443 = _mm512_loadu_ps(sfPtr10+17024+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm443 = _mm512_loadu_ps(sfPtr10+17088+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe440 = _mm512_loadu_ps(sfPtr10+25088+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm440 = _mm512_loadu_ps(sfPtr10+25152+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfRe444 = _mm512_loadu_ps(sfPtr10+25216+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512 sfIm444 = _mm512_loadu_ps(sfPtr10+25280+32768*i41+3072*j34+1536*k115+768*r22+256*t40);
__m512i ifft6605 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6606 = _mm512_permutexvar_ps(ifft6605, sfRe437);
__m512 ifft6697 = _mm512_permutexvar_ps(ifft6605, sfRe441);
__m512i ifft6607 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6608 = _mm512_permutexvar_ps(ifft6607, sfRe437);
__m512 ifft6698 = _mm512_permutexvar_ps(ifft6607, sfRe441);
__m512 ifft6609 = _mm512_permutexvar_ps(ifft6605, sfIm437);
__m512 ifft6699 = _mm512_permutexvar_ps(ifft6605, sfIm441);
__m512 ifft6610 = _mm512_permutexvar_ps(ifft6607, sfIm437);
__m512 ifft6700 = _mm512_permutexvar_ps(ifft6607, sfIm441);
__m512 ifft6611 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6612 = _mm512_mask_fmadd_ps(ifft6610, 65021, ifft6611, ifft6606);
__m512 ifft6701 = _mm512_mask_fmadd_ps(ifft6700, 65021, ifft6611, ifft6697);
__m512 ifft6613 = _mm512_mask_fnmadd_ps(ifft6609, 65021, ifft6611, ifft6608);
__m512 ifft6702 = _mm512_mask_fnmadd_ps(ifft6699, 65021, ifft6611, ifft6698);
__m512 ifft6614 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6615 = _mm512_fmadd_ps(ifft6612, ifft6614, _mm512_shuffle_ps(ifft6612, ifft6612, 177));
__m512 ifft6703 = _mm512_fmadd_ps(ifft6701, ifft6614, _mm512_shuffle_ps(ifft6701, ifft6701, 177));
__m512 ifft6616 = _mm512_fmadd_ps(ifft6613, ifft6614, _mm512_shuffle_ps(ifft6613, ifft6613, 177));
__m512 ifft6704 = _mm512_fmadd_ps(ifft6702, ifft6614, _mm512_shuffle_ps(ifft6702, ifft6702, 177));
__m512 ifft6617 = _mm512_fmadd_ps(sfRe438, ifft6614, _mm512_shuffle_ps(sfRe438, sfRe438, 177));
__m512 ifft6705 = _mm512_fmadd_ps(sfRe442, ifft6614, _mm512_shuffle_ps(sfRe442, sfRe442, 177));
__m512 ifft6618 = _mm512_fmadd_ps(sfIm438, ifft6614, _mm512_shuffle_ps(sfIm438, sfIm438, 177));
__m512 ifft6706 = _mm512_fmadd_ps(sfIm442, ifft6614, _mm512_shuffle_ps(sfIm442, sfIm442, 177));
__m512 ifft6619 = _mm512_fmadd_ps(sfRe439, ifft6614, _mm512_shuffle_ps(sfRe439, sfRe439, 177));
__m512 ifft6707 = _mm512_fmadd_ps(sfRe443, ifft6614, _mm512_shuffle_ps(sfRe443, sfRe443, 177));
__m512 ifft6620 = _mm512_fmadd_ps(sfIm439, ifft6614, _mm512_shuffle_ps(sfIm439, sfIm439, 177));
__m512 ifft6708 = _mm512_fmadd_ps(sfIm443, ifft6614, _mm512_shuffle_ps(sfIm443, sfIm443, 177));
__m512 ifft6621 = _mm512_fmadd_ps(sfRe440, ifft6614, _mm512_shuffle_ps(sfRe440, sfRe440, 177));
__m512 ifft6709 = _mm512_fmadd_ps(sfRe444, ifft6614, _mm512_shuffle_ps(sfRe444, sfRe444, 177));
__m512 ifft6622 = _mm512_fmadd_ps(sfIm440, ifft6614, _mm512_shuffle_ps(sfIm440, sfIm440, 177));
__m512 ifft6710 = _mm512_fmadd_ps(sfIm444, ifft6614, _mm512_shuffle_ps(sfIm444, sfIm444, 177));
__m512 ifft6623 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6624 = _mm512_mul_ps(ifft6615, ifft6623);
__m512 ifft6711 = _mm512_mul_ps(ifft6703, ifft6623);
__m512 ifft6625 = _mm512_mul_ps(ifft6616, ifft6623);
__m512 ifft6712 = _mm512_mul_ps(ifft6704, ifft6623);
__m512 ifft6626 = _mm512_mul_ps(ifft6617, ifft6623);
__m512 ifft6713 = _mm512_mul_ps(ifft6705, ifft6623);
__m512 ifft6627 = _mm512_mul_ps(ifft6618, ifft6623);
__m512 ifft6714 = _mm512_mul_ps(ifft6706, ifft6623);
__m512 ifft6628 = _mm512_mul_ps(ifft6619, ifft6623);
__m512 ifft6715 = _mm512_mul_ps(ifft6707, ifft6623);
__m512 ifft6629 = _mm512_mul_ps(ifft6620, ifft6623);
__m512 ifft6716 = _mm512_mul_ps(ifft6708, ifft6623);
__m512 ifft6630 = _mm512_mul_ps(ifft6621, ifft6623);
__m512 ifft6717 = _mm512_mul_ps(ifft6709, ifft6623);
__m512 ifft6631 = _mm512_mul_ps(ifft6622, ifft6623);
__m512 ifft6718 = _mm512_mul_ps(ifft6710, ifft6623);
__m512 ifft6632 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6633 = _mm512_fnmadd_ps(ifft6616, ifft6632, ifft6624);
__m512 ifft6719 = _mm512_fnmadd_ps(ifft6704, ifft6632, ifft6711);
__m512 ifft6634 = _mm512_fmadd_ps(ifft6615, ifft6632, ifft6625);
__m512 ifft6720 = _mm512_fmadd_ps(ifft6703, ifft6632, ifft6712);
__m512 ifft6635 = _mm512_fnmadd_ps(ifft6618, ifft6632, ifft6626);
__m512 ifft6721 = _mm512_fnmadd_ps(ifft6706, ifft6632, ifft6713);
__m512 ifft6636 = _mm512_fmadd_ps(ifft6617, ifft6632, ifft6627);
__m512 ifft6722 = _mm512_fmadd_ps(ifft6705, ifft6632, ifft6714);
__m512 ifft6637 = _mm512_fnmadd_ps(ifft6620, ifft6632, ifft6628);
__m512 ifft6723 = _mm512_fnmadd_ps(ifft6708, ifft6632, ifft6715);
__m512 ifft6638 = _mm512_fmadd_ps(ifft6619, ifft6632, ifft6629);
__m512 ifft6724 = _mm512_fmadd_ps(ifft6707, ifft6632, ifft6716);
__m512 ifft6639 = _mm512_fnmadd_ps(ifft6622, ifft6632, ifft6630);
__m512 ifft6725 = _mm512_fnmadd_ps(ifft6710, ifft6632, ifft6717);
__m512 ifft6640 = _mm512_fmadd_ps(ifft6621, ifft6632, ifft6631);
__m512 ifft6726 = _mm512_fmadd_ps(ifft6709, ifft6632, ifft6718);
__m512 ifft6641 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6642 = _mm512_fmadd_ps(ifft6633, ifft6641, _mm512_shuffle_ps(ifft6633, ifft6633, 78));
__m512 ifft6727 = _mm512_fmadd_ps(ifft6719, ifft6641, _mm512_shuffle_ps(ifft6719, ifft6719, 78));
__m512 ifft6643 = _mm512_fmadd_ps(ifft6634, ifft6641, _mm512_shuffle_ps(ifft6634, ifft6634, 78));
__m512 ifft6728 = _mm512_fmadd_ps(ifft6720, ifft6641, _mm512_shuffle_ps(ifft6720, ifft6720, 78));
__m512 ifft6644 = _mm512_fmadd_ps(ifft6635, ifft6641, _mm512_shuffle_ps(ifft6635, ifft6635, 78));
__m512 ifft6729 = _mm512_fmadd_ps(ifft6721, ifft6641, _mm512_shuffle_ps(ifft6721, ifft6721, 78));
__m512 ifft6645 = _mm512_fmadd_ps(ifft6636, ifft6641, _mm512_shuffle_ps(ifft6636, ifft6636, 78));
__m512 ifft6730 = _mm512_fmadd_ps(ifft6722, ifft6641, _mm512_shuffle_ps(ifft6722, ifft6722, 78));
__m512 ifft6646 = _mm512_fmadd_ps(ifft6637, ifft6641, _mm512_shuffle_ps(ifft6637, ifft6637, 78));
__m512 ifft6731 = _mm512_fmadd_ps(ifft6723, ifft6641, _mm512_shuffle_ps(ifft6723, ifft6723, 78));
__m512 ifft6647 = _mm512_fmadd_ps(ifft6638, ifft6641, _mm512_shuffle_ps(ifft6638, ifft6638, 78));
__m512 ifft6732 = _mm512_fmadd_ps(ifft6724, ifft6641, _mm512_shuffle_ps(ifft6724, ifft6724, 78));
__m512 ifft6648 = _mm512_fmadd_ps(ifft6639, ifft6641, _mm512_shuffle_ps(ifft6639, ifft6639, 78));
__m512 ifft6733 = _mm512_fmadd_ps(ifft6725, ifft6641, _mm512_shuffle_ps(ifft6725, ifft6725, 78));
__m512 ifft6649 = _mm512_fmadd_ps(ifft6640, ifft6641, _mm512_shuffle_ps(ifft6640, ifft6640, 78));
__m512 ifft6734 = _mm512_fmadd_ps(ifft6726, ifft6641, _mm512_shuffle_ps(ifft6726, ifft6726, 78));
__m512 ifft6650 = _mm512_mask_sub_ps(ifft6642, 49344, _mm512_setzero_ps(), ifft6643);
__m512 ifft6735 = _mm512_mask_sub_ps(ifft6727, 49344, _mm512_setzero_ps(), ifft6728);
__m512 ifft6651 = _mm512_mask_mov_ps(ifft6643, 49344, ifft6642);
__m512 ifft6736 = _mm512_mask_mov_ps(ifft6728, 49344, ifft6727);
__m512 ifft6652 = _mm512_mask_sub_ps(ifft6644, 49344, _mm512_setzero_ps(), ifft6645);
__m512 ifft6737 = _mm512_mask_sub_ps(ifft6729, 49344, _mm512_setzero_ps(), ifft6730);
__m512 ifft6653 = _mm512_mask_mov_ps(ifft6645, 49344, ifft6644);
__m512 ifft6738 = _mm512_mask_mov_ps(ifft6730, 49344, ifft6729);
__m512 ifft6654 = _mm512_mask_sub_ps(ifft6646, 49344, _mm512_setzero_ps(), ifft6647);
__m512 ifft6739 = _mm512_mask_sub_ps(ifft6731, 49344, _mm512_setzero_ps(), ifft6732);
__m512 ifft6655 = _mm512_mask_mov_ps(ifft6647, 49344, ifft6646);
__m512 ifft6740 = _mm512_mask_mov_ps(ifft6732, 49344, ifft6731);
__m512 ifft6656 = _mm512_mask_sub_ps(ifft6648, 49344, _mm512_setzero_ps(), ifft6649);
__m512 ifft6741 = _mm512_mask_sub_ps(ifft6733, 49344, _mm512_setzero_ps(), ifft6734);
__m512 ifft6657 = _mm512_mask_mov_ps(ifft6649, 49344, ifft6648);
__m512 ifft6742 = _mm512_mask_mov_ps(ifft6734, 49344, ifft6733);
__m512 ifft6658 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft6659 = _mm512_fmadd_ps(ifft6650, ifft6658, _mm512_shuffle_f32x4(ifft6650, ifft6650, 177));
__m512 ifft6743 = _mm512_fmadd_ps(ifft6735, ifft6658, _mm512_shuffle_f32x4(ifft6735, ifft6735, 177));
__m512 ifft6660 = _mm512_fmadd_ps(ifft6651, ifft6658, _mm512_shuffle_f32x4(ifft6651, ifft6651, 177));
__m512 ifft6744 = _mm512_fmadd_ps(ifft6736, ifft6658, _mm512_shuffle_f32x4(ifft6736, ifft6736, 177));
__m512 ifft6661 = _mm512_fmadd_ps(ifft6652, ifft6658, _mm512_shuffle_f32x4(ifft6652, ifft6652, 177));
__m512 ifft6745 = _mm512_fmadd_ps(ifft6737, ifft6658, _mm512_shuffle_f32x4(ifft6737, ifft6737, 177));
__m512 ifft6662 = _mm512_fmadd_ps(ifft6653, ifft6658, _mm512_shuffle_f32x4(ifft6653, ifft6653, 177));
__m512 ifft6746 = _mm512_fmadd_ps(ifft6738, ifft6658, _mm512_shuffle_f32x4(ifft6738, ifft6738, 177));
__m512 ifft6663 = _mm512_fmadd_ps(ifft6654, ifft6658, _mm512_shuffle_f32x4(ifft6654, ifft6654, 177));
__m512 ifft6747 = _mm512_fmadd_ps(ifft6739, ifft6658, _mm512_shuffle_f32x4(ifft6739, ifft6739, 177));
__m512 ifft6664 = _mm512_fnmsub_ps(ifft6655, ifft6658, _mm512_shuffle_f32x4(ifft6655, ifft6655, 177));
__m512 ifft6748 = _mm512_fnmsub_ps(ifft6740, ifft6658, _mm512_shuffle_f32x4(ifft6740, ifft6740, 177));
__m512 ifft6665 = _mm512_fmadd_ps(ifft6656, ifft6658, _mm512_shuffle_f32x4(ifft6656, ifft6656, 177));
__m512 ifft6749 = _mm512_fmadd_ps(ifft6741, ifft6658, _mm512_shuffle_f32x4(ifft6741, ifft6741, 177));
__m512 ifft6666 = _mm512_fmadd_ps(ifft6657, ifft6658, _mm512_shuffle_f32x4(ifft6657, ifft6657, 177));
__m512 ifft6750 = _mm512_fmadd_ps(ifft6742, ifft6658, _mm512_shuffle_f32x4(ifft6742, ifft6742, 177));
__m512 ifft6667 = _mm512_add_ps(ifft6659, ifft6660);
__m512 ifft6751 = _mm512_add_ps(ifft6743, ifft6744);
__m512 ifft6668 = _mm512_sub_ps(ifft6659, ifft6660);
__m512 ifft6752 = _mm512_sub_ps(ifft6743, ifft6744);
__m512 ifft6669 = _mm512_sub_ps(ifft6661, ifft6665);
__m512 ifft6753 = _mm512_sub_ps(ifft6745, ifft6749);
__m512 ifft6670 = _mm512_add_ps(ifft6662, ifft6666);
__m512 ifft6754 = _mm512_add_ps(ifft6746, ifft6750);
__m512 ifft6671 = _mm512_add_ps(ifft6661, ifft6665);
__m512 ifft6755 = _mm512_add_ps(ifft6745, ifft6749);
__m512 ifft6672 = _mm512_sub_ps(ifft6662, ifft6666);
__m512 ifft6756 = _mm512_sub_ps(ifft6746, ifft6750);
__m512 ifft6673 = _mm512_mul_ps(ifft6663, _mm512_set1_ps(3.125e-02f));
__m512 ifft6757 = _mm512_mul_ps(ifft6747, _mm512_set1_ps(3.125e-02f));
__m512 ifft6674 = _mm512_mul_ps(ifft6664, _mm512_set1_ps(3.125e-02f));
__m512 ifft6758 = _mm512_mul_ps(ifft6748, _mm512_set1_ps(3.125e-02f));
__m512 ifft6675 = _mm512_fmadd_ps(ifft6667, _mm512_set1_ps(1.5625e-02f), ifft6673);
__m512 ifft6759 = _mm512_fmadd_ps(ifft6751, _mm512_set1_ps(1.5625e-02f), ifft6757);
__m512 ifft6676 = _mm512_fmsub_ps(ifft6667, _mm512_set1_ps(1.5625e-02f), ifft6673);
__m512 ifft6760 = _mm512_fmsub_ps(ifft6751, _mm512_set1_ps(1.5625e-02f), ifft6757);
__m512 ifft6677 = _mm512_fmadd_ps(ifft6668, _mm512_set1_ps(1.5625e-02f), ifft6674);
__m512 ifft6761 = _mm512_fmadd_ps(ifft6752, _mm512_set1_ps(1.5625e-02f), ifft6758);
__m512 ifft6678 = _mm512_fmsub_ps(ifft6668, _mm512_set1_ps(1.5625e-02f), ifft6674);
__m512 ifft6762 = _mm512_fmsub_ps(ifft6752, _mm512_set1_ps(1.5625e-02f), ifft6758);
__m512 ifft6679 = _mm512_add_ps(ifft6669, ifft6670);
__m512 ifft6763 = _mm512_add_ps(ifft6753, ifft6754);
__m512 ifft6680 = _mm512_sub_ps(ifft6669, ifft6670);
__m512 ifft6764 = _mm512_sub_ps(ifft6753, ifft6754);
__m512 ifft6681 = _mm512_fnmadd_ps(ifft6679, _mm512_set1_ps(7.0710677e-01f), ifft6671);
__m512 ifft6765 = _mm512_fnmadd_ps(ifft6763, _mm512_set1_ps(7.0710677e-01f), ifft6755);
__m512 ifft6682 = _mm512_fmadd_ps(ifft6679, _mm512_set1_ps(7.0710677e-01f), ifft6671);
__m512 ifft6766 = _mm512_fmadd_ps(ifft6763, _mm512_set1_ps(7.0710677e-01f), ifft6755);
__m512 ifft6683 = _mm512_fmadd_ps(ifft6680, _mm512_set1_ps(7.0710677e-01f), ifft6672);
__m512 ifft6767 = _mm512_fmadd_ps(ifft6764, _mm512_set1_ps(7.0710677e-01f), ifft6756);
__m512 ifft6684 = _mm512_fmsub_ps(ifft6680, _mm512_set1_ps(7.0710677e-01f), ifft6672);
__m512 ifft6768 = _mm512_fmsub_ps(ifft6764, _mm512_set1_ps(7.0710677e-01f), ifft6756);
__m512 ifft6685 = _mm512_add_ps(ifft6681, ifft6682);
__m512 ifft6769 = _mm512_add_ps(ifft6765, ifft6766);
__m512 ifft6686 = _mm512_sub_ps(ifft6681, ifft6682);
__m512 ifft6770 = _mm512_sub_ps(ifft6765, ifft6766);
__m512 ifft6687 = _mm512_add_ps(ifft6683, ifft6684);
__m512 ifft6771 = _mm512_add_ps(ifft6767, ifft6768);
__m512 ifft6688 = _mm512_sub_ps(ifft6683, ifft6684);
__m512 ifft6772 = _mm512_sub_ps(ifft6767, ifft6768);
__m512 ifft6689 = _mm512_fmadd_ps(ifft6685, _mm512_set1_ps(1.5625e-02f), ifft6675);
__m512 ifft6773 = _mm512_fmadd_ps(ifft6769, _mm512_set1_ps(1.5625e-02f), ifft6759);
__m512 ifft6690 = _mm512_fnmadd_ps(ifft6685, _mm512_set1_ps(1.5625e-02f), ifft6675);
__m512 ifft6774 = _mm512_fnmadd_ps(ifft6769, _mm512_set1_ps(1.5625e-02f), ifft6759);
__m512 ifft6691 = _mm512_fmadd_ps(ifft6687, _mm512_set1_ps(1.5625e-02f), ifft6677);
__m512 ifft6775 = _mm512_fmadd_ps(ifft6771, _mm512_set1_ps(1.5625e-02f), ifft6761);
__m512 ifft6692 = _mm512_fnmadd_ps(ifft6687, _mm512_set1_ps(1.5625e-02f), ifft6677);
__m512 ifft6776 = _mm512_fnmadd_ps(ifft6771, _mm512_set1_ps(1.5625e-02f), ifft6761);
__m512 ifft6693 = _mm512_fnmadd_ps(ifft6688, _mm512_set1_ps(1.5625e-02f), ifft6676);
__m512 ifft6777 = _mm512_fnmadd_ps(ifft6772, _mm512_set1_ps(1.5625e-02f), ifft6760);
__m512 ifft6694 = _mm512_fmadd_ps(ifft6688, _mm512_set1_ps(1.5625e-02f), ifft6676);
__m512 ifft6778 = _mm512_fmadd_ps(ifft6772, _mm512_set1_ps(1.5625e-02f), ifft6760);
__m512 ifft6695 = _mm512_fmadd_ps(ifft6686, _mm512_set1_ps(1.5625e-02f), ifft6678);
__m512 ifft6779 = _mm512_fmadd_ps(ifft6770, _mm512_set1_ps(1.5625e-02f), ifft6762);
__m512 ifft6696 = _mm512_fnmadd_ps(ifft6686, _mm512_set1_ps(1.5625e-02f), ifft6678);
__m512 ifft6780 = _mm512_fnmadd_ps(ifft6770, _mm512_set1_ps(1.5625e-02f), ifft6762);
__m512 dat1961 = ifft6689;
__m512 dat1968 = ifft6773;
__m512 dat1962 = ifft6691;
__m512 dat1969 = ifft6775;
__m512 dat1963 = ifft6693;
__m512 dat1970 = ifft6777;
__m512 dat1964 = ifft6695;
__m512 dat1971 = ifft6779;
__m512 dat1965 = ifft6690;
__m512 dat1972 = ifft6774;
__m512 dat1966 = ifft6692;
__m512 dat1973 = ifft6776;
__m512 dat1967 = ifft6694;
__m512 dat1974 = ifft6778;
(void)ifft6696;
(void)ifft6780;
__m512i pm167 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack345 = _mm512_permutex2var_ps(dat1961, pm167, dat1968);
__m512i pm168 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack346 = _mm512_permutex2var_ps(dat1961, pm168, dat1968);
__m512 pack347 = _mm512_permutex2var_ps(dat1962, pm167, dat1969);
__m512 pack348 = _mm512_permutex2var_ps(dat1962, pm168, dat1969);
__m512 pack349 = _mm512_permutex2var_ps(dat1963, pm167, dat1970);
__m512 pack350 = _mm512_permutex2var_ps(dat1963, pm168, dat1970);
__m512 pack351 = _mm512_permutex2var_ps(dat1964, pm167, dat1971);
__m512 pack352 = _mm512_permutex2var_ps(dat1964, pm168, dat1971);
__m512 pack353 = _mm512_permutex2var_ps(dat1965, pm167, dat1972);
__m512 pack354 = _mm512_permutex2var_ps(dat1965, pm168, dat1972);
__m512 pack355 = _mm512_permutex2var_ps(dat1966, pm167, dat1973);
__m512 pack356 = _mm512_permutex2var_ps(dat1966, pm168, dat1973);
__m512 pack357 = _mm512_permutex2var_ps(dat1967, pm167, dat1974);
__m512 pack358 = _mm512_permutex2var_ps(dat1967, pm168, dat1974);
pack345 = _mm512_max_ps(_mm512_setzero_ps(), pack345);
pack346 = _mm512_max_ps(_mm512_setzero_ps(), pack346);
pack347 = _mm512_max_ps(_mm512_setzero_ps(), pack347);
pack348 = _mm512_max_ps(_mm512_setzero_ps(), pack348);
pack349 = _mm512_max_ps(_mm512_setzero_ps(), pack349);
pack350 = _mm512_max_ps(_mm512_setzero_ps(), pack350);
pack351 = _mm512_max_ps(_mm512_setzero_ps(), pack351);
pack352 = _mm512_max_ps(_mm512_setzero_ps(), pack352);
pack353 = _mm512_max_ps(_mm512_setzero_ps(), pack353);
pack354 = _mm512_max_ps(_mm512_setzero_ps(), pack354);
pack355 = _mm512_max_ps(_mm512_setzero_ps(), pack355);
pack356 = _mm512_max_ps(_mm512_setzero_ps(), pack356);
pack357 = _mm512_max_ps(_mm512_setzero_ps(), pack357);
pack358 = _mm512_max_ps(_mm512_setzero_ps(), pack358);
_mm512_mask_storeu_ps(datPtr19+784+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack345);
_mm512_mask_storeu_ps(datPtr19+3920+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack346);
_mm512_mask_storeu_ps(datPtr19+896+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack347);
_mm512_mask_storeu_ps(datPtr19+4032+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack348);
_mm512_mask_storeu_ps(datPtr19+1008+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack349);
_mm512_mask_storeu_ps(datPtr19+4144+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack350);
_mm512_mask_storeu_ps(datPtr19+1120+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack351);
_mm512_mask_storeu_ps(datPtr19+4256+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack352);
_mm512_mask_storeu_ps(datPtr19+1232+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack353);
_mm512_mask_storeu_ps(datPtr19+4368+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack354);
_mm512_mask_storeu_ps(datPtr19+1344+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack355);
_mm512_mask_storeu_ps(datPtr19+4480+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack356);
_mm512_mask_storeu_ps(datPtr19+1456+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack357);
_mm512_mask_storeu_ps(datPtr19+4592+25088*i41+12544*k115+6272*r22+112*toH39+4*toW39+56*t40, 16383, pack358);
}
}
++j34;
rel20 = 2;
}
ptrdiff_t toH40 = base20+21;
ptrdiff_t toW40 = 0;
ptrdiff_t k116 = 2*w53;
for (; k116 != 2; ++k116) {
ptrdiff_t r23 = 0;
for (; r23 != 2; ++r23) {
ptrdiff_t t41 = 0;
__m512 sfRe445 = _mm512_loadu_ps(sfPtr10+0+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm445 = _mm512_loadu_ps(sfPtr10+64+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe449 = _mm512_loadu_ps(sfPtr10+128+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm449 = _mm512_loadu_ps(sfPtr10+192+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe446 = _mm512_loadu_ps(sfPtr10+8192+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm446 = _mm512_loadu_ps(sfPtr10+8256+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe450 = _mm512_loadu_ps(sfPtr10+8320+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm450 = _mm512_loadu_ps(sfPtr10+8384+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe447 = _mm512_loadu_ps(sfPtr10+16384+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm447 = _mm512_loadu_ps(sfPtr10+16448+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe451 = _mm512_loadu_ps(sfPtr10+16512+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm451 = _mm512_loadu_ps(sfPtr10+16576+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe448 = _mm512_loadu_ps(sfPtr10+24576+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm448 = _mm512_loadu_ps(sfPtr10+24640+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfRe452 = _mm512_loadu_ps(sfPtr10+24704+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512 sfIm452 = _mm512_loadu_ps(sfPtr10+24768+32768*i41+3072*j34+1024*k116+512*r23+256*t41);
__m512i ifft6781 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6782 = _mm512_permutexvar_ps(ifft6781, sfRe445);
__m512 ifft6873 = _mm512_permutexvar_ps(ifft6781, sfRe449);
__m512i ifft6783 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6784 = _mm512_permutexvar_ps(ifft6783, sfRe445);
__m512 ifft6874 = _mm512_permutexvar_ps(ifft6783, sfRe449);
__m512 ifft6785 = _mm512_permutexvar_ps(ifft6781, sfIm445);
__m512 ifft6875 = _mm512_permutexvar_ps(ifft6781, sfIm449);
__m512 ifft6786 = _mm512_permutexvar_ps(ifft6783, sfIm445);
__m512 ifft6876 = _mm512_permutexvar_ps(ifft6783, sfIm449);
__m512 ifft6787 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6788 = _mm512_mask_fmadd_ps(ifft6786, 65021, ifft6787, ifft6782);
__m512 ifft6877 = _mm512_mask_fmadd_ps(ifft6876, 65021, ifft6787, ifft6873);
__m512 ifft6789 = _mm512_mask_fnmadd_ps(ifft6785, 65021, ifft6787, ifft6784);
__m512 ifft6878 = _mm512_mask_fnmadd_ps(ifft6875, 65021, ifft6787, ifft6874);
__m512 ifft6790 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6791 = _mm512_fmadd_ps(ifft6788, ifft6790, _mm512_shuffle_ps(ifft6788, ifft6788, 177));
__m512 ifft6879 = _mm512_fmadd_ps(ifft6877, ifft6790, _mm512_shuffle_ps(ifft6877, ifft6877, 177));
__m512 ifft6792 = _mm512_fmadd_ps(ifft6789, ifft6790, _mm512_shuffle_ps(ifft6789, ifft6789, 177));
__m512 ifft6880 = _mm512_fmadd_ps(ifft6878, ifft6790, _mm512_shuffle_ps(ifft6878, ifft6878, 177));
__m512 ifft6793 = _mm512_fmadd_ps(sfRe446, ifft6790, _mm512_shuffle_ps(sfRe446, sfRe446, 177));
__m512 ifft6881 = _mm512_fmadd_ps(sfRe450, ifft6790, _mm512_shuffle_ps(sfRe450, sfRe450, 177));
__m512 ifft6794 = _mm512_fmadd_ps(sfIm446, ifft6790, _mm512_shuffle_ps(sfIm446, sfIm446, 177));
__m512 ifft6882 = _mm512_fmadd_ps(sfIm450, ifft6790, _mm512_shuffle_ps(sfIm450, sfIm450, 177));
__m512 ifft6795 = _mm512_fmadd_ps(sfRe447, ifft6790, _mm512_shuffle_ps(sfRe447, sfRe447, 177));
__m512 ifft6883 = _mm512_fmadd_ps(sfRe451, ifft6790, _mm512_shuffle_ps(sfRe451, sfRe451, 177));
__m512 ifft6796 = _mm512_fmadd_ps(sfIm447, ifft6790, _mm512_shuffle_ps(sfIm447, sfIm447, 177));
__m512 ifft6884 = _mm512_fmadd_ps(sfIm451, ifft6790, _mm512_shuffle_ps(sfIm451, sfIm451, 177));
__m512 ifft6797 = _mm512_fmadd_ps(sfRe448, ifft6790, _mm512_shuffle_ps(sfRe448, sfRe448, 177));
__m512 ifft6885 = _mm512_fmadd_ps(sfRe452, ifft6790, _mm512_shuffle_ps(sfRe452, sfRe452, 177));
__m512 ifft6798 = _mm512_fmadd_ps(sfIm448, ifft6790, _mm512_shuffle_ps(sfIm448, sfIm448, 177));
__m512 ifft6886 = _mm512_fmadd_ps(sfIm452, ifft6790, _mm512_shuffle_ps(sfIm452, sfIm452, 177));
__m512 ifft6799 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6800 = _mm512_mul_ps(ifft6791, ifft6799);
__m512 ifft6887 = _mm512_mul_ps(ifft6879, ifft6799);
__m512 ifft6801 = _mm512_mul_ps(ifft6792, ifft6799);
__m512 ifft6888 = _mm512_mul_ps(ifft6880, ifft6799);
__m512 ifft6802 = _mm512_mul_ps(ifft6793, ifft6799);
__m512 ifft6889 = _mm512_mul_ps(ifft6881, ifft6799);
__m512 ifft6803 = _mm512_mul_ps(ifft6794, ifft6799);
__m512 ifft6890 = _mm512_mul_ps(ifft6882, ifft6799);
__m512 ifft6804 = _mm512_mul_ps(ifft6795, ifft6799);
__m512 ifft6891 = _mm512_mul_ps(ifft6883, ifft6799);
__m512 ifft6805 = _mm512_mul_ps(ifft6796, ifft6799);
__m512 ifft6892 = _mm512_mul_ps(ifft6884, ifft6799);
__m512 ifft6806 = _mm512_mul_ps(ifft6797, ifft6799);
__m512 ifft6893 = _mm512_mul_ps(ifft6885, ifft6799);
__m512 ifft6807 = _mm512_mul_ps(ifft6798, ifft6799);
__m512 ifft6894 = _mm512_mul_ps(ifft6886, ifft6799);
__m512 ifft6808 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6809 = _mm512_fnmadd_ps(ifft6792, ifft6808, ifft6800);
__m512 ifft6895 = _mm512_fnmadd_ps(ifft6880, ifft6808, ifft6887);
__m512 ifft6810 = _mm512_fmadd_ps(ifft6791, ifft6808, ifft6801);
__m512 ifft6896 = _mm512_fmadd_ps(ifft6879, ifft6808, ifft6888);
__m512 ifft6811 = _mm512_fnmadd_ps(ifft6794, ifft6808, ifft6802);
__m512 ifft6897 = _mm512_fnmadd_ps(ifft6882, ifft6808, ifft6889);
__m512 ifft6812 = _mm512_fmadd_ps(ifft6793, ifft6808, ifft6803);
__m512 ifft6898 = _mm512_fmadd_ps(ifft6881, ifft6808, ifft6890);
__m512 ifft6813 = _mm512_fnmadd_ps(ifft6796, ifft6808, ifft6804);
__m512 ifft6899 = _mm512_fnmadd_ps(ifft6884, ifft6808, ifft6891);
__m512 ifft6814 = _mm512_fmadd_ps(ifft6795, ifft6808, ifft6805);
__m512 ifft6900 = _mm512_fmadd_ps(ifft6883, ifft6808, ifft6892);
__m512 ifft6815 = _mm512_fnmadd_ps(ifft6798, ifft6808, ifft6806);
__m512 ifft6901 = _mm512_fnmadd_ps(ifft6886, ifft6808, ifft6893);
__m512 ifft6816 = _mm512_fmadd_ps(ifft6797, ifft6808, ifft6807);
__m512 ifft6902 = _mm512_fmadd_ps(ifft6885, ifft6808, ifft6894);
__m512 ifft6817 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6818 = _mm512_fmadd_ps(ifft6809, ifft6817, _mm512_shuffle_ps(ifft6809, ifft6809, 78));
__m512 ifft6903 = _mm512_fmadd_ps(ifft6895, ifft6817, _mm512_shuffle_ps(ifft6895, ifft6895, 78));
__m512 ifft6819 = _mm512_fmadd_ps(ifft6810, ifft6817, _mm512_shuffle_ps(ifft6810, ifft6810, 78));
__m512 ifft6904 = _mm512_fmadd_ps(ifft6896, ifft6817, _mm512_shuffle_ps(ifft6896, ifft6896, 78));
__m512 ifft6820 = _mm512_fmadd_ps(ifft6811, ifft6817, _mm512_shuffle_ps(ifft6811, ifft6811, 78));
__m512 ifft6905 = _mm512_fmadd_ps(ifft6897, ifft6817, _mm512_shuffle_ps(ifft6897, ifft6897, 78));
__m512 ifft6821 = _mm512_fmadd_ps(ifft6812, ifft6817, _mm512_shuffle_ps(ifft6812, ifft6812, 78));
__m512 ifft6906 = _mm512_fmadd_ps(ifft6898, ifft6817, _mm512_shuffle_ps(ifft6898, ifft6898, 78));
__m512 ifft6822 = _mm512_fmadd_ps(ifft6813, ifft6817, _mm512_shuffle_ps(ifft6813, ifft6813, 78));
__m512 ifft6907 = _mm512_fmadd_ps(ifft6899, ifft6817, _mm512_shuffle_ps(ifft6899, ifft6899, 78));
__m512 ifft6823 = _mm512_fmadd_ps(ifft6814, ifft6817, _mm512_shuffle_ps(ifft6814, ifft6814, 78));
__m512 ifft6908 = _mm512_fmadd_ps(ifft6900, ifft6817, _mm512_shuffle_ps(ifft6900, ifft6900, 78));
__m512 ifft6824 = _mm512_fmadd_ps(ifft6815, ifft6817, _mm512_shuffle_ps(ifft6815, ifft6815, 78));
__m512 ifft6909 = _mm512_fmadd_ps(ifft6901, ifft6817, _mm512_shuffle_ps(ifft6901, ifft6901, 78));
__m512 ifft6825 = _mm512_fmadd_ps(ifft6816, ifft6817, _mm512_shuffle_ps(ifft6816, ifft6816, 78));
__m512 ifft6910 = _mm512_fmadd_ps(ifft6902, ifft6817, _mm512_shuffle_ps(ifft6902, ifft6902, 78));
__m512 ifft6826 = _mm512_mask_sub_ps(ifft6818, 49344, _mm512_setzero_ps(), ifft6819);
__m512 ifft6911 = _mm512_mask_sub_ps(ifft6903, 49344, _mm512_setzero_ps(), ifft6904);
__m512 ifft6827 = _mm512_mask_mov_ps(ifft6819, 49344, ifft6818);
__m512 ifft6912 = _mm512_mask_mov_ps(ifft6904, 49344, ifft6903);
__m512 ifft6828 = _mm512_mask_sub_ps(ifft6820, 49344, _mm512_setzero_ps(), ifft6821);
__m512 ifft6913 = _mm512_mask_sub_ps(ifft6905, 49344, _mm512_setzero_ps(), ifft6906);
__m512 ifft6829 = _mm512_mask_mov_ps(ifft6821, 49344, ifft6820);
__m512 ifft6914 = _mm512_mask_mov_ps(ifft6906, 49344, ifft6905);
__m512 ifft6830 = _mm512_mask_sub_ps(ifft6822, 49344, _mm512_setzero_ps(), ifft6823);
__m512 ifft6915 = _mm512_mask_sub_ps(ifft6907, 49344, _mm512_setzero_ps(), ifft6908);
__m512 ifft6831 = _mm512_mask_mov_ps(ifft6823, 49344, ifft6822);
__m512 ifft6916 = _mm512_mask_mov_ps(ifft6908, 49344, ifft6907);
__m512 ifft6832 = _mm512_mask_sub_ps(ifft6824, 49344, _mm512_setzero_ps(), ifft6825);
__m512 ifft6917 = _mm512_mask_sub_ps(ifft6909, 49344, _mm512_setzero_ps(), ifft6910);
__m512 ifft6833 = _mm512_mask_mov_ps(ifft6825, 49344, ifft6824);
__m512 ifft6918 = _mm512_mask_mov_ps(ifft6910, 49344, ifft6909);
__m512 ifft6834 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft6835 = _mm512_fmadd_ps(ifft6826, ifft6834, _mm512_shuffle_f32x4(ifft6826, ifft6826, 177));
__m512 ifft6919 = _mm512_fmadd_ps(ifft6911, ifft6834, _mm512_shuffle_f32x4(ifft6911, ifft6911, 177));
__m512 ifft6836 = _mm512_fmadd_ps(ifft6827, ifft6834, _mm512_shuffle_f32x4(ifft6827, ifft6827, 177));
__m512 ifft6920 = _mm512_fmadd_ps(ifft6912, ifft6834, _mm512_shuffle_f32x4(ifft6912, ifft6912, 177));
__m512 ifft6837 = _mm512_fmadd_ps(ifft6828, ifft6834, _mm512_shuffle_f32x4(ifft6828, ifft6828, 177));
__m512 ifft6921 = _mm512_fmadd_ps(ifft6913, ifft6834, _mm512_shuffle_f32x4(ifft6913, ifft6913, 177));
__m512 ifft6838 = _mm512_fmadd_ps(ifft6829, ifft6834, _mm512_shuffle_f32x4(ifft6829, ifft6829, 177));
__m512 ifft6922 = _mm512_fmadd_ps(ifft6914, ifft6834, _mm512_shuffle_f32x4(ifft6914, ifft6914, 177));
__m512 ifft6839 = _mm512_fmadd_ps(ifft6830, ifft6834, _mm512_shuffle_f32x4(ifft6830, ifft6830, 177));
__m512 ifft6923 = _mm512_fmadd_ps(ifft6915, ifft6834, _mm512_shuffle_f32x4(ifft6915, ifft6915, 177));
__m512 ifft6840 = _mm512_fnmsub_ps(ifft6831, ifft6834, _mm512_shuffle_f32x4(ifft6831, ifft6831, 177));
__m512 ifft6924 = _mm512_fnmsub_ps(ifft6916, ifft6834, _mm512_shuffle_f32x4(ifft6916, ifft6916, 177));
__m512 ifft6841 = _mm512_fmadd_ps(ifft6832, ifft6834, _mm512_shuffle_f32x4(ifft6832, ifft6832, 177));
__m512 ifft6925 = _mm512_fmadd_ps(ifft6917, ifft6834, _mm512_shuffle_f32x4(ifft6917, ifft6917, 177));
__m512 ifft6842 = _mm512_fmadd_ps(ifft6833, ifft6834, _mm512_shuffle_f32x4(ifft6833, ifft6833, 177));
__m512 ifft6926 = _mm512_fmadd_ps(ifft6918, ifft6834, _mm512_shuffle_f32x4(ifft6918, ifft6918, 177));
__m512 ifft6843 = _mm512_add_ps(ifft6835, ifft6836);
__m512 ifft6927 = _mm512_add_ps(ifft6919, ifft6920);
__m512 ifft6844 = _mm512_sub_ps(ifft6835, ifft6836);
__m512 ifft6928 = _mm512_sub_ps(ifft6919, ifft6920);
__m512 ifft6845 = _mm512_sub_ps(ifft6837, ifft6841);
__m512 ifft6929 = _mm512_sub_ps(ifft6921, ifft6925);
__m512 ifft6846 = _mm512_add_ps(ifft6838, ifft6842);
__m512 ifft6930 = _mm512_add_ps(ifft6922, ifft6926);
__m512 ifft6847 = _mm512_add_ps(ifft6837, ifft6841);
__m512 ifft6931 = _mm512_add_ps(ifft6921, ifft6925);
__m512 ifft6848 = _mm512_sub_ps(ifft6838, ifft6842);
__m512 ifft6932 = _mm512_sub_ps(ifft6922, ifft6926);
__m512 ifft6849 = _mm512_mul_ps(ifft6839, _mm512_set1_ps(3.125e-02f));
__m512 ifft6933 = _mm512_mul_ps(ifft6923, _mm512_set1_ps(3.125e-02f));
__m512 ifft6850 = _mm512_mul_ps(ifft6840, _mm512_set1_ps(3.125e-02f));
__m512 ifft6934 = _mm512_mul_ps(ifft6924, _mm512_set1_ps(3.125e-02f));
__m512 ifft6851 = _mm512_fmadd_ps(ifft6843, _mm512_set1_ps(1.5625e-02f), ifft6849);
__m512 ifft6935 = _mm512_fmadd_ps(ifft6927, _mm512_set1_ps(1.5625e-02f), ifft6933);
__m512 ifft6852 = _mm512_fmsub_ps(ifft6843, _mm512_set1_ps(1.5625e-02f), ifft6849);
__m512 ifft6936 = _mm512_fmsub_ps(ifft6927, _mm512_set1_ps(1.5625e-02f), ifft6933);
__m512 ifft6853 = _mm512_fmadd_ps(ifft6844, _mm512_set1_ps(1.5625e-02f), ifft6850);
__m512 ifft6937 = _mm512_fmadd_ps(ifft6928, _mm512_set1_ps(1.5625e-02f), ifft6934);
__m512 ifft6854 = _mm512_fmsub_ps(ifft6844, _mm512_set1_ps(1.5625e-02f), ifft6850);
__m512 ifft6938 = _mm512_fmsub_ps(ifft6928, _mm512_set1_ps(1.5625e-02f), ifft6934);
__m512 ifft6855 = _mm512_add_ps(ifft6845, ifft6846);
__m512 ifft6939 = _mm512_add_ps(ifft6929, ifft6930);
__m512 ifft6856 = _mm512_sub_ps(ifft6845, ifft6846);
__m512 ifft6940 = _mm512_sub_ps(ifft6929, ifft6930);
__m512 ifft6857 = _mm512_fnmadd_ps(ifft6855, _mm512_set1_ps(7.0710677e-01f), ifft6847);
__m512 ifft6941 = _mm512_fnmadd_ps(ifft6939, _mm512_set1_ps(7.0710677e-01f), ifft6931);
__m512 ifft6858 = _mm512_fmadd_ps(ifft6855, _mm512_set1_ps(7.0710677e-01f), ifft6847);
__m512 ifft6942 = _mm512_fmadd_ps(ifft6939, _mm512_set1_ps(7.0710677e-01f), ifft6931);
__m512 ifft6859 = _mm512_fmadd_ps(ifft6856, _mm512_set1_ps(7.0710677e-01f), ifft6848);
__m512 ifft6943 = _mm512_fmadd_ps(ifft6940, _mm512_set1_ps(7.0710677e-01f), ifft6932);
__m512 ifft6860 = _mm512_fmsub_ps(ifft6856, _mm512_set1_ps(7.0710677e-01f), ifft6848);
__m512 ifft6944 = _mm512_fmsub_ps(ifft6940, _mm512_set1_ps(7.0710677e-01f), ifft6932);
__m512 ifft6861 = _mm512_add_ps(ifft6857, ifft6858);
__m512 ifft6945 = _mm512_add_ps(ifft6941, ifft6942);
__m512 ifft6862 = _mm512_sub_ps(ifft6857, ifft6858);
__m512 ifft6946 = _mm512_sub_ps(ifft6941, ifft6942);
__m512 ifft6863 = _mm512_add_ps(ifft6859, ifft6860);
__m512 ifft6947 = _mm512_add_ps(ifft6943, ifft6944);
__m512 ifft6864 = _mm512_sub_ps(ifft6859, ifft6860);
__m512 ifft6948 = _mm512_sub_ps(ifft6943, ifft6944);
__m512 ifft6865 = _mm512_fmadd_ps(ifft6861, _mm512_set1_ps(1.5625e-02f), ifft6851);
__m512 ifft6949 = _mm512_fmadd_ps(ifft6945, _mm512_set1_ps(1.5625e-02f), ifft6935);
__m512 ifft6866 = _mm512_fnmadd_ps(ifft6861, _mm512_set1_ps(1.5625e-02f), ifft6851);
__m512 ifft6950 = _mm512_fnmadd_ps(ifft6945, _mm512_set1_ps(1.5625e-02f), ifft6935);
__m512 ifft6867 = _mm512_fmadd_ps(ifft6863, _mm512_set1_ps(1.5625e-02f), ifft6853);
__m512 ifft6951 = _mm512_fmadd_ps(ifft6947, _mm512_set1_ps(1.5625e-02f), ifft6937);
__m512 ifft6868 = _mm512_fnmadd_ps(ifft6863, _mm512_set1_ps(1.5625e-02f), ifft6853);
__m512 ifft6952 = _mm512_fnmadd_ps(ifft6947, _mm512_set1_ps(1.5625e-02f), ifft6937);
__m512 ifft6869 = _mm512_fnmadd_ps(ifft6864, _mm512_set1_ps(1.5625e-02f), ifft6852);
__m512 ifft6953 = _mm512_fnmadd_ps(ifft6948, _mm512_set1_ps(1.5625e-02f), ifft6936);
__m512 ifft6870 = _mm512_fmadd_ps(ifft6864, _mm512_set1_ps(1.5625e-02f), ifft6852);
__m512 ifft6954 = _mm512_fmadd_ps(ifft6948, _mm512_set1_ps(1.5625e-02f), ifft6936);
__m512 ifft6871 = _mm512_fmadd_ps(ifft6862, _mm512_set1_ps(1.5625e-02f), ifft6854);
__m512 ifft6955 = _mm512_fmadd_ps(ifft6946, _mm512_set1_ps(1.5625e-02f), ifft6938);
__m512 ifft6872 = _mm512_fnmadd_ps(ifft6862, _mm512_set1_ps(1.5625e-02f), ifft6854);
__m512 ifft6956 = _mm512_fnmadd_ps(ifft6946, _mm512_set1_ps(1.5625e-02f), ifft6938);
__m512 dat1975 = ifft6865;
__m512 dat1982 = ifft6949;
__m512 dat1976 = ifft6867;
__m512 dat1983 = ifft6951;
__m512 dat1977 = ifft6869;
__m512 dat1984 = ifft6953;
__m512 dat1978 = ifft6871;
__m512 dat1985 = ifft6955;
__m512 dat1979 = ifft6866;
__m512 dat1986 = ifft6950;
__m512 dat1980 = ifft6868;
__m512 dat1987 = ifft6952;
__m512 dat1981 = ifft6870;
__m512 dat1988 = ifft6954;
(void)ifft6872;
(void)ifft6956;
__m512i pm169 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack359 = _mm512_permutex2var_ps(dat1975, pm169, dat1982);
__m512i pm170 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack360 = _mm512_permutex2var_ps(dat1975, pm170, dat1982);
__m512 pack361 = _mm512_permutex2var_ps(dat1976, pm169, dat1983);
__m512 pack362 = _mm512_permutex2var_ps(dat1976, pm170, dat1983);
__m512 pack363 = _mm512_permutex2var_ps(dat1977, pm169, dat1984);
__m512 pack364 = _mm512_permutex2var_ps(dat1977, pm170, dat1984);
__m512 pack365 = _mm512_permutex2var_ps(dat1978, pm169, dat1985);
__m512 pack366 = _mm512_permutex2var_ps(dat1978, pm170, dat1985);
__m512 pack367 = _mm512_permutex2var_ps(dat1979, pm169, dat1986);
__m512 pack368 = _mm512_permutex2var_ps(dat1979, pm170, dat1986);
__m512 pack369 = _mm512_permutex2var_ps(dat1980, pm169, dat1987);
__m512 pack370 = _mm512_permutex2var_ps(dat1980, pm170, dat1987);
__m512 pack371 = _mm512_permutex2var_ps(dat1981, pm169, dat1988);
__m512 pack372 = _mm512_permutex2var_ps(dat1981, pm170, dat1988);
pack359 = _mm512_max_ps(_mm512_setzero_ps(), pack359);
pack360 = _mm512_max_ps(_mm512_setzero_ps(), pack360);
pack361 = _mm512_max_ps(_mm512_setzero_ps(), pack361);
pack362 = _mm512_max_ps(_mm512_setzero_ps(), pack362);
pack363 = _mm512_max_ps(_mm512_setzero_ps(), pack363);
pack364 = _mm512_max_ps(_mm512_setzero_ps(), pack364);
pack365 = _mm512_max_ps(_mm512_setzero_ps(), pack365);
pack366 = _mm512_max_ps(_mm512_setzero_ps(), pack366);
pack367 = _mm512_max_ps(_mm512_setzero_ps(), pack367);
pack368 = _mm512_max_ps(_mm512_setzero_ps(), pack368);
pack369 = _mm512_max_ps(_mm512_setzero_ps(), pack369);
pack370 = _mm512_max_ps(_mm512_setzero_ps(), pack370);
pack371 = _mm512_max_ps(_mm512_setzero_ps(), pack371);
pack372 = _mm512_max_ps(_mm512_setzero_ps(), pack372);
_mm512_mask_storeu_ps(datPtr19+0+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack359);
_mm512_mask_storeu_ps(datPtr19+3136+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack360);
_mm512_mask_storeu_ps(datPtr19+112+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack361);
_mm512_mask_storeu_ps(datPtr19+3248+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack362);
_mm512_mask_storeu_ps(datPtr19+224+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack363);
_mm512_mask_storeu_ps(datPtr19+3360+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack364);
_mm512_mask_storeu_ps(datPtr19+336+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack365);
_mm512_mask_storeu_ps(datPtr19+3472+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack366);
_mm512_mask_storeu_ps(datPtr19+448+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack367);
_mm512_mask_storeu_ps(datPtr19+3584+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack368);
_mm512_mask_storeu_ps(datPtr19+560+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack369);
_mm512_mask_storeu_ps(datPtr19+3696+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack370);
_mm512_mask_storeu_ps(datPtr19+672+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack371);
_mm512_mask_storeu_ps(datPtr19+3808+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+0*t41, 16383, pack372);
ptrdiff_t t42 = 0;
__m512 sfRe453 = _mm512_loadu_ps(sfPtr10+256+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm453 = _mm512_loadu_ps(sfPtr10+320+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe457 = _mm512_loadu_ps(sfPtr10+384+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm457 = _mm512_loadu_ps(sfPtr10+448+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe454 = _mm512_loadu_ps(sfPtr10+8448+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm454 = _mm512_loadu_ps(sfPtr10+8512+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe458 = _mm512_loadu_ps(sfPtr10+8576+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm458 = _mm512_loadu_ps(sfPtr10+8640+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe455 = _mm512_loadu_ps(sfPtr10+16640+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm455 = _mm512_loadu_ps(sfPtr10+16704+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe459 = _mm512_loadu_ps(sfPtr10+16768+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm459 = _mm512_loadu_ps(sfPtr10+16832+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe456 = _mm512_loadu_ps(sfPtr10+24832+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm456 = _mm512_loadu_ps(sfPtr10+24896+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfRe460 = _mm512_loadu_ps(sfPtr10+24960+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512 sfIm460 = _mm512_loadu_ps(sfPtr10+25024+32768*i41+3072*j34+1024*k116+512*r23+256*t42);
__m512i ifft6957 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft6958 = _mm512_permutexvar_ps(ifft6957, sfRe453);
__m512 ifft7049 = _mm512_permutexvar_ps(ifft6957, sfRe457);
__m512i ifft6959 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft6960 = _mm512_permutexvar_ps(ifft6959, sfRe453);
__m512 ifft7050 = _mm512_permutexvar_ps(ifft6959, sfRe457);
__m512 ifft6961 = _mm512_permutexvar_ps(ifft6957, sfIm453);
__m512 ifft7051 = _mm512_permutexvar_ps(ifft6957, sfIm457);
__m512 ifft6962 = _mm512_permutexvar_ps(ifft6959, sfIm453);
__m512 ifft7052 = _mm512_permutexvar_ps(ifft6959, sfIm457);
__m512 ifft6963 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft6964 = _mm512_mask_fmadd_ps(ifft6962, 65021, ifft6963, ifft6958);
__m512 ifft7053 = _mm512_mask_fmadd_ps(ifft7052, 65021, ifft6963, ifft7049);
__m512 ifft6965 = _mm512_mask_fnmadd_ps(ifft6961, 65021, ifft6963, ifft6960);
__m512 ifft7054 = _mm512_mask_fnmadd_ps(ifft7051, 65021, ifft6963, ifft7050);
__m512 ifft6966 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft6967 = _mm512_fmadd_ps(ifft6964, ifft6966, _mm512_shuffle_ps(ifft6964, ifft6964, 177));
__m512 ifft7055 = _mm512_fmadd_ps(ifft7053, ifft6966, _mm512_shuffle_ps(ifft7053, ifft7053, 177));
__m512 ifft6968 = _mm512_fmadd_ps(ifft6965, ifft6966, _mm512_shuffle_ps(ifft6965, ifft6965, 177));
__m512 ifft7056 = _mm512_fmadd_ps(ifft7054, ifft6966, _mm512_shuffle_ps(ifft7054, ifft7054, 177));
__m512 ifft6969 = _mm512_fmadd_ps(sfRe454, ifft6966, _mm512_shuffle_ps(sfRe454, sfRe454, 177));
__m512 ifft7057 = _mm512_fmadd_ps(sfRe458, ifft6966, _mm512_shuffle_ps(sfRe458, sfRe458, 177));
__m512 ifft6970 = _mm512_fmadd_ps(sfIm454, ifft6966, _mm512_shuffle_ps(sfIm454, sfIm454, 177));
__m512 ifft7058 = _mm512_fmadd_ps(sfIm458, ifft6966, _mm512_shuffle_ps(sfIm458, sfIm458, 177));
__m512 ifft6971 = _mm512_fmadd_ps(sfRe455, ifft6966, _mm512_shuffle_ps(sfRe455, sfRe455, 177));
__m512 ifft7059 = _mm512_fmadd_ps(sfRe459, ifft6966, _mm512_shuffle_ps(sfRe459, sfRe459, 177));
__m512 ifft6972 = _mm512_fmadd_ps(sfIm455, ifft6966, _mm512_shuffle_ps(sfIm455, sfIm455, 177));
__m512 ifft7060 = _mm512_fmadd_ps(sfIm459, ifft6966, _mm512_shuffle_ps(sfIm459, sfIm459, 177));
__m512 ifft6973 = _mm512_fmadd_ps(sfRe456, ifft6966, _mm512_shuffle_ps(sfRe456, sfRe456, 177));
__m512 ifft7061 = _mm512_fmadd_ps(sfRe460, ifft6966, _mm512_shuffle_ps(sfRe460, sfRe460, 177));
__m512 ifft6974 = _mm512_fmadd_ps(sfIm456, ifft6966, _mm512_shuffle_ps(sfIm456, sfIm456, 177));
__m512 ifft7062 = _mm512_fmadd_ps(sfIm460, ifft6966, _mm512_shuffle_ps(sfIm460, sfIm460, 177));
__m512 ifft6975 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft6976 = _mm512_mul_ps(ifft6967, ifft6975);
__m512 ifft7063 = _mm512_mul_ps(ifft7055, ifft6975);
__m512 ifft6977 = _mm512_mul_ps(ifft6968, ifft6975);
__m512 ifft7064 = _mm512_mul_ps(ifft7056, ifft6975);
__m512 ifft6978 = _mm512_mul_ps(ifft6969, ifft6975);
__m512 ifft7065 = _mm512_mul_ps(ifft7057, ifft6975);
__m512 ifft6979 = _mm512_mul_ps(ifft6970, ifft6975);
__m512 ifft7066 = _mm512_mul_ps(ifft7058, ifft6975);
__m512 ifft6980 = _mm512_mul_ps(ifft6971, ifft6975);
__m512 ifft7067 = _mm512_mul_ps(ifft7059, ifft6975);
__m512 ifft6981 = _mm512_mul_ps(ifft6972, ifft6975);
__m512 ifft7068 = _mm512_mul_ps(ifft7060, ifft6975);
__m512 ifft6982 = _mm512_mul_ps(ifft6973, ifft6975);
__m512 ifft7069 = _mm512_mul_ps(ifft7061, ifft6975);
__m512 ifft6983 = _mm512_mul_ps(ifft6974, ifft6975);
__m512 ifft7070 = _mm512_mul_ps(ifft7062, ifft6975);
__m512 ifft6984 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft6985 = _mm512_fnmadd_ps(ifft6968, ifft6984, ifft6976);
__m512 ifft7071 = _mm512_fnmadd_ps(ifft7056, ifft6984, ifft7063);
__m512 ifft6986 = _mm512_fmadd_ps(ifft6967, ifft6984, ifft6977);
__m512 ifft7072 = _mm512_fmadd_ps(ifft7055, ifft6984, ifft7064);
__m512 ifft6987 = _mm512_fnmadd_ps(ifft6970, ifft6984, ifft6978);
__m512 ifft7073 = _mm512_fnmadd_ps(ifft7058, ifft6984, ifft7065);
__m512 ifft6988 = _mm512_fmadd_ps(ifft6969, ifft6984, ifft6979);
__m512 ifft7074 = _mm512_fmadd_ps(ifft7057, ifft6984, ifft7066);
__m512 ifft6989 = _mm512_fnmadd_ps(ifft6972, ifft6984, ifft6980);
__m512 ifft7075 = _mm512_fnmadd_ps(ifft7060, ifft6984, ifft7067);
__m512 ifft6990 = _mm512_fmadd_ps(ifft6971, ifft6984, ifft6981);
__m512 ifft7076 = _mm512_fmadd_ps(ifft7059, ifft6984, ifft7068);
__m512 ifft6991 = _mm512_fnmadd_ps(ifft6974, ifft6984, ifft6982);
__m512 ifft7077 = _mm512_fnmadd_ps(ifft7062, ifft6984, ifft7069);
__m512 ifft6992 = _mm512_fmadd_ps(ifft6973, ifft6984, ifft6983);
__m512 ifft7078 = _mm512_fmadd_ps(ifft7061, ifft6984, ifft7070);
__m512 ifft6993 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft6994 = _mm512_fmadd_ps(ifft6985, ifft6993, _mm512_shuffle_ps(ifft6985, ifft6985, 78));
__m512 ifft7079 = _mm512_fmadd_ps(ifft7071, ifft6993, _mm512_shuffle_ps(ifft7071, ifft7071, 78));
__m512 ifft6995 = _mm512_fmadd_ps(ifft6986, ifft6993, _mm512_shuffle_ps(ifft6986, ifft6986, 78));
__m512 ifft7080 = _mm512_fmadd_ps(ifft7072, ifft6993, _mm512_shuffle_ps(ifft7072, ifft7072, 78));
__m512 ifft6996 = _mm512_fmadd_ps(ifft6987, ifft6993, _mm512_shuffle_ps(ifft6987, ifft6987, 78));
__m512 ifft7081 = _mm512_fmadd_ps(ifft7073, ifft6993, _mm512_shuffle_ps(ifft7073, ifft7073, 78));
__m512 ifft6997 = _mm512_fmadd_ps(ifft6988, ifft6993, _mm512_shuffle_ps(ifft6988, ifft6988, 78));
__m512 ifft7082 = _mm512_fmadd_ps(ifft7074, ifft6993, _mm512_shuffle_ps(ifft7074, ifft7074, 78));
__m512 ifft6998 = _mm512_fmadd_ps(ifft6989, ifft6993, _mm512_shuffle_ps(ifft6989, ifft6989, 78));
__m512 ifft7083 = _mm512_fmadd_ps(ifft7075, ifft6993, _mm512_shuffle_ps(ifft7075, ifft7075, 78));
__m512 ifft6999 = _mm512_fmadd_ps(ifft6990, ifft6993, _mm512_shuffle_ps(ifft6990, ifft6990, 78));
__m512 ifft7084 = _mm512_fmadd_ps(ifft7076, ifft6993, _mm512_shuffle_ps(ifft7076, ifft7076, 78));
__m512 ifft7000 = _mm512_fmadd_ps(ifft6991, ifft6993, _mm512_shuffle_ps(ifft6991, ifft6991, 78));
__m512 ifft7085 = _mm512_fmadd_ps(ifft7077, ifft6993, _mm512_shuffle_ps(ifft7077, ifft7077, 78));
__m512 ifft7001 = _mm512_fmadd_ps(ifft6992, ifft6993, _mm512_shuffle_ps(ifft6992, ifft6992, 78));
__m512 ifft7086 = _mm512_fmadd_ps(ifft7078, ifft6993, _mm512_shuffle_ps(ifft7078, ifft7078, 78));
__m512 ifft7002 = _mm512_mask_sub_ps(ifft6994, 49344, _mm512_setzero_ps(), ifft6995);
__m512 ifft7087 = _mm512_mask_sub_ps(ifft7079, 49344, _mm512_setzero_ps(), ifft7080);
__m512 ifft7003 = _mm512_mask_mov_ps(ifft6995, 49344, ifft6994);
__m512 ifft7088 = _mm512_mask_mov_ps(ifft7080, 49344, ifft7079);
__m512 ifft7004 = _mm512_mask_sub_ps(ifft6996, 49344, _mm512_setzero_ps(), ifft6997);
__m512 ifft7089 = _mm512_mask_sub_ps(ifft7081, 49344, _mm512_setzero_ps(), ifft7082);
__m512 ifft7005 = _mm512_mask_mov_ps(ifft6997, 49344, ifft6996);
__m512 ifft7090 = _mm512_mask_mov_ps(ifft7082, 49344, ifft7081);
__m512 ifft7006 = _mm512_mask_sub_ps(ifft6998, 49344, _mm512_setzero_ps(), ifft6999);
__m512 ifft7091 = _mm512_mask_sub_ps(ifft7083, 49344, _mm512_setzero_ps(), ifft7084);
__m512 ifft7007 = _mm512_mask_mov_ps(ifft6999, 49344, ifft6998);
__m512 ifft7092 = _mm512_mask_mov_ps(ifft7084, 49344, ifft7083);
__m512 ifft7008 = _mm512_mask_sub_ps(ifft7000, 49344, _mm512_setzero_ps(), ifft7001);
__m512 ifft7093 = _mm512_mask_sub_ps(ifft7085, 49344, _mm512_setzero_ps(), ifft7086);
__m512 ifft7009 = _mm512_mask_mov_ps(ifft7001, 49344, ifft7000);
__m512 ifft7094 = _mm512_mask_mov_ps(ifft7086, 49344, ifft7085);
__m512 ifft7010 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft7011 = _mm512_fmadd_ps(ifft7002, ifft7010, _mm512_shuffle_f32x4(ifft7002, ifft7002, 177));
__m512 ifft7095 = _mm512_fmadd_ps(ifft7087, ifft7010, _mm512_shuffle_f32x4(ifft7087, ifft7087, 177));
__m512 ifft7012 = _mm512_fmadd_ps(ifft7003, ifft7010, _mm512_shuffle_f32x4(ifft7003, ifft7003, 177));
__m512 ifft7096 = _mm512_fmadd_ps(ifft7088, ifft7010, _mm512_shuffle_f32x4(ifft7088, ifft7088, 177));
__m512 ifft7013 = _mm512_fmadd_ps(ifft7004, ifft7010, _mm512_shuffle_f32x4(ifft7004, ifft7004, 177));
__m512 ifft7097 = _mm512_fmadd_ps(ifft7089, ifft7010, _mm512_shuffle_f32x4(ifft7089, ifft7089, 177));
__m512 ifft7014 = _mm512_fmadd_ps(ifft7005, ifft7010, _mm512_shuffle_f32x4(ifft7005, ifft7005, 177));
__m512 ifft7098 = _mm512_fmadd_ps(ifft7090, ifft7010, _mm512_shuffle_f32x4(ifft7090, ifft7090, 177));
__m512 ifft7015 = _mm512_fmadd_ps(ifft7006, ifft7010, _mm512_shuffle_f32x4(ifft7006, ifft7006, 177));
__m512 ifft7099 = _mm512_fmadd_ps(ifft7091, ifft7010, _mm512_shuffle_f32x4(ifft7091, ifft7091, 177));
__m512 ifft7016 = _mm512_fnmsub_ps(ifft7007, ifft7010, _mm512_shuffle_f32x4(ifft7007, ifft7007, 177));
__m512 ifft7100 = _mm512_fnmsub_ps(ifft7092, ifft7010, _mm512_shuffle_f32x4(ifft7092, ifft7092, 177));
__m512 ifft7017 = _mm512_fmadd_ps(ifft7008, ifft7010, _mm512_shuffle_f32x4(ifft7008, ifft7008, 177));
__m512 ifft7101 = _mm512_fmadd_ps(ifft7093, ifft7010, _mm512_shuffle_f32x4(ifft7093, ifft7093, 177));
__m512 ifft7018 = _mm512_fmadd_ps(ifft7009, ifft7010, _mm512_shuffle_f32x4(ifft7009, ifft7009, 177));
__m512 ifft7102 = _mm512_fmadd_ps(ifft7094, ifft7010, _mm512_shuffle_f32x4(ifft7094, ifft7094, 177));
__m512 ifft7019 = _mm512_add_ps(ifft7011, ifft7012);
__m512 ifft7103 = _mm512_add_ps(ifft7095, ifft7096);
__m512 ifft7020 = _mm512_sub_ps(ifft7011, ifft7012);
__m512 ifft7104 = _mm512_sub_ps(ifft7095, ifft7096);
__m512 ifft7021 = _mm512_sub_ps(ifft7013, ifft7017);
__m512 ifft7105 = _mm512_sub_ps(ifft7097, ifft7101);
__m512 ifft7022 = _mm512_add_ps(ifft7014, ifft7018);
__m512 ifft7106 = _mm512_add_ps(ifft7098, ifft7102);
__m512 ifft7023 = _mm512_add_ps(ifft7013, ifft7017);
__m512 ifft7107 = _mm512_add_ps(ifft7097, ifft7101);
__m512 ifft7024 = _mm512_sub_ps(ifft7014, ifft7018);
__m512 ifft7108 = _mm512_sub_ps(ifft7098, ifft7102);
__m512 ifft7025 = _mm512_mul_ps(ifft7015, _mm512_set1_ps(3.125e-02f));
__m512 ifft7109 = _mm512_mul_ps(ifft7099, _mm512_set1_ps(3.125e-02f));
__m512 ifft7026 = _mm512_mul_ps(ifft7016, _mm512_set1_ps(3.125e-02f));
__m512 ifft7110 = _mm512_mul_ps(ifft7100, _mm512_set1_ps(3.125e-02f));
__m512 ifft7027 = _mm512_fmadd_ps(ifft7019, _mm512_set1_ps(1.5625e-02f), ifft7025);
__m512 ifft7111 = _mm512_fmadd_ps(ifft7103, _mm512_set1_ps(1.5625e-02f), ifft7109);
__m512 ifft7028 = _mm512_fmsub_ps(ifft7019, _mm512_set1_ps(1.5625e-02f), ifft7025);
__m512 ifft7112 = _mm512_fmsub_ps(ifft7103, _mm512_set1_ps(1.5625e-02f), ifft7109);
__m512 ifft7029 = _mm512_fmadd_ps(ifft7020, _mm512_set1_ps(1.5625e-02f), ifft7026);
__m512 ifft7113 = _mm512_fmadd_ps(ifft7104, _mm512_set1_ps(1.5625e-02f), ifft7110);
__m512 ifft7030 = _mm512_fmsub_ps(ifft7020, _mm512_set1_ps(1.5625e-02f), ifft7026);
__m512 ifft7114 = _mm512_fmsub_ps(ifft7104, _mm512_set1_ps(1.5625e-02f), ifft7110);
__m512 ifft7031 = _mm512_add_ps(ifft7021, ifft7022);
__m512 ifft7115 = _mm512_add_ps(ifft7105, ifft7106);
__m512 ifft7032 = _mm512_sub_ps(ifft7021, ifft7022);
__m512 ifft7116 = _mm512_sub_ps(ifft7105, ifft7106);
__m512 ifft7033 = _mm512_fnmadd_ps(ifft7031, _mm512_set1_ps(7.0710677e-01f), ifft7023);
__m512 ifft7117 = _mm512_fnmadd_ps(ifft7115, _mm512_set1_ps(7.0710677e-01f), ifft7107);
__m512 ifft7034 = _mm512_fmadd_ps(ifft7031, _mm512_set1_ps(7.0710677e-01f), ifft7023);
__m512 ifft7118 = _mm512_fmadd_ps(ifft7115, _mm512_set1_ps(7.0710677e-01f), ifft7107);
__m512 ifft7035 = _mm512_fmadd_ps(ifft7032, _mm512_set1_ps(7.0710677e-01f), ifft7024);
__m512 ifft7119 = _mm512_fmadd_ps(ifft7116, _mm512_set1_ps(7.0710677e-01f), ifft7108);
__m512 ifft7036 = _mm512_fmsub_ps(ifft7032, _mm512_set1_ps(7.0710677e-01f), ifft7024);
__m512 ifft7120 = _mm512_fmsub_ps(ifft7116, _mm512_set1_ps(7.0710677e-01f), ifft7108);
__m512 ifft7037 = _mm512_add_ps(ifft7033, ifft7034);
__m512 ifft7121 = _mm512_add_ps(ifft7117, ifft7118);
__m512 ifft7038 = _mm512_sub_ps(ifft7033, ifft7034);
__m512 ifft7122 = _mm512_sub_ps(ifft7117, ifft7118);
__m512 ifft7039 = _mm512_add_ps(ifft7035, ifft7036);
__m512 ifft7123 = _mm512_add_ps(ifft7119, ifft7120);
__m512 ifft7040 = _mm512_sub_ps(ifft7035, ifft7036);
__m512 ifft7124 = _mm512_sub_ps(ifft7119, ifft7120);
__m512 ifft7041 = _mm512_fmadd_ps(ifft7037, _mm512_set1_ps(1.5625e-02f), ifft7027);
__m512 ifft7125 = _mm512_fmadd_ps(ifft7121, _mm512_set1_ps(1.5625e-02f), ifft7111);
__m512 ifft7042 = _mm512_fnmadd_ps(ifft7037, _mm512_set1_ps(1.5625e-02f), ifft7027);
__m512 ifft7126 = _mm512_fnmadd_ps(ifft7121, _mm512_set1_ps(1.5625e-02f), ifft7111);
__m512 ifft7043 = _mm512_fmadd_ps(ifft7039, _mm512_set1_ps(1.5625e-02f), ifft7029);
__m512 ifft7127 = _mm512_fmadd_ps(ifft7123, _mm512_set1_ps(1.5625e-02f), ifft7113);
__m512 ifft7044 = _mm512_fnmadd_ps(ifft7039, _mm512_set1_ps(1.5625e-02f), ifft7029);
__m512 ifft7128 = _mm512_fnmadd_ps(ifft7123, _mm512_set1_ps(1.5625e-02f), ifft7113);
__m512 ifft7045 = _mm512_fnmadd_ps(ifft7040, _mm512_set1_ps(1.5625e-02f), ifft7028);
__m512 ifft7129 = _mm512_fnmadd_ps(ifft7124, _mm512_set1_ps(1.5625e-02f), ifft7112);
__m512 ifft7046 = _mm512_fmadd_ps(ifft7040, _mm512_set1_ps(1.5625e-02f), ifft7028);
__m512 ifft7130 = _mm512_fmadd_ps(ifft7124, _mm512_set1_ps(1.5625e-02f), ifft7112);
__m512 ifft7047 = _mm512_fmadd_ps(ifft7038, _mm512_set1_ps(1.5625e-02f), ifft7030);
__m512 ifft7131 = _mm512_fmadd_ps(ifft7122, _mm512_set1_ps(1.5625e-02f), ifft7114);
__m512 ifft7048 = _mm512_fnmadd_ps(ifft7038, _mm512_set1_ps(1.5625e-02f), ifft7030);
__m512 ifft7132 = _mm512_fnmadd_ps(ifft7122, _mm512_set1_ps(1.5625e-02f), ifft7114);
__m512 dat1989 = ifft7041;
__m512 dat1996 = ifft7125;
__m512 dat1990 = ifft7043;
__m512 dat1997 = ifft7127;
__m512 dat1991 = ifft7045;
__m512 dat1998 = ifft7129;
__m512 dat1992 = ifft7047;
__m512 dat1999 = ifft7131;
__m512 dat1993 = ifft7042;
__m512 dat2000 = ifft7126;
__m512 dat1994 = ifft7044;
__m512 dat2001 = ifft7128;
__m512 dat1995 = ifft7046;
__m512 dat2002 = ifft7130;
(void)ifft7048;
(void)ifft7132;
__m512i pm171 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack373 = _mm512_permutex2var_ps(dat1989, pm171, dat1996);
__m512i pm172 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack374 = _mm512_permutex2var_ps(dat1989, pm172, dat1996);
__m512 pack375 = _mm512_permutex2var_ps(dat1990, pm171, dat1997);
__m512 pack376 = _mm512_permutex2var_ps(dat1990, pm172, dat1997);
__m512 pack377 = _mm512_permutex2var_ps(dat1991, pm171, dat1998);
__m512 pack378 = _mm512_permutex2var_ps(dat1991, pm172, dat1998);
__m512 pack379 = _mm512_permutex2var_ps(dat1992, pm171, dat1999);
__m512 pack380 = _mm512_permutex2var_ps(dat1992, pm172, dat1999);
__m512 pack381 = _mm512_permutex2var_ps(dat1993, pm171, dat2000);
__m512 pack382 = _mm512_permutex2var_ps(dat1993, pm172, dat2000);
__m512 pack383 = _mm512_permutex2var_ps(dat1994, pm171, dat2001);
__m512 pack384 = _mm512_permutex2var_ps(dat1994, pm172, dat2001);
__m512 pack385 = _mm512_permutex2var_ps(dat1995, pm171, dat2002);
__m512 pack386 = _mm512_permutex2var_ps(dat1995, pm172, dat2002);
pack373 = _mm512_max_ps(_mm512_setzero_ps(), pack373);
pack374 = _mm512_max_ps(_mm512_setzero_ps(), pack374);
pack375 = _mm512_max_ps(_mm512_setzero_ps(), pack375);
pack376 = _mm512_max_ps(_mm512_setzero_ps(), pack376);
pack377 = _mm512_max_ps(_mm512_setzero_ps(), pack377);
pack378 = _mm512_max_ps(_mm512_setzero_ps(), pack378);
pack379 = _mm512_max_ps(_mm512_setzero_ps(), pack379);
pack380 = _mm512_max_ps(_mm512_setzero_ps(), pack380);
pack381 = _mm512_max_ps(_mm512_setzero_ps(), pack381);
pack382 = _mm512_max_ps(_mm512_setzero_ps(), pack382);
pack383 = _mm512_max_ps(_mm512_setzero_ps(), pack383);
pack384 = _mm512_max_ps(_mm512_setzero_ps(), pack384);
pack385 = _mm512_max_ps(_mm512_setzero_ps(), pack385);
pack386 = _mm512_max_ps(_mm512_setzero_ps(), pack386);
_mm512_mask_storeu_ps(datPtr19+56+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack373);
_mm512_mask_storeu_ps(datPtr19+3192+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack374);
_mm512_mask_storeu_ps(datPtr19+168+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack375);
_mm512_mask_storeu_ps(datPtr19+3304+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack376);
_mm512_mask_storeu_ps(datPtr19+280+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack377);
_mm512_mask_storeu_ps(datPtr19+3416+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack378);
_mm512_mask_storeu_ps(datPtr19+392+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack379);
_mm512_mask_storeu_ps(datPtr19+3528+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack380);
_mm512_mask_storeu_ps(datPtr19+504+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack381);
_mm512_mask_storeu_ps(datPtr19+3640+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack382);
_mm512_mask_storeu_ps(datPtr19+616+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack383);
_mm512_mask_storeu_ps(datPtr19+3752+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack384);
_mm512_mask_storeu_ps(datPtr19+728+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack385);
_mm512_mask_storeu_ps(datPtr19+3864+25088*i41+12544*k116+6272*r23+112*toH40+4*toW40+56*t42, 16383, pack386);
}
}
++j34;
}
}

static void ResNeXt50StriderConsumeSums2(ResNeXt50ThreaderTeam1* team45, char** tensors63) {
ResNeXt50ThreaderTask1 task67;
task67.callee1 = ResNeXt50StriderConsumeSums2Callee1;
task67.any1 = tensors63;
task67.nd1 = 3;
task67.hull1[0] = 1;
task67.hull1[1] = 1;
task67.hull1[2] = 8;
ResNeXt50ThreaderDo1(team45, &task67);
}

static void ResNeXt50StriderArrangeFilts3Callee1(ResNeXt50ThreaderTask1* task100, int64_t* pt55) {
char** tensors98 = task100->any1;
ptrdiff_t b76 = pt55[0];
ptrdiff_t g31 = pt55[1];
ptrdiff_t e28 = 0;
char*restrict bfPtr13 = tensors98[3]+2048*e28;
char*restrict wfPtr13 = tensors98[3]+2048+130023424*e28;
char*restrict wtPtr16 = tensors98[0]+17856*e28;
char*restrict biasPtr16 = tensors98[1];
char*restrict bnPtr17 = tensors98[2];
ptrdiff_t i59 = 1*g31;
ptrdiff_t j51 = 4*b76;
ptrdiff_t jj51 = j51+3;
if (j51 < 8) {
for (; j51 != 8; ++j51) {
__m512 postMul50 = _mm512_set1_ps(((float*)bnPtr17+(ptrdiff_t)2*(0+16*i59+2*j51))[0]);
__m512 postMul51 = _mm512_set1_ps(((float*)bnPtr17+(ptrdiff_t)2*(1+16*i59+2*j51))[0]);
for (ptrdiff_t k153 = 0; k153 < 16; ++k153) {
__m512 wt595 = _mm512_maskz_loadu_ps(7, wtPtr16+0+9216*i59+1152*j51+36*k153);
__m512 wt596 = _mm512_maskz_loadu_ps(7, wtPtr16+12+9216*i59+1152*j51+36*k153);
__m512 wt597 = _mm512_maskz_loadu_ps(7, wtPtr16+24+9216*i59+1152*j51+36*k153);
wt595 = _mm512_mul_ps(postMul50, wt595);
wt596 = _mm512_mul_ps(postMul50, wt596);
wt597 = _mm512_mul_ps(postMul50, wt597);
__m512 fft9577 = _mm512_add_ps(wt595, _mm512_setzero_ps());
__m512 fft9665 = _mm512_add_ps(wt596, _mm512_setzero_ps());
__m512 fft9578 = _mm512_sub_ps(wt595, _mm512_setzero_ps());
__m512 fft9666 = _mm512_sub_ps(wt596, _mm512_setzero_ps());
__m512 fft9579 = _mm512_add_ps(wt597, _mm512_setzero_ps());
__m512 fft9667 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9580 = _mm512_sub_ps(wt597, _mm512_setzero_ps());
__m512 fft9668 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9581 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9669 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9582 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9670 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9583 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9671 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9584 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9672 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9585 = _mm512_add_ps(fft9577, fft9581);
__m512 fft9673 = _mm512_add_ps(fft9665, fft9669);
__m512 fft9586 = _mm512_sub_ps(fft9577, fft9581);
__m512 fft9674 = _mm512_sub_ps(fft9665, fft9669);
__m512 fft9587 = _mm512_add_ps(fft9579, fft9583);
__m512 fft9675 = _mm512_add_ps(fft9667, fft9671);
__m512 fft9588 = _mm512_sub_ps(fft9583, fft9579);
__m512 fft9676 = _mm512_sub_ps(fft9671, fft9667);
__m512 fft9589 = _mm512_sub_ps(fft9580, fft9584);
__m512 fft9677 = _mm512_sub_ps(fft9668, fft9672);
__m512 fft9590 = _mm512_add_ps(fft9580, fft9584);
__m512 fft9678 = _mm512_add_ps(fft9668, fft9672);
__m512 fft9591 = _mm512_add_ps(fft9585, fft9587);
__m512 fft9679 = _mm512_add_ps(fft9673, fft9675);
__m512 fft9592 = _mm512_sub_ps(fft9585, fft9587);
__m512 fft9680 = _mm512_sub_ps(fft9673, fft9675);
__m512 fft9593 = _mm512_fmadd_ps(fft9589, _mm512_set1_ps(7.0710677e-01f), fft9578);
__m512 fft9681 = _mm512_fmadd_ps(fft9677, _mm512_set1_ps(7.0710677e-01f), fft9666);
__m512 fft9594 = _mm512_fnmsub_ps(fft9590, _mm512_set1_ps(7.0710677e-01f), fft9582);
__m512 fft9682 = _mm512_fnmsub_ps(fft9678, _mm512_set1_ps(7.0710677e-01f), fft9670);
__m512 fft9595 = _mm512_fnmadd_ps(fft9589, _mm512_set1_ps(7.0710677e-01f), fft9578);
__m512 fft9683 = _mm512_fnmadd_ps(fft9677, _mm512_set1_ps(7.0710677e-01f), fft9666);
__m512 fft9596 = _mm512_fnmadd_ps(fft9590, _mm512_set1_ps(7.0710677e-01f), fft9582);
__m512 fft9684 = _mm512_fnmadd_ps(fft9678, _mm512_set1_ps(7.0710677e-01f), fft9670);
__m512 fft9597 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9598 = _mm512_fmadd_ps(fft9591, fft9597, _mm512_shuffle_f32x4(fft9591, fft9591, 78));
__m512 fft9685 = _mm512_fmadd_ps(fft9679, fft9597, _mm512_shuffle_f32x4(fft9679, fft9679, 78));
__m512 fft9599 = _mm512_fmadd_ps(fft9592, fft9597, _mm512_shuffle_f32x4(fft9592, fft9592, 78));
__m512 fft9686 = _mm512_fmadd_ps(fft9680, fft9597, _mm512_shuffle_f32x4(fft9680, fft9680, 78));
__m512 fft9600 = _mm512_fmadd_ps(fft9593, fft9597, _mm512_shuffle_f32x4(fft9593, fft9593, 78));
__m512 fft9687 = _mm512_fmadd_ps(fft9681, fft9597, _mm512_shuffle_f32x4(fft9681, fft9681, 78));
__m512 fft9601 = _mm512_fmadd_ps(fft9594, fft9597, _mm512_shuffle_f32x4(fft9594, fft9594, 78));
__m512 fft9688 = _mm512_fmadd_ps(fft9682, fft9597, _mm512_shuffle_f32x4(fft9682, fft9682, 78));
__m512 fft9602 = _mm512_fmadd_ps(fft9586, fft9597, _mm512_shuffle_f32x4(fft9586, fft9586, 78));
__m512 fft9689 = _mm512_fmadd_ps(fft9674, fft9597, _mm512_shuffle_f32x4(fft9674, fft9674, 78));
__m512 fft9603 = _mm512_fmadd_ps(fft9588, fft9597, _mm512_shuffle_f32x4(fft9588, fft9588, 78));
__m512 fft9690 = _mm512_fmadd_ps(fft9676, fft9597, _mm512_shuffle_f32x4(fft9676, fft9676, 78));
__m512 fft9604 = _mm512_fmadd_ps(fft9595, fft9597, _mm512_shuffle_f32x4(fft9595, fft9595, 78));
__m512 fft9691 = _mm512_fmadd_ps(fft9683, fft9597, _mm512_shuffle_f32x4(fft9683, fft9683, 78));
__m512 fft9605 = _mm512_fmadd_ps(fft9596, fft9597, _mm512_shuffle_f32x4(fft9596, fft9596, 78));
__m512 fft9692 = _mm512_fmadd_ps(fft9684, fft9597, _mm512_shuffle_f32x4(fft9684, fft9684, 78));
__m512 fft9606 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9607 = _mm512_mul_ps(fft9598, fft9606);
__m512 fft9693 = _mm512_mul_ps(fft9685, fft9606);
__m512 fft9608 = _mm512_mul_ps(fft9599, fft9606);
__m512 fft9694 = _mm512_mul_ps(fft9686, fft9606);
__m512 fft9609 = _mm512_mul_ps(fft9600, fft9606);
__m512 fft9695 = _mm512_mul_ps(fft9687, fft9606);
__m512 fft9610 = _mm512_mul_ps(fft9601, fft9606);
__m512 fft9696 = _mm512_mul_ps(fft9688, fft9606);
__m512 fft9611 = _mm512_mul_ps(fft9602, fft9606);
__m512 fft9697 = _mm512_mul_ps(fft9689, fft9606);
__m512 fft9612 = _mm512_mul_ps(fft9603, fft9606);
__m512 fft9698 = _mm512_mul_ps(fft9690, fft9606);
__m512 fft9613 = _mm512_mul_ps(fft9604, fft9606);
__m512 fft9699 = _mm512_mul_ps(fft9691, fft9606);
__m512 fft9614 = _mm512_mul_ps(fft9605, fft9606);
__m512 fft9700 = _mm512_mul_ps(fft9692, fft9606);
__m512 fft9615 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9616 = _mm512_fmadd_ps(fft9599, fft9615, fft9607);
__m512 fft9701 = _mm512_fmadd_ps(fft9686, fft9615, fft9693);
__m512 fft9617 = _mm512_fnmadd_ps(fft9598, fft9615, fft9608);
__m512 fft9702 = _mm512_fnmadd_ps(fft9685, fft9615, fft9694);
__m512 fft9618 = _mm512_fmadd_ps(fft9601, fft9615, fft9609);
__m512 fft9703 = _mm512_fmadd_ps(fft9688, fft9615, fft9695);
__m512 fft9619 = _mm512_fnmadd_ps(fft9600, fft9615, fft9610);
__m512 fft9704 = _mm512_fnmadd_ps(fft9687, fft9615, fft9696);
__m512 fft9620 = _mm512_fmadd_ps(fft9603, fft9615, fft9611);
__m512 fft9705 = _mm512_fmadd_ps(fft9690, fft9615, fft9697);
__m512 fft9621 = _mm512_fnmadd_ps(fft9602, fft9615, fft9612);
__m512 fft9706 = _mm512_fnmadd_ps(fft9689, fft9615, fft9698);
__m512 fft9622 = _mm512_fmadd_ps(fft9605, fft9615, fft9613);
__m512 fft9707 = _mm512_fmadd_ps(fft9692, fft9615, fft9699);
__m512 fft9623 = _mm512_fnmadd_ps(fft9604, fft9615, fft9614);
__m512 fft9708 = _mm512_fnmadd_ps(fft9691, fft9615, fft9700);
__m512 fft9624 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9625 = _mm512_fmadd_ps(fft9616, fft9624, _mm512_shuffle_f32x4(fft9616, fft9616, 177));
__m512 fft9709 = _mm512_fmadd_ps(fft9701, fft9624, _mm512_shuffle_f32x4(fft9701, fft9701, 177));
__m512 fft9626 = _mm512_fmadd_ps(fft9617, fft9624, _mm512_shuffle_f32x4(fft9617, fft9617, 177));
__m512 fft9710 = _mm512_fmadd_ps(fft9702, fft9624, _mm512_shuffle_f32x4(fft9702, fft9702, 177));
__m512 fft9627 = _mm512_fmadd_ps(fft9618, fft9624, _mm512_shuffle_f32x4(fft9618, fft9618, 177));
__m512 fft9711 = _mm512_fmadd_ps(fft9703, fft9624, _mm512_shuffle_f32x4(fft9703, fft9703, 177));
__m512 fft9628 = _mm512_fmadd_ps(fft9619, fft9624, _mm512_shuffle_f32x4(fft9619, fft9619, 177));
__m512 fft9712 = _mm512_fmadd_ps(fft9704, fft9624, _mm512_shuffle_f32x4(fft9704, fft9704, 177));
__m512 fft9629 = _mm512_fmadd_ps(fft9620, fft9624, _mm512_shuffle_f32x4(fft9620, fft9620, 177));
__m512 fft9713 = _mm512_fmadd_ps(fft9705, fft9624, _mm512_shuffle_f32x4(fft9705, fft9705, 177));
__m512 fft9630 = _mm512_fmadd_ps(fft9621, fft9624, _mm512_shuffle_f32x4(fft9621, fft9621, 177));
__m512 fft9714 = _mm512_fmadd_ps(fft9706, fft9624, _mm512_shuffle_f32x4(fft9706, fft9706, 177));
__m512 fft9631 = _mm512_fmadd_ps(fft9622, fft9624, _mm512_shuffle_f32x4(fft9622, fft9622, 177));
__m512 fft9715 = _mm512_fmadd_ps(fft9707, fft9624, _mm512_shuffle_f32x4(fft9707, fft9707, 177));
__m512 fft9632 = _mm512_fmadd_ps(fft9623, fft9624, _mm512_shuffle_f32x4(fft9623, fft9623, 177));
__m512 fft9716 = _mm512_fmadd_ps(fft9708, fft9624, _mm512_shuffle_f32x4(fft9708, fft9708, 177));
__m512 fft9633 = _mm512_mask_mov_ps(fft9625, 49344, fft9626);
__m512 fft9717 = _mm512_mask_mov_ps(fft9709, 49344, fft9710);
__m512 fft9634 = _mm512_mask_sub_ps(fft9626, 49344, _mm512_setzero_ps(), fft9625);
__m512 fft9718 = _mm512_mask_sub_ps(fft9710, 49344, _mm512_setzero_ps(), fft9709);
__m512 fft9635 = _mm512_mask_mov_ps(fft9627, 49344, fft9628);
__m512 fft9719 = _mm512_mask_mov_ps(fft9711, 49344, fft9712);
__m512 fft9636 = _mm512_mask_sub_ps(fft9628, 49344, _mm512_setzero_ps(), fft9627);
__m512 fft9720 = _mm512_mask_sub_ps(fft9712, 49344, _mm512_setzero_ps(), fft9711);
__m512 fft9637 = _mm512_mask_mov_ps(fft9629, 49344, fft9630);
__m512 fft9721 = _mm512_mask_mov_ps(fft9713, 49344, fft9714);
__m512 fft9638 = _mm512_mask_sub_ps(fft9630, 49344, _mm512_setzero_ps(), fft9629);
__m512 fft9722 = _mm512_mask_sub_ps(fft9714, 49344, _mm512_setzero_ps(), fft9713);
__m512 fft9639 = _mm512_mask_mov_ps(fft9631, 49344, fft9632);
__m512 fft9723 = _mm512_mask_mov_ps(fft9715, 49344, fft9716);
__m512 fft9640 = _mm512_mask_sub_ps(fft9632, 49344, _mm512_setzero_ps(), fft9631);
__m512 fft9724 = _mm512_mask_sub_ps(fft9716, 49344, _mm512_setzero_ps(), fft9715);
__m512 fft9641 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9642 = _mm512_fmadd_ps(fft9633, fft9641, _mm512_shuffle_ps(fft9633, fft9633, 78));
__m512 fft9725 = _mm512_fmadd_ps(fft9717, fft9641, _mm512_shuffle_ps(fft9717, fft9717, 78));
__m512 fft9643 = _mm512_fmadd_ps(fft9634, fft9641, _mm512_shuffle_ps(fft9634, fft9634, 78));
__m512 fft9726 = _mm512_fmadd_ps(fft9718, fft9641, _mm512_shuffle_ps(fft9718, fft9718, 78));
__m512 fft9644 = _mm512_fmadd_ps(fft9635, fft9641, _mm512_shuffle_ps(fft9635, fft9635, 78));
__m512 fft9727 = _mm512_fmadd_ps(fft9719, fft9641, _mm512_shuffle_ps(fft9719, fft9719, 78));
__m512 fft9645 = _mm512_fmadd_ps(fft9636, fft9641, _mm512_shuffle_ps(fft9636, fft9636, 78));
__m512 fft9728 = _mm512_fmadd_ps(fft9720, fft9641, _mm512_shuffle_ps(fft9720, fft9720, 78));
__m512 fft9646 = _mm512_fmadd_ps(fft9637, fft9641, _mm512_shuffle_ps(fft9637, fft9637, 78));
__m512 fft9729 = _mm512_fmadd_ps(fft9721, fft9641, _mm512_shuffle_ps(fft9721, fft9721, 78));
__m512 fft9647 = _mm512_fmadd_ps(fft9638, fft9641, _mm512_shuffle_ps(fft9638, fft9638, 78));
__m512 fft9730 = _mm512_fmadd_ps(fft9722, fft9641, _mm512_shuffle_ps(fft9722, fft9722, 78));
__m512 fft9648 = _mm512_fmadd_ps(fft9639, fft9641, _mm512_shuffle_ps(fft9639, fft9639, 78));
__m512 fft9731 = _mm512_fmadd_ps(fft9723, fft9641, _mm512_shuffle_ps(fft9723, fft9723, 78));
__m512 fft9649 = _mm512_fmadd_ps(fft9640, fft9641, _mm512_shuffle_ps(fft9640, fft9640, 78));
__m512 fft9732 = _mm512_fmadd_ps(fft9724, fft9641, _mm512_shuffle_ps(fft9724, fft9724, 78));
__m512i fft9650 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9651 = _mm512_permutexvar_ps(fft9650, fft9642);
__m512 fft9733 = _mm512_permutexvar_ps(fft9650, fft9725);
__m512i fft9652 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9653 = _mm512_permutexvar_ps(fft9652, fft9642);
__m512 fft9734 = _mm512_permutexvar_ps(fft9652, fft9725);
__m512 fft9654 = _mm512_permutexvar_ps(fft9650, fft9643);
__m512 fft9735 = _mm512_permutexvar_ps(fft9650, fft9726);
__m512 fft9655 = _mm512_permutexvar_ps(fft9652, fft9643);
__m512 fft9736 = _mm512_permutexvar_ps(fft9652, fft9726);
__m512 fft9656 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9657 = _mm512_fmadd_ps(fft9651, fft9656, fft9653);
__m512 fft9737 = _mm512_fmadd_ps(fft9733, fft9656, fft9734);
__m512 fft9658 = _mm512_fnmadd_ps(fft9655, fft9656, fft9654);
__m512 fft9738 = _mm512_fnmadd_ps(fft9736, fft9656, fft9735);
__m512 fft9659 = _mm512_mask_mov_ps(fft9655, 21845, fft9657);
__m512 fft9739 = _mm512_mask_mov_ps(fft9736, 21845, fft9737);
__m512 fft9660 = _mm512_mask_mov_ps(fft9651, 43176, fft9657);
__m512 fft9740 = _mm512_mask_mov_ps(fft9733, 43176, fft9737);
__m512 fft9661 = _mm512_mask_mov_ps(fft9659, 43176, fft9658);
__m512 fft9741 = _mm512_mask_mov_ps(fft9739, 43176, fft9738);
__m512 fft9662 = _mm512_mask_mov_ps(fft9660, 22102, fft9658);
__m512 fft9742 = _mm512_mask_mov_ps(fft9740, 22102, fft9738);
__m512 fft9663 = _mm512_mask_mul_ps(fft9661, 64764, fft9661, _mm512_set1_ps(5e-01f));
__m512 fft9743 = _mm512_mask_mul_ps(fft9741, 64764, fft9741, _mm512_set1_ps(5e-01f));
__m512 fft9664 = _mm512_mask_mul_ps(fft9662, 64764, fft9662, _mm512_set1_ps(5e-01f));
__m512 fft9744 = _mm512_mask_mul_ps(fft9742, 64764, fft9742, _mm512_set1_ps(5e-01f));
__m512 wf137 = fft9663;
__m512 wf145 = fft9743;
__m512 wf138 = fft9664;
__m512 wf146 = fft9744;
__m512 wf139 = fft9644;
__m512 wf147 = fft9727;
__m512 wf140 = fft9645;
__m512 wf148 = fft9728;
__m512 wf141 = fft9646;
__m512 wf149 = fft9729;
__m512 wf142 = fft9647;
__m512 wf150 = fft9730;
__m512 wf143 = fft9648;
__m512 wf151 = fft9731;
__m512 wf144 = fft9649;
__m512 wf152 = fft9732;
ptrdiff_t c48 = (size_t)(0+2*j51)/4;
ptrdiff_t m58 = (size_t)(0+2*j51)%4/2;
ptrdiff_t f62 = (size_t)(0+2*j51)%2;
__m512i eo58 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf139 = _mm512_permutexvar_ps(eo58, wf139);
wf140 = _mm512_permutexvar_ps(eo58, wf140);
__m512i wfs45 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf139, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs45 = _mm512_inserti64x4(wfs45, _mm512_cvtps_ph(wf140, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+8192+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs45);
_mm512_mask_storeu_epi32(wfPtr13+1056752+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs45);
wf147 = _mm512_permutexvar_ps(eo58, wf147);
wf148 = _mm512_permutexvar_ps(eo58, wf148);
__m512i wfs46 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf147, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs46 = _mm512_inserti64x4(wfs46, _mm512_cvtps_ph(wf148, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2105344+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs46);
_mm512_mask_storeu_epi32(wfPtr13+3153904+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs46);
wf141 = _mm512_permutexvar_ps(eo58, wf141);
wf142 = _mm512_permutexvar_ps(eo58, wf142);
__m512i wfs47 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf141, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs47 = _mm512_inserti64x4(wfs47, _mm512_cvtps_ph(wf142, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+16384+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs47);
_mm512_mask_storeu_epi32(wfPtr13+1064944+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs47);
wf149 = _mm512_permutexvar_ps(eo58, wf149);
wf150 = _mm512_permutexvar_ps(eo58, wf150);
__m512i wfs48 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf149, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs48 = _mm512_inserti64x4(wfs48, _mm512_cvtps_ph(wf150, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2113536+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs48);
_mm512_mask_storeu_epi32(wfPtr13+3162096+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs48);
wf143 = _mm512_permutexvar_ps(eo58, wf143);
wf144 = _mm512_permutexvar_ps(eo58, wf144);
__m512i wfs49 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf143, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs49 = _mm512_inserti64x4(wfs49, _mm512_cvtps_ph(wf144, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+24576+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs49);
_mm512_mask_storeu_epi32(wfPtr13+1073136+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs49);
wf151 = _mm512_permutexvar_ps(eo58, wf151);
wf152 = _mm512_permutexvar_ps(eo58, wf152);
__m512i wfs50 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf151, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs50 = _mm512_inserti64x4(wfs50, _mm512_cvtps_ph(wf152, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2121728+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs50);
_mm512_mask_storeu_epi32(wfPtr13+3170288+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs50);
__m512i wfs51 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf137, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs51 = _mm512_inserti64x4(wfs51, _mm512_cvtps_ph(wf138, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+0+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs51);
_mm512_mask_storeu_epi32(wfPtr13+1048560+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs51);
__m512i wfs52 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf145, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs52 = _mm512_inserti64x4(wfs52, _mm512_cvtps_ph(wf146, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2097152+32768*i59+2048*c48+128*k153+64*m58+16*f62, 3855, wfs52);
_mm512_mask_storeu_epi32(wfPtr13+3145712+32768*i59+2048*c48+128*k153+64*m58+16*f62, 61680, wfs52);
__m512 wt598 = _mm512_maskz_loadu_ps(7, wtPtr16+576+9216*i59+1152*j51+36*k153);
__m512 wt599 = _mm512_maskz_loadu_ps(7, wtPtr16+588+9216*i59+1152*j51+36*k153);
__m512 wt600 = _mm512_maskz_loadu_ps(7, wtPtr16+600+9216*i59+1152*j51+36*k153);
wt598 = _mm512_mul_ps(postMul51, wt598);
wt599 = _mm512_mul_ps(postMul51, wt599);
wt600 = _mm512_mul_ps(postMul51, wt600);
__m512 fft9745 = _mm512_add_ps(wt598, _mm512_setzero_ps());
__m512 fft9833 = _mm512_add_ps(wt599, _mm512_setzero_ps());
__m512 fft9746 = _mm512_sub_ps(wt598, _mm512_setzero_ps());
__m512 fft9834 = _mm512_sub_ps(wt599, _mm512_setzero_ps());
__m512 fft9747 = _mm512_add_ps(wt600, _mm512_setzero_ps());
__m512 fft9835 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9748 = _mm512_sub_ps(wt600, _mm512_setzero_ps());
__m512 fft9836 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9749 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9837 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9750 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9838 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9751 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9839 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9752 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9840 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft9753 = _mm512_add_ps(fft9745, fft9749);
__m512 fft9841 = _mm512_add_ps(fft9833, fft9837);
__m512 fft9754 = _mm512_sub_ps(fft9745, fft9749);
__m512 fft9842 = _mm512_sub_ps(fft9833, fft9837);
__m512 fft9755 = _mm512_add_ps(fft9747, fft9751);
__m512 fft9843 = _mm512_add_ps(fft9835, fft9839);
__m512 fft9756 = _mm512_sub_ps(fft9751, fft9747);
__m512 fft9844 = _mm512_sub_ps(fft9839, fft9835);
__m512 fft9757 = _mm512_sub_ps(fft9748, fft9752);
__m512 fft9845 = _mm512_sub_ps(fft9836, fft9840);
__m512 fft9758 = _mm512_add_ps(fft9748, fft9752);
__m512 fft9846 = _mm512_add_ps(fft9836, fft9840);
__m512 fft9759 = _mm512_add_ps(fft9753, fft9755);
__m512 fft9847 = _mm512_add_ps(fft9841, fft9843);
__m512 fft9760 = _mm512_sub_ps(fft9753, fft9755);
__m512 fft9848 = _mm512_sub_ps(fft9841, fft9843);
__m512 fft9761 = _mm512_fmadd_ps(fft9757, _mm512_set1_ps(7.0710677e-01f), fft9746);
__m512 fft9849 = _mm512_fmadd_ps(fft9845, _mm512_set1_ps(7.0710677e-01f), fft9834);
__m512 fft9762 = _mm512_fnmsub_ps(fft9758, _mm512_set1_ps(7.0710677e-01f), fft9750);
__m512 fft9850 = _mm512_fnmsub_ps(fft9846, _mm512_set1_ps(7.0710677e-01f), fft9838);
__m512 fft9763 = _mm512_fnmadd_ps(fft9757, _mm512_set1_ps(7.0710677e-01f), fft9746);
__m512 fft9851 = _mm512_fnmadd_ps(fft9845, _mm512_set1_ps(7.0710677e-01f), fft9834);
__m512 fft9764 = _mm512_fnmadd_ps(fft9758, _mm512_set1_ps(7.0710677e-01f), fft9750);
__m512 fft9852 = _mm512_fnmadd_ps(fft9846, _mm512_set1_ps(7.0710677e-01f), fft9838);
__m512 fft9765 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9766 = _mm512_fmadd_ps(fft9759, fft9765, _mm512_shuffle_f32x4(fft9759, fft9759, 78));
__m512 fft9853 = _mm512_fmadd_ps(fft9847, fft9765, _mm512_shuffle_f32x4(fft9847, fft9847, 78));
__m512 fft9767 = _mm512_fmadd_ps(fft9760, fft9765, _mm512_shuffle_f32x4(fft9760, fft9760, 78));
__m512 fft9854 = _mm512_fmadd_ps(fft9848, fft9765, _mm512_shuffle_f32x4(fft9848, fft9848, 78));
__m512 fft9768 = _mm512_fmadd_ps(fft9761, fft9765, _mm512_shuffle_f32x4(fft9761, fft9761, 78));
__m512 fft9855 = _mm512_fmadd_ps(fft9849, fft9765, _mm512_shuffle_f32x4(fft9849, fft9849, 78));
__m512 fft9769 = _mm512_fmadd_ps(fft9762, fft9765, _mm512_shuffle_f32x4(fft9762, fft9762, 78));
__m512 fft9856 = _mm512_fmadd_ps(fft9850, fft9765, _mm512_shuffle_f32x4(fft9850, fft9850, 78));
__m512 fft9770 = _mm512_fmadd_ps(fft9754, fft9765, _mm512_shuffle_f32x4(fft9754, fft9754, 78));
__m512 fft9857 = _mm512_fmadd_ps(fft9842, fft9765, _mm512_shuffle_f32x4(fft9842, fft9842, 78));
__m512 fft9771 = _mm512_fmadd_ps(fft9756, fft9765, _mm512_shuffle_f32x4(fft9756, fft9756, 78));
__m512 fft9858 = _mm512_fmadd_ps(fft9844, fft9765, _mm512_shuffle_f32x4(fft9844, fft9844, 78));
__m512 fft9772 = _mm512_fmadd_ps(fft9763, fft9765, _mm512_shuffle_f32x4(fft9763, fft9763, 78));
__m512 fft9859 = _mm512_fmadd_ps(fft9851, fft9765, _mm512_shuffle_f32x4(fft9851, fft9851, 78));
__m512 fft9773 = _mm512_fmadd_ps(fft9764, fft9765, _mm512_shuffle_f32x4(fft9764, fft9764, 78));
__m512 fft9860 = _mm512_fmadd_ps(fft9852, fft9765, _mm512_shuffle_f32x4(fft9852, fft9852, 78));
__m512 fft9774 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9775 = _mm512_mul_ps(fft9766, fft9774);
__m512 fft9861 = _mm512_mul_ps(fft9853, fft9774);
__m512 fft9776 = _mm512_mul_ps(fft9767, fft9774);
__m512 fft9862 = _mm512_mul_ps(fft9854, fft9774);
__m512 fft9777 = _mm512_mul_ps(fft9768, fft9774);
__m512 fft9863 = _mm512_mul_ps(fft9855, fft9774);
__m512 fft9778 = _mm512_mul_ps(fft9769, fft9774);
__m512 fft9864 = _mm512_mul_ps(fft9856, fft9774);
__m512 fft9779 = _mm512_mul_ps(fft9770, fft9774);
__m512 fft9865 = _mm512_mul_ps(fft9857, fft9774);
__m512 fft9780 = _mm512_mul_ps(fft9771, fft9774);
__m512 fft9866 = _mm512_mul_ps(fft9858, fft9774);
__m512 fft9781 = _mm512_mul_ps(fft9772, fft9774);
__m512 fft9867 = _mm512_mul_ps(fft9859, fft9774);
__m512 fft9782 = _mm512_mul_ps(fft9773, fft9774);
__m512 fft9868 = _mm512_mul_ps(fft9860, fft9774);
__m512 fft9783 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9784 = _mm512_fmadd_ps(fft9767, fft9783, fft9775);
__m512 fft9869 = _mm512_fmadd_ps(fft9854, fft9783, fft9861);
__m512 fft9785 = _mm512_fnmadd_ps(fft9766, fft9783, fft9776);
__m512 fft9870 = _mm512_fnmadd_ps(fft9853, fft9783, fft9862);
__m512 fft9786 = _mm512_fmadd_ps(fft9769, fft9783, fft9777);
__m512 fft9871 = _mm512_fmadd_ps(fft9856, fft9783, fft9863);
__m512 fft9787 = _mm512_fnmadd_ps(fft9768, fft9783, fft9778);
__m512 fft9872 = _mm512_fnmadd_ps(fft9855, fft9783, fft9864);
__m512 fft9788 = _mm512_fmadd_ps(fft9771, fft9783, fft9779);
__m512 fft9873 = _mm512_fmadd_ps(fft9858, fft9783, fft9865);
__m512 fft9789 = _mm512_fnmadd_ps(fft9770, fft9783, fft9780);
__m512 fft9874 = _mm512_fnmadd_ps(fft9857, fft9783, fft9866);
__m512 fft9790 = _mm512_fmadd_ps(fft9773, fft9783, fft9781);
__m512 fft9875 = _mm512_fmadd_ps(fft9860, fft9783, fft9867);
__m512 fft9791 = _mm512_fnmadd_ps(fft9772, fft9783, fft9782);
__m512 fft9876 = _mm512_fnmadd_ps(fft9859, fft9783, fft9868);
__m512 fft9792 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9793 = _mm512_fmadd_ps(fft9784, fft9792, _mm512_shuffle_f32x4(fft9784, fft9784, 177));
__m512 fft9877 = _mm512_fmadd_ps(fft9869, fft9792, _mm512_shuffle_f32x4(fft9869, fft9869, 177));
__m512 fft9794 = _mm512_fmadd_ps(fft9785, fft9792, _mm512_shuffle_f32x4(fft9785, fft9785, 177));
__m512 fft9878 = _mm512_fmadd_ps(fft9870, fft9792, _mm512_shuffle_f32x4(fft9870, fft9870, 177));
__m512 fft9795 = _mm512_fmadd_ps(fft9786, fft9792, _mm512_shuffle_f32x4(fft9786, fft9786, 177));
__m512 fft9879 = _mm512_fmadd_ps(fft9871, fft9792, _mm512_shuffle_f32x4(fft9871, fft9871, 177));
__m512 fft9796 = _mm512_fmadd_ps(fft9787, fft9792, _mm512_shuffle_f32x4(fft9787, fft9787, 177));
__m512 fft9880 = _mm512_fmadd_ps(fft9872, fft9792, _mm512_shuffle_f32x4(fft9872, fft9872, 177));
__m512 fft9797 = _mm512_fmadd_ps(fft9788, fft9792, _mm512_shuffle_f32x4(fft9788, fft9788, 177));
__m512 fft9881 = _mm512_fmadd_ps(fft9873, fft9792, _mm512_shuffle_f32x4(fft9873, fft9873, 177));
__m512 fft9798 = _mm512_fmadd_ps(fft9789, fft9792, _mm512_shuffle_f32x4(fft9789, fft9789, 177));
__m512 fft9882 = _mm512_fmadd_ps(fft9874, fft9792, _mm512_shuffle_f32x4(fft9874, fft9874, 177));
__m512 fft9799 = _mm512_fmadd_ps(fft9790, fft9792, _mm512_shuffle_f32x4(fft9790, fft9790, 177));
__m512 fft9883 = _mm512_fmadd_ps(fft9875, fft9792, _mm512_shuffle_f32x4(fft9875, fft9875, 177));
__m512 fft9800 = _mm512_fmadd_ps(fft9791, fft9792, _mm512_shuffle_f32x4(fft9791, fft9791, 177));
__m512 fft9884 = _mm512_fmadd_ps(fft9876, fft9792, _mm512_shuffle_f32x4(fft9876, fft9876, 177));
__m512 fft9801 = _mm512_mask_mov_ps(fft9793, 49344, fft9794);
__m512 fft9885 = _mm512_mask_mov_ps(fft9877, 49344, fft9878);
__m512 fft9802 = _mm512_mask_sub_ps(fft9794, 49344, _mm512_setzero_ps(), fft9793);
__m512 fft9886 = _mm512_mask_sub_ps(fft9878, 49344, _mm512_setzero_ps(), fft9877);
__m512 fft9803 = _mm512_mask_mov_ps(fft9795, 49344, fft9796);
__m512 fft9887 = _mm512_mask_mov_ps(fft9879, 49344, fft9880);
__m512 fft9804 = _mm512_mask_sub_ps(fft9796, 49344, _mm512_setzero_ps(), fft9795);
__m512 fft9888 = _mm512_mask_sub_ps(fft9880, 49344, _mm512_setzero_ps(), fft9879);
__m512 fft9805 = _mm512_mask_mov_ps(fft9797, 49344, fft9798);
__m512 fft9889 = _mm512_mask_mov_ps(fft9881, 49344, fft9882);
__m512 fft9806 = _mm512_mask_sub_ps(fft9798, 49344, _mm512_setzero_ps(), fft9797);
__m512 fft9890 = _mm512_mask_sub_ps(fft9882, 49344, _mm512_setzero_ps(), fft9881);
__m512 fft9807 = _mm512_mask_mov_ps(fft9799, 49344, fft9800);
__m512 fft9891 = _mm512_mask_mov_ps(fft9883, 49344, fft9884);
__m512 fft9808 = _mm512_mask_sub_ps(fft9800, 49344, _mm512_setzero_ps(), fft9799);
__m512 fft9892 = _mm512_mask_sub_ps(fft9884, 49344, _mm512_setzero_ps(), fft9883);
__m512 fft9809 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9810 = _mm512_fmadd_ps(fft9801, fft9809, _mm512_shuffle_ps(fft9801, fft9801, 78));
__m512 fft9893 = _mm512_fmadd_ps(fft9885, fft9809, _mm512_shuffle_ps(fft9885, fft9885, 78));
__m512 fft9811 = _mm512_fmadd_ps(fft9802, fft9809, _mm512_shuffle_ps(fft9802, fft9802, 78));
__m512 fft9894 = _mm512_fmadd_ps(fft9886, fft9809, _mm512_shuffle_ps(fft9886, fft9886, 78));
__m512 fft9812 = _mm512_fmadd_ps(fft9803, fft9809, _mm512_shuffle_ps(fft9803, fft9803, 78));
__m512 fft9895 = _mm512_fmadd_ps(fft9887, fft9809, _mm512_shuffle_ps(fft9887, fft9887, 78));
__m512 fft9813 = _mm512_fmadd_ps(fft9804, fft9809, _mm512_shuffle_ps(fft9804, fft9804, 78));
__m512 fft9896 = _mm512_fmadd_ps(fft9888, fft9809, _mm512_shuffle_ps(fft9888, fft9888, 78));
__m512 fft9814 = _mm512_fmadd_ps(fft9805, fft9809, _mm512_shuffle_ps(fft9805, fft9805, 78));
__m512 fft9897 = _mm512_fmadd_ps(fft9889, fft9809, _mm512_shuffle_ps(fft9889, fft9889, 78));
__m512 fft9815 = _mm512_fmadd_ps(fft9806, fft9809, _mm512_shuffle_ps(fft9806, fft9806, 78));
__m512 fft9898 = _mm512_fmadd_ps(fft9890, fft9809, _mm512_shuffle_ps(fft9890, fft9890, 78));
__m512 fft9816 = _mm512_fmadd_ps(fft9807, fft9809, _mm512_shuffle_ps(fft9807, fft9807, 78));
__m512 fft9899 = _mm512_fmadd_ps(fft9891, fft9809, _mm512_shuffle_ps(fft9891, fft9891, 78));
__m512 fft9817 = _mm512_fmadd_ps(fft9808, fft9809, _mm512_shuffle_ps(fft9808, fft9808, 78));
__m512 fft9900 = _mm512_fmadd_ps(fft9892, fft9809, _mm512_shuffle_ps(fft9892, fft9892, 78));
__m512i fft9818 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9819 = _mm512_permutexvar_ps(fft9818, fft9810);
__m512 fft9901 = _mm512_permutexvar_ps(fft9818, fft9893);
__m512i fft9820 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9821 = _mm512_permutexvar_ps(fft9820, fft9810);
__m512 fft9902 = _mm512_permutexvar_ps(fft9820, fft9893);
__m512 fft9822 = _mm512_permutexvar_ps(fft9818, fft9811);
__m512 fft9903 = _mm512_permutexvar_ps(fft9818, fft9894);
__m512 fft9823 = _mm512_permutexvar_ps(fft9820, fft9811);
__m512 fft9904 = _mm512_permutexvar_ps(fft9820, fft9894);
__m512 fft9824 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9825 = _mm512_fmadd_ps(fft9819, fft9824, fft9821);
__m512 fft9905 = _mm512_fmadd_ps(fft9901, fft9824, fft9902);
__m512 fft9826 = _mm512_fnmadd_ps(fft9823, fft9824, fft9822);
__m512 fft9906 = _mm512_fnmadd_ps(fft9904, fft9824, fft9903);
__m512 fft9827 = _mm512_mask_mov_ps(fft9823, 21845, fft9825);
__m512 fft9907 = _mm512_mask_mov_ps(fft9904, 21845, fft9905);
__m512 fft9828 = _mm512_mask_mov_ps(fft9819, 43176, fft9825);
__m512 fft9908 = _mm512_mask_mov_ps(fft9901, 43176, fft9905);
__m512 fft9829 = _mm512_mask_mov_ps(fft9827, 43176, fft9826);
__m512 fft9909 = _mm512_mask_mov_ps(fft9907, 43176, fft9906);
__m512 fft9830 = _mm512_mask_mov_ps(fft9828, 22102, fft9826);
__m512 fft9910 = _mm512_mask_mov_ps(fft9908, 22102, fft9906);
__m512 fft9831 = _mm512_mask_mul_ps(fft9829, 64764, fft9829, _mm512_set1_ps(5e-01f));
__m512 fft9911 = _mm512_mask_mul_ps(fft9909, 64764, fft9909, _mm512_set1_ps(5e-01f));
__m512 fft9832 = _mm512_mask_mul_ps(fft9830, 64764, fft9830, _mm512_set1_ps(5e-01f));
__m512 fft9912 = _mm512_mask_mul_ps(fft9910, 64764, fft9910, _mm512_set1_ps(5e-01f));
__m512 wf153 = fft9831;
__m512 wf161 = fft9911;
__m512 wf154 = fft9832;
__m512 wf162 = fft9912;
__m512 wf155 = fft9812;
__m512 wf163 = fft9895;
__m512 wf156 = fft9813;
__m512 wf164 = fft9896;
__m512 wf157 = fft9814;
__m512 wf165 = fft9897;
__m512 wf158 = fft9815;
__m512 wf166 = fft9898;
__m512 wf159 = fft9816;
__m512 wf167 = fft9899;
__m512 wf160 = fft9817;
__m512 wf168 = fft9900;
ptrdiff_t c49 = (size_t)(1+2*j51)/4;
ptrdiff_t m59 = (size_t)(1+2*j51)%4/2;
ptrdiff_t f63 = (size_t)(1+2*j51)%2;
__m512i eo59 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf155 = _mm512_permutexvar_ps(eo59, wf155);
wf156 = _mm512_permutexvar_ps(eo59, wf156);
__m512i wfs53 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf155, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs53 = _mm512_inserti64x4(wfs53, _mm512_cvtps_ph(wf156, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+8192+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs53);
_mm512_mask_storeu_epi32(wfPtr13+1056752+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs53);
wf163 = _mm512_permutexvar_ps(eo59, wf163);
wf164 = _mm512_permutexvar_ps(eo59, wf164);
__m512i wfs54 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf163, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs54 = _mm512_inserti64x4(wfs54, _mm512_cvtps_ph(wf164, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2105344+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs54);
_mm512_mask_storeu_epi32(wfPtr13+3153904+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs54);
wf157 = _mm512_permutexvar_ps(eo59, wf157);
wf158 = _mm512_permutexvar_ps(eo59, wf158);
__m512i wfs55 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf157, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs55 = _mm512_inserti64x4(wfs55, _mm512_cvtps_ph(wf158, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+16384+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs55);
_mm512_mask_storeu_epi32(wfPtr13+1064944+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs55);
wf165 = _mm512_permutexvar_ps(eo59, wf165);
wf166 = _mm512_permutexvar_ps(eo59, wf166);
__m512i wfs56 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf165, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs56 = _mm512_inserti64x4(wfs56, _mm512_cvtps_ph(wf166, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2113536+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs56);
_mm512_mask_storeu_epi32(wfPtr13+3162096+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs56);
wf159 = _mm512_permutexvar_ps(eo59, wf159);
wf160 = _mm512_permutexvar_ps(eo59, wf160);
__m512i wfs57 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf159, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs57 = _mm512_inserti64x4(wfs57, _mm512_cvtps_ph(wf160, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+24576+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs57);
_mm512_mask_storeu_epi32(wfPtr13+1073136+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs57);
wf167 = _mm512_permutexvar_ps(eo59, wf167);
wf168 = _mm512_permutexvar_ps(eo59, wf168);
__m512i wfs58 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf167, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs58 = _mm512_inserti64x4(wfs58, _mm512_cvtps_ph(wf168, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2121728+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs58);
_mm512_mask_storeu_epi32(wfPtr13+3170288+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs58);
__m512i wfs59 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf153, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs59 = _mm512_inserti64x4(wfs59, _mm512_cvtps_ph(wf154, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+0+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs59);
_mm512_mask_storeu_epi32(wfPtr13+1048560+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs59);
__m512i wfs60 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf161, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs60 = _mm512_inserti64x4(wfs60, _mm512_cvtps_ph(wf162, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr13+2097152+32768*i59+2048*c49+128*k153+64*m59+16*f63, 3855, wfs60);
_mm512_mask_storeu_epi32(wfPtr13+3145712+32768*i59+2048*c49+128*k153+64*m59+16*f63, 61680, wfs60);
}
__m512 bias6 = _mm512_setzero_ps();
if (!e28) {
bias6 = _mm512_maskz_loadu_ps(3, biasPtr16-0+64*i59+8*j51);
__m512i pmMul34 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd34 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas12 = _mm512_maskz_loadu_ps(15, bnPtr17+(ptrdiff_t)8*(0+16*i59+2*j51));
__m512 postMul52 = _mm512_permutexvar_ps(pmMul34, mas12);
__m512 postAdd34 = _mm512_permutexvar_ps(pmAdd34, mas12);
bias6 = _mm512_fmadd_ps(bias6, postMul52, postAdd34);
bias6 = _mm512_mul_ps(bias6, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr13-0+64*i59+8*j51, 3, bias6);
if (j51 >= jj51) return;
}
}
}

static void ResNeXt50StriderArrangeFilts3(ResNeXt50ThreaderTeam1* team62, char** tensors97) {
ResNeXt50ThreaderTask1 task101;
task101.callee1 = ResNeXt50StriderArrangeFilts3Callee1;
task101.any1 = tensors97;
task101.nd1 = 3;
task101.hull1[0] = 2;
task101.hull1[1] = 32;
task101.hull1[2] = 1;
ResNeXt50ThreaderDo1(team62, &task101);
}

static void ResNeXt50StriderArrangeDats3Callee1(ResNeXt50ThreaderTask1* task102, int64_t* pt56) {
char** tensors100 = task102->any1;
ptrdiff_t s57 = 0;
ptrdiff_t c50 = 0;
ptrdiff_t g32 = pt56[2];
ptrdiff_t e29 = 0;
char*restrict datPtr31 = tensors100[0]-116+1555456*e29;
char*restrict dfPtr13 = tensors100[1]+65011712*e29;
ptrdiff_t i60 = 2*g32;
ptrdiff_t ii44 = i60+1;
for (; i60 <= ii44; ++i60) {
ptrdiff_t j52 = 1*c50;
ptrdiff_t rel23 = j52-0;
ptrdiff_t base23 = 0;
ptrdiff_t h51 = base23+0;
ptrdiff_t w65 = 0;
ptrdiff_t k154 = 16*s57;
ptrdiff_t kk53 = k154+15;
for (; k154 <= kk53; ++k154) {
ptrdiff_t b77 = 0;
ptrdiff_t m60 = (size_t)b77/2;
ptrdiff_t f64 = (size_t)b77%2;
__m512 dat2364 = _mm512_maskz_loadu_ps(65534, datPtr31+112+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2365 = _mm512_maskz_loadu_ps(65534, datPtr31+224+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2366 = _mm512_maskz_loadu_ps(65534, datPtr31+336+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2367 = _mm512_maskz_loadu_ps(65534, datPtr31+448+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2368 = _mm512_maskz_loadu_ps(65534, datPtr31+560+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2369 = _mm512_maskz_loadu_ps(65534, datPtr31+672+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2370 = _mm512_maskz_loadu_ps(65534, datPtr31+784+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2371 = _mm512_maskz_loadu_ps(65534, datPtr31+896+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2372 = _mm512_maskz_loadu_ps(65534, datPtr31+1008+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2373 = _mm512_maskz_loadu_ps(65534, datPtr31+1120+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2374 = _mm512_maskz_loadu_ps(65534, datPtr31+1232+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2375 = _mm512_maskz_loadu_ps(65534, datPtr31+1344+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2376 = _mm512_maskz_loadu_ps(65534, datPtr31+1456+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2377 = _mm512_maskz_loadu_ps(65534, datPtr31+1568+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 dat2378 = _mm512_maskz_loadu_ps(65534, datPtr31+1680+50176*i60+3136*k154+112*h51+4*w65+0*b77);
__m512 fft9913 = _mm512_add_ps(_mm512_setzero_ps(), dat2371);
__m512 fft10001 = _mm512_add_ps(dat2364, dat2372);
__m512 fft9914 = _mm512_sub_ps(_mm512_setzero_ps(), dat2371);
__m512 fft10002 = _mm512_sub_ps(dat2364, dat2372);
__m512 fft9915 = _mm512_add_ps(dat2365, dat2373);
__m512 fft10003 = _mm512_add_ps(dat2366, dat2374);
__m512 fft9916 = _mm512_sub_ps(dat2365, dat2373);
__m512 fft10004 = _mm512_sub_ps(dat2366, dat2374);
__m512 fft9917 = _mm512_add_ps(dat2367, dat2375);
__m512 fft10005 = _mm512_add_ps(dat2368, dat2376);
__m512 fft9918 = _mm512_sub_ps(dat2367, dat2375);
__m512 fft10006 = _mm512_sub_ps(dat2368, dat2376);
__m512 fft9919 = _mm512_add_ps(dat2369, dat2377);
__m512 fft10007 = _mm512_add_ps(dat2370, dat2378);
__m512 fft9920 = _mm512_sub_ps(dat2369, dat2377);
__m512 fft10008 = _mm512_sub_ps(dat2370, dat2378);
__m512 fft9921 = _mm512_add_ps(fft9913, fft9917);
__m512 fft10009 = _mm512_add_ps(fft10001, fft10005);
__m512 fft9922 = _mm512_sub_ps(fft9913, fft9917);
__m512 fft10010 = _mm512_sub_ps(fft10001, fft10005);
__m512 fft9923 = _mm512_add_ps(fft9915, fft9919);
__m512 fft10011 = _mm512_add_ps(fft10003, fft10007);
__m512 fft9924 = _mm512_sub_ps(fft9919, fft9915);
__m512 fft10012 = _mm512_sub_ps(fft10007, fft10003);
__m512 fft9925 = _mm512_sub_ps(fft9916, fft9920);
__m512 fft10013 = _mm512_sub_ps(fft10004, fft10008);
__m512 fft9926 = _mm512_add_ps(fft9916, fft9920);
__m512 fft10014 = _mm512_add_ps(fft10004, fft10008);
__m512 fft9927 = _mm512_add_ps(fft9921, fft9923);
__m512 fft10015 = _mm512_add_ps(fft10009, fft10011);
__m512 fft9928 = _mm512_sub_ps(fft9921, fft9923);
__m512 fft10016 = _mm512_sub_ps(fft10009, fft10011);
__m512 fft9929 = _mm512_fmadd_ps(fft9925, _mm512_set1_ps(7.0710677e-01f), fft9914);
__m512 fft10017 = _mm512_fmadd_ps(fft10013, _mm512_set1_ps(7.0710677e-01f), fft10002);
__m512 fft9930 = _mm512_fnmsub_ps(fft9926, _mm512_set1_ps(7.0710677e-01f), fft9918);
__m512 fft10018 = _mm512_fnmsub_ps(fft10014, _mm512_set1_ps(7.0710677e-01f), fft10006);
__m512 fft9931 = _mm512_fnmadd_ps(fft9925, _mm512_set1_ps(7.0710677e-01f), fft9914);
__m512 fft10019 = _mm512_fnmadd_ps(fft10013, _mm512_set1_ps(7.0710677e-01f), fft10002);
__m512 fft9932 = _mm512_fnmadd_ps(fft9926, _mm512_set1_ps(7.0710677e-01f), fft9918);
__m512 fft10020 = _mm512_fnmadd_ps(fft10014, _mm512_set1_ps(7.0710677e-01f), fft10006);
__m512 fft9933 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9934 = _mm512_fmadd_ps(fft9927, fft9933, _mm512_shuffle_f32x4(fft9927, fft9927, 78));
__m512 fft10021 = _mm512_fmadd_ps(fft10015, fft9933, _mm512_shuffle_f32x4(fft10015, fft10015, 78));
__m512 fft9935 = _mm512_fmadd_ps(fft9928, fft9933, _mm512_shuffle_f32x4(fft9928, fft9928, 78));
__m512 fft10022 = _mm512_fmadd_ps(fft10016, fft9933, _mm512_shuffle_f32x4(fft10016, fft10016, 78));
__m512 fft9936 = _mm512_fmadd_ps(fft9929, fft9933, _mm512_shuffle_f32x4(fft9929, fft9929, 78));
__m512 fft10023 = _mm512_fmadd_ps(fft10017, fft9933, _mm512_shuffle_f32x4(fft10017, fft10017, 78));
__m512 fft9937 = _mm512_fmadd_ps(fft9930, fft9933, _mm512_shuffle_f32x4(fft9930, fft9930, 78));
__m512 fft10024 = _mm512_fmadd_ps(fft10018, fft9933, _mm512_shuffle_f32x4(fft10018, fft10018, 78));
__m512 fft9938 = _mm512_fmadd_ps(fft9922, fft9933, _mm512_shuffle_f32x4(fft9922, fft9922, 78));
__m512 fft10025 = _mm512_fmadd_ps(fft10010, fft9933, _mm512_shuffle_f32x4(fft10010, fft10010, 78));
__m512 fft9939 = _mm512_fmadd_ps(fft9924, fft9933, _mm512_shuffle_f32x4(fft9924, fft9924, 78));
__m512 fft10026 = _mm512_fmadd_ps(fft10012, fft9933, _mm512_shuffle_f32x4(fft10012, fft10012, 78));
__m512 fft9940 = _mm512_fmadd_ps(fft9931, fft9933, _mm512_shuffle_f32x4(fft9931, fft9931, 78));
__m512 fft10027 = _mm512_fmadd_ps(fft10019, fft9933, _mm512_shuffle_f32x4(fft10019, fft10019, 78));
__m512 fft9941 = _mm512_fmadd_ps(fft9932, fft9933, _mm512_shuffle_f32x4(fft9932, fft9932, 78));
__m512 fft10028 = _mm512_fmadd_ps(fft10020, fft9933, _mm512_shuffle_f32x4(fft10020, fft10020, 78));
__m512 fft9942 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft9943 = _mm512_mul_ps(fft9934, fft9942);
__m512 fft10029 = _mm512_mul_ps(fft10021, fft9942);
__m512 fft9944 = _mm512_mul_ps(fft9935, fft9942);
__m512 fft10030 = _mm512_mul_ps(fft10022, fft9942);
__m512 fft9945 = _mm512_mul_ps(fft9936, fft9942);
__m512 fft10031 = _mm512_mul_ps(fft10023, fft9942);
__m512 fft9946 = _mm512_mul_ps(fft9937, fft9942);
__m512 fft10032 = _mm512_mul_ps(fft10024, fft9942);
__m512 fft9947 = _mm512_mul_ps(fft9938, fft9942);
__m512 fft10033 = _mm512_mul_ps(fft10025, fft9942);
__m512 fft9948 = _mm512_mul_ps(fft9939, fft9942);
__m512 fft10034 = _mm512_mul_ps(fft10026, fft9942);
__m512 fft9949 = _mm512_mul_ps(fft9940, fft9942);
__m512 fft10035 = _mm512_mul_ps(fft10027, fft9942);
__m512 fft9950 = _mm512_mul_ps(fft9941, fft9942);
__m512 fft10036 = _mm512_mul_ps(fft10028, fft9942);
__m512 fft9951 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft9952 = _mm512_fmadd_ps(fft9935, fft9951, fft9943);
__m512 fft10037 = _mm512_fmadd_ps(fft10022, fft9951, fft10029);
__m512 fft9953 = _mm512_fnmadd_ps(fft9934, fft9951, fft9944);
__m512 fft10038 = _mm512_fnmadd_ps(fft10021, fft9951, fft10030);
__m512 fft9954 = _mm512_fmadd_ps(fft9937, fft9951, fft9945);
__m512 fft10039 = _mm512_fmadd_ps(fft10024, fft9951, fft10031);
__m512 fft9955 = _mm512_fnmadd_ps(fft9936, fft9951, fft9946);
__m512 fft10040 = _mm512_fnmadd_ps(fft10023, fft9951, fft10032);
__m512 fft9956 = _mm512_fmadd_ps(fft9939, fft9951, fft9947);
__m512 fft10041 = _mm512_fmadd_ps(fft10026, fft9951, fft10033);
__m512 fft9957 = _mm512_fnmadd_ps(fft9938, fft9951, fft9948);
__m512 fft10042 = _mm512_fnmadd_ps(fft10025, fft9951, fft10034);
__m512 fft9958 = _mm512_fmadd_ps(fft9941, fft9951, fft9949);
__m512 fft10043 = _mm512_fmadd_ps(fft10028, fft9951, fft10035);
__m512 fft9959 = _mm512_fnmadd_ps(fft9940, fft9951, fft9950);
__m512 fft10044 = _mm512_fnmadd_ps(fft10027, fft9951, fft10036);
__m512 fft9960 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft9961 = _mm512_fmadd_ps(fft9952, fft9960, _mm512_shuffle_f32x4(fft9952, fft9952, 177));
__m512 fft10045 = _mm512_fmadd_ps(fft10037, fft9960, _mm512_shuffle_f32x4(fft10037, fft10037, 177));
__m512 fft9962 = _mm512_fmadd_ps(fft9953, fft9960, _mm512_shuffle_f32x4(fft9953, fft9953, 177));
__m512 fft10046 = _mm512_fmadd_ps(fft10038, fft9960, _mm512_shuffle_f32x4(fft10038, fft10038, 177));
__m512 fft9963 = _mm512_fmadd_ps(fft9954, fft9960, _mm512_shuffle_f32x4(fft9954, fft9954, 177));
__m512 fft10047 = _mm512_fmadd_ps(fft10039, fft9960, _mm512_shuffle_f32x4(fft10039, fft10039, 177));
__m512 fft9964 = _mm512_fmadd_ps(fft9955, fft9960, _mm512_shuffle_f32x4(fft9955, fft9955, 177));
__m512 fft10048 = _mm512_fmadd_ps(fft10040, fft9960, _mm512_shuffle_f32x4(fft10040, fft10040, 177));
__m512 fft9965 = _mm512_fmadd_ps(fft9956, fft9960, _mm512_shuffle_f32x4(fft9956, fft9956, 177));
__m512 fft10049 = _mm512_fmadd_ps(fft10041, fft9960, _mm512_shuffle_f32x4(fft10041, fft10041, 177));
__m512 fft9966 = _mm512_fmadd_ps(fft9957, fft9960, _mm512_shuffle_f32x4(fft9957, fft9957, 177));
__m512 fft10050 = _mm512_fmadd_ps(fft10042, fft9960, _mm512_shuffle_f32x4(fft10042, fft10042, 177));
__m512 fft9967 = _mm512_fmadd_ps(fft9958, fft9960, _mm512_shuffle_f32x4(fft9958, fft9958, 177));
__m512 fft10051 = _mm512_fmadd_ps(fft10043, fft9960, _mm512_shuffle_f32x4(fft10043, fft10043, 177));
__m512 fft9968 = _mm512_fmadd_ps(fft9959, fft9960, _mm512_shuffle_f32x4(fft9959, fft9959, 177));
__m512 fft10052 = _mm512_fmadd_ps(fft10044, fft9960, _mm512_shuffle_f32x4(fft10044, fft10044, 177));
__m512 fft9969 = _mm512_mask_mov_ps(fft9961, 49344, fft9962);
__m512 fft10053 = _mm512_mask_mov_ps(fft10045, 49344, fft10046);
__m512 fft9970 = _mm512_mask_sub_ps(fft9962, 49344, _mm512_setzero_ps(), fft9961);
__m512 fft10054 = _mm512_mask_sub_ps(fft10046, 49344, _mm512_setzero_ps(), fft10045);
__m512 fft9971 = _mm512_mask_mov_ps(fft9963, 49344, fft9964);
__m512 fft10055 = _mm512_mask_mov_ps(fft10047, 49344, fft10048);
__m512 fft9972 = _mm512_mask_sub_ps(fft9964, 49344, _mm512_setzero_ps(), fft9963);
__m512 fft10056 = _mm512_mask_sub_ps(fft10048, 49344, _mm512_setzero_ps(), fft10047);
__m512 fft9973 = _mm512_mask_mov_ps(fft9965, 49344, fft9966);
__m512 fft10057 = _mm512_mask_mov_ps(fft10049, 49344, fft10050);
__m512 fft9974 = _mm512_mask_sub_ps(fft9966, 49344, _mm512_setzero_ps(), fft9965);
__m512 fft10058 = _mm512_mask_sub_ps(fft10050, 49344, _mm512_setzero_ps(), fft10049);
__m512 fft9975 = _mm512_mask_mov_ps(fft9967, 49344, fft9968);
__m512 fft10059 = _mm512_mask_mov_ps(fft10051, 49344, fft10052);
__m512 fft9976 = _mm512_mask_sub_ps(fft9968, 49344, _mm512_setzero_ps(), fft9967);
__m512 fft10060 = _mm512_mask_sub_ps(fft10052, 49344, _mm512_setzero_ps(), fft10051);
__m512 fft9977 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft9978 = _mm512_fmadd_ps(fft9969, fft9977, _mm512_shuffle_ps(fft9969, fft9969, 78));
__m512 fft10061 = _mm512_fmadd_ps(fft10053, fft9977, _mm512_shuffle_ps(fft10053, fft10053, 78));
__m512 fft9979 = _mm512_fmadd_ps(fft9970, fft9977, _mm512_shuffle_ps(fft9970, fft9970, 78));
__m512 fft10062 = _mm512_fmadd_ps(fft10054, fft9977, _mm512_shuffle_ps(fft10054, fft10054, 78));
__m512 fft9980 = _mm512_fmadd_ps(fft9971, fft9977, _mm512_shuffle_ps(fft9971, fft9971, 78));
__m512 fft10063 = _mm512_fmadd_ps(fft10055, fft9977, _mm512_shuffle_ps(fft10055, fft10055, 78));
__m512 fft9981 = _mm512_fmadd_ps(fft9972, fft9977, _mm512_shuffle_ps(fft9972, fft9972, 78));
__m512 fft10064 = _mm512_fmadd_ps(fft10056, fft9977, _mm512_shuffle_ps(fft10056, fft10056, 78));
__m512 fft9982 = _mm512_fmadd_ps(fft9973, fft9977, _mm512_shuffle_ps(fft9973, fft9973, 78));
__m512 fft10065 = _mm512_fmadd_ps(fft10057, fft9977, _mm512_shuffle_ps(fft10057, fft10057, 78));
__m512 fft9983 = _mm512_fmadd_ps(fft9974, fft9977, _mm512_shuffle_ps(fft9974, fft9974, 78));
__m512 fft10066 = _mm512_fmadd_ps(fft10058, fft9977, _mm512_shuffle_ps(fft10058, fft10058, 78));
__m512 fft9984 = _mm512_fmadd_ps(fft9975, fft9977, _mm512_shuffle_ps(fft9975, fft9975, 78));
__m512 fft10067 = _mm512_fmadd_ps(fft10059, fft9977, _mm512_shuffle_ps(fft10059, fft10059, 78));
__m512 fft9985 = _mm512_fmadd_ps(fft9976, fft9977, _mm512_shuffle_ps(fft9976, fft9976, 78));
__m512 fft10068 = _mm512_fmadd_ps(fft10060, fft9977, _mm512_shuffle_ps(fft10060, fft10060, 78));
__m512i fft9986 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft9987 = _mm512_permutexvar_ps(fft9986, fft9978);
__m512 fft10069 = _mm512_permutexvar_ps(fft9986, fft10061);
__m512i fft9988 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft9989 = _mm512_permutexvar_ps(fft9988, fft9978);
__m512 fft10070 = _mm512_permutexvar_ps(fft9988, fft10061);
__m512 fft9990 = _mm512_permutexvar_ps(fft9986, fft9979);
__m512 fft10071 = _mm512_permutexvar_ps(fft9986, fft10062);
__m512 fft9991 = _mm512_permutexvar_ps(fft9988, fft9979);
__m512 fft10072 = _mm512_permutexvar_ps(fft9988, fft10062);
__m512 fft9992 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft9993 = _mm512_fmadd_ps(fft9987, fft9992, fft9989);
__m512 fft10073 = _mm512_fmadd_ps(fft10069, fft9992, fft10070);
__m512 fft9994 = _mm512_fnmadd_ps(fft9991, fft9992, fft9990);
__m512 fft10074 = _mm512_fnmadd_ps(fft10072, fft9992, fft10071);
__m512 fft9995 = _mm512_mask_mov_ps(fft9991, 21845, fft9993);
__m512 fft10075 = _mm512_mask_mov_ps(fft10072, 21845, fft10073);
__m512 fft9996 = _mm512_mask_mov_ps(fft9987, 43176, fft9993);
__m512 fft10076 = _mm512_mask_mov_ps(fft10069, 43176, fft10073);
__m512 fft9997 = _mm512_mask_mov_ps(fft9995, 43176, fft9994);
__m512 fft10077 = _mm512_mask_mov_ps(fft10075, 43176, fft10074);
__m512 fft9998 = _mm512_mask_mov_ps(fft9996, 22102, fft9994);
__m512 fft10078 = _mm512_mask_mov_ps(fft10076, 22102, fft10074);
__m512 fft9999 = _mm512_mask_mul_ps(fft9997, 64764, fft9997, _mm512_set1_ps(5e-01f));
__m512 fft10079 = _mm512_mask_mul_ps(fft10077, 64764, fft10077, _mm512_set1_ps(5e-01f));
__m512 fft10000 = _mm512_mask_mul_ps(fft9998, 64764, fft9998, _mm512_set1_ps(5e-01f));
__m512 fft10080 = _mm512_mask_mul_ps(fft10078, 64764, fft10078, _mm512_set1_ps(5e-01f));
__m512 df876 = fft9999;
__m512 df884 = fft10079;
__m512 df877 = fft10000;
__m512 df885 = fft10080;
__m512 df878 = fft9980;
__m512 df886 = fft10063;
__m512 df879 = fft9981;
__m512 df887 = fft10064;
__m512 df880 = fft9982;
__m512 df888 = fft10065;
__m512 df881 = fft9983;
__m512 df889 = fft10066;
__m512 df882 = fft9984;
__m512 df890 = fft10067;
__m512 df883 = fft9985;
__m512 df891 = fft10068;
__m512i eo60 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df878 = _mm512_permutexvar_ps(eo60, df878);
df879 = _mm512_permutexvar_ps(eo60, df879);
_mm512_mask_storeu_ps(dfPtr13+4096+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df878);
_mm512_mask_storeu_ps(dfPtr13+4160+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df879);
_mm512_mask_storeu_ps(dfPtr13+528352+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df878);
_mm512_mask_storeu_ps(dfPtr13+528416+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df879);
df886 = _mm512_permutexvar_ps(eo60, df886);
df887 = _mm512_permutexvar_ps(eo60, df887);
_mm512_mask_storeu_ps(dfPtr13+1052672+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df886);
_mm512_mask_storeu_ps(dfPtr13+1052736+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df887);
_mm512_mask_storeu_ps(dfPtr13+1576928+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df886);
_mm512_mask_storeu_ps(dfPtr13+1576992+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df887);
df880 = _mm512_permutexvar_ps(eo60, df880);
df881 = _mm512_permutexvar_ps(eo60, df881);
_mm512_mask_storeu_ps(dfPtr13+8192+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df880);
_mm512_mask_storeu_ps(dfPtr13+8256+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df881);
_mm512_mask_storeu_ps(dfPtr13+532448+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df880);
_mm512_mask_storeu_ps(dfPtr13+532512+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df881);
df888 = _mm512_permutexvar_ps(eo60, df888);
df889 = _mm512_permutexvar_ps(eo60, df889);
_mm512_mask_storeu_ps(dfPtr13+1056768+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df888);
_mm512_mask_storeu_ps(dfPtr13+1056832+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df889);
_mm512_mask_storeu_ps(dfPtr13+1581024+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df888);
_mm512_mask_storeu_ps(dfPtr13+1581088+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df889);
df882 = _mm512_permutexvar_ps(eo60, df882);
df883 = _mm512_permutexvar_ps(eo60, df883);
_mm512_mask_storeu_ps(dfPtr13+12288+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df882);
_mm512_mask_storeu_ps(dfPtr13+12352+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df883);
_mm512_mask_storeu_ps(dfPtr13+536544+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df882);
_mm512_mask_storeu_ps(dfPtr13+536608+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df883);
df890 = _mm512_permutexvar_ps(eo60, df890);
df891 = _mm512_permutexvar_ps(eo60, df891);
_mm512_mask_storeu_ps(dfPtr13+1060864+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df890);
_mm512_mask_storeu_ps(dfPtr13+1060928+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df891);
_mm512_mask_storeu_ps(dfPtr13+1585120+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df890);
_mm512_mask_storeu_ps(dfPtr13+1585184+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df891);
_mm512_mask_storeu_ps(dfPtr13+0+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df876);
_mm512_mask_storeu_ps(dfPtr13+64+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df877);
_mm512_mask_storeu_ps(dfPtr13+524256+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df876);
_mm512_mask_storeu_ps(dfPtr13+524320+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df877);
_mm512_mask_storeu_ps(dfPtr13+1048576+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df884);
_mm512_mask_storeu_ps(dfPtr13+1048640+16384*i60+6144*j52+256*k154+128*m60+32*f64, 255, df885);
_mm512_mask_storeu_ps(dfPtr13+1572832+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df884);
_mm512_mask_storeu_ps(dfPtr13+1572896+16384*i60+6144*j52+256*k154+128*m60+32*f64, 65280, df885);
ptrdiff_t b78 = 1;
ptrdiff_t m61 = (size_t)b78/2;
ptrdiff_t f65 = (size_t)b78%2;
__m512 dat2379 = _mm512_maskz_loadu_ps(32767, datPtr31+168+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2380 = _mm512_maskz_loadu_ps(32767, datPtr31+280+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2381 = _mm512_maskz_loadu_ps(32767, datPtr31+392+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2382 = _mm512_maskz_loadu_ps(32767, datPtr31+504+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2383 = _mm512_maskz_loadu_ps(32767, datPtr31+616+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2384 = _mm512_maskz_loadu_ps(32767, datPtr31+728+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2385 = _mm512_maskz_loadu_ps(32767, datPtr31+840+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2386 = _mm512_maskz_loadu_ps(32767, datPtr31+952+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2387 = _mm512_maskz_loadu_ps(32767, datPtr31+1064+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2388 = _mm512_maskz_loadu_ps(32767, datPtr31+1176+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2389 = _mm512_maskz_loadu_ps(32767, datPtr31+1288+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2390 = _mm512_maskz_loadu_ps(32767, datPtr31+1400+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2391 = _mm512_maskz_loadu_ps(32767, datPtr31+1512+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2392 = _mm512_maskz_loadu_ps(32767, datPtr31+1624+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 dat2393 = _mm512_maskz_loadu_ps(32767, datPtr31+1736+50176*i60+3136*k154+112*h51+4*w65+0*b78);
__m512 fft10081 = _mm512_add_ps(_mm512_setzero_ps(), dat2386);
__m512 fft10169 = _mm512_add_ps(dat2379, dat2387);
__m512 fft10082 = _mm512_sub_ps(_mm512_setzero_ps(), dat2386);
__m512 fft10170 = _mm512_sub_ps(dat2379, dat2387);
__m512 fft10083 = _mm512_add_ps(dat2380, dat2388);
__m512 fft10171 = _mm512_add_ps(dat2381, dat2389);
__m512 fft10084 = _mm512_sub_ps(dat2380, dat2388);
__m512 fft10172 = _mm512_sub_ps(dat2381, dat2389);
__m512 fft10085 = _mm512_add_ps(dat2382, dat2390);
__m512 fft10173 = _mm512_add_ps(dat2383, dat2391);
__m512 fft10086 = _mm512_sub_ps(dat2382, dat2390);
__m512 fft10174 = _mm512_sub_ps(dat2383, dat2391);
__m512 fft10087 = _mm512_add_ps(dat2384, dat2392);
__m512 fft10175 = _mm512_add_ps(dat2385, dat2393);
__m512 fft10088 = _mm512_sub_ps(dat2384, dat2392);
__m512 fft10176 = _mm512_sub_ps(dat2385, dat2393);
__m512 fft10089 = _mm512_add_ps(fft10081, fft10085);
__m512 fft10177 = _mm512_add_ps(fft10169, fft10173);
__m512 fft10090 = _mm512_sub_ps(fft10081, fft10085);
__m512 fft10178 = _mm512_sub_ps(fft10169, fft10173);
__m512 fft10091 = _mm512_add_ps(fft10083, fft10087);
__m512 fft10179 = _mm512_add_ps(fft10171, fft10175);
__m512 fft10092 = _mm512_sub_ps(fft10087, fft10083);
__m512 fft10180 = _mm512_sub_ps(fft10175, fft10171);
__m512 fft10093 = _mm512_sub_ps(fft10084, fft10088);
__m512 fft10181 = _mm512_sub_ps(fft10172, fft10176);
__m512 fft10094 = _mm512_add_ps(fft10084, fft10088);
__m512 fft10182 = _mm512_add_ps(fft10172, fft10176);
__m512 fft10095 = _mm512_add_ps(fft10089, fft10091);
__m512 fft10183 = _mm512_add_ps(fft10177, fft10179);
__m512 fft10096 = _mm512_sub_ps(fft10089, fft10091);
__m512 fft10184 = _mm512_sub_ps(fft10177, fft10179);
__m512 fft10097 = _mm512_fmadd_ps(fft10093, _mm512_set1_ps(7.0710677e-01f), fft10082);
__m512 fft10185 = _mm512_fmadd_ps(fft10181, _mm512_set1_ps(7.0710677e-01f), fft10170);
__m512 fft10098 = _mm512_fnmsub_ps(fft10094, _mm512_set1_ps(7.0710677e-01f), fft10086);
__m512 fft10186 = _mm512_fnmsub_ps(fft10182, _mm512_set1_ps(7.0710677e-01f), fft10174);
__m512 fft10099 = _mm512_fnmadd_ps(fft10093, _mm512_set1_ps(7.0710677e-01f), fft10082);
__m512 fft10187 = _mm512_fnmadd_ps(fft10181, _mm512_set1_ps(7.0710677e-01f), fft10170);
__m512 fft10100 = _mm512_fnmadd_ps(fft10094, _mm512_set1_ps(7.0710677e-01f), fft10086);
__m512 fft10188 = _mm512_fnmadd_ps(fft10182, _mm512_set1_ps(7.0710677e-01f), fft10174);
__m512 fft10101 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10102 = _mm512_fmadd_ps(fft10095, fft10101, _mm512_shuffle_f32x4(fft10095, fft10095, 78));
__m512 fft10189 = _mm512_fmadd_ps(fft10183, fft10101, _mm512_shuffle_f32x4(fft10183, fft10183, 78));
__m512 fft10103 = _mm512_fmadd_ps(fft10096, fft10101, _mm512_shuffle_f32x4(fft10096, fft10096, 78));
__m512 fft10190 = _mm512_fmadd_ps(fft10184, fft10101, _mm512_shuffle_f32x4(fft10184, fft10184, 78));
__m512 fft10104 = _mm512_fmadd_ps(fft10097, fft10101, _mm512_shuffle_f32x4(fft10097, fft10097, 78));
__m512 fft10191 = _mm512_fmadd_ps(fft10185, fft10101, _mm512_shuffle_f32x4(fft10185, fft10185, 78));
__m512 fft10105 = _mm512_fmadd_ps(fft10098, fft10101, _mm512_shuffle_f32x4(fft10098, fft10098, 78));
__m512 fft10192 = _mm512_fmadd_ps(fft10186, fft10101, _mm512_shuffle_f32x4(fft10186, fft10186, 78));
__m512 fft10106 = _mm512_fmadd_ps(fft10090, fft10101, _mm512_shuffle_f32x4(fft10090, fft10090, 78));
__m512 fft10193 = _mm512_fmadd_ps(fft10178, fft10101, _mm512_shuffle_f32x4(fft10178, fft10178, 78));
__m512 fft10107 = _mm512_fmadd_ps(fft10092, fft10101, _mm512_shuffle_f32x4(fft10092, fft10092, 78));
__m512 fft10194 = _mm512_fmadd_ps(fft10180, fft10101, _mm512_shuffle_f32x4(fft10180, fft10180, 78));
__m512 fft10108 = _mm512_fmadd_ps(fft10099, fft10101, _mm512_shuffle_f32x4(fft10099, fft10099, 78));
__m512 fft10195 = _mm512_fmadd_ps(fft10187, fft10101, _mm512_shuffle_f32x4(fft10187, fft10187, 78));
__m512 fft10109 = _mm512_fmadd_ps(fft10100, fft10101, _mm512_shuffle_f32x4(fft10100, fft10100, 78));
__m512 fft10196 = _mm512_fmadd_ps(fft10188, fft10101, _mm512_shuffle_f32x4(fft10188, fft10188, 78));
__m512 fft10110 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10111 = _mm512_mul_ps(fft10102, fft10110);
__m512 fft10197 = _mm512_mul_ps(fft10189, fft10110);
__m512 fft10112 = _mm512_mul_ps(fft10103, fft10110);
__m512 fft10198 = _mm512_mul_ps(fft10190, fft10110);
__m512 fft10113 = _mm512_mul_ps(fft10104, fft10110);
__m512 fft10199 = _mm512_mul_ps(fft10191, fft10110);
__m512 fft10114 = _mm512_mul_ps(fft10105, fft10110);
__m512 fft10200 = _mm512_mul_ps(fft10192, fft10110);
__m512 fft10115 = _mm512_mul_ps(fft10106, fft10110);
__m512 fft10201 = _mm512_mul_ps(fft10193, fft10110);
__m512 fft10116 = _mm512_mul_ps(fft10107, fft10110);
__m512 fft10202 = _mm512_mul_ps(fft10194, fft10110);
__m512 fft10117 = _mm512_mul_ps(fft10108, fft10110);
__m512 fft10203 = _mm512_mul_ps(fft10195, fft10110);
__m512 fft10118 = _mm512_mul_ps(fft10109, fft10110);
__m512 fft10204 = _mm512_mul_ps(fft10196, fft10110);
__m512 fft10119 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10120 = _mm512_fmadd_ps(fft10103, fft10119, fft10111);
__m512 fft10205 = _mm512_fmadd_ps(fft10190, fft10119, fft10197);
__m512 fft10121 = _mm512_fnmadd_ps(fft10102, fft10119, fft10112);
__m512 fft10206 = _mm512_fnmadd_ps(fft10189, fft10119, fft10198);
__m512 fft10122 = _mm512_fmadd_ps(fft10105, fft10119, fft10113);
__m512 fft10207 = _mm512_fmadd_ps(fft10192, fft10119, fft10199);
__m512 fft10123 = _mm512_fnmadd_ps(fft10104, fft10119, fft10114);
__m512 fft10208 = _mm512_fnmadd_ps(fft10191, fft10119, fft10200);
__m512 fft10124 = _mm512_fmadd_ps(fft10107, fft10119, fft10115);
__m512 fft10209 = _mm512_fmadd_ps(fft10194, fft10119, fft10201);
__m512 fft10125 = _mm512_fnmadd_ps(fft10106, fft10119, fft10116);
__m512 fft10210 = _mm512_fnmadd_ps(fft10193, fft10119, fft10202);
__m512 fft10126 = _mm512_fmadd_ps(fft10109, fft10119, fft10117);
__m512 fft10211 = _mm512_fmadd_ps(fft10196, fft10119, fft10203);
__m512 fft10127 = _mm512_fnmadd_ps(fft10108, fft10119, fft10118);
__m512 fft10212 = _mm512_fnmadd_ps(fft10195, fft10119, fft10204);
__m512 fft10128 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10129 = _mm512_fmadd_ps(fft10120, fft10128, _mm512_shuffle_f32x4(fft10120, fft10120, 177));
__m512 fft10213 = _mm512_fmadd_ps(fft10205, fft10128, _mm512_shuffle_f32x4(fft10205, fft10205, 177));
__m512 fft10130 = _mm512_fmadd_ps(fft10121, fft10128, _mm512_shuffle_f32x4(fft10121, fft10121, 177));
__m512 fft10214 = _mm512_fmadd_ps(fft10206, fft10128, _mm512_shuffle_f32x4(fft10206, fft10206, 177));
__m512 fft10131 = _mm512_fmadd_ps(fft10122, fft10128, _mm512_shuffle_f32x4(fft10122, fft10122, 177));
__m512 fft10215 = _mm512_fmadd_ps(fft10207, fft10128, _mm512_shuffle_f32x4(fft10207, fft10207, 177));
__m512 fft10132 = _mm512_fmadd_ps(fft10123, fft10128, _mm512_shuffle_f32x4(fft10123, fft10123, 177));
__m512 fft10216 = _mm512_fmadd_ps(fft10208, fft10128, _mm512_shuffle_f32x4(fft10208, fft10208, 177));
__m512 fft10133 = _mm512_fmadd_ps(fft10124, fft10128, _mm512_shuffle_f32x4(fft10124, fft10124, 177));
__m512 fft10217 = _mm512_fmadd_ps(fft10209, fft10128, _mm512_shuffle_f32x4(fft10209, fft10209, 177));
__m512 fft10134 = _mm512_fmadd_ps(fft10125, fft10128, _mm512_shuffle_f32x4(fft10125, fft10125, 177));
__m512 fft10218 = _mm512_fmadd_ps(fft10210, fft10128, _mm512_shuffle_f32x4(fft10210, fft10210, 177));
__m512 fft10135 = _mm512_fmadd_ps(fft10126, fft10128, _mm512_shuffle_f32x4(fft10126, fft10126, 177));
__m512 fft10219 = _mm512_fmadd_ps(fft10211, fft10128, _mm512_shuffle_f32x4(fft10211, fft10211, 177));
__m512 fft10136 = _mm512_fmadd_ps(fft10127, fft10128, _mm512_shuffle_f32x4(fft10127, fft10127, 177));
__m512 fft10220 = _mm512_fmadd_ps(fft10212, fft10128, _mm512_shuffle_f32x4(fft10212, fft10212, 177));
__m512 fft10137 = _mm512_mask_mov_ps(fft10129, 49344, fft10130);
__m512 fft10221 = _mm512_mask_mov_ps(fft10213, 49344, fft10214);
__m512 fft10138 = _mm512_mask_sub_ps(fft10130, 49344, _mm512_setzero_ps(), fft10129);
__m512 fft10222 = _mm512_mask_sub_ps(fft10214, 49344, _mm512_setzero_ps(), fft10213);
__m512 fft10139 = _mm512_mask_mov_ps(fft10131, 49344, fft10132);
__m512 fft10223 = _mm512_mask_mov_ps(fft10215, 49344, fft10216);
__m512 fft10140 = _mm512_mask_sub_ps(fft10132, 49344, _mm512_setzero_ps(), fft10131);
__m512 fft10224 = _mm512_mask_sub_ps(fft10216, 49344, _mm512_setzero_ps(), fft10215);
__m512 fft10141 = _mm512_mask_mov_ps(fft10133, 49344, fft10134);
__m512 fft10225 = _mm512_mask_mov_ps(fft10217, 49344, fft10218);
__m512 fft10142 = _mm512_mask_sub_ps(fft10134, 49344, _mm512_setzero_ps(), fft10133);
__m512 fft10226 = _mm512_mask_sub_ps(fft10218, 49344, _mm512_setzero_ps(), fft10217);
__m512 fft10143 = _mm512_mask_mov_ps(fft10135, 49344, fft10136);
__m512 fft10227 = _mm512_mask_mov_ps(fft10219, 49344, fft10220);
__m512 fft10144 = _mm512_mask_sub_ps(fft10136, 49344, _mm512_setzero_ps(), fft10135);
__m512 fft10228 = _mm512_mask_sub_ps(fft10220, 49344, _mm512_setzero_ps(), fft10219);
__m512 fft10145 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10146 = _mm512_fmadd_ps(fft10137, fft10145, _mm512_shuffle_ps(fft10137, fft10137, 78));
__m512 fft10229 = _mm512_fmadd_ps(fft10221, fft10145, _mm512_shuffle_ps(fft10221, fft10221, 78));
__m512 fft10147 = _mm512_fmadd_ps(fft10138, fft10145, _mm512_shuffle_ps(fft10138, fft10138, 78));
__m512 fft10230 = _mm512_fmadd_ps(fft10222, fft10145, _mm512_shuffle_ps(fft10222, fft10222, 78));
__m512 fft10148 = _mm512_fmadd_ps(fft10139, fft10145, _mm512_shuffle_ps(fft10139, fft10139, 78));
__m512 fft10231 = _mm512_fmadd_ps(fft10223, fft10145, _mm512_shuffle_ps(fft10223, fft10223, 78));
__m512 fft10149 = _mm512_fmadd_ps(fft10140, fft10145, _mm512_shuffle_ps(fft10140, fft10140, 78));
__m512 fft10232 = _mm512_fmadd_ps(fft10224, fft10145, _mm512_shuffle_ps(fft10224, fft10224, 78));
__m512 fft10150 = _mm512_fmadd_ps(fft10141, fft10145, _mm512_shuffle_ps(fft10141, fft10141, 78));
__m512 fft10233 = _mm512_fmadd_ps(fft10225, fft10145, _mm512_shuffle_ps(fft10225, fft10225, 78));
__m512 fft10151 = _mm512_fmadd_ps(fft10142, fft10145, _mm512_shuffle_ps(fft10142, fft10142, 78));
__m512 fft10234 = _mm512_fmadd_ps(fft10226, fft10145, _mm512_shuffle_ps(fft10226, fft10226, 78));
__m512 fft10152 = _mm512_fmadd_ps(fft10143, fft10145, _mm512_shuffle_ps(fft10143, fft10143, 78));
__m512 fft10235 = _mm512_fmadd_ps(fft10227, fft10145, _mm512_shuffle_ps(fft10227, fft10227, 78));
__m512 fft10153 = _mm512_fmadd_ps(fft10144, fft10145, _mm512_shuffle_ps(fft10144, fft10144, 78));
__m512 fft10236 = _mm512_fmadd_ps(fft10228, fft10145, _mm512_shuffle_ps(fft10228, fft10228, 78));
__m512i fft10154 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10155 = _mm512_permutexvar_ps(fft10154, fft10146);
__m512 fft10237 = _mm512_permutexvar_ps(fft10154, fft10229);
__m512i fft10156 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10157 = _mm512_permutexvar_ps(fft10156, fft10146);
__m512 fft10238 = _mm512_permutexvar_ps(fft10156, fft10229);
__m512 fft10158 = _mm512_permutexvar_ps(fft10154, fft10147);
__m512 fft10239 = _mm512_permutexvar_ps(fft10154, fft10230);
__m512 fft10159 = _mm512_permutexvar_ps(fft10156, fft10147);
__m512 fft10240 = _mm512_permutexvar_ps(fft10156, fft10230);
__m512 fft10160 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft10161 = _mm512_fmadd_ps(fft10155, fft10160, fft10157);
__m512 fft10241 = _mm512_fmadd_ps(fft10237, fft10160, fft10238);
__m512 fft10162 = _mm512_fnmadd_ps(fft10159, fft10160, fft10158);
__m512 fft10242 = _mm512_fnmadd_ps(fft10240, fft10160, fft10239);
__m512 fft10163 = _mm512_mask_mov_ps(fft10159, 21845, fft10161);
__m512 fft10243 = _mm512_mask_mov_ps(fft10240, 21845, fft10241);
__m512 fft10164 = _mm512_mask_mov_ps(fft10155, 43176, fft10161);
__m512 fft10244 = _mm512_mask_mov_ps(fft10237, 43176, fft10241);
__m512 fft10165 = _mm512_mask_mov_ps(fft10163, 43176, fft10162);
__m512 fft10245 = _mm512_mask_mov_ps(fft10243, 43176, fft10242);
__m512 fft10166 = _mm512_mask_mov_ps(fft10164, 22102, fft10162);
__m512 fft10246 = _mm512_mask_mov_ps(fft10244, 22102, fft10242);
__m512 fft10167 = _mm512_mask_mul_ps(fft10165, 64764, fft10165, _mm512_set1_ps(5e-01f));
__m512 fft10247 = _mm512_mask_mul_ps(fft10245, 64764, fft10245, _mm512_set1_ps(5e-01f));
__m512 fft10168 = _mm512_mask_mul_ps(fft10166, 64764, fft10166, _mm512_set1_ps(5e-01f));
__m512 fft10248 = _mm512_mask_mul_ps(fft10246, 64764, fft10246, _mm512_set1_ps(5e-01f));
__m512 df892 = fft10167;
__m512 df900 = fft10247;
__m512 df893 = fft10168;
__m512 df901 = fft10248;
__m512 df894 = fft10148;
__m512 df902 = fft10231;
__m512 df895 = fft10149;
__m512 df903 = fft10232;
__m512 df896 = fft10150;
__m512 df904 = fft10233;
__m512 df897 = fft10151;
__m512 df905 = fft10234;
__m512 df898 = fft10152;
__m512 df906 = fft10235;
__m512 df899 = fft10153;
__m512 df907 = fft10236;
__m512i eo61 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df894 = _mm512_permutexvar_ps(eo61, df894);
df895 = _mm512_permutexvar_ps(eo61, df895);
_mm512_mask_storeu_ps(dfPtr13+4096+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df894);
_mm512_mask_storeu_ps(dfPtr13+4160+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df895);
_mm512_mask_storeu_ps(dfPtr13+528352+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df894);
_mm512_mask_storeu_ps(dfPtr13+528416+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df895);
df902 = _mm512_permutexvar_ps(eo61, df902);
df903 = _mm512_permutexvar_ps(eo61, df903);
_mm512_mask_storeu_ps(dfPtr13+1052672+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df902);
_mm512_mask_storeu_ps(dfPtr13+1052736+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df903);
_mm512_mask_storeu_ps(dfPtr13+1576928+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df902);
_mm512_mask_storeu_ps(dfPtr13+1576992+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df903);
df896 = _mm512_permutexvar_ps(eo61, df896);
df897 = _mm512_permutexvar_ps(eo61, df897);
_mm512_mask_storeu_ps(dfPtr13+8192+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df896);
_mm512_mask_storeu_ps(dfPtr13+8256+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df897);
_mm512_mask_storeu_ps(dfPtr13+532448+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df896);
_mm512_mask_storeu_ps(dfPtr13+532512+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df897);
df904 = _mm512_permutexvar_ps(eo61, df904);
df905 = _mm512_permutexvar_ps(eo61, df905);
_mm512_mask_storeu_ps(dfPtr13+1056768+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df904);
_mm512_mask_storeu_ps(dfPtr13+1056832+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df905);
_mm512_mask_storeu_ps(dfPtr13+1581024+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df904);
_mm512_mask_storeu_ps(dfPtr13+1581088+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df905);
df898 = _mm512_permutexvar_ps(eo61, df898);
df899 = _mm512_permutexvar_ps(eo61, df899);
_mm512_mask_storeu_ps(dfPtr13+12288+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df898);
_mm512_mask_storeu_ps(dfPtr13+12352+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df899);
_mm512_mask_storeu_ps(dfPtr13+536544+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df898);
_mm512_mask_storeu_ps(dfPtr13+536608+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df899);
df906 = _mm512_permutexvar_ps(eo61, df906);
df907 = _mm512_permutexvar_ps(eo61, df907);
_mm512_mask_storeu_ps(dfPtr13+1060864+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df906);
_mm512_mask_storeu_ps(dfPtr13+1060928+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df907);
_mm512_mask_storeu_ps(dfPtr13+1585120+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df906);
_mm512_mask_storeu_ps(dfPtr13+1585184+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df907);
_mm512_mask_storeu_ps(dfPtr13+0+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df892);
_mm512_mask_storeu_ps(dfPtr13+64+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df893);
_mm512_mask_storeu_ps(dfPtr13+524256+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df892);
_mm512_mask_storeu_ps(dfPtr13+524320+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df893);
_mm512_mask_storeu_ps(dfPtr13+1048576+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df900);
_mm512_mask_storeu_ps(dfPtr13+1048640+16384*i60+6144*j52+256*k154+128*m61+32*f65, 255, df901);
_mm512_mask_storeu_ps(dfPtr13+1572832+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df900);
_mm512_mask_storeu_ps(dfPtr13+1572896+16384*i60+6144*j52+256*k154+128*m61+32*f65, 65280, df901);
ptrdiff_t b79 = 2;
ptrdiff_t m62 = (size_t)b79/2;
ptrdiff_t f66 = (size_t)b79%2;
__m512 dat2394 = _mm512_maskz_loadu_ps(65534, datPtr31+1568+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2395 = _mm512_maskz_loadu_ps(65534, datPtr31+1680+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2396 = _mm512_maskz_loadu_ps(65534, datPtr31+1792+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2397 = _mm512_maskz_loadu_ps(65534, datPtr31+1904+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2398 = _mm512_maskz_loadu_ps(65534, datPtr31+2016+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2399 = _mm512_maskz_loadu_ps(65534, datPtr31+2128+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2400 = _mm512_maskz_loadu_ps(65534, datPtr31+2240+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2401 = _mm512_maskz_loadu_ps(65534, datPtr31+2352+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2402 = _mm512_maskz_loadu_ps(65534, datPtr31+2464+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2403 = _mm512_maskz_loadu_ps(65534, datPtr31+2576+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2404 = _mm512_maskz_loadu_ps(65534, datPtr31+2688+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2405 = _mm512_maskz_loadu_ps(65534, datPtr31+2800+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2406 = _mm512_maskz_loadu_ps(65534, datPtr31+2912+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2407 = _mm512_maskz_loadu_ps(65534, datPtr31+3024+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 dat2408 = _mm512_maskz_loadu_ps(65534, datPtr31+3136+50176*i60+3136*k154+112*h51+4*w65+0*b79);
__m512 fft10249 = _mm512_add_ps(dat2394, dat2402);
__m512 fft10337 = _mm512_add_ps(dat2395, dat2403);
__m512 fft10250 = _mm512_sub_ps(dat2394, dat2402);
__m512 fft10338 = _mm512_sub_ps(dat2395, dat2403);
__m512 fft10251 = _mm512_add_ps(dat2396, dat2404);
__m512 fft10339 = _mm512_add_ps(dat2397, dat2405);
__m512 fft10252 = _mm512_sub_ps(dat2396, dat2404);
__m512 fft10340 = _mm512_sub_ps(dat2397, dat2405);
__m512 fft10253 = _mm512_add_ps(dat2398, dat2406);
__m512 fft10341 = _mm512_add_ps(dat2399, dat2407);
__m512 fft10254 = _mm512_sub_ps(dat2398, dat2406);
__m512 fft10342 = _mm512_sub_ps(dat2399, dat2407);
__m512 fft10255 = _mm512_add_ps(dat2400, dat2408);
__m512 fft10343 = _mm512_add_ps(dat2401, _mm512_setzero_ps());
__m512 fft10256 = _mm512_sub_ps(dat2400, dat2408);
__m512 fft10344 = _mm512_sub_ps(dat2401, _mm512_setzero_ps());
__m512 fft10257 = _mm512_add_ps(fft10249, fft10253);
__m512 fft10345 = _mm512_add_ps(fft10337, fft10341);
__m512 fft10258 = _mm512_sub_ps(fft10249, fft10253);
__m512 fft10346 = _mm512_sub_ps(fft10337, fft10341);
__m512 fft10259 = _mm512_add_ps(fft10251, fft10255);
__m512 fft10347 = _mm512_add_ps(fft10339, fft10343);
__m512 fft10260 = _mm512_sub_ps(fft10255, fft10251);
__m512 fft10348 = _mm512_sub_ps(fft10343, fft10339);
__m512 fft10261 = _mm512_sub_ps(fft10252, fft10256);
__m512 fft10349 = _mm512_sub_ps(fft10340, fft10344);
__m512 fft10262 = _mm512_add_ps(fft10252, fft10256);
__m512 fft10350 = _mm512_add_ps(fft10340, fft10344);
__m512 fft10263 = _mm512_add_ps(fft10257, fft10259);
__m512 fft10351 = _mm512_add_ps(fft10345, fft10347);
__m512 fft10264 = _mm512_sub_ps(fft10257, fft10259);
__m512 fft10352 = _mm512_sub_ps(fft10345, fft10347);
__m512 fft10265 = _mm512_fmadd_ps(fft10261, _mm512_set1_ps(7.0710677e-01f), fft10250);
__m512 fft10353 = _mm512_fmadd_ps(fft10349, _mm512_set1_ps(7.0710677e-01f), fft10338);
__m512 fft10266 = _mm512_fnmsub_ps(fft10262, _mm512_set1_ps(7.0710677e-01f), fft10254);
__m512 fft10354 = _mm512_fnmsub_ps(fft10350, _mm512_set1_ps(7.0710677e-01f), fft10342);
__m512 fft10267 = _mm512_fnmadd_ps(fft10261, _mm512_set1_ps(7.0710677e-01f), fft10250);
__m512 fft10355 = _mm512_fnmadd_ps(fft10349, _mm512_set1_ps(7.0710677e-01f), fft10338);
__m512 fft10268 = _mm512_fnmadd_ps(fft10262, _mm512_set1_ps(7.0710677e-01f), fft10254);
__m512 fft10356 = _mm512_fnmadd_ps(fft10350, _mm512_set1_ps(7.0710677e-01f), fft10342);
__m512 fft10269 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10270 = _mm512_fmadd_ps(fft10263, fft10269, _mm512_shuffle_f32x4(fft10263, fft10263, 78));
__m512 fft10357 = _mm512_fmadd_ps(fft10351, fft10269, _mm512_shuffle_f32x4(fft10351, fft10351, 78));
__m512 fft10271 = _mm512_fmadd_ps(fft10264, fft10269, _mm512_shuffle_f32x4(fft10264, fft10264, 78));
__m512 fft10358 = _mm512_fmadd_ps(fft10352, fft10269, _mm512_shuffle_f32x4(fft10352, fft10352, 78));
__m512 fft10272 = _mm512_fmadd_ps(fft10265, fft10269, _mm512_shuffle_f32x4(fft10265, fft10265, 78));
__m512 fft10359 = _mm512_fmadd_ps(fft10353, fft10269, _mm512_shuffle_f32x4(fft10353, fft10353, 78));
__m512 fft10273 = _mm512_fmadd_ps(fft10266, fft10269, _mm512_shuffle_f32x4(fft10266, fft10266, 78));
__m512 fft10360 = _mm512_fmadd_ps(fft10354, fft10269, _mm512_shuffle_f32x4(fft10354, fft10354, 78));
__m512 fft10274 = _mm512_fmadd_ps(fft10258, fft10269, _mm512_shuffle_f32x4(fft10258, fft10258, 78));
__m512 fft10361 = _mm512_fmadd_ps(fft10346, fft10269, _mm512_shuffle_f32x4(fft10346, fft10346, 78));
__m512 fft10275 = _mm512_fmadd_ps(fft10260, fft10269, _mm512_shuffle_f32x4(fft10260, fft10260, 78));
__m512 fft10362 = _mm512_fmadd_ps(fft10348, fft10269, _mm512_shuffle_f32x4(fft10348, fft10348, 78));
__m512 fft10276 = _mm512_fmadd_ps(fft10267, fft10269, _mm512_shuffle_f32x4(fft10267, fft10267, 78));
__m512 fft10363 = _mm512_fmadd_ps(fft10355, fft10269, _mm512_shuffle_f32x4(fft10355, fft10355, 78));
__m512 fft10277 = _mm512_fmadd_ps(fft10268, fft10269, _mm512_shuffle_f32x4(fft10268, fft10268, 78));
__m512 fft10364 = _mm512_fmadd_ps(fft10356, fft10269, _mm512_shuffle_f32x4(fft10356, fft10356, 78));
__m512 fft10278 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10279 = _mm512_mul_ps(fft10270, fft10278);
__m512 fft10365 = _mm512_mul_ps(fft10357, fft10278);
__m512 fft10280 = _mm512_mul_ps(fft10271, fft10278);
__m512 fft10366 = _mm512_mul_ps(fft10358, fft10278);
__m512 fft10281 = _mm512_mul_ps(fft10272, fft10278);
__m512 fft10367 = _mm512_mul_ps(fft10359, fft10278);
__m512 fft10282 = _mm512_mul_ps(fft10273, fft10278);
__m512 fft10368 = _mm512_mul_ps(fft10360, fft10278);
__m512 fft10283 = _mm512_mul_ps(fft10274, fft10278);
__m512 fft10369 = _mm512_mul_ps(fft10361, fft10278);
__m512 fft10284 = _mm512_mul_ps(fft10275, fft10278);
__m512 fft10370 = _mm512_mul_ps(fft10362, fft10278);
__m512 fft10285 = _mm512_mul_ps(fft10276, fft10278);
__m512 fft10371 = _mm512_mul_ps(fft10363, fft10278);
__m512 fft10286 = _mm512_mul_ps(fft10277, fft10278);
__m512 fft10372 = _mm512_mul_ps(fft10364, fft10278);
__m512 fft10287 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10288 = _mm512_fmadd_ps(fft10271, fft10287, fft10279);
__m512 fft10373 = _mm512_fmadd_ps(fft10358, fft10287, fft10365);
__m512 fft10289 = _mm512_fnmadd_ps(fft10270, fft10287, fft10280);
__m512 fft10374 = _mm512_fnmadd_ps(fft10357, fft10287, fft10366);
__m512 fft10290 = _mm512_fmadd_ps(fft10273, fft10287, fft10281);
__m512 fft10375 = _mm512_fmadd_ps(fft10360, fft10287, fft10367);
__m512 fft10291 = _mm512_fnmadd_ps(fft10272, fft10287, fft10282);
__m512 fft10376 = _mm512_fnmadd_ps(fft10359, fft10287, fft10368);
__m512 fft10292 = _mm512_fmadd_ps(fft10275, fft10287, fft10283);
__m512 fft10377 = _mm512_fmadd_ps(fft10362, fft10287, fft10369);
__m512 fft10293 = _mm512_fnmadd_ps(fft10274, fft10287, fft10284);
__m512 fft10378 = _mm512_fnmadd_ps(fft10361, fft10287, fft10370);
__m512 fft10294 = _mm512_fmadd_ps(fft10277, fft10287, fft10285);
__m512 fft10379 = _mm512_fmadd_ps(fft10364, fft10287, fft10371);
__m512 fft10295 = _mm512_fnmadd_ps(fft10276, fft10287, fft10286);
__m512 fft10380 = _mm512_fnmadd_ps(fft10363, fft10287, fft10372);
__m512 fft10296 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10297 = _mm512_fmadd_ps(fft10288, fft10296, _mm512_shuffle_f32x4(fft10288, fft10288, 177));
__m512 fft10381 = _mm512_fmadd_ps(fft10373, fft10296, _mm512_shuffle_f32x4(fft10373, fft10373, 177));
__m512 fft10298 = _mm512_fmadd_ps(fft10289, fft10296, _mm512_shuffle_f32x4(fft10289, fft10289, 177));
__m512 fft10382 = _mm512_fmadd_ps(fft10374, fft10296, _mm512_shuffle_f32x4(fft10374, fft10374, 177));
__m512 fft10299 = _mm512_fmadd_ps(fft10290, fft10296, _mm512_shuffle_f32x4(fft10290, fft10290, 177));
__m512 fft10383 = _mm512_fmadd_ps(fft10375, fft10296, _mm512_shuffle_f32x4(fft10375, fft10375, 177));
__m512 fft10300 = _mm512_fmadd_ps(fft10291, fft10296, _mm512_shuffle_f32x4(fft10291, fft10291, 177));
__m512 fft10384 = _mm512_fmadd_ps(fft10376, fft10296, _mm512_shuffle_f32x4(fft10376, fft10376, 177));
__m512 fft10301 = _mm512_fmadd_ps(fft10292, fft10296, _mm512_shuffle_f32x4(fft10292, fft10292, 177));
__m512 fft10385 = _mm512_fmadd_ps(fft10377, fft10296, _mm512_shuffle_f32x4(fft10377, fft10377, 177));
__m512 fft10302 = _mm512_fmadd_ps(fft10293, fft10296, _mm512_shuffle_f32x4(fft10293, fft10293, 177));
__m512 fft10386 = _mm512_fmadd_ps(fft10378, fft10296, _mm512_shuffle_f32x4(fft10378, fft10378, 177));
__m512 fft10303 = _mm512_fmadd_ps(fft10294, fft10296, _mm512_shuffle_f32x4(fft10294, fft10294, 177));
__m512 fft10387 = _mm512_fmadd_ps(fft10379, fft10296, _mm512_shuffle_f32x4(fft10379, fft10379, 177));
__m512 fft10304 = _mm512_fmadd_ps(fft10295, fft10296, _mm512_shuffle_f32x4(fft10295, fft10295, 177));
__m512 fft10388 = _mm512_fmadd_ps(fft10380, fft10296, _mm512_shuffle_f32x4(fft10380, fft10380, 177));
__m512 fft10305 = _mm512_mask_mov_ps(fft10297, 49344, fft10298);
__m512 fft10389 = _mm512_mask_mov_ps(fft10381, 49344, fft10382);
__m512 fft10306 = _mm512_mask_sub_ps(fft10298, 49344, _mm512_setzero_ps(), fft10297);
__m512 fft10390 = _mm512_mask_sub_ps(fft10382, 49344, _mm512_setzero_ps(), fft10381);
__m512 fft10307 = _mm512_mask_mov_ps(fft10299, 49344, fft10300);
__m512 fft10391 = _mm512_mask_mov_ps(fft10383, 49344, fft10384);
__m512 fft10308 = _mm512_mask_sub_ps(fft10300, 49344, _mm512_setzero_ps(), fft10299);
__m512 fft10392 = _mm512_mask_sub_ps(fft10384, 49344, _mm512_setzero_ps(), fft10383);
__m512 fft10309 = _mm512_mask_mov_ps(fft10301, 49344, fft10302);
__m512 fft10393 = _mm512_mask_mov_ps(fft10385, 49344, fft10386);
__m512 fft10310 = _mm512_mask_sub_ps(fft10302, 49344, _mm512_setzero_ps(), fft10301);
__m512 fft10394 = _mm512_mask_sub_ps(fft10386, 49344, _mm512_setzero_ps(), fft10385);
__m512 fft10311 = _mm512_mask_mov_ps(fft10303, 49344, fft10304);
__m512 fft10395 = _mm512_mask_mov_ps(fft10387, 49344, fft10388);
__m512 fft10312 = _mm512_mask_sub_ps(fft10304, 49344, _mm512_setzero_ps(), fft10303);
__m512 fft10396 = _mm512_mask_sub_ps(fft10388, 49344, _mm512_setzero_ps(), fft10387);
__m512 fft10313 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10314 = _mm512_fmadd_ps(fft10305, fft10313, _mm512_shuffle_ps(fft10305, fft10305, 78));
__m512 fft10397 = _mm512_fmadd_ps(fft10389, fft10313, _mm512_shuffle_ps(fft10389, fft10389, 78));
__m512 fft10315 = _mm512_fmadd_ps(fft10306, fft10313, _mm512_shuffle_ps(fft10306, fft10306, 78));
__m512 fft10398 = _mm512_fmadd_ps(fft10390, fft10313, _mm512_shuffle_ps(fft10390, fft10390, 78));
__m512 fft10316 = _mm512_fmadd_ps(fft10307, fft10313, _mm512_shuffle_ps(fft10307, fft10307, 78));
__m512 fft10399 = _mm512_fmadd_ps(fft10391, fft10313, _mm512_shuffle_ps(fft10391, fft10391, 78));
__m512 fft10317 = _mm512_fmadd_ps(fft10308, fft10313, _mm512_shuffle_ps(fft10308, fft10308, 78));
__m512 fft10400 = _mm512_fmadd_ps(fft10392, fft10313, _mm512_shuffle_ps(fft10392, fft10392, 78));
__m512 fft10318 = _mm512_fmadd_ps(fft10309, fft10313, _mm512_shuffle_ps(fft10309, fft10309, 78));
__m512 fft10401 = _mm512_fmadd_ps(fft10393, fft10313, _mm512_shuffle_ps(fft10393, fft10393, 78));
__m512 fft10319 = _mm512_fmadd_ps(fft10310, fft10313, _mm512_shuffle_ps(fft10310, fft10310, 78));
__m512 fft10402 = _mm512_fmadd_ps(fft10394, fft10313, _mm512_shuffle_ps(fft10394, fft10394, 78));
__m512 fft10320 = _mm512_fmadd_ps(fft10311, fft10313, _mm512_shuffle_ps(fft10311, fft10311, 78));
__m512 fft10403 = _mm512_fmadd_ps(fft10395, fft10313, _mm512_shuffle_ps(fft10395, fft10395, 78));
__m512 fft10321 = _mm512_fmadd_ps(fft10312, fft10313, _mm512_shuffle_ps(fft10312, fft10312, 78));
__m512 fft10404 = _mm512_fmadd_ps(fft10396, fft10313, _mm512_shuffle_ps(fft10396, fft10396, 78));
__m512i fft10322 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10323 = _mm512_permutexvar_ps(fft10322, fft10314);
__m512 fft10405 = _mm512_permutexvar_ps(fft10322, fft10397);
__m512i fft10324 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10325 = _mm512_permutexvar_ps(fft10324, fft10314);
__m512 fft10406 = _mm512_permutexvar_ps(fft10324, fft10397);
__m512 fft10326 = _mm512_permutexvar_ps(fft10322, fft10315);
__m512 fft10407 = _mm512_permutexvar_ps(fft10322, fft10398);
__m512 fft10327 = _mm512_permutexvar_ps(fft10324, fft10315);
__m512 fft10408 = _mm512_permutexvar_ps(fft10324, fft10398);
__m512 fft10328 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft10329 = _mm512_fmadd_ps(fft10323, fft10328, fft10325);
__m512 fft10409 = _mm512_fmadd_ps(fft10405, fft10328, fft10406);
__m512 fft10330 = _mm512_fnmadd_ps(fft10327, fft10328, fft10326);
__m512 fft10410 = _mm512_fnmadd_ps(fft10408, fft10328, fft10407);
__m512 fft10331 = _mm512_mask_mov_ps(fft10327, 21845, fft10329);
__m512 fft10411 = _mm512_mask_mov_ps(fft10408, 21845, fft10409);
__m512 fft10332 = _mm512_mask_mov_ps(fft10323, 43176, fft10329);
__m512 fft10412 = _mm512_mask_mov_ps(fft10405, 43176, fft10409);
__m512 fft10333 = _mm512_mask_mov_ps(fft10331, 43176, fft10330);
__m512 fft10413 = _mm512_mask_mov_ps(fft10411, 43176, fft10410);
__m512 fft10334 = _mm512_mask_mov_ps(fft10332, 22102, fft10330);
__m512 fft10414 = _mm512_mask_mov_ps(fft10412, 22102, fft10410);
__m512 fft10335 = _mm512_mask_mul_ps(fft10333, 64764, fft10333, _mm512_set1_ps(5e-01f));
__m512 fft10415 = _mm512_mask_mul_ps(fft10413, 64764, fft10413, _mm512_set1_ps(5e-01f));
__m512 fft10336 = _mm512_mask_mul_ps(fft10334, 64764, fft10334, _mm512_set1_ps(5e-01f));
__m512 fft10416 = _mm512_mask_mul_ps(fft10414, 64764, fft10414, _mm512_set1_ps(5e-01f));
__m512 df908 = fft10335;
__m512 df916 = fft10415;
__m512 df909 = fft10336;
__m512 df917 = fft10416;
__m512 df910 = fft10316;
__m512 df918 = fft10399;
__m512 df911 = fft10317;
__m512 df919 = fft10400;
__m512 df912 = fft10318;
__m512 df920 = fft10401;
__m512 df913 = fft10319;
__m512 df921 = fft10402;
__m512 df914 = fft10320;
__m512 df922 = fft10403;
__m512 df915 = fft10321;
__m512 df923 = fft10404;
__m512i eo62 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df910 = _mm512_permutexvar_ps(eo62, df910);
df911 = _mm512_permutexvar_ps(eo62, df911);
_mm512_mask_storeu_ps(dfPtr13+4096+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df910);
_mm512_mask_storeu_ps(dfPtr13+4160+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df911);
_mm512_mask_storeu_ps(dfPtr13+528352+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df910);
_mm512_mask_storeu_ps(dfPtr13+528416+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df911);
df918 = _mm512_permutexvar_ps(eo62, df918);
df919 = _mm512_permutexvar_ps(eo62, df919);
_mm512_mask_storeu_ps(dfPtr13+1052672+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df918);
_mm512_mask_storeu_ps(dfPtr13+1052736+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df919);
_mm512_mask_storeu_ps(dfPtr13+1576928+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df918);
_mm512_mask_storeu_ps(dfPtr13+1576992+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df919);
df912 = _mm512_permutexvar_ps(eo62, df912);
df913 = _mm512_permutexvar_ps(eo62, df913);
_mm512_mask_storeu_ps(dfPtr13+8192+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df912);
_mm512_mask_storeu_ps(dfPtr13+8256+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df913);
_mm512_mask_storeu_ps(dfPtr13+532448+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df912);
_mm512_mask_storeu_ps(dfPtr13+532512+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df913);
df920 = _mm512_permutexvar_ps(eo62, df920);
df921 = _mm512_permutexvar_ps(eo62, df921);
_mm512_mask_storeu_ps(dfPtr13+1056768+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df920);
_mm512_mask_storeu_ps(dfPtr13+1056832+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df921);
_mm512_mask_storeu_ps(dfPtr13+1581024+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df920);
_mm512_mask_storeu_ps(dfPtr13+1581088+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df921);
df914 = _mm512_permutexvar_ps(eo62, df914);
df915 = _mm512_permutexvar_ps(eo62, df915);
_mm512_mask_storeu_ps(dfPtr13+12288+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df914);
_mm512_mask_storeu_ps(dfPtr13+12352+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df915);
_mm512_mask_storeu_ps(dfPtr13+536544+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df914);
_mm512_mask_storeu_ps(dfPtr13+536608+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df915);
df922 = _mm512_permutexvar_ps(eo62, df922);
df923 = _mm512_permutexvar_ps(eo62, df923);
_mm512_mask_storeu_ps(dfPtr13+1060864+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df922);
_mm512_mask_storeu_ps(dfPtr13+1060928+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df923);
_mm512_mask_storeu_ps(dfPtr13+1585120+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df922);
_mm512_mask_storeu_ps(dfPtr13+1585184+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df923);
_mm512_mask_storeu_ps(dfPtr13+0+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df908);
_mm512_mask_storeu_ps(dfPtr13+64+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df909);
_mm512_mask_storeu_ps(dfPtr13+524256+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df908);
_mm512_mask_storeu_ps(dfPtr13+524320+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df909);
_mm512_mask_storeu_ps(dfPtr13+1048576+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df916);
_mm512_mask_storeu_ps(dfPtr13+1048640+16384*i60+6144*j52+256*k154+128*m62+32*f66, 255, df917);
_mm512_mask_storeu_ps(dfPtr13+1572832+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df916);
_mm512_mask_storeu_ps(dfPtr13+1572896+16384*i60+6144*j52+256*k154+128*m62+32*f66, 65280, df917);
ptrdiff_t b80 = 3;
ptrdiff_t m63 = (size_t)b80/2;
ptrdiff_t f67 = (size_t)b80%2;
__m512 dat2409 = _mm512_maskz_loadu_ps(32767, datPtr31+1624+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2410 = _mm512_maskz_loadu_ps(32767, datPtr31+1736+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2411 = _mm512_maskz_loadu_ps(32767, datPtr31+1848+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2412 = _mm512_maskz_loadu_ps(32767, datPtr31+1960+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2413 = _mm512_maskz_loadu_ps(32767, datPtr31+2072+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2414 = _mm512_maskz_loadu_ps(32767, datPtr31+2184+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2415 = _mm512_maskz_loadu_ps(32767, datPtr31+2296+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2416 = _mm512_maskz_loadu_ps(32767, datPtr31+2408+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2417 = _mm512_maskz_loadu_ps(32767, datPtr31+2520+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2418 = _mm512_maskz_loadu_ps(32767, datPtr31+2632+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2419 = _mm512_maskz_loadu_ps(32767, datPtr31+2744+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2420 = _mm512_maskz_loadu_ps(32767, datPtr31+2856+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2421 = _mm512_maskz_loadu_ps(32767, datPtr31+2968+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2422 = _mm512_maskz_loadu_ps(32767, datPtr31+3080+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 dat2423 = _mm512_maskz_loadu_ps(32767, datPtr31+3192+50176*i60+3136*k154+112*h51+4*w65+0*b80);
__m512 fft10417 = _mm512_add_ps(dat2409, dat2417);
__m512 fft10505 = _mm512_add_ps(dat2410, dat2418);
__m512 fft10418 = _mm512_sub_ps(dat2409, dat2417);
__m512 fft10506 = _mm512_sub_ps(dat2410, dat2418);
__m512 fft10419 = _mm512_add_ps(dat2411, dat2419);
__m512 fft10507 = _mm512_add_ps(dat2412, dat2420);
__m512 fft10420 = _mm512_sub_ps(dat2411, dat2419);
__m512 fft10508 = _mm512_sub_ps(dat2412, dat2420);
__m512 fft10421 = _mm512_add_ps(dat2413, dat2421);
__m512 fft10509 = _mm512_add_ps(dat2414, dat2422);
__m512 fft10422 = _mm512_sub_ps(dat2413, dat2421);
__m512 fft10510 = _mm512_sub_ps(dat2414, dat2422);
__m512 fft10423 = _mm512_add_ps(dat2415, dat2423);
__m512 fft10511 = _mm512_add_ps(dat2416, _mm512_setzero_ps());
__m512 fft10424 = _mm512_sub_ps(dat2415, dat2423);
__m512 fft10512 = _mm512_sub_ps(dat2416, _mm512_setzero_ps());
__m512 fft10425 = _mm512_add_ps(fft10417, fft10421);
__m512 fft10513 = _mm512_add_ps(fft10505, fft10509);
__m512 fft10426 = _mm512_sub_ps(fft10417, fft10421);
__m512 fft10514 = _mm512_sub_ps(fft10505, fft10509);
__m512 fft10427 = _mm512_add_ps(fft10419, fft10423);
__m512 fft10515 = _mm512_add_ps(fft10507, fft10511);
__m512 fft10428 = _mm512_sub_ps(fft10423, fft10419);
__m512 fft10516 = _mm512_sub_ps(fft10511, fft10507);
__m512 fft10429 = _mm512_sub_ps(fft10420, fft10424);
__m512 fft10517 = _mm512_sub_ps(fft10508, fft10512);
__m512 fft10430 = _mm512_add_ps(fft10420, fft10424);
__m512 fft10518 = _mm512_add_ps(fft10508, fft10512);
__m512 fft10431 = _mm512_add_ps(fft10425, fft10427);
__m512 fft10519 = _mm512_add_ps(fft10513, fft10515);
__m512 fft10432 = _mm512_sub_ps(fft10425, fft10427);
__m512 fft10520 = _mm512_sub_ps(fft10513, fft10515);
__m512 fft10433 = _mm512_fmadd_ps(fft10429, _mm512_set1_ps(7.0710677e-01f), fft10418);
__m512 fft10521 = _mm512_fmadd_ps(fft10517, _mm512_set1_ps(7.0710677e-01f), fft10506);
__m512 fft10434 = _mm512_fnmsub_ps(fft10430, _mm512_set1_ps(7.0710677e-01f), fft10422);
__m512 fft10522 = _mm512_fnmsub_ps(fft10518, _mm512_set1_ps(7.0710677e-01f), fft10510);
__m512 fft10435 = _mm512_fnmadd_ps(fft10429, _mm512_set1_ps(7.0710677e-01f), fft10418);
__m512 fft10523 = _mm512_fnmadd_ps(fft10517, _mm512_set1_ps(7.0710677e-01f), fft10506);
__m512 fft10436 = _mm512_fnmadd_ps(fft10430, _mm512_set1_ps(7.0710677e-01f), fft10422);
__m512 fft10524 = _mm512_fnmadd_ps(fft10518, _mm512_set1_ps(7.0710677e-01f), fft10510);
__m512 fft10437 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10438 = _mm512_fmadd_ps(fft10431, fft10437, _mm512_shuffle_f32x4(fft10431, fft10431, 78));
__m512 fft10525 = _mm512_fmadd_ps(fft10519, fft10437, _mm512_shuffle_f32x4(fft10519, fft10519, 78));
__m512 fft10439 = _mm512_fmadd_ps(fft10432, fft10437, _mm512_shuffle_f32x4(fft10432, fft10432, 78));
__m512 fft10526 = _mm512_fmadd_ps(fft10520, fft10437, _mm512_shuffle_f32x4(fft10520, fft10520, 78));
__m512 fft10440 = _mm512_fmadd_ps(fft10433, fft10437, _mm512_shuffle_f32x4(fft10433, fft10433, 78));
__m512 fft10527 = _mm512_fmadd_ps(fft10521, fft10437, _mm512_shuffle_f32x4(fft10521, fft10521, 78));
__m512 fft10441 = _mm512_fmadd_ps(fft10434, fft10437, _mm512_shuffle_f32x4(fft10434, fft10434, 78));
__m512 fft10528 = _mm512_fmadd_ps(fft10522, fft10437, _mm512_shuffle_f32x4(fft10522, fft10522, 78));
__m512 fft10442 = _mm512_fmadd_ps(fft10426, fft10437, _mm512_shuffle_f32x4(fft10426, fft10426, 78));
__m512 fft10529 = _mm512_fmadd_ps(fft10514, fft10437, _mm512_shuffle_f32x4(fft10514, fft10514, 78));
__m512 fft10443 = _mm512_fmadd_ps(fft10428, fft10437, _mm512_shuffle_f32x4(fft10428, fft10428, 78));
__m512 fft10530 = _mm512_fmadd_ps(fft10516, fft10437, _mm512_shuffle_f32x4(fft10516, fft10516, 78));
__m512 fft10444 = _mm512_fmadd_ps(fft10435, fft10437, _mm512_shuffle_f32x4(fft10435, fft10435, 78));
__m512 fft10531 = _mm512_fmadd_ps(fft10523, fft10437, _mm512_shuffle_f32x4(fft10523, fft10523, 78));
__m512 fft10445 = _mm512_fmadd_ps(fft10436, fft10437, _mm512_shuffle_f32x4(fft10436, fft10436, 78));
__m512 fft10532 = _mm512_fmadd_ps(fft10524, fft10437, _mm512_shuffle_f32x4(fft10524, fft10524, 78));
__m512 fft10446 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10447 = _mm512_mul_ps(fft10438, fft10446);
__m512 fft10533 = _mm512_mul_ps(fft10525, fft10446);
__m512 fft10448 = _mm512_mul_ps(fft10439, fft10446);
__m512 fft10534 = _mm512_mul_ps(fft10526, fft10446);
__m512 fft10449 = _mm512_mul_ps(fft10440, fft10446);
__m512 fft10535 = _mm512_mul_ps(fft10527, fft10446);
__m512 fft10450 = _mm512_mul_ps(fft10441, fft10446);
__m512 fft10536 = _mm512_mul_ps(fft10528, fft10446);
__m512 fft10451 = _mm512_mul_ps(fft10442, fft10446);
__m512 fft10537 = _mm512_mul_ps(fft10529, fft10446);
__m512 fft10452 = _mm512_mul_ps(fft10443, fft10446);
__m512 fft10538 = _mm512_mul_ps(fft10530, fft10446);
__m512 fft10453 = _mm512_mul_ps(fft10444, fft10446);
__m512 fft10539 = _mm512_mul_ps(fft10531, fft10446);
__m512 fft10454 = _mm512_mul_ps(fft10445, fft10446);
__m512 fft10540 = _mm512_mul_ps(fft10532, fft10446);
__m512 fft10455 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10456 = _mm512_fmadd_ps(fft10439, fft10455, fft10447);
__m512 fft10541 = _mm512_fmadd_ps(fft10526, fft10455, fft10533);
__m512 fft10457 = _mm512_fnmadd_ps(fft10438, fft10455, fft10448);
__m512 fft10542 = _mm512_fnmadd_ps(fft10525, fft10455, fft10534);
__m512 fft10458 = _mm512_fmadd_ps(fft10441, fft10455, fft10449);
__m512 fft10543 = _mm512_fmadd_ps(fft10528, fft10455, fft10535);
__m512 fft10459 = _mm512_fnmadd_ps(fft10440, fft10455, fft10450);
__m512 fft10544 = _mm512_fnmadd_ps(fft10527, fft10455, fft10536);
__m512 fft10460 = _mm512_fmadd_ps(fft10443, fft10455, fft10451);
__m512 fft10545 = _mm512_fmadd_ps(fft10530, fft10455, fft10537);
__m512 fft10461 = _mm512_fnmadd_ps(fft10442, fft10455, fft10452);
__m512 fft10546 = _mm512_fnmadd_ps(fft10529, fft10455, fft10538);
__m512 fft10462 = _mm512_fmadd_ps(fft10445, fft10455, fft10453);
__m512 fft10547 = _mm512_fmadd_ps(fft10532, fft10455, fft10539);
__m512 fft10463 = _mm512_fnmadd_ps(fft10444, fft10455, fft10454);
__m512 fft10548 = _mm512_fnmadd_ps(fft10531, fft10455, fft10540);
__m512 fft10464 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10465 = _mm512_fmadd_ps(fft10456, fft10464, _mm512_shuffle_f32x4(fft10456, fft10456, 177));
__m512 fft10549 = _mm512_fmadd_ps(fft10541, fft10464, _mm512_shuffle_f32x4(fft10541, fft10541, 177));
__m512 fft10466 = _mm512_fmadd_ps(fft10457, fft10464, _mm512_shuffle_f32x4(fft10457, fft10457, 177));
__m512 fft10550 = _mm512_fmadd_ps(fft10542, fft10464, _mm512_shuffle_f32x4(fft10542, fft10542, 177));
__m512 fft10467 = _mm512_fmadd_ps(fft10458, fft10464, _mm512_shuffle_f32x4(fft10458, fft10458, 177));
__m512 fft10551 = _mm512_fmadd_ps(fft10543, fft10464, _mm512_shuffle_f32x4(fft10543, fft10543, 177));
__m512 fft10468 = _mm512_fmadd_ps(fft10459, fft10464, _mm512_shuffle_f32x4(fft10459, fft10459, 177));
__m512 fft10552 = _mm512_fmadd_ps(fft10544, fft10464, _mm512_shuffle_f32x4(fft10544, fft10544, 177));
__m512 fft10469 = _mm512_fmadd_ps(fft10460, fft10464, _mm512_shuffle_f32x4(fft10460, fft10460, 177));
__m512 fft10553 = _mm512_fmadd_ps(fft10545, fft10464, _mm512_shuffle_f32x4(fft10545, fft10545, 177));
__m512 fft10470 = _mm512_fmadd_ps(fft10461, fft10464, _mm512_shuffle_f32x4(fft10461, fft10461, 177));
__m512 fft10554 = _mm512_fmadd_ps(fft10546, fft10464, _mm512_shuffle_f32x4(fft10546, fft10546, 177));
__m512 fft10471 = _mm512_fmadd_ps(fft10462, fft10464, _mm512_shuffle_f32x4(fft10462, fft10462, 177));
__m512 fft10555 = _mm512_fmadd_ps(fft10547, fft10464, _mm512_shuffle_f32x4(fft10547, fft10547, 177));
__m512 fft10472 = _mm512_fmadd_ps(fft10463, fft10464, _mm512_shuffle_f32x4(fft10463, fft10463, 177));
__m512 fft10556 = _mm512_fmadd_ps(fft10548, fft10464, _mm512_shuffle_f32x4(fft10548, fft10548, 177));
__m512 fft10473 = _mm512_mask_mov_ps(fft10465, 49344, fft10466);
__m512 fft10557 = _mm512_mask_mov_ps(fft10549, 49344, fft10550);
__m512 fft10474 = _mm512_mask_sub_ps(fft10466, 49344, _mm512_setzero_ps(), fft10465);
__m512 fft10558 = _mm512_mask_sub_ps(fft10550, 49344, _mm512_setzero_ps(), fft10549);
__m512 fft10475 = _mm512_mask_mov_ps(fft10467, 49344, fft10468);
__m512 fft10559 = _mm512_mask_mov_ps(fft10551, 49344, fft10552);
__m512 fft10476 = _mm512_mask_sub_ps(fft10468, 49344, _mm512_setzero_ps(), fft10467);
__m512 fft10560 = _mm512_mask_sub_ps(fft10552, 49344, _mm512_setzero_ps(), fft10551);
__m512 fft10477 = _mm512_mask_mov_ps(fft10469, 49344, fft10470);
__m512 fft10561 = _mm512_mask_mov_ps(fft10553, 49344, fft10554);
__m512 fft10478 = _mm512_mask_sub_ps(fft10470, 49344, _mm512_setzero_ps(), fft10469);
__m512 fft10562 = _mm512_mask_sub_ps(fft10554, 49344, _mm512_setzero_ps(), fft10553);
__m512 fft10479 = _mm512_mask_mov_ps(fft10471, 49344, fft10472);
__m512 fft10563 = _mm512_mask_mov_ps(fft10555, 49344, fft10556);
__m512 fft10480 = _mm512_mask_sub_ps(fft10472, 49344, _mm512_setzero_ps(), fft10471);
__m512 fft10564 = _mm512_mask_sub_ps(fft10556, 49344, _mm512_setzero_ps(), fft10555);
__m512 fft10481 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10482 = _mm512_fmadd_ps(fft10473, fft10481, _mm512_shuffle_ps(fft10473, fft10473, 78));
__m512 fft10565 = _mm512_fmadd_ps(fft10557, fft10481, _mm512_shuffle_ps(fft10557, fft10557, 78));
__m512 fft10483 = _mm512_fmadd_ps(fft10474, fft10481, _mm512_shuffle_ps(fft10474, fft10474, 78));
__m512 fft10566 = _mm512_fmadd_ps(fft10558, fft10481, _mm512_shuffle_ps(fft10558, fft10558, 78));
__m512 fft10484 = _mm512_fmadd_ps(fft10475, fft10481, _mm512_shuffle_ps(fft10475, fft10475, 78));
__m512 fft10567 = _mm512_fmadd_ps(fft10559, fft10481, _mm512_shuffle_ps(fft10559, fft10559, 78));
__m512 fft10485 = _mm512_fmadd_ps(fft10476, fft10481, _mm512_shuffle_ps(fft10476, fft10476, 78));
__m512 fft10568 = _mm512_fmadd_ps(fft10560, fft10481, _mm512_shuffle_ps(fft10560, fft10560, 78));
__m512 fft10486 = _mm512_fmadd_ps(fft10477, fft10481, _mm512_shuffle_ps(fft10477, fft10477, 78));
__m512 fft10569 = _mm512_fmadd_ps(fft10561, fft10481, _mm512_shuffle_ps(fft10561, fft10561, 78));
__m512 fft10487 = _mm512_fmadd_ps(fft10478, fft10481, _mm512_shuffle_ps(fft10478, fft10478, 78));
__m512 fft10570 = _mm512_fmadd_ps(fft10562, fft10481, _mm512_shuffle_ps(fft10562, fft10562, 78));
__m512 fft10488 = _mm512_fmadd_ps(fft10479, fft10481, _mm512_shuffle_ps(fft10479, fft10479, 78));
__m512 fft10571 = _mm512_fmadd_ps(fft10563, fft10481, _mm512_shuffle_ps(fft10563, fft10563, 78));
__m512 fft10489 = _mm512_fmadd_ps(fft10480, fft10481, _mm512_shuffle_ps(fft10480, fft10480, 78));
__m512 fft10572 = _mm512_fmadd_ps(fft10564, fft10481, _mm512_shuffle_ps(fft10564, fft10564, 78));
__m512i fft10490 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10491 = _mm512_permutexvar_ps(fft10490, fft10482);
__m512 fft10573 = _mm512_permutexvar_ps(fft10490, fft10565);
__m512i fft10492 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10493 = _mm512_permutexvar_ps(fft10492, fft10482);
__m512 fft10574 = _mm512_permutexvar_ps(fft10492, fft10565);
__m512 fft10494 = _mm512_permutexvar_ps(fft10490, fft10483);
__m512 fft10575 = _mm512_permutexvar_ps(fft10490, fft10566);
__m512 fft10495 = _mm512_permutexvar_ps(fft10492, fft10483);
__m512 fft10576 = _mm512_permutexvar_ps(fft10492, fft10566);
__m512 fft10496 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft10497 = _mm512_fmadd_ps(fft10491, fft10496, fft10493);
__m512 fft10577 = _mm512_fmadd_ps(fft10573, fft10496, fft10574);
__m512 fft10498 = _mm512_fnmadd_ps(fft10495, fft10496, fft10494);
__m512 fft10578 = _mm512_fnmadd_ps(fft10576, fft10496, fft10575);
__m512 fft10499 = _mm512_mask_mov_ps(fft10495, 21845, fft10497);
__m512 fft10579 = _mm512_mask_mov_ps(fft10576, 21845, fft10577);
__m512 fft10500 = _mm512_mask_mov_ps(fft10491, 43176, fft10497);
__m512 fft10580 = _mm512_mask_mov_ps(fft10573, 43176, fft10577);
__m512 fft10501 = _mm512_mask_mov_ps(fft10499, 43176, fft10498);
__m512 fft10581 = _mm512_mask_mov_ps(fft10579, 43176, fft10578);
__m512 fft10502 = _mm512_mask_mov_ps(fft10500, 22102, fft10498);
__m512 fft10582 = _mm512_mask_mov_ps(fft10580, 22102, fft10578);
__m512 fft10503 = _mm512_mask_mul_ps(fft10501, 64764, fft10501, _mm512_set1_ps(5e-01f));
__m512 fft10583 = _mm512_mask_mul_ps(fft10581, 64764, fft10581, _mm512_set1_ps(5e-01f));
__m512 fft10504 = _mm512_mask_mul_ps(fft10502, 64764, fft10502, _mm512_set1_ps(5e-01f));
__m512 fft10584 = _mm512_mask_mul_ps(fft10582, 64764, fft10582, _mm512_set1_ps(5e-01f));
__m512 df924 = fft10503;
__m512 df932 = fft10583;
__m512 df925 = fft10504;
__m512 df933 = fft10584;
__m512 df926 = fft10484;
__m512 df934 = fft10567;
__m512 df927 = fft10485;
__m512 df935 = fft10568;
__m512 df928 = fft10486;
__m512 df936 = fft10569;
__m512 df929 = fft10487;
__m512 df937 = fft10570;
__m512 df930 = fft10488;
__m512 df938 = fft10571;
__m512 df931 = fft10489;
__m512 df939 = fft10572;
__m512i eo63 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df926 = _mm512_permutexvar_ps(eo63, df926);
df927 = _mm512_permutexvar_ps(eo63, df927);
_mm512_mask_storeu_ps(dfPtr13+4096+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df926);
_mm512_mask_storeu_ps(dfPtr13+4160+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df927);
_mm512_mask_storeu_ps(dfPtr13+528352+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df926);
_mm512_mask_storeu_ps(dfPtr13+528416+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df927);
df934 = _mm512_permutexvar_ps(eo63, df934);
df935 = _mm512_permutexvar_ps(eo63, df935);
_mm512_mask_storeu_ps(dfPtr13+1052672+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df934);
_mm512_mask_storeu_ps(dfPtr13+1052736+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df935);
_mm512_mask_storeu_ps(dfPtr13+1576928+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df934);
_mm512_mask_storeu_ps(dfPtr13+1576992+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df935);
df928 = _mm512_permutexvar_ps(eo63, df928);
df929 = _mm512_permutexvar_ps(eo63, df929);
_mm512_mask_storeu_ps(dfPtr13+8192+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df928);
_mm512_mask_storeu_ps(dfPtr13+8256+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df929);
_mm512_mask_storeu_ps(dfPtr13+532448+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df928);
_mm512_mask_storeu_ps(dfPtr13+532512+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df929);
df936 = _mm512_permutexvar_ps(eo63, df936);
df937 = _mm512_permutexvar_ps(eo63, df937);
_mm512_mask_storeu_ps(dfPtr13+1056768+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df936);
_mm512_mask_storeu_ps(dfPtr13+1056832+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df937);
_mm512_mask_storeu_ps(dfPtr13+1581024+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df936);
_mm512_mask_storeu_ps(dfPtr13+1581088+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df937);
df930 = _mm512_permutexvar_ps(eo63, df930);
df931 = _mm512_permutexvar_ps(eo63, df931);
_mm512_mask_storeu_ps(dfPtr13+12288+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df930);
_mm512_mask_storeu_ps(dfPtr13+12352+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df931);
_mm512_mask_storeu_ps(dfPtr13+536544+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df930);
_mm512_mask_storeu_ps(dfPtr13+536608+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df931);
df938 = _mm512_permutexvar_ps(eo63, df938);
df939 = _mm512_permutexvar_ps(eo63, df939);
_mm512_mask_storeu_ps(dfPtr13+1060864+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df938);
_mm512_mask_storeu_ps(dfPtr13+1060928+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df939);
_mm512_mask_storeu_ps(dfPtr13+1585120+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df938);
_mm512_mask_storeu_ps(dfPtr13+1585184+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df939);
_mm512_mask_storeu_ps(dfPtr13+0+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df924);
_mm512_mask_storeu_ps(dfPtr13+64+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df925);
_mm512_mask_storeu_ps(dfPtr13+524256+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df924);
_mm512_mask_storeu_ps(dfPtr13+524320+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df925);
_mm512_mask_storeu_ps(dfPtr13+1048576+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df932);
_mm512_mask_storeu_ps(dfPtr13+1048640+16384*i60+6144*j52+256*k154+128*m63+32*f67, 255, df933);
_mm512_mask_storeu_ps(dfPtr13+1572832+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df932);
_mm512_mask_storeu_ps(dfPtr13+1572896+16384*i60+6144*j52+256*k154+128*m63+32*f67, 65280, df933);
}
++j52;
}
}

static void ResNeXt50StriderArrangeDats3(ResNeXt50ThreaderTeam1* team63, char** tensors99) {
ResNeXt50ThreaderTask1 task103;
task103.callee1 = ResNeXt50StriderArrangeDats3Callee1;
task103.any1 = tensors99;
task103.nd1 = 4;
task103.hull1[0] = 1;
task103.hull1[1] = 1;
task103.hull1[2] = 16;
task103.hull1[3] = 1;
ResNeXt50ThreaderDo1(team63, &task103);
}

static void ResNeXt50StriderProduceSums3Callee1(ResNeXt50ThreaderTask1* task104, int64_t* pt57) {
void** tuple6 = task104->any1;
char** tensors102 = tuple6[0];
ptrdiff_t e30 = 0;
ptrdiff_t z6 = (ptrdiff_t)tuple6[2];
ptrdiff_t g33 = pt57[3];
ptrdiff_t p3 = 0;
ptrdiff_t d20 = 0;
ptrdiff_t w66 = 0;
if (__builtin_expect(!(e30|z6), 0)) {
z6 = 0;
char*restrict bfPtr14 = tensors102[0]+2048*e30;
char*restrict wfPtr14 = tensors102[0]+2048+130023424*e30+1048576*z6;
char*restrict dfPtr14 = tensors102[1]+65011712*e30+524288*z6;
char*restrict sfPtr13 = tensors102[2];
ptrdiff_t i61 = 1*g33;
ptrdiff_t j53 = 4*p3;
ptrdiff_t jj52 = j53+3;
if (__builtin_expect(!j53, 0)) {
ptrdiff_t k155 = 1*d20;
ptrdiff_t l64 = 4*w66;
for (; l64 != 4; ++l64) {
__m512 sfRe461 = _mm512_setzero_ps();
__m512 sfIm461 = _mm512_setzero_ps();
__m512 sfRe465 = _mm512_setzero_ps();
__m512 sfIm465 = _mm512_setzero_ps();
sfRe461 = _mm512_mask_mov_ps(sfRe461, 1, _mm512_set1_ps(*(float*)(bfPtr14+0+64*i61+16*l64)));
sfRe461 = _mm512_mask_mov_ps(sfRe461, 256, _mm512_set1_ps(*(float*)(bfPtr14+4+64*i61+16*l64)));
sfRe465 = _mm512_mask_mov_ps(sfRe465, 1, _mm512_set1_ps(*(float*)(bfPtr14+8+64*i61+16*l64)));
sfRe465 = _mm512_mask_mov_ps(sfRe465, 256, _mm512_set1_ps(*(float*)(bfPtr14+12+64*i61+16*l64)));
__m512 sfRe462 = sfRe461;
__m512 sfIm462 = sfIm461;
__m512 sfRe463 = sfRe461;
__m512 sfIm463 = sfIm461;
__m512 sfRe464 = sfRe461;
__m512 sfIm464 = sfIm461;
__m512 sfRe466 = sfRe465;
__m512 sfIm466 = sfIm465;
__m512 sfRe467 = sfRe465;
__m512 sfIm467 = sfIm465;
__m512 sfRe468 = sfRe465;
__m512 sfIm468 = sfIm465;
for (ptrdiff_t s58 = 0; s58 < 16; ++s58) {
__m512i wfLd33 = _mm512_loadu_si512(wfPtr14+0+32768*i61+8192*j53+2048*l64+128*s58);
__m512 wfRe33 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd33));
__m512 wfIm33 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd33, 1));
__m512 wfMx17 = _mm512_mask_mov_ps(wfIm33, 64764, wfRe33);
__m512i wfLd34 = _mm512_loadu_si512(wfPtr14+64+32768*i61+8192*j53+2048*l64+128*s58);
__m512 wfRe34 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd34));
__m512 wfIm34 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd34, 1));
__m512 wfMx18 = _mm512_mask_mov_ps(wfIm34, 64764, wfRe34);
__m512 dfRe37 = _mm512_loadu_ps(dfPtr14+0+16384*i61+4096*j53+6144*k155+256*s58);
__m512 dfIm37 = _mm512_loadu_ps(dfPtr14+64+16384*i61+4096*j53+6144*k155+256*s58);
sfRe461 = _mm512_fmadd_ps(wfRe33, dfRe37, sfRe461);
sfRe461 = _mm512_mask3_fmadd_ps(wfIm33, dfIm37, sfRe461, 64764);
sfIm461 = _mm512_fmadd_ps(wfMx17, dfIm37, sfIm461);
sfIm461 = _mm512_mask3_fnmadd_ps(wfIm33, dfRe37, sfIm461, 64764);
sfRe465 = _mm512_fmadd_ps(wfRe34, dfRe37, sfRe465);
sfRe465 = _mm512_mask3_fmadd_ps(wfIm34, dfIm37, sfRe465, 64764);
sfIm465 = _mm512_fmadd_ps(wfMx18, dfIm37, sfIm465);
sfIm465 = _mm512_mask3_fnmadd_ps(wfIm34, dfRe37, sfIm465, 64764);
dfRe37 = _mm512_shuffle_f32x4(dfRe37, dfRe37, 78);
dfIm37 = _mm512_shuffle_f32x4(dfIm37, dfIm37, 78);
sfRe462 = _mm512_fmadd_ps(wfRe33, dfRe37, sfRe462);
sfRe462 = _mm512_mask3_fmadd_ps(wfIm33, dfIm37, sfRe462, 64764);
sfIm462 = _mm512_fmadd_ps(wfMx17, dfIm37, sfIm462);
sfIm462 = _mm512_mask3_fnmadd_ps(wfIm33, dfRe37, sfIm462, 64764);
sfRe466 = _mm512_fmadd_ps(wfRe34, dfRe37, sfRe466);
sfRe466 = _mm512_mask3_fmadd_ps(wfIm34, dfIm37, sfRe466, 64764);
sfIm466 = _mm512_fmadd_ps(wfMx18, dfIm37, sfIm466);
sfIm466 = _mm512_mask3_fnmadd_ps(wfIm34, dfRe37, sfIm466, 64764);
__m512 dfRe38 = _mm512_loadu_ps(dfPtr14+128+16384*i61+4096*j53+6144*k155+256*s58);
__m512 dfIm38 = _mm512_loadu_ps(dfPtr14+192+16384*i61+4096*j53+6144*k155+256*s58);
sfRe463 = _mm512_fmadd_ps(wfRe33, dfRe38, sfRe463);
sfRe463 = _mm512_mask3_fmadd_ps(wfIm33, dfIm38, sfRe463, 64764);
sfIm463 = _mm512_fmadd_ps(wfMx17, dfIm38, sfIm463);
sfIm463 = _mm512_mask3_fnmadd_ps(wfIm33, dfRe38, sfIm463, 64764);
sfRe467 = _mm512_fmadd_ps(wfRe34, dfRe38, sfRe467);
sfRe467 = _mm512_mask3_fmadd_ps(wfIm34, dfIm38, sfRe467, 64764);
sfIm467 = _mm512_fmadd_ps(wfMx18, dfIm38, sfIm467);
sfIm467 = _mm512_mask3_fnmadd_ps(wfIm34, dfRe38, sfIm467, 64764);
dfRe38 = _mm512_shuffle_f32x4(dfRe38, dfRe38, 78);
dfIm38 = _mm512_shuffle_f32x4(dfIm38, dfIm38, 78);
sfRe464 = _mm512_fmadd_ps(wfRe33, dfRe38, sfRe464);
sfRe464 = _mm512_mask3_fmadd_ps(wfIm33, dfIm38, sfRe464, 64764);
sfIm464 = _mm512_fmadd_ps(wfMx17, dfIm38, sfIm464);
sfIm464 = _mm512_mask3_fnmadd_ps(wfIm33, dfRe38, sfIm464, 64764);
sfRe468 = _mm512_fmadd_ps(wfRe34, dfRe38, sfRe468);
sfRe468 = _mm512_mask3_fmadd_ps(wfIm34, dfIm38, sfRe468, 64764);
sfIm468 = _mm512_fmadd_ps(wfMx18, dfIm38, sfIm468);
sfIm468 = _mm512_mask3_fnmadd_ps(wfIm34, dfRe38, sfIm468, 64764);
}
_mm512_storeu_ps(sfPtr13+0+16384*i61+4096*j53+6144*k155+1024*l64, sfRe461);
_mm512_storeu_ps(sfPtr13+64+16384*i61+4096*j53+6144*k155+1024*l64, sfIm461);
_mm512_storeu_ps(sfPtr13+128+16384*i61+4096*j53+6144*k155+1024*l64, sfRe462);
_mm512_storeu_ps(sfPtr13+192+16384*i61+4096*j53+6144*k155+1024*l64, sfIm462);
_mm512_storeu_ps(sfPtr13+256+16384*i61+4096*j53+6144*k155+1024*l64, sfRe463);
_mm512_storeu_ps(sfPtr13+320+16384*i61+4096*j53+6144*k155+1024*l64, sfIm463);
_mm512_storeu_ps(sfPtr13+384+16384*i61+4096*j53+6144*k155+1024*l64, sfRe464);
_mm512_storeu_ps(sfPtr13+448+16384*i61+4096*j53+6144*k155+1024*l64, sfIm464);
_mm512_storeu_ps(sfPtr13+512+16384*i61+4096*j53+6144*k155+1024*l64, sfRe465);
_mm512_storeu_ps(sfPtr13+576+16384*i61+4096*j53+6144*k155+1024*l64, sfIm465);
_mm512_storeu_ps(sfPtr13+640+16384*i61+4096*j53+6144*k155+1024*l64, sfRe466);
_mm512_storeu_ps(sfPtr13+704+16384*i61+4096*j53+6144*k155+1024*l64, sfIm466);
_mm512_storeu_ps(sfPtr13+768+16384*i61+4096*j53+6144*k155+1024*l64, sfRe467);
_mm512_storeu_ps(sfPtr13+832+16384*i61+4096*j53+6144*k155+1024*l64, sfIm467);
_mm512_storeu_ps(sfPtr13+896+16384*i61+4096*j53+6144*k155+1024*l64, sfRe468);
_mm512_storeu_ps(sfPtr13+960+16384*i61+4096*j53+6144*k155+1024*l64, sfIm468);
}
j53 = 1;
}
for (; j53 <= jj52; ++j53) {
ptrdiff_t k156 = 1*d20;
ptrdiff_t l65 = 4*w66;
for (; l65 != 4; ++l65) {
__m512 sfRe469 = _mm512_setzero_ps();
__m512 sfIm469 = _mm512_setzero_ps();
__m512 sfRe473 = _mm512_setzero_ps();
__m512 sfIm473 = _mm512_setzero_ps();
(void)bfPtr14;
__m512 sfRe470 = sfRe469;
__m512 sfIm470 = sfIm469;
__m512 sfRe471 = sfRe469;
__m512 sfIm471 = sfIm469;
__m512 sfRe472 = sfRe469;
__m512 sfIm472 = sfIm469;
__m512 sfRe474 = sfRe473;
__m512 sfIm474 = sfIm473;
__m512 sfRe475 = sfRe473;
__m512 sfIm475 = sfIm473;
__m512 sfRe476 = sfRe473;
__m512 sfIm476 = sfIm473;
for (ptrdiff_t s59 = 0; s59 < 16; ++s59) {
__m512i wfLd35 = _mm512_loadu_si512(wfPtr14+0+32768*i61+8192*j53+2048*l65+128*s59);
__m512 wfRe35 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd35));
__m512 wfIm35 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd35, 1));
__m512i wfLd36 = _mm512_loadu_si512(wfPtr14+64+32768*i61+8192*j53+2048*l65+128*s59);
__m512 wfRe36 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd36));
__m512 wfIm36 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd36, 1));
__m512 dfRe39 = _mm512_loadu_ps(dfPtr14+0+16384*i61+4096*j53+6144*k156+256*s59);
__m512 dfIm39 = _mm512_loadu_ps(dfPtr14+64+16384*i61+4096*j53+6144*k156+256*s59);
sfRe469 = _mm512_fmadd_ps(wfRe35, dfRe39, sfRe469);
sfRe469 = _mm512_fmadd_ps(wfIm35, dfIm39, sfRe469);
sfIm469 = _mm512_fmadd_ps(wfRe35, dfIm39, sfIm469);
sfIm469 = _mm512_fnmadd_ps(wfIm35, dfRe39, sfIm469);
sfRe473 = _mm512_fmadd_ps(wfRe36, dfRe39, sfRe473);
sfRe473 = _mm512_fmadd_ps(wfIm36, dfIm39, sfRe473);
sfIm473 = _mm512_fmadd_ps(wfRe36, dfIm39, sfIm473);
sfIm473 = _mm512_fnmadd_ps(wfIm36, dfRe39, sfIm473);
dfRe39 = _mm512_shuffle_f32x4(dfRe39, dfRe39, 78);
dfIm39 = _mm512_shuffle_f32x4(dfIm39, dfIm39, 78);
sfRe470 = _mm512_fmadd_ps(wfRe35, dfRe39, sfRe470);
sfRe470 = _mm512_fmadd_ps(wfIm35, dfIm39, sfRe470);
sfIm470 = _mm512_fmadd_ps(wfRe35, dfIm39, sfIm470);
sfIm470 = _mm512_fnmadd_ps(wfIm35, dfRe39, sfIm470);
sfRe474 = _mm512_fmadd_ps(wfRe36, dfRe39, sfRe474);
sfRe474 = _mm512_fmadd_ps(wfIm36, dfIm39, sfRe474);
sfIm474 = _mm512_fmadd_ps(wfRe36, dfIm39, sfIm474);
sfIm474 = _mm512_fnmadd_ps(wfIm36, dfRe39, sfIm474);
__m512 dfRe40 = _mm512_loadu_ps(dfPtr14+128+16384*i61+4096*j53+6144*k156+256*s59);
__m512 dfIm40 = _mm512_loadu_ps(dfPtr14+192+16384*i61+4096*j53+6144*k156+256*s59);
sfRe471 = _mm512_fmadd_ps(wfRe35, dfRe40, sfRe471);
sfRe471 = _mm512_fmadd_ps(wfIm35, dfIm40, sfRe471);
sfIm471 = _mm512_fmadd_ps(wfRe35, dfIm40, sfIm471);
sfIm471 = _mm512_fnmadd_ps(wfIm35, dfRe40, sfIm471);
sfRe475 = _mm512_fmadd_ps(wfRe36, dfRe40, sfRe475);
sfRe475 = _mm512_fmadd_ps(wfIm36, dfIm40, sfRe475);
sfIm475 = _mm512_fmadd_ps(wfRe36, dfIm40, sfIm475);
sfIm475 = _mm512_fnmadd_ps(wfIm36, dfRe40, sfIm475);
dfRe40 = _mm512_shuffle_f32x4(dfRe40, dfRe40, 78);
dfIm40 = _mm512_shuffle_f32x4(dfIm40, dfIm40, 78);
sfRe472 = _mm512_fmadd_ps(wfRe35, dfRe40, sfRe472);
sfRe472 = _mm512_fmadd_ps(wfIm35, dfIm40, sfRe472);
sfIm472 = _mm512_fmadd_ps(wfRe35, dfIm40, sfIm472);
sfIm472 = _mm512_fnmadd_ps(wfIm35, dfRe40, sfIm472);
sfRe476 = _mm512_fmadd_ps(wfRe36, dfRe40, sfRe476);
sfRe476 = _mm512_fmadd_ps(wfIm36, dfIm40, sfRe476);
sfIm476 = _mm512_fmadd_ps(wfRe36, dfIm40, sfIm476);
sfIm476 = _mm512_fnmadd_ps(wfIm36, dfRe40, sfIm476);
}
_mm512_storeu_ps(sfPtr13+0+16384*i61+4096*j53+6144*k156+1024*l65, sfRe469);
_mm512_storeu_ps(sfPtr13+64+16384*i61+4096*j53+6144*k156+1024*l65, sfIm469);
_mm512_storeu_ps(sfPtr13+128+16384*i61+4096*j53+6144*k156+1024*l65, sfRe470);
_mm512_storeu_ps(sfPtr13+192+16384*i61+4096*j53+6144*k156+1024*l65, sfIm470);
_mm512_storeu_ps(sfPtr13+256+16384*i61+4096*j53+6144*k156+1024*l65, sfRe471);
_mm512_storeu_ps(sfPtr13+320+16384*i61+4096*j53+6144*k156+1024*l65, sfIm471);
_mm512_storeu_ps(sfPtr13+384+16384*i61+4096*j53+6144*k156+1024*l65, sfRe472);
_mm512_storeu_ps(sfPtr13+448+16384*i61+4096*j53+6144*k156+1024*l65, sfIm472);
_mm512_storeu_ps(sfPtr13+512+16384*i61+4096*j53+6144*k156+1024*l65, sfRe473);
_mm512_storeu_ps(sfPtr13+576+16384*i61+4096*j53+6144*k156+1024*l65, sfIm473);
_mm512_storeu_ps(sfPtr13+640+16384*i61+4096*j53+6144*k156+1024*l65, sfRe474);
_mm512_storeu_ps(sfPtr13+704+16384*i61+4096*j53+6144*k156+1024*l65, sfIm474);
_mm512_storeu_ps(sfPtr13+768+16384*i61+4096*j53+6144*k156+1024*l65, sfRe475);
_mm512_storeu_ps(sfPtr13+832+16384*i61+4096*j53+6144*k156+1024*l65, sfIm475);
_mm512_storeu_ps(sfPtr13+896+16384*i61+4096*j53+6144*k156+1024*l65, sfRe476);
_mm512_storeu_ps(sfPtr13+960+16384*i61+4096*j53+6144*k156+1024*l65, sfIm476);
}
}
return;
}
char*restrict bfPtr15 = tensors102[0]+2048*e30;
char*restrict wfPtr15 = tensors102[0]+2048+130023424*e30+1048576*z6;
char*restrict dfPtr15 = tensors102[1]+65011712*e30+524288*z6;
char*restrict sfPtr14 = tensors102[2];
ptrdiff_t i62 = 1*g33;
ptrdiff_t j54 = 4*p3;
ptrdiff_t jj53 = j54+3;
if (__builtin_expect(!j54, 0)) {
ptrdiff_t k157 = 1*d20;
ptrdiff_t l66 = 4*w66;
for (; l66 != 4; ++l66) {
__m512 sfRe477 = _mm512_setzero_ps();
__m512 sfIm477 = _mm512_setzero_ps();
__m512 sfRe481 = _mm512_setzero_ps();
__m512 sfIm481 = _mm512_setzero_ps();
(void)bfPtr15;
__m512 sfRe478 = sfRe477;
__m512 sfIm478 = sfIm477;
__m512 sfRe479 = sfRe477;
__m512 sfIm479 = sfIm477;
__m512 sfRe480 = sfRe477;
__m512 sfIm480 = sfIm477;
__m512 sfRe482 = sfRe481;
__m512 sfIm482 = sfIm481;
__m512 sfRe483 = sfRe481;
__m512 sfIm483 = sfIm481;
__m512 sfRe484 = sfRe481;
__m512 sfIm484 = sfIm481;
for (ptrdiff_t s60 = 0; s60 < 16; ++s60) {
__m512i wfLd37 = _mm512_loadu_si512(wfPtr15+0+32768*i62+8192*j54+2048*l66+128*s60);
__m512 wfRe37 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd37));
__m512 wfIm37 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd37, 1));
__m512 wfMx19 = _mm512_mask_mov_ps(wfIm37, 64764, wfRe37);
__m512i wfLd38 = _mm512_loadu_si512(wfPtr15+64+32768*i62+8192*j54+2048*l66+128*s60);
__m512 wfRe38 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd38));
__m512 wfIm38 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd38, 1));
__m512 wfMx20 = _mm512_mask_mov_ps(wfIm38, 64764, wfRe38);
__m512 dfRe41 = _mm512_loadu_ps(dfPtr15+0+16384*i62+4096*j54+6144*k157+256*s60);
__m512 dfIm41 = _mm512_loadu_ps(dfPtr15+64+16384*i62+4096*j54+6144*k157+256*s60);
sfRe477 = _mm512_fmadd_ps(wfRe37, dfRe41, sfRe477);
sfRe477 = _mm512_mask3_fmadd_ps(wfIm37, dfIm41, sfRe477, 64764);
sfIm477 = _mm512_fmadd_ps(wfMx19, dfIm41, sfIm477);
sfIm477 = _mm512_mask3_fnmadd_ps(wfIm37, dfRe41, sfIm477, 64764);
sfRe481 = _mm512_fmadd_ps(wfRe38, dfRe41, sfRe481);
sfRe481 = _mm512_mask3_fmadd_ps(wfIm38, dfIm41, sfRe481, 64764);
sfIm481 = _mm512_fmadd_ps(wfMx20, dfIm41, sfIm481);
sfIm481 = _mm512_mask3_fnmadd_ps(wfIm38, dfRe41, sfIm481, 64764);
dfRe41 = _mm512_shuffle_f32x4(dfRe41, dfRe41, 78);
dfIm41 = _mm512_shuffle_f32x4(dfIm41, dfIm41, 78);
sfRe478 = _mm512_fmadd_ps(wfRe37, dfRe41, sfRe478);
sfRe478 = _mm512_mask3_fmadd_ps(wfIm37, dfIm41, sfRe478, 64764);
sfIm478 = _mm512_fmadd_ps(wfMx19, dfIm41, sfIm478);
sfIm478 = _mm512_mask3_fnmadd_ps(wfIm37, dfRe41, sfIm478, 64764);
sfRe482 = _mm512_fmadd_ps(wfRe38, dfRe41, sfRe482);
sfRe482 = _mm512_mask3_fmadd_ps(wfIm38, dfIm41, sfRe482, 64764);
sfIm482 = _mm512_fmadd_ps(wfMx20, dfIm41, sfIm482);
sfIm482 = _mm512_mask3_fnmadd_ps(wfIm38, dfRe41, sfIm482, 64764);
__m512 dfRe42 = _mm512_loadu_ps(dfPtr15+128+16384*i62+4096*j54+6144*k157+256*s60);
__m512 dfIm42 = _mm512_loadu_ps(dfPtr15+192+16384*i62+4096*j54+6144*k157+256*s60);
sfRe479 = _mm512_fmadd_ps(wfRe37, dfRe42, sfRe479);
sfRe479 = _mm512_mask3_fmadd_ps(wfIm37, dfIm42, sfRe479, 64764);
sfIm479 = _mm512_fmadd_ps(wfMx19, dfIm42, sfIm479);
sfIm479 = _mm512_mask3_fnmadd_ps(wfIm37, dfRe42, sfIm479, 64764);
sfRe483 = _mm512_fmadd_ps(wfRe38, dfRe42, sfRe483);
sfRe483 = _mm512_mask3_fmadd_ps(wfIm38, dfIm42, sfRe483, 64764);
sfIm483 = _mm512_fmadd_ps(wfMx20, dfIm42, sfIm483);
sfIm483 = _mm512_mask3_fnmadd_ps(wfIm38, dfRe42, sfIm483, 64764);
dfRe42 = _mm512_shuffle_f32x4(dfRe42, dfRe42, 78);
dfIm42 = _mm512_shuffle_f32x4(dfIm42, dfIm42, 78);
sfRe480 = _mm512_fmadd_ps(wfRe37, dfRe42, sfRe480);
sfRe480 = _mm512_mask3_fmadd_ps(wfIm37, dfIm42, sfRe480, 64764);
sfIm480 = _mm512_fmadd_ps(wfMx19, dfIm42, sfIm480);
sfIm480 = _mm512_mask3_fnmadd_ps(wfIm37, dfRe42, sfIm480, 64764);
sfRe484 = _mm512_fmadd_ps(wfRe38, dfRe42, sfRe484);
sfRe484 = _mm512_mask3_fmadd_ps(wfIm38, dfIm42, sfRe484, 64764);
sfIm484 = _mm512_fmadd_ps(wfMx20, dfIm42, sfIm484);
sfIm484 = _mm512_mask3_fnmadd_ps(wfIm38, dfRe42, sfIm484, 64764);
}
sfRe477 = _mm512_add_ps(sfRe477, _mm512_loadu_ps(sfPtr14+0+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm477 = _mm512_add_ps(sfIm477, _mm512_loadu_ps(sfPtr14+64+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe478 = _mm512_add_ps(sfRe478, _mm512_loadu_ps(sfPtr14+128+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm478 = _mm512_add_ps(sfIm478, _mm512_loadu_ps(sfPtr14+192+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe479 = _mm512_add_ps(sfRe479, _mm512_loadu_ps(sfPtr14+256+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm479 = _mm512_add_ps(sfIm479, _mm512_loadu_ps(sfPtr14+320+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe480 = _mm512_add_ps(sfRe480, _mm512_loadu_ps(sfPtr14+384+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm480 = _mm512_add_ps(sfIm480, _mm512_loadu_ps(sfPtr14+448+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe481 = _mm512_add_ps(sfRe481, _mm512_loadu_ps(sfPtr14+512+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm481 = _mm512_add_ps(sfIm481, _mm512_loadu_ps(sfPtr14+576+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe482 = _mm512_add_ps(sfRe482, _mm512_loadu_ps(sfPtr14+640+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm482 = _mm512_add_ps(sfIm482, _mm512_loadu_ps(sfPtr14+704+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe483 = _mm512_add_ps(sfRe483, _mm512_loadu_ps(sfPtr14+768+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm483 = _mm512_add_ps(sfIm483, _mm512_loadu_ps(sfPtr14+832+16384*i62+4096*j54+6144*k157+1024*l66));
sfRe484 = _mm512_add_ps(sfRe484, _mm512_loadu_ps(sfPtr14+896+16384*i62+4096*j54+6144*k157+1024*l66));
sfIm484 = _mm512_add_ps(sfIm484, _mm512_loadu_ps(sfPtr14+960+16384*i62+4096*j54+6144*k157+1024*l66));
_mm512_storeu_ps(sfPtr14+0+16384*i62+4096*j54+6144*k157+1024*l66, sfRe477);
_mm512_storeu_ps(sfPtr14+64+16384*i62+4096*j54+6144*k157+1024*l66, sfIm477);
_mm512_storeu_ps(sfPtr14+128+16384*i62+4096*j54+6144*k157+1024*l66, sfRe478);
_mm512_storeu_ps(sfPtr14+192+16384*i62+4096*j54+6144*k157+1024*l66, sfIm478);
_mm512_storeu_ps(sfPtr14+256+16384*i62+4096*j54+6144*k157+1024*l66, sfRe479);
_mm512_storeu_ps(sfPtr14+320+16384*i62+4096*j54+6144*k157+1024*l66, sfIm479);
_mm512_storeu_ps(sfPtr14+384+16384*i62+4096*j54+6144*k157+1024*l66, sfRe480);
_mm512_storeu_ps(sfPtr14+448+16384*i62+4096*j54+6144*k157+1024*l66, sfIm480);
_mm512_storeu_ps(sfPtr14+512+16384*i62+4096*j54+6144*k157+1024*l66, sfRe481);
_mm512_storeu_ps(sfPtr14+576+16384*i62+4096*j54+6144*k157+1024*l66, sfIm481);
_mm512_storeu_ps(sfPtr14+640+16384*i62+4096*j54+6144*k157+1024*l66, sfRe482);
_mm512_storeu_ps(sfPtr14+704+16384*i62+4096*j54+6144*k157+1024*l66, sfIm482);
_mm512_storeu_ps(sfPtr14+768+16384*i62+4096*j54+6144*k157+1024*l66, sfRe483);
_mm512_storeu_ps(sfPtr14+832+16384*i62+4096*j54+6144*k157+1024*l66, sfIm483);
_mm512_storeu_ps(sfPtr14+896+16384*i62+4096*j54+6144*k157+1024*l66, sfRe484);
_mm512_storeu_ps(sfPtr14+960+16384*i62+4096*j54+6144*k157+1024*l66, sfIm484);
}
j54 = 1;
}
for (; j54 <= jj53; ++j54) {
ptrdiff_t k158 = 1*d20;
ptrdiff_t l67 = 4*w66;
for (; l67 != 4; ++l67) {
__m512 sfRe485 = _mm512_setzero_ps();
__m512 sfIm485 = _mm512_setzero_ps();
__m512 sfRe489 = _mm512_setzero_ps();
__m512 sfIm489 = _mm512_setzero_ps();
(void)bfPtr15;
__m512 sfRe486 = sfRe485;
__m512 sfIm486 = sfIm485;
__m512 sfRe487 = sfRe485;
__m512 sfIm487 = sfIm485;
__m512 sfRe488 = sfRe485;
__m512 sfIm488 = sfIm485;
__m512 sfRe490 = sfRe489;
__m512 sfIm490 = sfIm489;
__m512 sfRe491 = sfRe489;
__m512 sfIm491 = sfIm489;
__m512 sfRe492 = sfRe489;
__m512 sfIm492 = sfIm489;
for (ptrdiff_t s61 = 0; s61 < 16; ++s61) {
__m512i wfLd39 = _mm512_loadu_si512(wfPtr15+0+32768*i62+8192*j54+2048*l67+128*s61);
__m512 wfRe39 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd39));
__m512 wfIm39 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd39, 1));
__m512i wfLd40 = _mm512_loadu_si512(wfPtr15+64+32768*i62+8192*j54+2048*l67+128*s61);
__m512 wfRe40 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd40));
__m512 wfIm40 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd40, 1));
__m512 dfRe43 = _mm512_loadu_ps(dfPtr15+0+16384*i62+4096*j54+6144*k158+256*s61);
__m512 dfIm43 = _mm512_loadu_ps(dfPtr15+64+16384*i62+4096*j54+6144*k158+256*s61);
sfRe485 = _mm512_fmadd_ps(wfRe39, dfRe43, sfRe485);
sfRe485 = _mm512_fmadd_ps(wfIm39, dfIm43, sfRe485);
sfIm485 = _mm512_fmadd_ps(wfRe39, dfIm43, sfIm485);
sfIm485 = _mm512_fnmadd_ps(wfIm39, dfRe43, sfIm485);
sfRe489 = _mm512_fmadd_ps(wfRe40, dfRe43, sfRe489);
sfRe489 = _mm512_fmadd_ps(wfIm40, dfIm43, sfRe489);
sfIm489 = _mm512_fmadd_ps(wfRe40, dfIm43, sfIm489);
sfIm489 = _mm512_fnmadd_ps(wfIm40, dfRe43, sfIm489);
dfRe43 = _mm512_shuffle_f32x4(dfRe43, dfRe43, 78);
dfIm43 = _mm512_shuffle_f32x4(dfIm43, dfIm43, 78);
sfRe486 = _mm512_fmadd_ps(wfRe39, dfRe43, sfRe486);
sfRe486 = _mm512_fmadd_ps(wfIm39, dfIm43, sfRe486);
sfIm486 = _mm512_fmadd_ps(wfRe39, dfIm43, sfIm486);
sfIm486 = _mm512_fnmadd_ps(wfIm39, dfRe43, sfIm486);
sfRe490 = _mm512_fmadd_ps(wfRe40, dfRe43, sfRe490);
sfRe490 = _mm512_fmadd_ps(wfIm40, dfIm43, sfRe490);
sfIm490 = _mm512_fmadd_ps(wfRe40, dfIm43, sfIm490);
sfIm490 = _mm512_fnmadd_ps(wfIm40, dfRe43, sfIm490);
__m512 dfRe44 = _mm512_loadu_ps(dfPtr15+128+16384*i62+4096*j54+6144*k158+256*s61);
__m512 dfIm44 = _mm512_loadu_ps(dfPtr15+192+16384*i62+4096*j54+6144*k158+256*s61);
sfRe487 = _mm512_fmadd_ps(wfRe39, dfRe44, sfRe487);
sfRe487 = _mm512_fmadd_ps(wfIm39, dfIm44, sfRe487);
sfIm487 = _mm512_fmadd_ps(wfRe39, dfIm44, sfIm487);
sfIm487 = _mm512_fnmadd_ps(wfIm39, dfRe44, sfIm487);
sfRe491 = _mm512_fmadd_ps(wfRe40, dfRe44, sfRe491);
sfRe491 = _mm512_fmadd_ps(wfIm40, dfIm44, sfRe491);
sfIm491 = _mm512_fmadd_ps(wfRe40, dfIm44, sfIm491);
sfIm491 = _mm512_fnmadd_ps(wfIm40, dfRe44, sfIm491);
dfRe44 = _mm512_shuffle_f32x4(dfRe44, dfRe44, 78);
dfIm44 = _mm512_shuffle_f32x4(dfIm44, dfIm44, 78);
sfRe488 = _mm512_fmadd_ps(wfRe39, dfRe44, sfRe488);
sfRe488 = _mm512_fmadd_ps(wfIm39, dfIm44, sfRe488);
sfIm488 = _mm512_fmadd_ps(wfRe39, dfIm44, sfIm488);
sfIm488 = _mm512_fnmadd_ps(wfIm39, dfRe44, sfIm488);
sfRe492 = _mm512_fmadd_ps(wfRe40, dfRe44, sfRe492);
sfRe492 = _mm512_fmadd_ps(wfIm40, dfIm44, sfRe492);
sfIm492 = _mm512_fmadd_ps(wfRe40, dfIm44, sfIm492);
sfIm492 = _mm512_fnmadd_ps(wfIm40, dfRe44, sfIm492);
}
sfRe485 = _mm512_add_ps(sfRe485, _mm512_loadu_ps(sfPtr14+0+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm485 = _mm512_add_ps(sfIm485, _mm512_loadu_ps(sfPtr14+64+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe486 = _mm512_add_ps(sfRe486, _mm512_loadu_ps(sfPtr14+128+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm486 = _mm512_add_ps(sfIm486, _mm512_loadu_ps(sfPtr14+192+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe487 = _mm512_add_ps(sfRe487, _mm512_loadu_ps(sfPtr14+256+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm487 = _mm512_add_ps(sfIm487, _mm512_loadu_ps(sfPtr14+320+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe488 = _mm512_add_ps(sfRe488, _mm512_loadu_ps(sfPtr14+384+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm488 = _mm512_add_ps(sfIm488, _mm512_loadu_ps(sfPtr14+448+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe489 = _mm512_add_ps(sfRe489, _mm512_loadu_ps(sfPtr14+512+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm489 = _mm512_add_ps(sfIm489, _mm512_loadu_ps(sfPtr14+576+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe490 = _mm512_add_ps(sfRe490, _mm512_loadu_ps(sfPtr14+640+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm490 = _mm512_add_ps(sfIm490, _mm512_loadu_ps(sfPtr14+704+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe491 = _mm512_add_ps(sfRe491, _mm512_loadu_ps(sfPtr14+768+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm491 = _mm512_add_ps(sfIm491, _mm512_loadu_ps(sfPtr14+832+16384*i62+4096*j54+6144*k158+1024*l67));
sfRe492 = _mm512_add_ps(sfRe492, _mm512_loadu_ps(sfPtr14+896+16384*i62+4096*j54+6144*k158+1024*l67));
sfIm492 = _mm512_add_ps(sfIm492, _mm512_loadu_ps(sfPtr14+960+16384*i62+4096*j54+6144*k158+1024*l67));
_mm512_storeu_ps(sfPtr14+0+16384*i62+4096*j54+6144*k158+1024*l67, sfRe485);
_mm512_storeu_ps(sfPtr14+64+16384*i62+4096*j54+6144*k158+1024*l67, sfIm485);
_mm512_storeu_ps(sfPtr14+128+16384*i62+4096*j54+6144*k158+1024*l67, sfRe486);
_mm512_storeu_ps(sfPtr14+192+16384*i62+4096*j54+6144*k158+1024*l67, sfIm486);
_mm512_storeu_ps(sfPtr14+256+16384*i62+4096*j54+6144*k158+1024*l67, sfRe487);
_mm512_storeu_ps(sfPtr14+320+16384*i62+4096*j54+6144*k158+1024*l67, sfIm487);
_mm512_storeu_ps(sfPtr14+384+16384*i62+4096*j54+6144*k158+1024*l67, sfRe488);
_mm512_storeu_ps(sfPtr14+448+16384*i62+4096*j54+6144*k158+1024*l67, sfIm488);
_mm512_storeu_ps(sfPtr14+512+16384*i62+4096*j54+6144*k158+1024*l67, sfRe489);
_mm512_storeu_ps(sfPtr14+576+16384*i62+4096*j54+6144*k158+1024*l67, sfIm489);
_mm512_storeu_ps(sfPtr14+640+16384*i62+4096*j54+6144*k158+1024*l67, sfRe490);
_mm512_storeu_ps(sfPtr14+704+16384*i62+4096*j54+6144*k158+1024*l67, sfIm490);
_mm512_storeu_ps(sfPtr14+768+16384*i62+4096*j54+6144*k158+1024*l67, sfRe491);
_mm512_storeu_ps(sfPtr14+832+16384*i62+4096*j54+6144*k158+1024*l67, sfIm491);
_mm512_storeu_ps(sfPtr14+896+16384*i62+4096*j54+6144*k158+1024*l67, sfRe492);
_mm512_storeu_ps(sfPtr14+960+16384*i62+4096*j54+6144*k158+1024*l67, sfIm492);
}
}
}

static void ResNeXt50StriderProduceSums3(ResNeXt50ThreaderTeam1* team64, char** tensors101) {
void* tuple5[3];
tuple5[0] = tensors101;
for (ptrdiff_t e31 = 0; e31 < 1; ++e31) {
tuple5[1] = (void*)e31;
for (ptrdiff_t z7 = 0; z7 < 4; ++z7) {
tuple5[2] = (void*)z7;
ResNeXt50ThreaderTask1 task105;
task105.callee1 = ResNeXt50StriderProduceSums3Callee1;
task105.any1 = tuple5;
task105.nd1 = 4;
task105.hull1[0] = 1;
task105.hull1[1] = 1;
task105.hull1[2] = 1;
task105.hull1[3] = 32;
ResNeXt50ThreaderDo1(team64, &task105);
}
}
}

static void ResNeXt50StriderConsumeSums3Callee1(ResNeXt50ThreaderTask1* task106, int64_t* pt58) {
char** tensors104 = task106->any1;
ptrdiff_t w67 = 0;
ptrdiff_t d21 = 0;
ptrdiff_t g34 = pt58[2];
char*restrict sfPtr15 = tensors104[0];
char*restrict datPtr32 = tensors104[1];
ptrdiff_t i63 = 8*g34;
ptrdiff_t ii45 = i63+7;
for (; i63 <= ii45; ++i63) {
ptrdiff_t j55 = 1*d21;
ptrdiff_t rel24 = j55-0;
ptrdiff_t base24 = 0;
ptrdiff_t toH46 = base24+0;
ptrdiff_t toW46 = 0;
ptrdiff_t k159 = 4*w67;
for (; k159 != 4; ++k159) {
ptrdiff_t r24 = 0;
for (; r24 != 2; ++r24) {
ptrdiff_t t43 = 0;
__m512 sfRe493 = _mm512_loadu_ps(sfPtr15+0+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm493 = _mm512_loadu_ps(sfPtr15+64+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe497 = _mm512_loadu_ps(sfPtr15+128+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm497 = _mm512_loadu_ps(sfPtr15+192+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe494 = _mm512_loadu_ps(sfPtr15+4096+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm494 = _mm512_loadu_ps(sfPtr15+4160+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe498 = _mm512_loadu_ps(sfPtr15+4224+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm498 = _mm512_loadu_ps(sfPtr15+4288+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe495 = _mm512_loadu_ps(sfPtr15+8192+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm495 = _mm512_loadu_ps(sfPtr15+8256+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe499 = _mm512_loadu_ps(sfPtr15+8320+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm499 = _mm512_loadu_ps(sfPtr15+8384+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe496 = _mm512_loadu_ps(sfPtr15+12288+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm496 = _mm512_loadu_ps(sfPtr15+12352+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfRe500 = _mm512_loadu_ps(sfPtr15+12416+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512 sfIm500 = _mm512_loadu_ps(sfPtr15+12480+16384*i63+6144*j55+1024*k159+512*r24+256*t43);
__m512i ifft7133 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft7134 = _mm512_permutexvar_ps(ifft7133, sfRe493);
__m512 ifft7225 = _mm512_permutexvar_ps(ifft7133, sfRe497);
__m512i ifft7135 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft7136 = _mm512_permutexvar_ps(ifft7135, sfRe493);
__m512 ifft7226 = _mm512_permutexvar_ps(ifft7135, sfRe497);
__m512 ifft7137 = _mm512_permutexvar_ps(ifft7133, sfIm493);
__m512 ifft7227 = _mm512_permutexvar_ps(ifft7133, sfIm497);
__m512 ifft7138 = _mm512_permutexvar_ps(ifft7135, sfIm493);
__m512 ifft7228 = _mm512_permutexvar_ps(ifft7135, sfIm497);
__m512 ifft7139 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft7140 = _mm512_mask_fmadd_ps(ifft7138, 65021, ifft7139, ifft7134);
__m512 ifft7229 = _mm512_mask_fmadd_ps(ifft7228, 65021, ifft7139, ifft7225);
__m512 ifft7141 = _mm512_mask_fnmadd_ps(ifft7137, 65021, ifft7139, ifft7136);
__m512 ifft7230 = _mm512_mask_fnmadd_ps(ifft7227, 65021, ifft7139, ifft7226);
__m512 ifft7142 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft7143 = _mm512_fmadd_ps(ifft7140, ifft7142, _mm512_shuffle_ps(ifft7140, ifft7140, 177));
__m512 ifft7231 = _mm512_fmadd_ps(ifft7229, ifft7142, _mm512_shuffle_ps(ifft7229, ifft7229, 177));
__m512 ifft7144 = _mm512_fmadd_ps(ifft7141, ifft7142, _mm512_shuffle_ps(ifft7141, ifft7141, 177));
__m512 ifft7232 = _mm512_fmadd_ps(ifft7230, ifft7142, _mm512_shuffle_ps(ifft7230, ifft7230, 177));
__m512 ifft7145 = _mm512_fmadd_ps(sfRe494, ifft7142, _mm512_shuffle_ps(sfRe494, sfRe494, 177));
__m512 ifft7233 = _mm512_fmadd_ps(sfRe498, ifft7142, _mm512_shuffle_ps(sfRe498, sfRe498, 177));
__m512 ifft7146 = _mm512_fmadd_ps(sfIm494, ifft7142, _mm512_shuffle_ps(sfIm494, sfIm494, 177));
__m512 ifft7234 = _mm512_fmadd_ps(sfIm498, ifft7142, _mm512_shuffle_ps(sfIm498, sfIm498, 177));
__m512 ifft7147 = _mm512_fmadd_ps(sfRe495, ifft7142, _mm512_shuffle_ps(sfRe495, sfRe495, 177));
__m512 ifft7235 = _mm512_fmadd_ps(sfRe499, ifft7142, _mm512_shuffle_ps(sfRe499, sfRe499, 177));
__m512 ifft7148 = _mm512_fmadd_ps(sfIm495, ifft7142, _mm512_shuffle_ps(sfIm495, sfIm495, 177));
__m512 ifft7236 = _mm512_fmadd_ps(sfIm499, ifft7142, _mm512_shuffle_ps(sfIm499, sfIm499, 177));
__m512 ifft7149 = _mm512_fmadd_ps(sfRe496, ifft7142, _mm512_shuffle_ps(sfRe496, sfRe496, 177));
__m512 ifft7237 = _mm512_fmadd_ps(sfRe500, ifft7142, _mm512_shuffle_ps(sfRe500, sfRe500, 177));
__m512 ifft7150 = _mm512_fmadd_ps(sfIm496, ifft7142, _mm512_shuffle_ps(sfIm496, sfIm496, 177));
__m512 ifft7238 = _mm512_fmadd_ps(sfIm500, ifft7142, _mm512_shuffle_ps(sfIm500, sfIm500, 177));
__m512 ifft7151 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft7152 = _mm512_mul_ps(ifft7143, ifft7151);
__m512 ifft7239 = _mm512_mul_ps(ifft7231, ifft7151);
__m512 ifft7153 = _mm512_mul_ps(ifft7144, ifft7151);
__m512 ifft7240 = _mm512_mul_ps(ifft7232, ifft7151);
__m512 ifft7154 = _mm512_mul_ps(ifft7145, ifft7151);
__m512 ifft7241 = _mm512_mul_ps(ifft7233, ifft7151);
__m512 ifft7155 = _mm512_mul_ps(ifft7146, ifft7151);
__m512 ifft7242 = _mm512_mul_ps(ifft7234, ifft7151);
__m512 ifft7156 = _mm512_mul_ps(ifft7147, ifft7151);
__m512 ifft7243 = _mm512_mul_ps(ifft7235, ifft7151);
__m512 ifft7157 = _mm512_mul_ps(ifft7148, ifft7151);
__m512 ifft7244 = _mm512_mul_ps(ifft7236, ifft7151);
__m512 ifft7158 = _mm512_mul_ps(ifft7149, ifft7151);
__m512 ifft7245 = _mm512_mul_ps(ifft7237, ifft7151);
__m512 ifft7159 = _mm512_mul_ps(ifft7150, ifft7151);
__m512 ifft7246 = _mm512_mul_ps(ifft7238, ifft7151);
__m512 ifft7160 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft7161 = _mm512_fnmadd_ps(ifft7144, ifft7160, ifft7152);
__m512 ifft7247 = _mm512_fnmadd_ps(ifft7232, ifft7160, ifft7239);
__m512 ifft7162 = _mm512_fmadd_ps(ifft7143, ifft7160, ifft7153);
__m512 ifft7248 = _mm512_fmadd_ps(ifft7231, ifft7160, ifft7240);
__m512 ifft7163 = _mm512_fnmadd_ps(ifft7146, ifft7160, ifft7154);
__m512 ifft7249 = _mm512_fnmadd_ps(ifft7234, ifft7160, ifft7241);
__m512 ifft7164 = _mm512_fmadd_ps(ifft7145, ifft7160, ifft7155);
__m512 ifft7250 = _mm512_fmadd_ps(ifft7233, ifft7160, ifft7242);
__m512 ifft7165 = _mm512_fnmadd_ps(ifft7148, ifft7160, ifft7156);
__m512 ifft7251 = _mm512_fnmadd_ps(ifft7236, ifft7160, ifft7243);
__m512 ifft7166 = _mm512_fmadd_ps(ifft7147, ifft7160, ifft7157);
__m512 ifft7252 = _mm512_fmadd_ps(ifft7235, ifft7160, ifft7244);
__m512 ifft7167 = _mm512_fnmadd_ps(ifft7150, ifft7160, ifft7158);
__m512 ifft7253 = _mm512_fnmadd_ps(ifft7238, ifft7160, ifft7245);
__m512 ifft7168 = _mm512_fmadd_ps(ifft7149, ifft7160, ifft7159);
__m512 ifft7254 = _mm512_fmadd_ps(ifft7237, ifft7160, ifft7246);
__m512 ifft7169 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft7170 = _mm512_fmadd_ps(ifft7161, ifft7169, _mm512_shuffle_ps(ifft7161, ifft7161, 78));
__m512 ifft7255 = _mm512_fmadd_ps(ifft7247, ifft7169, _mm512_shuffle_ps(ifft7247, ifft7247, 78));
__m512 ifft7171 = _mm512_fmadd_ps(ifft7162, ifft7169, _mm512_shuffle_ps(ifft7162, ifft7162, 78));
__m512 ifft7256 = _mm512_fmadd_ps(ifft7248, ifft7169, _mm512_shuffle_ps(ifft7248, ifft7248, 78));
__m512 ifft7172 = _mm512_fmadd_ps(ifft7163, ifft7169, _mm512_shuffle_ps(ifft7163, ifft7163, 78));
__m512 ifft7257 = _mm512_fmadd_ps(ifft7249, ifft7169, _mm512_shuffle_ps(ifft7249, ifft7249, 78));
__m512 ifft7173 = _mm512_fmadd_ps(ifft7164, ifft7169, _mm512_shuffle_ps(ifft7164, ifft7164, 78));
__m512 ifft7258 = _mm512_fmadd_ps(ifft7250, ifft7169, _mm512_shuffle_ps(ifft7250, ifft7250, 78));
__m512 ifft7174 = _mm512_fmadd_ps(ifft7165, ifft7169, _mm512_shuffle_ps(ifft7165, ifft7165, 78));
__m512 ifft7259 = _mm512_fmadd_ps(ifft7251, ifft7169, _mm512_shuffle_ps(ifft7251, ifft7251, 78));
__m512 ifft7175 = _mm512_fmadd_ps(ifft7166, ifft7169, _mm512_shuffle_ps(ifft7166, ifft7166, 78));
__m512 ifft7260 = _mm512_fmadd_ps(ifft7252, ifft7169, _mm512_shuffle_ps(ifft7252, ifft7252, 78));
__m512 ifft7176 = _mm512_fmadd_ps(ifft7167, ifft7169, _mm512_shuffle_ps(ifft7167, ifft7167, 78));
__m512 ifft7261 = _mm512_fmadd_ps(ifft7253, ifft7169, _mm512_shuffle_ps(ifft7253, ifft7253, 78));
__m512 ifft7177 = _mm512_fmadd_ps(ifft7168, ifft7169, _mm512_shuffle_ps(ifft7168, ifft7168, 78));
__m512 ifft7262 = _mm512_fmadd_ps(ifft7254, ifft7169, _mm512_shuffle_ps(ifft7254, ifft7254, 78));
__m512 ifft7178 = _mm512_mask_sub_ps(ifft7170, 49344, _mm512_setzero_ps(), ifft7171);
__m512 ifft7263 = _mm512_mask_sub_ps(ifft7255, 49344, _mm512_setzero_ps(), ifft7256);
__m512 ifft7179 = _mm512_mask_mov_ps(ifft7171, 49344, ifft7170);
__m512 ifft7264 = _mm512_mask_mov_ps(ifft7256, 49344, ifft7255);
__m512 ifft7180 = _mm512_mask_sub_ps(ifft7172, 49344, _mm512_setzero_ps(), ifft7173);
__m512 ifft7265 = _mm512_mask_sub_ps(ifft7257, 49344, _mm512_setzero_ps(), ifft7258);
__m512 ifft7181 = _mm512_mask_mov_ps(ifft7173, 49344, ifft7172);
__m512 ifft7266 = _mm512_mask_mov_ps(ifft7258, 49344, ifft7257);
__m512 ifft7182 = _mm512_mask_sub_ps(ifft7174, 49344, _mm512_setzero_ps(), ifft7175);
__m512 ifft7267 = _mm512_mask_sub_ps(ifft7259, 49344, _mm512_setzero_ps(), ifft7260);
__m512 ifft7183 = _mm512_mask_mov_ps(ifft7175, 49344, ifft7174);
__m512 ifft7268 = _mm512_mask_mov_ps(ifft7260, 49344, ifft7259);
__m512 ifft7184 = _mm512_mask_sub_ps(ifft7176, 49344, _mm512_setzero_ps(), ifft7177);
__m512 ifft7269 = _mm512_mask_sub_ps(ifft7261, 49344, _mm512_setzero_ps(), ifft7262);
__m512 ifft7185 = _mm512_mask_mov_ps(ifft7177, 49344, ifft7176);
__m512 ifft7270 = _mm512_mask_mov_ps(ifft7262, 49344, ifft7261);
__m512 ifft7186 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft7187 = _mm512_fmadd_ps(ifft7178, ifft7186, _mm512_shuffle_f32x4(ifft7178, ifft7178, 177));
__m512 ifft7271 = _mm512_fmadd_ps(ifft7263, ifft7186, _mm512_shuffle_f32x4(ifft7263, ifft7263, 177));
__m512 ifft7188 = _mm512_fmadd_ps(ifft7179, ifft7186, _mm512_shuffle_f32x4(ifft7179, ifft7179, 177));
__m512 ifft7272 = _mm512_fmadd_ps(ifft7264, ifft7186, _mm512_shuffle_f32x4(ifft7264, ifft7264, 177));
__m512 ifft7189 = _mm512_fmadd_ps(ifft7180, ifft7186, _mm512_shuffle_f32x4(ifft7180, ifft7180, 177));
__m512 ifft7273 = _mm512_fmadd_ps(ifft7265, ifft7186, _mm512_shuffle_f32x4(ifft7265, ifft7265, 177));
__m512 ifft7190 = _mm512_fmadd_ps(ifft7181, ifft7186, _mm512_shuffle_f32x4(ifft7181, ifft7181, 177));
__m512 ifft7274 = _mm512_fmadd_ps(ifft7266, ifft7186, _mm512_shuffle_f32x4(ifft7266, ifft7266, 177));
__m512 ifft7191 = _mm512_fmadd_ps(ifft7182, ifft7186, _mm512_shuffle_f32x4(ifft7182, ifft7182, 177));
__m512 ifft7275 = _mm512_fmadd_ps(ifft7267, ifft7186, _mm512_shuffle_f32x4(ifft7267, ifft7267, 177));
__m512 ifft7192 = _mm512_fnmsub_ps(ifft7183, ifft7186, _mm512_shuffle_f32x4(ifft7183, ifft7183, 177));
__m512 ifft7276 = _mm512_fnmsub_ps(ifft7268, ifft7186, _mm512_shuffle_f32x4(ifft7268, ifft7268, 177));
__m512 ifft7193 = _mm512_fmadd_ps(ifft7184, ifft7186, _mm512_shuffle_f32x4(ifft7184, ifft7184, 177));
__m512 ifft7277 = _mm512_fmadd_ps(ifft7269, ifft7186, _mm512_shuffle_f32x4(ifft7269, ifft7269, 177));
__m512 ifft7194 = _mm512_fmadd_ps(ifft7185, ifft7186, _mm512_shuffle_f32x4(ifft7185, ifft7185, 177));
__m512 ifft7278 = _mm512_fmadd_ps(ifft7270, ifft7186, _mm512_shuffle_f32x4(ifft7270, ifft7270, 177));
__m512 ifft7195 = _mm512_add_ps(ifft7187, ifft7188);
__m512 ifft7279 = _mm512_add_ps(ifft7271, ifft7272);
__m512 ifft7196 = _mm512_sub_ps(ifft7187, ifft7188);
__m512 ifft7280 = _mm512_sub_ps(ifft7271, ifft7272);
__m512 ifft7197 = _mm512_sub_ps(ifft7189, ifft7193);
__m512 ifft7281 = _mm512_sub_ps(ifft7273, ifft7277);
__m512 ifft7198 = _mm512_add_ps(ifft7190, ifft7194);
__m512 ifft7282 = _mm512_add_ps(ifft7274, ifft7278);
__m512 ifft7199 = _mm512_add_ps(ifft7189, ifft7193);
__m512 ifft7283 = _mm512_add_ps(ifft7273, ifft7277);
__m512 ifft7200 = _mm512_sub_ps(ifft7190, ifft7194);
__m512 ifft7284 = _mm512_sub_ps(ifft7274, ifft7278);
__m512 ifft7201 = _mm512_mul_ps(ifft7191, _mm512_set1_ps(3.125e-02f));
__m512 ifft7285 = _mm512_mul_ps(ifft7275, _mm512_set1_ps(3.125e-02f));
__m512 ifft7202 = _mm512_mul_ps(ifft7192, _mm512_set1_ps(3.125e-02f));
__m512 ifft7286 = _mm512_mul_ps(ifft7276, _mm512_set1_ps(3.125e-02f));
__m512 ifft7203 = _mm512_fmadd_ps(ifft7195, _mm512_set1_ps(1.5625e-02f), ifft7201);
__m512 ifft7287 = _mm512_fmadd_ps(ifft7279, _mm512_set1_ps(1.5625e-02f), ifft7285);
__m512 ifft7204 = _mm512_fmsub_ps(ifft7195, _mm512_set1_ps(1.5625e-02f), ifft7201);
__m512 ifft7288 = _mm512_fmsub_ps(ifft7279, _mm512_set1_ps(1.5625e-02f), ifft7285);
__m512 ifft7205 = _mm512_fmadd_ps(ifft7196, _mm512_set1_ps(1.5625e-02f), ifft7202);
__m512 ifft7289 = _mm512_fmadd_ps(ifft7280, _mm512_set1_ps(1.5625e-02f), ifft7286);
__m512 ifft7206 = _mm512_fmsub_ps(ifft7196, _mm512_set1_ps(1.5625e-02f), ifft7202);
__m512 ifft7290 = _mm512_fmsub_ps(ifft7280, _mm512_set1_ps(1.5625e-02f), ifft7286);
__m512 ifft7207 = _mm512_add_ps(ifft7197, ifft7198);
__m512 ifft7291 = _mm512_add_ps(ifft7281, ifft7282);
__m512 ifft7208 = _mm512_sub_ps(ifft7197, ifft7198);
__m512 ifft7292 = _mm512_sub_ps(ifft7281, ifft7282);
__m512 ifft7209 = _mm512_fnmadd_ps(ifft7207, _mm512_set1_ps(7.0710677e-01f), ifft7199);
__m512 ifft7293 = _mm512_fnmadd_ps(ifft7291, _mm512_set1_ps(7.0710677e-01f), ifft7283);
__m512 ifft7210 = _mm512_fmadd_ps(ifft7207, _mm512_set1_ps(7.0710677e-01f), ifft7199);
__m512 ifft7294 = _mm512_fmadd_ps(ifft7291, _mm512_set1_ps(7.0710677e-01f), ifft7283);
__m512 ifft7211 = _mm512_fmadd_ps(ifft7208, _mm512_set1_ps(7.0710677e-01f), ifft7200);
__m512 ifft7295 = _mm512_fmadd_ps(ifft7292, _mm512_set1_ps(7.0710677e-01f), ifft7284);
__m512 ifft7212 = _mm512_fmsub_ps(ifft7208, _mm512_set1_ps(7.0710677e-01f), ifft7200);
__m512 ifft7296 = _mm512_fmsub_ps(ifft7292, _mm512_set1_ps(7.0710677e-01f), ifft7284);
__m512 ifft7213 = _mm512_add_ps(ifft7209, ifft7210);
__m512 ifft7297 = _mm512_add_ps(ifft7293, ifft7294);
__m512 ifft7214 = _mm512_sub_ps(ifft7209, ifft7210);
__m512 ifft7298 = _mm512_sub_ps(ifft7293, ifft7294);
__m512 ifft7215 = _mm512_add_ps(ifft7211, ifft7212);
__m512 ifft7299 = _mm512_add_ps(ifft7295, ifft7296);
__m512 ifft7216 = _mm512_sub_ps(ifft7211, ifft7212);
__m512 ifft7300 = _mm512_sub_ps(ifft7295, ifft7296);
__m512 ifft7217 = _mm512_fmadd_ps(ifft7213, _mm512_set1_ps(1.5625e-02f), ifft7203);
__m512 ifft7301 = _mm512_fmadd_ps(ifft7297, _mm512_set1_ps(1.5625e-02f), ifft7287);
__m512 ifft7218 = _mm512_fnmadd_ps(ifft7213, _mm512_set1_ps(1.5625e-02f), ifft7203);
__m512 ifft7302 = _mm512_fnmadd_ps(ifft7297, _mm512_set1_ps(1.5625e-02f), ifft7287);
__m512 ifft7219 = _mm512_fmadd_ps(ifft7215, _mm512_set1_ps(1.5625e-02f), ifft7205);
__m512 ifft7303 = _mm512_fmadd_ps(ifft7299, _mm512_set1_ps(1.5625e-02f), ifft7289);
__m512 ifft7220 = _mm512_fnmadd_ps(ifft7215, _mm512_set1_ps(1.5625e-02f), ifft7205);
__m512 ifft7304 = _mm512_fnmadd_ps(ifft7299, _mm512_set1_ps(1.5625e-02f), ifft7289);
__m512 ifft7221 = _mm512_fnmadd_ps(ifft7216, _mm512_set1_ps(1.5625e-02f), ifft7204);
__m512 ifft7305 = _mm512_fnmadd_ps(ifft7300, _mm512_set1_ps(1.5625e-02f), ifft7288);
__m512 ifft7222 = _mm512_fmadd_ps(ifft7216, _mm512_set1_ps(1.5625e-02f), ifft7204);
__m512 ifft7306 = _mm512_fmadd_ps(ifft7300, _mm512_set1_ps(1.5625e-02f), ifft7288);
__m512 ifft7223 = _mm512_fmadd_ps(ifft7214, _mm512_set1_ps(1.5625e-02f), ifft7206);
__m512 ifft7307 = _mm512_fmadd_ps(ifft7298, _mm512_set1_ps(1.5625e-02f), ifft7290);
__m512 ifft7224 = _mm512_fnmadd_ps(ifft7214, _mm512_set1_ps(1.5625e-02f), ifft7206);
__m512 ifft7308 = _mm512_fnmadd_ps(ifft7298, _mm512_set1_ps(1.5625e-02f), ifft7290);
__m512 dat2424 = ifft7217;
__m512 dat2431 = ifft7301;
__m512 dat2425 = ifft7219;
__m512 dat2432 = ifft7303;
__m512 dat2426 = ifft7221;
__m512 dat2433 = ifft7305;
__m512 dat2427 = ifft7223;
__m512 dat2434 = ifft7307;
__m512 dat2428 = ifft7218;
__m512 dat2435 = ifft7302;
__m512 dat2429 = ifft7220;
__m512 dat2436 = ifft7304;
__m512 dat2430 = ifft7222;
__m512 dat2437 = ifft7306;
(void)ifft7224;
(void)ifft7308;
__m512i pm207 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack387 = _mm512_permutex2var_ps(dat2424, pm207, dat2431);
__m512i pm208 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack388 = _mm512_permutex2var_ps(dat2424, pm208, dat2431);
__m512 pack389 = _mm512_permutex2var_ps(dat2425, pm207, dat2432);
__m512 pack390 = _mm512_permutex2var_ps(dat2425, pm208, dat2432);
__m512 pack391 = _mm512_permutex2var_ps(dat2426, pm207, dat2433);
__m512 pack392 = _mm512_permutex2var_ps(dat2426, pm208, dat2433);
__m512 pack393 = _mm512_permutex2var_ps(dat2427, pm207, dat2434);
__m512 pack394 = _mm512_permutex2var_ps(dat2427, pm208, dat2434);
__m512 pack395 = _mm512_permutex2var_ps(dat2428, pm207, dat2435);
__m512 pack396 = _mm512_permutex2var_ps(dat2428, pm208, dat2435);
__m512 pack397 = _mm512_permutex2var_ps(dat2429, pm207, dat2436);
__m512 pack398 = _mm512_permutex2var_ps(dat2429, pm208, dat2436);
__m512 pack399 = _mm512_permutex2var_ps(dat2430, pm207, dat2437);
__m512 pack400 = _mm512_permutex2var_ps(dat2430, pm208, dat2437);
pack387 = _mm512_max_ps(_mm512_setzero_ps(), pack387);
pack388 = _mm512_max_ps(_mm512_setzero_ps(), pack388);
pack389 = _mm512_max_ps(_mm512_setzero_ps(), pack389);
pack390 = _mm512_max_ps(_mm512_setzero_ps(), pack390);
pack391 = _mm512_max_ps(_mm512_setzero_ps(), pack391);
pack392 = _mm512_max_ps(_mm512_setzero_ps(), pack392);
pack393 = _mm512_max_ps(_mm512_setzero_ps(), pack393);
pack394 = _mm512_max_ps(_mm512_setzero_ps(), pack394);
pack395 = _mm512_max_ps(_mm512_setzero_ps(), pack395);
pack396 = _mm512_max_ps(_mm512_setzero_ps(), pack396);
pack397 = _mm512_max_ps(_mm512_setzero_ps(), pack397);
pack398 = _mm512_max_ps(_mm512_setzero_ps(), pack398);
pack399 = _mm512_max_ps(_mm512_setzero_ps(), pack399);
pack400 = _mm512_max_ps(_mm512_setzero_ps(), pack400);
_mm512_mask_storeu_ps(datPtr32+0+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack387);
_mm512_mask_storeu_ps(datPtr32+832+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack388);
_mm512_mask_storeu_ps(datPtr32+56+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack389);
_mm512_mask_storeu_ps(datPtr32+888+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack390);
_mm512_mask_storeu_ps(datPtr32+112+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack391);
_mm512_mask_storeu_ps(datPtr32+944+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack392);
_mm512_mask_storeu_ps(datPtr32+168+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack393);
_mm512_mask_storeu_ps(datPtr32+1000+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack394);
_mm512_mask_storeu_ps(datPtr32+224+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack395);
_mm512_mask_storeu_ps(datPtr32+1056+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack396);
_mm512_mask_storeu_ps(datPtr32+280+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack397);
_mm512_mask_storeu_ps(datPtr32+1112+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack398);
_mm512_mask_storeu_ps(datPtr32+336+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack399);
_mm512_mask_storeu_ps(datPtr32+1168+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t43, 16383, pack400);
ptrdiff_t t44 = 0;
__m512 sfRe501 = _mm512_loadu_ps(sfPtr15+256+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm501 = _mm512_loadu_ps(sfPtr15+320+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe505 = _mm512_loadu_ps(sfPtr15+384+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm505 = _mm512_loadu_ps(sfPtr15+448+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe502 = _mm512_loadu_ps(sfPtr15+4352+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm502 = _mm512_loadu_ps(sfPtr15+4416+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe506 = _mm512_loadu_ps(sfPtr15+4480+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm506 = _mm512_loadu_ps(sfPtr15+4544+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe503 = _mm512_loadu_ps(sfPtr15+8448+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm503 = _mm512_loadu_ps(sfPtr15+8512+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe507 = _mm512_loadu_ps(sfPtr15+8576+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm507 = _mm512_loadu_ps(sfPtr15+8640+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe504 = _mm512_loadu_ps(sfPtr15+12544+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm504 = _mm512_loadu_ps(sfPtr15+12608+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfRe508 = _mm512_loadu_ps(sfPtr15+12672+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512 sfIm508 = _mm512_loadu_ps(sfPtr15+12736+16384*i63+6144*j55+1024*k159+512*r24+256*t44);
__m512i ifft7309 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft7310 = _mm512_permutexvar_ps(ifft7309, sfRe501);
__m512 ifft7401 = _mm512_permutexvar_ps(ifft7309, sfRe505);
__m512i ifft7311 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft7312 = _mm512_permutexvar_ps(ifft7311, sfRe501);
__m512 ifft7402 = _mm512_permutexvar_ps(ifft7311, sfRe505);
__m512 ifft7313 = _mm512_permutexvar_ps(ifft7309, sfIm501);
__m512 ifft7403 = _mm512_permutexvar_ps(ifft7309, sfIm505);
__m512 ifft7314 = _mm512_permutexvar_ps(ifft7311, sfIm501);
__m512 ifft7404 = _mm512_permutexvar_ps(ifft7311, sfIm505);
__m512 ifft7315 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft7316 = _mm512_mask_fmadd_ps(ifft7314, 65021, ifft7315, ifft7310);
__m512 ifft7405 = _mm512_mask_fmadd_ps(ifft7404, 65021, ifft7315, ifft7401);
__m512 ifft7317 = _mm512_mask_fnmadd_ps(ifft7313, 65021, ifft7315, ifft7312);
__m512 ifft7406 = _mm512_mask_fnmadd_ps(ifft7403, 65021, ifft7315, ifft7402);
__m512 ifft7318 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft7319 = _mm512_fmadd_ps(ifft7316, ifft7318, _mm512_shuffle_ps(ifft7316, ifft7316, 177));
__m512 ifft7407 = _mm512_fmadd_ps(ifft7405, ifft7318, _mm512_shuffle_ps(ifft7405, ifft7405, 177));
__m512 ifft7320 = _mm512_fmadd_ps(ifft7317, ifft7318, _mm512_shuffle_ps(ifft7317, ifft7317, 177));
__m512 ifft7408 = _mm512_fmadd_ps(ifft7406, ifft7318, _mm512_shuffle_ps(ifft7406, ifft7406, 177));
__m512 ifft7321 = _mm512_fmadd_ps(sfRe502, ifft7318, _mm512_shuffle_ps(sfRe502, sfRe502, 177));
__m512 ifft7409 = _mm512_fmadd_ps(sfRe506, ifft7318, _mm512_shuffle_ps(sfRe506, sfRe506, 177));
__m512 ifft7322 = _mm512_fmadd_ps(sfIm502, ifft7318, _mm512_shuffle_ps(sfIm502, sfIm502, 177));
__m512 ifft7410 = _mm512_fmadd_ps(sfIm506, ifft7318, _mm512_shuffle_ps(sfIm506, sfIm506, 177));
__m512 ifft7323 = _mm512_fmadd_ps(sfRe503, ifft7318, _mm512_shuffle_ps(sfRe503, sfRe503, 177));
__m512 ifft7411 = _mm512_fmadd_ps(sfRe507, ifft7318, _mm512_shuffle_ps(sfRe507, sfRe507, 177));
__m512 ifft7324 = _mm512_fmadd_ps(sfIm503, ifft7318, _mm512_shuffle_ps(sfIm503, sfIm503, 177));
__m512 ifft7412 = _mm512_fmadd_ps(sfIm507, ifft7318, _mm512_shuffle_ps(sfIm507, sfIm507, 177));
__m512 ifft7325 = _mm512_fmadd_ps(sfRe504, ifft7318, _mm512_shuffle_ps(sfRe504, sfRe504, 177));
__m512 ifft7413 = _mm512_fmadd_ps(sfRe508, ifft7318, _mm512_shuffle_ps(sfRe508, sfRe508, 177));
__m512 ifft7326 = _mm512_fmadd_ps(sfIm504, ifft7318, _mm512_shuffle_ps(sfIm504, sfIm504, 177));
__m512 ifft7414 = _mm512_fmadd_ps(sfIm508, ifft7318, _mm512_shuffle_ps(sfIm508, sfIm508, 177));
__m512 ifft7327 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft7328 = _mm512_mul_ps(ifft7319, ifft7327);
__m512 ifft7415 = _mm512_mul_ps(ifft7407, ifft7327);
__m512 ifft7329 = _mm512_mul_ps(ifft7320, ifft7327);
__m512 ifft7416 = _mm512_mul_ps(ifft7408, ifft7327);
__m512 ifft7330 = _mm512_mul_ps(ifft7321, ifft7327);
__m512 ifft7417 = _mm512_mul_ps(ifft7409, ifft7327);
__m512 ifft7331 = _mm512_mul_ps(ifft7322, ifft7327);
__m512 ifft7418 = _mm512_mul_ps(ifft7410, ifft7327);
__m512 ifft7332 = _mm512_mul_ps(ifft7323, ifft7327);
__m512 ifft7419 = _mm512_mul_ps(ifft7411, ifft7327);
__m512 ifft7333 = _mm512_mul_ps(ifft7324, ifft7327);
__m512 ifft7420 = _mm512_mul_ps(ifft7412, ifft7327);
__m512 ifft7334 = _mm512_mul_ps(ifft7325, ifft7327);
__m512 ifft7421 = _mm512_mul_ps(ifft7413, ifft7327);
__m512 ifft7335 = _mm512_mul_ps(ifft7326, ifft7327);
__m512 ifft7422 = _mm512_mul_ps(ifft7414, ifft7327);
__m512 ifft7336 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft7337 = _mm512_fnmadd_ps(ifft7320, ifft7336, ifft7328);
__m512 ifft7423 = _mm512_fnmadd_ps(ifft7408, ifft7336, ifft7415);
__m512 ifft7338 = _mm512_fmadd_ps(ifft7319, ifft7336, ifft7329);
__m512 ifft7424 = _mm512_fmadd_ps(ifft7407, ifft7336, ifft7416);
__m512 ifft7339 = _mm512_fnmadd_ps(ifft7322, ifft7336, ifft7330);
__m512 ifft7425 = _mm512_fnmadd_ps(ifft7410, ifft7336, ifft7417);
__m512 ifft7340 = _mm512_fmadd_ps(ifft7321, ifft7336, ifft7331);
__m512 ifft7426 = _mm512_fmadd_ps(ifft7409, ifft7336, ifft7418);
__m512 ifft7341 = _mm512_fnmadd_ps(ifft7324, ifft7336, ifft7332);
__m512 ifft7427 = _mm512_fnmadd_ps(ifft7412, ifft7336, ifft7419);
__m512 ifft7342 = _mm512_fmadd_ps(ifft7323, ifft7336, ifft7333);
__m512 ifft7428 = _mm512_fmadd_ps(ifft7411, ifft7336, ifft7420);
__m512 ifft7343 = _mm512_fnmadd_ps(ifft7326, ifft7336, ifft7334);
__m512 ifft7429 = _mm512_fnmadd_ps(ifft7414, ifft7336, ifft7421);
__m512 ifft7344 = _mm512_fmadd_ps(ifft7325, ifft7336, ifft7335);
__m512 ifft7430 = _mm512_fmadd_ps(ifft7413, ifft7336, ifft7422);
__m512 ifft7345 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft7346 = _mm512_fmadd_ps(ifft7337, ifft7345, _mm512_shuffle_ps(ifft7337, ifft7337, 78));
__m512 ifft7431 = _mm512_fmadd_ps(ifft7423, ifft7345, _mm512_shuffle_ps(ifft7423, ifft7423, 78));
__m512 ifft7347 = _mm512_fmadd_ps(ifft7338, ifft7345, _mm512_shuffle_ps(ifft7338, ifft7338, 78));
__m512 ifft7432 = _mm512_fmadd_ps(ifft7424, ifft7345, _mm512_shuffle_ps(ifft7424, ifft7424, 78));
__m512 ifft7348 = _mm512_fmadd_ps(ifft7339, ifft7345, _mm512_shuffle_ps(ifft7339, ifft7339, 78));
__m512 ifft7433 = _mm512_fmadd_ps(ifft7425, ifft7345, _mm512_shuffle_ps(ifft7425, ifft7425, 78));
__m512 ifft7349 = _mm512_fmadd_ps(ifft7340, ifft7345, _mm512_shuffle_ps(ifft7340, ifft7340, 78));
__m512 ifft7434 = _mm512_fmadd_ps(ifft7426, ifft7345, _mm512_shuffle_ps(ifft7426, ifft7426, 78));
__m512 ifft7350 = _mm512_fmadd_ps(ifft7341, ifft7345, _mm512_shuffle_ps(ifft7341, ifft7341, 78));
__m512 ifft7435 = _mm512_fmadd_ps(ifft7427, ifft7345, _mm512_shuffle_ps(ifft7427, ifft7427, 78));
__m512 ifft7351 = _mm512_fmadd_ps(ifft7342, ifft7345, _mm512_shuffle_ps(ifft7342, ifft7342, 78));
__m512 ifft7436 = _mm512_fmadd_ps(ifft7428, ifft7345, _mm512_shuffle_ps(ifft7428, ifft7428, 78));
__m512 ifft7352 = _mm512_fmadd_ps(ifft7343, ifft7345, _mm512_shuffle_ps(ifft7343, ifft7343, 78));
__m512 ifft7437 = _mm512_fmadd_ps(ifft7429, ifft7345, _mm512_shuffle_ps(ifft7429, ifft7429, 78));
__m512 ifft7353 = _mm512_fmadd_ps(ifft7344, ifft7345, _mm512_shuffle_ps(ifft7344, ifft7344, 78));
__m512 ifft7438 = _mm512_fmadd_ps(ifft7430, ifft7345, _mm512_shuffle_ps(ifft7430, ifft7430, 78));
__m512 ifft7354 = _mm512_mask_sub_ps(ifft7346, 49344, _mm512_setzero_ps(), ifft7347);
__m512 ifft7439 = _mm512_mask_sub_ps(ifft7431, 49344, _mm512_setzero_ps(), ifft7432);
__m512 ifft7355 = _mm512_mask_mov_ps(ifft7347, 49344, ifft7346);
__m512 ifft7440 = _mm512_mask_mov_ps(ifft7432, 49344, ifft7431);
__m512 ifft7356 = _mm512_mask_sub_ps(ifft7348, 49344, _mm512_setzero_ps(), ifft7349);
__m512 ifft7441 = _mm512_mask_sub_ps(ifft7433, 49344, _mm512_setzero_ps(), ifft7434);
__m512 ifft7357 = _mm512_mask_mov_ps(ifft7349, 49344, ifft7348);
__m512 ifft7442 = _mm512_mask_mov_ps(ifft7434, 49344, ifft7433);
__m512 ifft7358 = _mm512_mask_sub_ps(ifft7350, 49344, _mm512_setzero_ps(), ifft7351);
__m512 ifft7443 = _mm512_mask_sub_ps(ifft7435, 49344, _mm512_setzero_ps(), ifft7436);
__m512 ifft7359 = _mm512_mask_mov_ps(ifft7351, 49344, ifft7350);
__m512 ifft7444 = _mm512_mask_mov_ps(ifft7436, 49344, ifft7435);
__m512 ifft7360 = _mm512_mask_sub_ps(ifft7352, 49344, _mm512_setzero_ps(), ifft7353);
__m512 ifft7445 = _mm512_mask_sub_ps(ifft7437, 49344, _mm512_setzero_ps(), ifft7438);
__m512 ifft7361 = _mm512_mask_mov_ps(ifft7353, 49344, ifft7352);
__m512 ifft7446 = _mm512_mask_mov_ps(ifft7438, 49344, ifft7437);
__m512 ifft7362 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft7363 = _mm512_fmadd_ps(ifft7354, ifft7362, _mm512_shuffle_f32x4(ifft7354, ifft7354, 177));
__m512 ifft7447 = _mm512_fmadd_ps(ifft7439, ifft7362, _mm512_shuffle_f32x4(ifft7439, ifft7439, 177));
__m512 ifft7364 = _mm512_fmadd_ps(ifft7355, ifft7362, _mm512_shuffle_f32x4(ifft7355, ifft7355, 177));
__m512 ifft7448 = _mm512_fmadd_ps(ifft7440, ifft7362, _mm512_shuffle_f32x4(ifft7440, ifft7440, 177));
__m512 ifft7365 = _mm512_fmadd_ps(ifft7356, ifft7362, _mm512_shuffle_f32x4(ifft7356, ifft7356, 177));
__m512 ifft7449 = _mm512_fmadd_ps(ifft7441, ifft7362, _mm512_shuffle_f32x4(ifft7441, ifft7441, 177));
__m512 ifft7366 = _mm512_fmadd_ps(ifft7357, ifft7362, _mm512_shuffle_f32x4(ifft7357, ifft7357, 177));
__m512 ifft7450 = _mm512_fmadd_ps(ifft7442, ifft7362, _mm512_shuffle_f32x4(ifft7442, ifft7442, 177));
__m512 ifft7367 = _mm512_fmadd_ps(ifft7358, ifft7362, _mm512_shuffle_f32x4(ifft7358, ifft7358, 177));
__m512 ifft7451 = _mm512_fmadd_ps(ifft7443, ifft7362, _mm512_shuffle_f32x4(ifft7443, ifft7443, 177));
__m512 ifft7368 = _mm512_fnmsub_ps(ifft7359, ifft7362, _mm512_shuffle_f32x4(ifft7359, ifft7359, 177));
__m512 ifft7452 = _mm512_fnmsub_ps(ifft7444, ifft7362, _mm512_shuffle_f32x4(ifft7444, ifft7444, 177));
__m512 ifft7369 = _mm512_fmadd_ps(ifft7360, ifft7362, _mm512_shuffle_f32x4(ifft7360, ifft7360, 177));
__m512 ifft7453 = _mm512_fmadd_ps(ifft7445, ifft7362, _mm512_shuffle_f32x4(ifft7445, ifft7445, 177));
__m512 ifft7370 = _mm512_fmadd_ps(ifft7361, ifft7362, _mm512_shuffle_f32x4(ifft7361, ifft7361, 177));
__m512 ifft7454 = _mm512_fmadd_ps(ifft7446, ifft7362, _mm512_shuffle_f32x4(ifft7446, ifft7446, 177));
__m512 ifft7371 = _mm512_add_ps(ifft7363, ifft7364);
__m512 ifft7455 = _mm512_add_ps(ifft7447, ifft7448);
__m512 ifft7372 = _mm512_sub_ps(ifft7363, ifft7364);
__m512 ifft7456 = _mm512_sub_ps(ifft7447, ifft7448);
__m512 ifft7373 = _mm512_sub_ps(ifft7365, ifft7369);
__m512 ifft7457 = _mm512_sub_ps(ifft7449, ifft7453);
__m512 ifft7374 = _mm512_add_ps(ifft7366, ifft7370);
__m512 ifft7458 = _mm512_add_ps(ifft7450, ifft7454);
__m512 ifft7375 = _mm512_add_ps(ifft7365, ifft7369);
__m512 ifft7459 = _mm512_add_ps(ifft7449, ifft7453);
__m512 ifft7376 = _mm512_sub_ps(ifft7366, ifft7370);
__m512 ifft7460 = _mm512_sub_ps(ifft7450, ifft7454);
__m512 ifft7377 = _mm512_mul_ps(ifft7367, _mm512_set1_ps(3.125e-02f));
__m512 ifft7461 = _mm512_mul_ps(ifft7451, _mm512_set1_ps(3.125e-02f));
__m512 ifft7378 = _mm512_mul_ps(ifft7368, _mm512_set1_ps(3.125e-02f));
__m512 ifft7462 = _mm512_mul_ps(ifft7452, _mm512_set1_ps(3.125e-02f));
__m512 ifft7379 = _mm512_fmadd_ps(ifft7371, _mm512_set1_ps(1.5625e-02f), ifft7377);
__m512 ifft7463 = _mm512_fmadd_ps(ifft7455, _mm512_set1_ps(1.5625e-02f), ifft7461);
__m512 ifft7380 = _mm512_fmsub_ps(ifft7371, _mm512_set1_ps(1.5625e-02f), ifft7377);
__m512 ifft7464 = _mm512_fmsub_ps(ifft7455, _mm512_set1_ps(1.5625e-02f), ifft7461);
__m512 ifft7381 = _mm512_fmadd_ps(ifft7372, _mm512_set1_ps(1.5625e-02f), ifft7378);
__m512 ifft7465 = _mm512_fmadd_ps(ifft7456, _mm512_set1_ps(1.5625e-02f), ifft7462);
__m512 ifft7382 = _mm512_fmsub_ps(ifft7372, _mm512_set1_ps(1.5625e-02f), ifft7378);
__m512 ifft7466 = _mm512_fmsub_ps(ifft7456, _mm512_set1_ps(1.5625e-02f), ifft7462);
__m512 ifft7383 = _mm512_add_ps(ifft7373, ifft7374);
__m512 ifft7467 = _mm512_add_ps(ifft7457, ifft7458);
__m512 ifft7384 = _mm512_sub_ps(ifft7373, ifft7374);
__m512 ifft7468 = _mm512_sub_ps(ifft7457, ifft7458);
__m512 ifft7385 = _mm512_fnmadd_ps(ifft7383, _mm512_set1_ps(7.0710677e-01f), ifft7375);
__m512 ifft7469 = _mm512_fnmadd_ps(ifft7467, _mm512_set1_ps(7.0710677e-01f), ifft7459);
__m512 ifft7386 = _mm512_fmadd_ps(ifft7383, _mm512_set1_ps(7.0710677e-01f), ifft7375);
__m512 ifft7470 = _mm512_fmadd_ps(ifft7467, _mm512_set1_ps(7.0710677e-01f), ifft7459);
__m512 ifft7387 = _mm512_fmadd_ps(ifft7384, _mm512_set1_ps(7.0710677e-01f), ifft7376);
__m512 ifft7471 = _mm512_fmadd_ps(ifft7468, _mm512_set1_ps(7.0710677e-01f), ifft7460);
__m512 ifft7388 = _mm512_fmsub_ps(ifft7384, _mm512_set1_ps(7.0710677e-01f), ifft7376);
__m512 ifft7472 = _mm512_fmsub_ps(ifft7468, _mm512_set1_ps(7.0710677e-01f), ifft7460);
__m512 ifft7389 = _mm512_add_ps(ifft7385, ifft7386);
__m512 ifft7473 = _mm512_add_ps(ifft7469, ifft7470);
__m512 ifft7390 = _mm512_sub_ps(ifft7385, ifft7386);
__m512 ifft7474 = _mm512_sub_ps(ifft7469, ifft7470);
__m512 ifft7391 = _mm512_add_ps(ifft7387, ifft7388);
__m512 ifft7475 = _mm512_add_ps(ifft7471, ifft7472);
__m512 ifft7392 = _mm512_sub_ps(ifft7387, ifft7388);
__m512 ifft7476 = _mm512_sub_ps(ifft7471, ifft7472);
__m512 ifft7393 = _mm512_fmadd_ps(ifft7389, _mm512_set1_ps(1.5625e-02f), ifft7379);
__m512 ifft7477 = _mm512_fmadd_ps(ifft7473, _mm512_set1_ps(1.5625e-02f), ifft7463);
__m512 ifft7394 = _mm512_fnmadd_ps(ifft7389, _mm512_set1_ps(1.5625e-02f), ifft7379);
__m512 ifft7478 = _mm512_fnmadd_ps(ifft7473, _mm512_set1_ps(1.5625e-02f), ifft7463);
__m512 ifft7395 = _mm512_fmadd_ps(ifft7391, _mm512_set1_ps(1.5625e-02f), ifft7381);
__m512 ifft7479 = _mm512_fmadd_ps(ifft7475, _mm512_set1_ps(1.5625e-02f), ifft7465);
__m512 ifft7396 = _mm512_fnmadd_ps(ifft7391, _mm512_set1_ps(1.5625e-02f), ifft7381);
__m512 ifft7480 = _mm512_fnmadd_ps(ifft7475, _mm512_set1_ps(1.5625e-02f), ifft7465);
__m512 ifft7397 = _mm512_fnmadd_ps(ifft7392, _mm512_set1_ps(1.5625e-02f), ifft7380);
__m512 ifft7481 = _mm512_fnmadd_ps(ifft7476, _mm512_set1_ps(1.5625e-02f), ifft7464);
__m512 ifft7398 = _mm512_fmadd_ps(ifft7392, _mm512_set1_ps(1.5625e-02f), ifft7380);
__m512 ifft7482 = _mm512_fmadd_ps(ifft7476, _mm512_set1_ps(1.5625e-02f), ifft7464);
__m512 ifft7399 = _mm512_fmadd_ps(ifft7390, _mm512_set1_ps(1.5625e-02f), ifft7382);
__m512 ifft7483 = _mm512_fmadd_ps(ifft7474, _mm512_set1_ps(1.5625e-02f), ifft7466);
__m512 ifft7400 = _mm512_fnmadd_ps(ifft7390, _mm512_set1_ps(1.5625e-02f), ifft7382);
__m512 ifft7484 = _mm512_fnmadd_ps(ifft7474, _mm512_set1_ps(1.5625e-02f), ifft7466);
__m512 dat2438 = ifft7393;
__m512 dat2445 = ifft7477;
__m512 dat2439 = ifft7395;
__m512 dat2446 = ifft7479;
__m512 dat2440 = ifft7397;
__m512 dat2447 = ifft7481;
__m512 dat2441 = ifft7399;
__m512 dat2448 = ifft7483;
__m512 dat2442 = ifft7394;
__m512 dat2449 = ifft7478;
__m512 dat2443 = ifft7396;
__m512 dat2450 = ifft7480;
__m512 dat2444 = ifft7398;
__m512 dat2451 = ifft7482;
(void)ifft7400;
(void)ifft7484;
__m512i pm209 = _mm512_set_epi32(1, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
__m512 pack401 = _mm512_permutex2var_ps(dat2438, pm209, dat2445);
__m512i pm210 = _mm512_set_epi32(25, 24, 14, 13, 12, 11, 10, 9, 8, 30, 29, 28, 27, 26, 25, 24);
__m512 pack402 = _mm512_permutex2var_ps(dat2438, pm210, dat2445);
__m512 pack403 = _mm512_permutex2var_ps(dat2439, pm209, dat2446);
__m512 pack404 = _mm512_permutex2var_ps(dat2439, pm210, dat2446);
__m512 pack405 = _mm512_permutex2var_ps(dat2440, pm209, dat2447);
__m512 pack406 = _mm512_permutex2var_ps(dat2440, pm210, dat2447);
__m512 pack407 = _mm512_permutex2var_ps(dat2441, pm209, dat2448);
__m512 pack408 = _mm512_permutex2var_ps(dat2441, pm210, dat2448);
__m512 pack409 = _mm512_permutex2var_ps(dat2442, pm209, dat2449);
__m512 pack410 = _mm512_permutex2var_ps(dat2442, pm210, dat2449);
__m512 pack411 = _mm512_permutex2var_ps(dat2443, pm209, dat2450);
__m512 pack412 = _mm512_permutex2var_ps(dat2443, pm210, dat2450);
__m512 pack413 = _mm512_permutex2var_ps(dat2444, pm209, dat2451);
__m512 pack414 = _mm512_permutex2var_ps(dat2444, pm210, dat2451);
pack401 = _mm512_max_ps(_mm512_setzero_ps(), pack401);
pack402 = _mm512_max_ps(_mm512_setzero_ps(), pack402);
pack403 = _mm512_max_ps(_mm512_setzero_ps(), pack403);
pack404 = _mm512_max_ps(_mm512_setzero_ps(), pack404);
pack405 = _mm512_max_ps(_mm512_setzero_ps(), pack405);
pack406 = _mm512_max_ps(_mm512_setzero_ps(), pack406);
pack407 = _mm512_max_ps(_mm512_setzero_ps(), pack407);
pack408 = _mm512_max_ps(_mm512_setzero_ps(), pack408);
pack409 = _mm512_max_ps(_mm512_setzero_ps(), pack409);
pack410 = _mm512_max_ps(_mm512_setzero_ps(), pack410);
pack411 = _mm512_max_ps(_mm512_setzero_ps(), pack411);
pack412 = _mm512_max_ps(_mm512_setzero_ps(), pack412);
pack413 = _mm512_max_ps(_mm512_setzero_ps(), pack413);
pack414 = _mm512_max_ps(_mm512_setzero_ps(), pack414);
_mm512_mask_storeu_ps(datPtr32+392+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack401);
_mm512_mask_storeu_ps(datPtr32+1224+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack402);
_mm512_mask_storeu_ps(datPtr32+448+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack403);
_mm512_mask_storeu_ps(datPtr32+1280+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack404);
_mm512_mask_storeu_ps(datPtr32+504+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack405);
_mm512_mask_storeu_ps(datPtr32+1336+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack406);
_mm512_mask_storeu_ps(datPtr32+560+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack407);
_mm512_mask_storeu_ps(datPtr32+1392+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack408);
_mm512_mask_storeu_ps(datPtr32+616+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack409);
_mm512_mask_storeu_ps(datPtr32+1448+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack410);
_mm512_mask_storeu_ps(datPtr32+672+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack411);
_mm512_mask_storeu_ps(datPtr32+1504+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack412);
_mm512_mask_storeu_ps(datPtr32+728+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack413);
_mm512_mask_storeu_ps(datPtr32+1560+13312*i63+3328*k159+1664*r24+56*toH46+4*toW46+0*t44, 16383, pack414);
}
}
++j55;
}
}

static void ResNeXt50StriderConsumeSums3(ResNeXt50ThreaderTeam1* team65, char** tensors103) {
ResNeXt50ThreaderTask1 task107;
task107.callee1 = ResNeXt50StriderConsumeSums3Callee1;
task107.any1 = tensors103;
task107.nd1 = 3;
task107.hull1[0] = 1;
task107.hull1[1] = 1;
task107.hull1[2] = 4;
ResNeXt50ThreaderDo1(team65, &task107);
}

static void ResNeXt50StriderArrangeFilts4Callee1(ResNeXt50ThreaderTask1* task140, int64_t* pt75) {
char** tensors138 = task140->any1;
ptrdiff_t b88 = pt75[0];
ptrdiff_t g43 = pt75[1];
ptrdiff_t e39 = 0;
char*restrict bfPtr18 = tensors138[3]+4096*e39;
char*restrict wfPtr18 = tensors138[3]+4096+346554368*e39;
char*restrict wtPtr22 = tensors138[0]+23796*e39;
char*restrict biasPtr22 = tensors138[1];
char*restrict bnPtr23 = tensors138[2];
ptrdiff_t i81 = 1*g43;
ptrdiff_t j72 = 2*b88;
ptrdiff_t jj65 = j72+1;
if (j72 < 16) {
for (; j72 != 16; ++j72) {
__m512 postMul70 = _mm512_set1_ps(((float*)bnPtr23+(ptrdiff_t)2*(0+32*i81+2*j72))[0]);
__m512 postMul71 = _mm512_set1_ps(((float*)bnPtr23+(ptrdiff_t)2*(1+32*i81+2*j72))[0]);
for (ptrdiff_t k188 = 0; k188 < 32; ++k188) {
__m512 wt861 = _mm512_maskz_loadu_ps(7, wtPtr22+0+36864*i81+2304*j72+36*k188);
__m512 wt862 = _mm512_maskz_loadu_ps(7, wtPtr22+12+36864*i81+2304*j72+36*k188);
__m512 wt863 = _mm512_maskz_loadu_ps(7, wtPtr22+24+36864*i81+2304*j72+36*k188);
wt861 = _mm512_mul_ps(postMul70, wt861);
wt862 = _mm512_mul_ps(postMul70, wt862);
wt863 = _mm512_mul_ps(postMul70, wt863);
__m512 fft10585 = _mm512_add_ps(wt861, _mm512_setzero_ps());
__m512 fft10673 = _mm512_add_ps(wt862, _mm512_setzero_ps());
__m512 fft10586 = _mm512_sub_ps(wt861, _mm512_setzero_ps());
__m512 fft10674 = _mm512_sub_ps(wt862, _mm512_setzero_ps());
__m512 fft10587 = _mm512_add_ps(wt863, _mm512_setzero_ps());
__m512 fft10675 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10588 = _mm512_sub_ps(wt863, _mm512_setzero_ps());
__m512 fft10676 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10589 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10677 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10590 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10678 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10591 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10679 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10592 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10680 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10593 = _mm512_add_ps(fft10585, fft10589);
__m512 fft10681 = _mm512_add_ps(fft10673, fft10677);
__m512 fft10594 = _mm512_sub_ps(fft10585, fft10589);
__m512 fft10682 = _mm512_sub_ps(fft10673, fft10677);
__m512 fft10595 = _mm512_add_ps(fft10587, fft10591);
__m512 fft10683 = _mm512_add_ps(fft10675, fft10679);
__m512 fft10596 = _mm512_sub_ps(fft10591, fft10587);
__m512 fft10684 = _mm512_sub_ps(fft10679, fft10675);
__m512 fft10597 = _mm512_sub_ps(fft10588, fft10592);
__m512 fft10685 = _mm512_sub_ps(fft10676, fft10680);
__m512 fft10598 = _mm512_add_ps(fft10588, fft10592);
__m512 fft10686 = _mm512_add_ps(fft10676, fft10680);
__m512 fft10599 = _mm512_add_ps(fft10593, fft10595);
__m512 fft10687 = _mm512_add_ps(fft10681, fft10683);
__m512 fft10600 = _mm512_sub_ps(fft10593, fft10595);
__m512 fft10688 = _mm512_sub_ps(fft10681, fft10683);
__m512 fft10601 = _mm512_fmadd_ps(fft10597, _mm512_set1_ps(7.0710677e-01f), fft10586);
__m512 fft10689 = _mm512_fmadd_ps(fft10685, _mm512_set1_ps(7.0710677e-01f), fft10674);
__m512 fft10602 = _mm512_fnmsub_ps(fft10598, _mm512_set1_ps(7.0710677e-01f), fft10590);
__m512 fft10690 = _mm512_fnmsub_ps(fft10686, _mm512_set1_ps(7.0710677e-01f), fft10678);
__m512 fft10603 = _mm512_fnmadd_ps(fft10597, _mm512_set1_ps(7.0710677e-01f), fft10586);
__m512 fft10691 = _mm512_fnmadd_ps(fft10685, _mm512_set1_ps(7.0710677e-01f), fft10674);
__m512 fft10604 = _mm512_fnmadd_ps(fft10598, _mm512_set1_ps(7.0710677e-01f), fft10590);
__m512 fft10692 = _mm512_fnmadd_ps(fft10686, _mm512_set1_ps(7.0710677e-01f), fft10678);
__m512 fft10605 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10606 = _mm512_fmadd_ps(fft10599, fft10605, _mm512_shuffle_f32x4(fft10599, fft10599, 78));
__m512 fft10693 = _mm512_fmadd_ps(fft10687, fft10605, _mm512_shuffle_f32x4(fft10687, fft10687, 78));
__m512 fft10607 = _mm512_fmadd_ps(fft10600, fft10605, _mm512_shuffle_f32x4(fft10600, fft10600, 78));
__m512 fft10694 = _mm512_fmadd_ps(fft10688, fft10605, _mm512_shuffle_f32x4(fft10688, fft10688, 78));
__m512 fft10608 = _mm512_fmadd_ps(fft10601, fft10605, _mm512_shuffle_f32x4(fft10601, fft10601, 78));
__m512 fft10695 = _mm512_fmadd_ps(fft10689, fft10605, _mm512_shuffle_f32x4(fft10689, fft10689, 78));
__m512 fft10609 = _mm512_fmadd_ps(fft10602, fft10605, _mm512_shuffle_f32x4(fft10602, fft10602, 78));
__m512 fft10696 = _mm512_fmadd_ps(fft10690, fft10605, _mm512_shuffle_f32x4(fft10690, fft10690, 78));
__m512 fft10610 = _mm512_fmadd_ps(fft10594, fft10605, _mm512_shuffle_f32x4(fft10594, fft10594, 78));
__m512 fft10697 = _mm512_fmadd_ps(fft10682, fft10605, _mm512_shuffle_f32x4(fft10682, fft10682, 78));
__m512 fft10611 = _mm512_fmadd_ps(fft10596, fft10605, _mm512_shuffle_f32x4(fft10596, fft10596, 78));
__m512 fft10698 = _mm512_fmadd_ps(fft10684, fft10605, _mm512_shuffle_f32x4(fft10684, fft10684, 78));
__m512 fft10612 = _mm512_fmadd_ps(fft10603, fft10605, _mm512_shuffle_f32x4(fft10603, fft10603, 78));
__m512 fft10699 = _mm512_fmadd_ps(fft10691, fft10605, _mm512_shuffle_f32x4(fft10691, fft10691, 78));
__m512 fft10613 = _mm512_fmadd_ps(fft10604, fft10605, _mm512_shuffle_f32x4(fft10604, fft10604, 78));
__m512 fft10700 = _mm512_fmadd_ps(fft10692, fft10605, _mm512_shuffle_f32x4(fft10692, fft10692, 78));
__m512 fft10614 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10615 = _mm512_mul_ps(fft10606, fft10614);
__m512 fft10701 = _mm512_mul_ps(fft10693, fft10614);
__m512 fft10616 = _mm512_mul_ps(fft10607, fft10614);
__m512 fft10702 = _mm512_mul_ps(fft10694, fft10614);
__m512 fft10617 = _mm512_mul_ps(fft10608, fft10614);
__m512 fft10703 = _mm512_mul_ps(fft10695, fft10614);
__m512 fft10618 = _mm512_mul_ps(fft10609, fft10614);
__m512 fft10704 = _mm512_mul_ps(fft10696, fft10614);
__m512 fft10619 = _mm512_mul_ps(fft10610, fft10614);
__m512 fft10705 = _mm512_mul_ps(fft10697, fft10614);
__m512 fft10620 = _mm512_mul_ps(fft10611, fft10614);
__m512 fft10706 = _mm512_mul_ps(fft10698, fft10614);
__m512 fft10621 = _mm512_mul_ps(fft10612, fft10614);
__m512 fft10707 = _mm512_mul_ps(fft10699, fft10614);
__m512 fft10622 = _mm512_mul_ps(fft10613, fft10614);
__m512 fft10708 = _mm512_mul_ps(fft10700, fft10614);
__m512 fft10623 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10624 = _mm512_fmadd_ps(fft10607, fft10623, fft10615);
__m512 fft10709 = _mm512_fmadd_ps(fft10694, fft10623, fft10701);
__m512 fft10625 = _mm512_fnmadd_ps(fft10606, fft10623, fft10616);
__m512 fft10710 = _mm512_fnmadd_ps(fft10693, fft10623, fft10702);
__m512 fft10626 = _mm512_fmadd_ps(fft10609, fft10623, fft10617);
__m512 fft10711 = _mm512_fmadd_ps(fft10696, fft10623, fft10703);
__m512 fft10627 = _mm512_fnmadd_ps(fft10608, fft10623, fft10618);
__m512 fft10712 = _mm512_fnmadd_ps(fft10695, fft10623, fft10704);
__m512 fft10628 = _mm512_fmadd_ps(fft10611, fft10623, fft10619);
__m512 fft10713 = _mm512_fmadd_ps(fft10698, fft10623, fft10705);
__m512 fft10629 = _mm512_fnmadd_ps(fft10610, fft10623, fft10620);
__m512 fft10714 = _mm512_fnmadd_ps(fft10697, fft10623, fft10706);
__m512 fft10630 = _mm512_fmadd_ps(fft10613, fft10623, fft10621);
__m512 fft10715 = _mm512_fmadd_ps(fft10700, fft10623, fft10707);
__m512 fft10631 = _mm512_fnmadd_ps(fft10612, fft10623, fft10622);
__m512 fft10716 = _mm512_fnmadd_ps(fft10699, fft10623, fft10708);
__m512 fft10632 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10633 = _mm512_fmadd_ps(fft10624, fft10632, _mm512_shuffle_f32x4(fft10624, fft10624, 177));
__m512 fft10717 = _mm512_fmadd_ps(fft10709, fft10632, _mm512_shuffle_f32x4(fft10709, fft10709, 177));
__m512 fft10634 = _mm512_fmadd_ps(fft10625, fft10632, _mm512_shuffle_f32x4(fft10625, fft10625, 177));
__m512 fft10718 = _mm512_fmadd_ps(fft10710, fft10632, _mm512_shuffle_f32x4(fft10710, fft10710, 177));
__m512 fft10635 = _mm512_fmadd_ps(fft10626, fft10632, _mm512_shuffle_f32x4(fft10626, fft10626, 177));
__m512 fft10719 = _mm512_fmadd_ps(fft10711, fft10632, _mm512_shuffle_f32x4(fft10711, fft10711, 177));
__m512 fft10636 = _mm512_fmadd_ps(fft10627, fft10632, _mm512_shuffle_f32x4(fft10627, fft10627, 177));
__m512 fft10720 = _mm512_fmadd_ps(fft10712, fft10632, _mm512_shuffle_f32x4(fft10712, fft10712, 177));
__m512 fft10637 = _mm512_fmadd_ps(fft10628, fft10632, _mm512_shuffle_f32x4(fft10628, fft10628, 177));
__m512 fft10721 = _mm512_fmadd_ps(fft10713, fft10632, _mm512_shuffle_f32x4(fft10713, fft10713, 177));
__m512 fft10638 = _mm512_fmadd_ps(fft10629, fft10632, _mm512_shuffle_f32x4(fft10629, fft10629, 177));
__m512 fft10722 = _mm512_fmadd_ps(fft10714, fft10632, _mm512_shuffle_f32x4(fft10714, fft10714, 177));
__m512 fft10639 = _mm512_fmadd_ps(fft10630, fft10632, _mm512_shuffle_f32x4(fft10630, fft10630, 177));
__m512 fft10723 = _mm512_fmadd_ps(fft10715, fft10632, _mm512_shuffle_f32x4(fft10715, fft10715, 177));
__m512 fft10640 = _mm512_fmadd_ps(fft10631, fft10632, _mm512_shuffle_f32x4(fft10631, fft10631, 177));
__m512 fft10724 = _mm512_fmadd_ps(fft10716, fft10632, _mm512_shuffle_f32x4(fft10716, fft10716, 177));
__m512 fft10641 = _mm512_mask_mov_ps(fft10633, 49344, fft10634);
__m512 fft10725 = _mm512_mask_mov_ps(fft10717, 49344, fft10718);
__m512 fft10642 = _mm512_mask_sub_ps(fft10634, 49344, _mm512_setzero_ps(), fft10633);
__m512 fft10726 = _mm512_mask_sub_ps(fft10718, 49344, _mm512_setzero_ps(), fft10717);
__m512 fft10643 = _mm512_mask_mov_ps(fft10635, 49344, fft10636);
__m512 fft10727 = _mm512_mask_mov_ps(fft10719, 49344, fft10720);
__m512 fft10644 = _mm512_mask_sub_ps(fft10636, 49344, _mm512_setzero_ps(), fft10635);
__m512 fft10728 = _mm512_mask_sub_ps(fft10720, 49344, _mm512_setzero_ps(), fft10719);
__m512 fft10645 = _mm512_mask_mov_ps(fft10637, 49344, fft10638);
__m512 fft10729 = _mm512_mask_mov_ps(fft10721, 49344, fft10722);
__m512 fft10646 = _mm512_mask_sub_ps(fft10638, 49344, _mm512_setzero_ps(), fft10637);
__m512 fft10730 = _mm512_mask_sub_ps(fft10722, 49344, _mm512_setzero_ps(), fft10721);
__m512 fft10647 = _mm512_mask_mov_ps(fft10639, 49344, fft10640);
__m512 fft10731 = _mm512_mask_mov_ps(fft10723, 49344, fft10724);
__m512 fft10648 = _mm512_mask_sub_ps(fft10640, 49344, _mm512_setzero_ps(), fft10639);
__m512 fft10732 = _mm512_mask_sub_ps(fft10724, 49344, _mm512_setzero_ps(), fft10723);
__m512 fft10649 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10650 = _mm512_fmadd_ps(fft10641, fft10649, _mm512_shuffle_ps(fft10641, fft10641, 78));
__m512 fft10733 = _mm512_fmadd_ps(fft10725, fft10649, _mm512_shuffle_ps(fft10725, fft10725, 78));
__m512 fft10651 = _mm512_fmadd_ps(fft10642, fft10649, _mm512_shuffle_ps(fft10642, fft10642, 78));
__m512 fft10734 = _mm512_fmadd_ps(fft10726, fft10649, _mm512_shuffle_ps(fft10726, fft10726, 78));
__m512 fft10652 = _mm512_fmadd_ps(fft10643, fft10649, _mm512_shuffle_ps(fft10643, fft10643, 78));
__m512 fft10735 = _mm512_fmadd_ps(fft10727, fft10649, _mm512_shuffle_ps(fft10727, fft10727, 78));
__m512 fft10653 = _mm512_fmadd_ps(fft10644, fft10649, _mm512_shuffle_ps(fft10644, fft10644, 78));
__m512 fft10736 = _mm512_fmadd_ps(fft10728, fft10649, _mm512_shuffle_ps(fft10728, fft10728, 78));
__m512 fft10654 = _mm512_fmadd_ps(fft10645, fft10649, _mm512_shuffle_ps(fft10645, fft10645, 78));
__m512 fft10737 = _mm512_fmadd_ps(fft10729, fft10649, _mm512_shuffle_ps(fft10729, fft10729, 78));
__m512 fft10655 = _mm512_fmadd_ps(fft10646, fft10649, _mm512_shuffle_ps(fft10646, fft10646, 78));
__m512 fft10738 = _mm512_fmadd_ps(fft10730, fft10649, _mm512_shuffle_ps(fft10730, fft10730, 78));
__m512 fft10656 = _mm512_fmadd_ps(fft10647, fft10649, _mm512_shuffle_ps(fft10647, fft10647, 78));
__m512 fft10739 = _mm512_fmadd_ps(fft10731, fft10649, _mm512_shuffle_ps(fft10731, fft10731, 78));
__m512 fft10657 = _mm512_fmadd_ps(fft10648, fft10649, _mm512_shuffle_ps(fft10648, fft10648, 78));
__m512 fft10740 = _mm512_fmadd_ps(fft10732, fft10649, _mm512_shuffle_ps(fft10732, fft10732, 78));
__m512i fft10658 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10659 = _mm512_permutexvar_ps(fft10658, fft10650);
__m512 fft10741 = _mm512_permutexvar_ps(fft10658, fft10733);
__m512i fft10660 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10661 = _mm512_permutexvar_ps(fft10660, fft10650);
__m512 fft10742 = _mm512_permutexvar_ps(fft10660, fft10733);
__m512 fft10662 = _mm512_permutexvar_ps(fft10658, fft10651);
__m512 fft10743 = _mm512_permutexvar_ps(fft10658, fft10734);
__m512 fft10663 = _mm512_permutexvar_ps(fft10660, fft10651);
__m512 fft10744 = _mm512_permutexvar_ps(fft10660, fft10734);
__m512 fft10664 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft10665 = _mm512_fmadd_ps(fft10659, fft10664, fft10661);
__m512 fft10745 = _mm512_fmadd_ps(fft10741, fft10664, fft10742);
__m512 fft10666 = _mm512_fnmadd_ps(fft10663, fft10664, fft10662);
__m512 fft10746 = _mm512_fnmadd_ps(fft10744, fft10664, fft10743);
__m512 fft10667 = _mm512_mask_mov_ps(fft10663, 21845, fft10665);
__m512 fft10747 = _mm512_mask_mov_ps(fft10744, 21845, fft10745);
__m512 fft10668 = _mm512_mask_mov_ps(fft10659, 43176, fft10665);
__m512 fft10748 = _mm512_mask_mov_ps(fft10741, 43176, fft10745);
__m512 fft10669 = _mm512_mask_mov_ps(fft10667, 43176, fft10666);
__m512 fft10749 = _mm512_mask_mov_ps(fft10747, 43176, fft10746);
__m512 fft10670 = _mm512_mask_mov_ps(fft10668, 22102, fft10666);
__m512 fft10750 = _mm512_mask_mov_ps(fft10748, 22102, fft10746);
__m512 fft10671 = _mm512_mask_mul_ps(fft10669, 64764, fft10669, _mm512_set1_ps(5e-01f));
__m512 fft10751 = _mm512_mask_mul_ps(fft10749, 64764, fft10749, _mm512_set1_ps(5e-01f));
__m512 fft10672 = _mm512_mask_mul_ps(fft10670, 64764, fft10670, _mm512_set1_ps(5e-01f));
__m512 fft10752 = _mm512_mask_mul_ps(fft10750, 64764, fft10750, _mm512_set1_ps(5e-01f));
__m512 wf193 = fft10671;
__m512 wf201 = fft10751;
__m512 wf194 = fft10672;
__m512 wf202 = fft10752;
__m512 wf195 = fft10652;
__m512 wf203 = fft10735;
__m512 wf196 = fft10653;
__m512 wf204 = fft10736;
__m512 wf197 = fft10654;
__m512 wf205 = fft10737;
__m512 wf198 = fft10655;
__m512 wf206 = fft10738;
__m512 wf199 = fft10656;
__m512 wf207 = fft10739;
__m512 wf200 = fft10657;
__m512 wf208 = fft10740;
ptrdiff_t c67 = (size_t)(0+2*j72)/4;
ptrdiff_t m64 = (size_t)(0+2*j72)%4/2;
ptrdiff_t f69 = (size_t)(0+2*j72)%2;
__m512i eo64 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf195 = _mm512_permutexvar_ps(eo64, wf195);
wf196 = _mm512_permutexvar_ps(eo64, wf196);
__m512i wfs65 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf195, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs65 = _mm512_inserti64x4(wfs65, _mm512_cvtps_ph(wf196, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+32768+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs65);
_mm512_mask_storeu_epi32(wfPtr18+4227056+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs65);
wf203 = _mm512_permutexvar_ps(eo64, wf203);
wf204 = _mm512_permutexvar_ps(eo64, wf204);
__m512i wfs66 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf203, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs66 = _mm512_inserti64x4(wfs66, _mm512_cvtps_ph(wf204, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8421376+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs66);
_mm512_mask_storeu_epi32(wfPtr18+12615664+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs66);
wf197 = _mm512_permutexvar_ps(eo64, wf197);
wf198 = _mm512_permutexvar_ps(eo64, wf198);
__m512i wfs67 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf197, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs67 = _mm512_inserti64x4(wfs67, _mm512_cvtps_ph(wf198, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+65536+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs67);
_mm512_mask_storeu_epi32(wfPtr18+4259824+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs67);
wf205 = _mm512_permutexvar_ps(eo64, wf205);
wf206 = _mm512_permutexvar_ps(eo64, wf206);
__m512i wfs68 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf205, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs68 = _mm512_inserti64x4(wfs68, _mm512_cvtps_ph(wf206, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8454144+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs68);
_mm512_mask_storeu_epi32(wfPtr18+12648432+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs68);
wf199 = _mm512_permutexvar_ps(eo64, wf199);
wf200 = _mm512_permutexvar_ps(eo64, wf200);
__m512i wfs69 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf199, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs69 = _mm512_inserti64x4(wfs69, _mm512_cvtps_ph(wf200, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+98304+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs69);
_mm512_mask_storeu_epi32(wfPtr18+4292592+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs69);
wf207 = _mm512_permutexvar_ps(eo64, wf207);
wf208 = _mm512_permutexvar_ps(eo64, wf208);
__m512i wfs70 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf207, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs70 = _mm512_inserti64x4(wfs70, _mm512_cvtps_ph(wf208, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8486912+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs70);
_mm512_mask_storeu_epi32(wfPtr18+12681200+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs70);
__m512i wfs71 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf193, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs71 = _mm512_inserti64x4(wfs71, _mm512_cvtps_ph(wf194, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+0+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs71);
_mm512_mask_storeu_epi32(wfPtr18+4194288+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs71);
__m512i wfs72 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf201, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs72 = _mm512_inserti64x4(wfs72, _mm512_cvtps_ph(wf202, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8388608+131072*i81+4096*c67+128*k188+64*m64+16*f69, 3855, wfs72);
_mm512_mask_storeu_epi32(wfPtr18+12582896+131072*i81+4096*c67+128*k188+64*m64+16*f69, 61680, wfs72);
__m512 wt864 = _mm512_maskz_loadu_ps(7, wtPtr22+1152+36864*i81+2304*j72+36*k188);
__m512 wt865 = _mm512_maskz_loadu_ps(7, wtPtr22+1164+36864*i81+2304*j72+36*k188);
__m512 wt866 = _mm512_maskz_loadu_ps(7, wtPtr22+1176+36864*i81+2304*j72+36*k188);
wt864 = _mm512_mul_ps(postMul71, wt864);
wt865 = _mm512_mul_ps(postMul71, wt865);
wt866 = _mm512_mul_ps(postMul71, wt866);
__m512 fft10753 = _mm512_add_ps(wt864, _mm512_setzero_ps());
__m512 fft10841 = _mm512_add_ps(wt865, _mm512_setzero_ps());
__m512 fft10754 = _mm512_sub_ps(wt864, _mm512_setzero_ps());
__m512 fft10842 = _mm512_sub_ps(wt865, _mm512_setzero_ps());
__m512 fft10755 = _mm512_add_ps(wt866, _mm512_setzero_ps());
__m512 fft10843 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10756 = _mm512_sub_ps(wt866, _mm512_setzero_ps());
__m512 fft10844 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10757 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10845 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10758 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10846 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10759 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10847 = _mm512_add_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10760 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10848 = _mm512_sub_ps(_mm512_setzero_ps(), _mm512_setzero_ps());
__m512 fft10761 = _mm512_add_ps(fft10753, fft10757);
__m512 fft10849 = _mm512_add_ps(fft10841, fft10845);
__m512 fft10762 = _mm512_sub_ps(fft10753, fft10757);
__m512 fft10850 = _mm512_sub_ps(fft10841, fft10845);
__m512 fft10763 = _mm512_add_ps(fft10755, fft10759);
__m512 fft10851 = _mm512_add_ps(fft10843, fft10847);
__m512 fft10764 = _mm512_sub_ps(fft10759, fft10755);
__m512 fft10852 = _mm512_sub_ps(fft10847, fft10843);
__m512 fft10765 = _mm512_sub_ps(fft10756, fft10760);
__m512 fft10853 = _mm512_sub_ps(fft10844, fft10848);
__m512 fft10766 = _mm512_add_ps(fft10756, fft10760);
__m512 fft10854 = _mm512_add_ps(fft10844, fft10848);
__m512 fft10767 = _mm512_add_ps(fft10761, fft10763);
__m512 fft10855 = _mm512_add_ps(fft10849, fft10851);
__m512 fft10768 = _mm512_sub_ps(fft10761, fft10763);
__m512 fft10856 = _mm512_sub_ps(fft10849, fft10851);
__m512 fft10769 = _mm512_fmadd_ps(fft10765, _mm512_set1_ps(7.0710677e-01f), fft10754);
__m512 fft10857 = _mm512_fmadd_ps(fft10853, _mm512_set1_ps(7.0710677e-01f), fft10842);
__m512 fft10770 = _mm512_fnmsub_ps(fft10766, _mm512_set1_ps(7.0710677e-01f), fft10758);
__m512 fft10858 = _mm512_fnmsub_ps(fft10854, _mm512_set1_ps(7.0710677e-01f), fft10846);
__m512 fft10771 = _mm512_fnmadd_ps(fft10765, _mm512_set1_ps(7.0710677e-01f), fft10754);
__m512 fft10859 = _mm512_fnmadd_ps(fft10853, _mm512_set1_ps(7.0710677e-01f), fft10842);
__m512 fft10772 = _mm512_fnmadd_ps(fft10766, _mm512_set1_ps(7.0710677e-01f), fft10758);
__m512 fft10860 = _mm512_fnmadd_ps(fft10854, _mm512_set1_ps(7.0710677e-01f), fft10846);
__m512 fft10773 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10774 = _mm512_fmadd_ps(fft10767, fft10773, _mm512_shuffle_f32x4(fft10767, fft10767, 78));
__m512 fft10861 = _mm512_fmadd_ps(fft10855, fft10773, _mm512_shuffle_f32x4(fft10855, fft10855, 78));
__m512 fft10775 = _mm512_fmadd_ps(fft10768, fft10773, _mm512_shuffle_f32x4(fft10768, fft10768, 78));
__m512 fft10862 = _mm512_fmadd_ps(fft10856, fft10773, _mm512_shuffle_f32x4(fft10856, fft10856, 78));
__m512 fft10776 = _mm512_fmadd_ps(fft10769, fft10773, _mm512_shuffle_f32x4(fft10769, fft10769, 78));
__m512 fft10863 = _mm512_fmadd_ps(fft10857, fft10773, _mm512_shuffle_f32x4(fft10857, fft10857, 78));
__m512 fft10777 = _mm512_fmadd_ps(fft10770, fft10773, _mm512_shuffle_f32x4(fft10770, fft10770, 78));
__m512 fft10864 = _mm512_fmadd_ps(fft10858, fft10773, _mm512_shuffle_f32x4(fft10858, fft10858, 78));
__m512 fft10778 = _mm512_fmadd_ps(fft10762, fft10773, _mm512_shuffle_f32x4(fft10762, fft10762, 78));
__m512 fft10865 = _mm512_fmadd_ps(fft10850, fft10773, _mm512_shuffle_f32x4(fft10850, fft10850, 78));
__m512 fft10779 = _mm512_fmadd_ps(fft10764, fft10773, _mm512_shuffle_f32x4(fft10764, fft10764, 78));
__m512 fft10866 = _mm512_fmadd_ps(fft10852, fft10773, _mm512_shuffle_f32x4(fft10852, fft10852, 78));
__m512 fft10780 = _mm512_fmadd_ps(fft10771, fft10773, _mm512_shuffle_f32x4(fft10771, fft10771, 78));
__m512 fft10867 = _mm512_fmadd_ps(fft10859, fft10773, _mm512_shuffle_f32x4(fft10859, fft10859, 78));
__m512 fft10781 = _mm512_fmadd_ps(fft10772, fft10773, _mm512_shuffle_f32x4(fft10772, fft10772, 78));
__m512 fft10868 = _mm512_fmadd_ps(fft10860, fft10773, _mm512_shuffle_f32x4(fft10860, fft10860, 78));
__m512 fft10782 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10783 = _mm512_mul_ps(fft10774, fft10782);
__m512 fft10869 = _mm512_mul_ps(fft10861, fft10782);
__m512 fft10784 = _mm512_mul_ps(fft10775, fft10782);
__m512 fft10870 = _mm512_mul_ps(fft10862, fft10782);
__m512 fft10785 = _mm512_mul_ps(fft10776, fft10782);
__m512 fft10871 = _mm512_mul_ps(fft10863, fft10782);
__m512 fft10786 = _mm512_mul_ps(fft10777, fft10782);
__m512 fft10872 = _mm512_mul_ps(fft10864, fft10782);
__m512 fft10787 = _mm512_mul_ps(fft10778, fft10782);
__m512 fft10873 = _mm512_mul_ps(fft10865, fft10782);
__m512 fft10788 = _mm512_mul_ps(fft10779, fft10782);
__m512 fft10874 = _mm512_mul_ps(fft10866, fft10782);
__m512 fft10789 = _mm512_mul_ps(fft10780, fft10782);
__m512 fft10875 = _mm512_mul_ps(fft10867, fft10782);
__m512 fft10790 = _mm512_mul_ps(fft10781, fft10782);
__m512 fft10876 = _mm512_mul_ps(fft10868, fft10782);
__m512 fft10791 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10792 = _mm512_fmadd_ps(fft10775, fft10791, fft10783);
__m512 fft10877 = _mm512_fmadd_ps(fft10862, fft10791, fft10869);
__m512 fft10793 = _mm512_fnmadd_ps(fft10774, fft10791, fft10784);
__m512 fft10878 = _mm512_fnmadd_ps(fft10861, fft10791, fft10870);
__m512 fft10794 = _mm512_fmadd_ps(fft10777, fft10791, fft10785);
__m512 fft10879 = _mm512_fmadd_ps(fft10864, fft10791, fft10871);
__m512 fft10795 = _mm512_fnmadd_ps(fft10776, fft10791, fft10786);
__m512 fft10880 = _mm512_fnmadd_ps(fft10863, fft10791, fft10872);
__m512 fft10796 = _mm512_fmadd_ps(fft10779, fft10791, fft10787);
__m512 fft10881 = _mm512_fmadd_ps(fft10866, fft10791, fft10873);
__m512 fft10797 = _mm512_fnmadd_ps(fft10778, fft10791, fft10788);
__m512 fft10882 = _mm512_fnmadd_ps(fft10865, fft10791, fft10874);
__m512 fft10798 = _mm512_fmadd_ps(fft10781, fft10791, fft10789);
__m512 fft10883 = _mm512_fmadd_ps(fft10868, fft10791, fft10875);
__m512 fft10799 = _mm512_fnmadd_ps(fft10780, fft10791, fft10790);
__m512 fft10884 = _mm512_fnmadd_ps(fft10867, fft10791, fft10876);
__m512 fft10800 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10801 = _mm512_fmadd_ps(fft10792, fft10800, _mm512_shuffle_f32x4(fft10792, fft10792, 177));
__m512 fft10885 = _mm512_fmadd_ps(fft10877, fft10800, _mm512_shuffle_f32x4(fft10877, fft10877, 177));
__m512 fft10802 = _mm512_fmadd_ps(fft10793, fft10800, _mm512_shuffle_f32x4(fft10793, fft10793, 177));
__m512 fft10886 = _mm512_fmadd_ps(fft10878, fft10800, _mm512_shuffle_f32x4(fft10878, fft10878, 177));
__m512 fft10803 = _mm512_fmadd_ps(fft10794, fft10800, _mm512_shuffle_f32x4(fft10794, fft10794, 177));
__m512 fft10887 = _mm512_fmadd_ps(fft10879, fft10800, _mm512_shuffle_f32x4(fft10879, fft10879, 177));
__m512 fft10804 = _mm512_fmadd_ps(fft10795, fft10800, _mm512_shuffle_f32x4(fft10795, fft10795, 177));
__m512 fft10888 = _mm512_fmadd_ps(fft10880, fft10800, _mm512_shuffle_f32x4(fft10880, fft10880, 177));
__m512 fft10805 = _mm512_fmadd_ps(fft10796, fft10800, _mm512_shuffle_f32x4(fft10796, fft10796, 177));
__m512 fft10889 = _mm512_fmadd_ps(fft10881, fft10800, _mm512_shuffle_f32x4(fft10881, fft10881, 177));
__m512 fft10806 = _mm512_fmadd_ps(fft10797, fft10800, _mm512_shuffle_f32x4(fft10797, fft10797, 177));
__m512 fft10890 = _mm512_fmadd_ps(fft10882, fft10800, _mm512_shuffle_f32x4(fft10882, fft10882, 177));
__m512 fft10807 = _mm512_fmadd_ps(fft10798, fft10800, _mm512_shuffle_f32x4(fft10798, fft10798, 177));
__m512 fft10891 = _mm512_fmadd_ps(fft10883, fft10800, _mm512_shuffle_f32x4(fft10883, fft10883, 177));
__m512 fft10808 = _mm512_fmadd_ps(fft10799, fft10800, _mm512_shuffle_f32x4(fft10799, fft10799, 177));
__m512 fft10892 = _mm512_fmadd_ps(fft10884, fft10800, _mm512_shuffle_f32x4(fft10884, fft10884, 177));
__m512 fft10809 = _mm512_mask_mov_ps(fft10801, 49344, fft10802);
__m512 fft10893 = _mm512_mask_mov_ps(fft10885, 49344, fft10886);
__m512 fft10810 = _mm512_mask_sub_ps(fft10802, 49344, _mm512_setzero_ps(), fft10801);
__m512 fft10894 = _mm512_mask_sub_ps(fft10886, 49344, _mm512_setzero_ps(), fft10885);
__m512 fft10811 = _mm512_mask_mov_ps(fft10803, 49344, fft10804);
__m512 fft10895 = _mm512_mask_mov_ps(fft10887, 49344, fft10888);
__m512 fft10812 = _mm512_mask_sub_ps(fft10804, 49344, _mm512_setzero_ps(), fft10803);
__m512 fft10896 = _mm512_mask_sub_ps(fft10888, 49344, _mm512_setzero_ps(), fft10887);
__m512 fft10813 = _mm512_mask_mov_ps(fft10805, 49344, fft10806);
__m512 fft10897 = _mm512_mask_mov_ps(fft10889, 49344, fft10890);
__m512 fft10814 = _mm512_mask_sub_ps(fft10806, 49344, _mm512_setzero_ps(), fft10805);
__m512 fft10898 = _mm512_mask_sub_ps(fft10890, 49344, _mm512_setzero_ps(), fft10889);
__m512 fft10815 = _mm512_mask_mov_ps(fft10807, 49344, fft10808);
__m512 fft10899 = _mm512_mask_mov_ps(fft10891, 49344, fft10892);
__m512 fft10816 = _mm512_mask_sub_ps(fft10808, 49344, _mm512_setzero_ps(), fft10807);
__m512 fft10900 = _mm512_mask_sub_ps(fft10892, 49344, _mm512_setzero_ps(), fft10891);
__m512 fft10817 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10818 = _mm512_fmadd_ps(fft10809, fft10817, _mm512_shuffle_ps(fft10809, fft10809, 78));
__m512 fft10901 = _mm512_fmadd_ps(fft10893, fft10817, _mm512_shuffle_ps(fft10893, fft10893, 78));
__m512 fft10819 = _mm512_fmadd_ps(fft10810, fft10817, _mm512_shuffle_ps(fft10810, fft10810, 78));
__m512 fft10902 = _mm512_fmadd_ps(fft10894, fft10817, _mm512_shuffle_ps(fft10894, fft10894, 78));
__m512 fft10820 = _mm512_fmadd_ps(fft10811, fft10817, _mm512_shuffle_ps(fft10811, fft10811, 78));
__m512 fft10903 = _mm512_fmadd_ps(fft10895, fft10817, _mm512_shuffle_ps(fft10895, fft10895, 78));
__m512 fft10821 = _mm512_fmadd_ps(fft10812, fft10817, _mm512_shuffle_ps(fft10812, fft10812, 78));
__m512 fft10904 = _mm512_fmadd_ps(fft10896, fft10817, _mm512_shuffle_ps(fft10896, fft10896, 78));
__m512 fft10822 = _mm512_fmadd_ps(fft10813, fft10817, _mm512_shuffle_ps(fft10813, fft10813, 78));
__m512 fft10905 = _mm512_fmadd_ps(fft10897, fft10817, _mm512_shuffle_ps(fft10897, fft10897, 78));
__m512 fft10823 = _mm512_fmadd_ps(fft10814, fft10817, _mm512_shuffle_ps(fft10814, fft10814, 78));
__m512 fft10906 = _mm512_fmadd_ps(fft10898, fft10817, _mm512_shuffle_ps(fft10898, fft10898, 78));
__m512 fft10824 = _mm512_fmadd_ps(fft10815, fft10817, _mm512_shuffle_ps(fft10815, fft10815, 78));
__m512 fft10907 = _mm512_fmadd_ps(fft10899, fft10817, _mm512_shuffle_ps(fft10899, fft10899, 78));
__m512 fft10825 = _mm512_fmadd_ps(fft10816, fft10817, _mm512_shuffle_ps(fft10816, fft10816, 78));
__m512 fft10908 = _mm512_fmadd_ps(fft10900, fft10817, _mm512_shuffle_ps(fft10900, fft10900, 78));
__m512i fft10826 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10827 = _mm512_permutexvar_ps(fft10826, fft10818);
__m512 fft10909 = _mm512_permutexvar_ps(fft10826, fft10901);
__m512i fft10828 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10829 = _mm512_permutexvar_ps(fft10828, fft10818);
__m512 fft10910 = _mm512_permutexvar_ps(fft10828, fft10901);
__m512 fft10830 = _mm512_permutexvar_ps(fft10826, fft10819);
__m512 fft10911 = _mm512_permutexvar_ps(fft10826, fft10902);
__m512 fft10831 = _mm512_permutexvar_ps(fft10828, fft10819);
__m512 fft10912 = _mm512_permutexvar_ps(fft10828, fft10902);
__m512 fft10832 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft10833 = _mm512_fmadd_ps(fft10827, fft10832, fft10829);
__m512 fft10913 = _mm512_fmadd_ps(fft10909, fft10832, fft10910);
__m512 fft10834 = _mm512_fnmadd_ps(fft10831, fft10832, fft10830);
__m512 fft10914 = _mm512_fnmadd_ps(fft10912, fft10832, fft10911);
__m512 fft10835 = _mm512_mask_mov_ps(fft10831, 21845, fft10833);
__m512 fft10915 = _mm512_mask_mov_ps(fft10912, 21845, fft10913);
__m512 fft10836 = _mm512_mask_mov_ps(fft10827, 43176, fft10833);
__m512 fft10916 = _mm512_mask_mov_ps(fft10909, 43176, fft10913);
__m512 fft10837 = _mm512_mask_mov_ps(fft10835, 43176, fft10834);
__m512 fft10917 = _mm512_mask_mov_ps(fft10915, 43176, fft10914);
__m512 fft10838 = _mm512_mask_mov_ps(fft10836, 22102, fft10834);
__m512 fft10918 = _mm512_mask_mov_ps(fft10916, 22102, fft10914);
__m512 fft10839 = _mm512_mask_mul_ps(fft10837, 64764, fft10837, _mm512_set1_ps(5e-01f));
__m512 fft10919 = _mm512_mask_mul_ps(fft10917, 64764, fft10917, _mm512_set1_ps(5e-01f));
__m512 fft10840 = _mm512_mask_mul_ps(fft10838, 64764, fft10838, _mm512_set1_ps(5e-01f));
__m512 fft10920 = _mm512_mask_mul_ps(fft10918, 64764, fft10918, _mm512_set1_ps(5e-01f));
__m512 wf209 = fft10839;
__m512 wf217 = fft10919;
__m512 wf210 = fft10840;
__m512 wf218 = fft10920;
__m512 wf211 = fft10820;
__m512 wf219 = fft10903;
__m512 wf212 = fft10821;
__m512 wf220 = fft10904;
__m512 wf213 = fft10822;
__m512 wf221 = fft10905;
__m512 wf214 = fft10823;
__m512 wf222 = fft10906;
__m512 wf215 = fft10824;
__m512 wf223 = fft10907;
__m512 wf216 = fft10825;
__m512 wf224 = fft10908;
ptrdiff_t c68 = (size_t)(1+2*j72)/4;
ptrdiff_t m65 = (size_t)(1+2*j72)%4/2;
ptrdiff_t f70 = (size_t)(1+2*j72)%2;
__m512i eo65 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
wf211 = _mm512_permutexvar_ps(eo65, wf211);
wf212 = _mm512_permutexvar_ps(eo65, wf212);
__m512i wfs73 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf211, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs73 = _mm512_inserti64x4(wfs73, _mm512_cvtps_ph(wf212, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+32768+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs73);
_mm512_mask_storeu_epi32(wfPtr18+4227056+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs73);
wf219 = _mm512_permutexvar_ps(eo65, wf219);
wf220 = _mm512_permutexvar_ps(eo65, wf220);
__m512i wfs74 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf219, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs74 = _mm512_inserti64x4(wfs74, _mm512_cvtps_ph(wf220, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8421376+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs74);
_mm512_mask_storeu_epi32(wfPtr18+12615664+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs74);
wf213 = _mm512_permutexvar_ps(eo65, wf213);
wf214 = _mm512_permutexvar_ps(eo65, wf214);
__m512i wfs75 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf213, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs75 = _mm512_inserti64x4(wfs75, _mm512_cvtps_ph(wf214, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+65536+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs75);
_mm512_mask_storeu_epi32(wfPtr18+4259824+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs75);
wf221 = _mm512_permutexvar_ps(eo65, wf221);
wf222 = _mm512_permutexvar_ps(eo65, wf222);
__m512i wfs76 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf221, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs76 = _mm512_inserti64x4(wfs76, _mm512_cvtps_ph(wf222, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8454144+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs76);
_mm512_mask_storeu_epi32(wfPtr18+12648432+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs76);
wf215 = _mm512_permutexvar_ps(eo65, wf215);
wf216 = _mm512_permutexvar_ps(eo65, wf216);
__m512i wfs77 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf215, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs77 = _mm512_inserti64x4(wfs77, _mm512_cvtps_ph(wf216, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+98304+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs77);
_mm512_mask_storeu_epi32(wfPtr18+4292592+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs77);
wf223 = _mm512_permutexvar_ps(eo65, wf223);
wf224 = _mm512_permutexvar_ps(eo65, wf224);
__m512i wfs78 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf223, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs78 = _mm512_inserti64x4(wfs78, _mm512_cvtps_ph(wf224, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8486912+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs78);
_mm512_mask_storeu_epi32(wfPtr18+12681200+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs78);
__m512i wfs79 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf209, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs79 = _mm512_inserti64x4(wfs79, _mm512_cvtps_ph(wf210, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+0+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs79);
_mm512_mask_storeu_epi32(wfPtr18+4194288+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs79);
__m512i wfs80 = _mm512_castsi256_si512(_mm512_cvtps_ph(wf217, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
wfs80 = _mm512_inserti64x4(wfs80, _mm512_cvtps_ph(wf218, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC), 1);
_mm512_mask_storeu_epi32(wfPtr18+8388608+131072*i81+4096*c68+128*k188+64*m65+16*f70, 3855, wfs80);
_mm512_mask_storeu_epi32(wfPtr18+12582896+131072*i81+4096*c68+128*k188+64*m65+16*f70, 61680, wfs80);
}
__m512 bias8 = _mm512_setzero_ps();
if (!e39) {
bias8 = _mm512_maskz_loadu_ps(3, biasPtr22-0+128*i81+8*j72);
__m512i pmMul48 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i pmAdd48 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
__m512 mas15 = _mm512_maskz_loadu_ps(15, bnPtr23+(ptrdiff_t)8*(0+32*i81+2*j72));
__m512 postMul72 = _mm512_permutexvar_ps(pmMul48, mas15);
__m512 postAdd48 = _mm512_permutexvar_ps(pmAdd48, mas15);
bias8 = _mm512_fmadd_ps(bias8, postMul72, postAdd48);
bias8 = _mm512_mul_ps(bias8, _mm512_set1_ps(6.4e+01f));
}
_mm512_mask_storeu_ps(bfPtr18-0+128*i81+8*j72, 3, bias8);
if (j72 >= jj65) return;
}
}
}

static void ResNeXt50StriderArrangeFilts4(ResNeXt50ThreaderTeam1* team82, char** tensors137) {
ResNeXt50ThreaderTask1 task141;
task141.callee1 = ResNeXt50StriderArrangeFilts4Callee1;
task141.any1 = tensors137;
task141.nd1 = 3;
task141.hull1[0] = 8;
task141.hull1[1] = 32;
task141.hull1[2] = 1;
ResNeXt50ThreaderDo1(team82, &task141);
}

static void ResNeXt50StriderArrangeDats4Callee1(ResNeXt50ThreaderTask1* task142, int64_t* pt76) {
char** tensors140 = task142->any1;
ptrdiff_t s82 = 0;
ptrdiff_t c69 = 0;
ptrdiff_t g44 = pt76[2];
ptrdiff_t e40 = 0;
char*restrict datPtr44 = tensors140[0]-60+549952*e40;
char*restrict dfPtr18 = tensors140[1]+43319296*e40;
ptrdiff_t i82 = 4*g44;
ptrdiff_t ii61 = i82+3;
for (; i82 <= ii61; ++i82) {
ptrdiff_t j73 = 1*c69;
ptrdiff_t rel27 = j73-0;
ptrdiff_t base27 = 0;
ptrdiff_t h55 = base27+0;
ptrdiff_t w76 = 0;
ptrdiff_t k189 = 32*s82;
ptrdiff_t kk68 = k189+31;
for (; k189 <= kk68; ++k189) {
ptrdiff_t b89 = 0;
ptrdiff_t m66 = (size_t)b89/2;
ptrdiff_t f71 = (size_t)b89%2;
__m512 dat2638 = _mm512_maskz_loadu_ps(32766, datPtr44+56+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2639 = _mm512_maskz_loadu_ps(32766, datPtr44+112+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2640 = _mm512_maskz_loadu_ps(32766, datPtr44+168+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2641 = _mm512_maskz_loadu_ps(32766, datPtr44+224+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2642 = _mm512_maskz_loadu_ps(32766, datPtr44+280+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2643 = _mm512_maskz_loadu_ps(32766, datPtr44+336+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2644 = _mm512_maskz_loadu_ps(32766, datPtr44+392+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2645 = _mm512_maskz_loadu_ps(32766, datPtr44+448+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2646 = _mm512_maskz_loadu_ps(32766, datPtr44+504+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2647 = _mm512_maskz_loadu_ps(32766, datPtr44+560+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2648 = _mm512_maskz_loadu_ps(32766, datPtr44+616+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2649 = _mm512_maskz_loadu_ps(32766, datPtr44+672+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2650 = _mm512_maskz_loadu_ps(32766, datPtr44+728+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 dat2651 = _mm512_maskz_loadu_ps(32766, datPtr44+784+26624*i82+832*k189+56*h55+4*w76+0*b89);
__m512 fft10921 = _mm512_add_ps(_mm512_setzero_ps(), dat2645);
__m512 fft11009 = _mm512_add_ps(dat2638, dat2646);
__m512 fft10922 = _mm512_sub_ps(_mm512_setzero_ps(), dat2645);
__m512 fft11010 = _mm512_sub_ps(dat2638, dat2646);
__m512 fft10923 = _mm512_add_ps(dat2639, dat2647);
__m512 fft11011 = _mm512_add_ps(dat2640, dat2648);
__m512 fft10924 = _mm512_sub_ps(dat2639, dat2647);
__m512 fft11012 = _mm512_sub_ps(dat2640, dat2648);
__m512 fft10925 = _mm512_add_ps(dat2641, dat2649);
__m512 fft11013 = _mm512_add_ps(dat2642, dat2650);
__m512 fft10926 = _mm512_sub_ps(dat2641, dat2649);
__m512 fft11014 = _mm512_sub_ps(dat2642, dat2650);
__m512 fft10927 = _mm512_add_ps(dat2643, dat2651);
__m512 fft11015 = _mm512_add_ps(dat2644, _mm512_setzero_ps());
__m512 fft10928 = _mm512_sub_ps(dat2643, dat2651);
__m512 fft11016 = _mm512_sub_ps(dat2644, _mm512_setzero_ps());
__m512 fft10929 = _mm512_add_ps(fft10921, fft10925);
__m512 fft11017 = _mm512_add_ps(fft11009, fft11013);
__m512 fft10930 = _mm512_sub_ps(fft10921, fft10925);
__m512 fft11018 = _mm512_sub_ps(fft11009, fft11013);
__m512 fft10931 = _mm512_add_ps(fft10923, fft10927);
__m512 fft11019 = _mm512_add_ps(fft11011, fft11015);
__m512 fft10932 = _mm512_sub_ps(fft10927, fft10923);
__m512 fft11020 = _mm512_sub_ps(fft11015, fft11011);
__m512 fft10933 = _mm512_sub_ps(fft10924, fft10928);
__m512 fft11021 = _mm512_sub_ps(fft11012, fft11016);
__m512 fft10934 = _mm512_add_ps(fft10924, fft10928);
__m512 fft11022 = _mm512_add_ps(fft11012, fft11016);
__m512 fft10935 = _mm512_add_ps(fft10929, fft10931);
__m512 fft11023 = _mm512_add_ps(fft11017, fft11019);
__m512 fft10936 = _mm512_sub_ps(fft10929, fft10931);
__m512 fft11024 = _mm512_sub_ps(fft11017, fft11019);
__m512 fft10937 = _mm512_fmadd_ps(fft10933, _mm512_set1_ps(7.0710677e-01f), fft10922);
__m512 fft11025 = _mm512_fmadd_ps(fft11021, _mm512_set1_ps(7.0710677e-01f), fft11010);
__m512 fft10938 = _mm512_fnmsub_ps(fft10934, _mm512_set1_ps(7.0710677e-01f), fft10926);
__m512 fft11026 = _mm512_fnmsub_ps(fft11022, _mm512_set1_ps(7.0710677e-01f), fft11014);
__m512 fft10939 = _mm512_fnmadd_ps(fft10933, _mm512_set1_ps(7.0710677e-01f), fft10922);
__m512 fft11027 = _mm512_fnmadd_ps(fft11021, _mm512_set1_ps(7.0710677e-01f), fft11010);
__m512 fft10940 = _mm512_fnmadd_ps(fft10934, _mm512_set1_ps(7.0710677e-01f), fft10926);
__m512 fft11028 = _mm512_fnmadd_ps(fft11022, _mm512_set1_ps(7.0710677e-01f), fft11014);
__m512 fft10941 = _mm512_set_ps(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10942 = _mm512_fmadd_ps(fft10935, fft10941, _mm512_shuffle_f32x4(fft10935, fft10935, 78));
__m512 fft11029 = _mm512_fmadd_ps(fft11023, fft10941, _mm512_shuffle_f32x4(fft11023, fft11023, 78));
__m512 fft10943 = _mm512_fmadd_ps(fft10936, fft10941, _mm512_shuffle_f32x4(fft10936, fft10936, 78));
__m512 fft11030 = _mm512_fmadd_ps(fft11024, fft10941, _mm512_shuffle_f32x4(fft11024, fft11024, 78));
__m512 fft10944 = _mm512_fmadd_ps(fft10937, fft10941, _mm512_shuffle_f32x4(fft10937, fft10937, 78));
__m512 fft11031 = _mm512_fmadd_ps(fft11025, fft10941, _mm512_shuffle_f32x4(fft11025, fft11025, 78));
__m512 fft10945 = _mm512_fmadd_ps(fft10938, fft10941, _mm512_shuffle_f32x4(fft10938, fft10938, 78));
__m512 fft11032 = _mm512_fmadd_ps(fft11026, fft10941, _mm512_shuffle_f32x4(fft11026, fft11026, 78));
__m512 fft10946 = _mm512_fmadd_ps(fft10930, fft10941, _mm512_shuffle_f32x4(fft10930, fft10930, 78));
__m512 fft11033 = _mm512_fmadd_ps(fft11018, fft10941, _mm512_shuffle_f32x4(fft11018, fft11018, 78));
__m512 fft10947 = _mm512_fmadd_ps(fft10932, fft10941, _mm512_shuffle_f32x4(fft10932, fft10932, 78));
__m512 fft11034 = _mm512_fmadd_ps(fft11020, fft10941, _mm512_shuffle_f32x4(fft11020, fft11020, 78));
__m512 fft10948 = _mm512_fmadd_ps(fft10939, fft10941, _mm512_shuffle_f32x4(fft10939, fft10939, 78));
__m512 fft11035 = _mm512_fmadd_ps(fft11027, fft10941, _mm512_shuffle_f32x4(fft11027, fft11027, 78));
__m512 fft10949 = _mm512_fmadd_ps(fft10940, fft10941, _mm512_shuffle_f32x4(fft10940, fft10940, 78));
__m512 fft11036 = _mm512_fmadd_ps(fft11028, fft10941, _mm512_shuffle_f32x4(fft11028, fft11028, 78));
__m512 fft10950 = _mm512_set_ps(-7.0710677e-01f, -7.0710677e-01f, 0, 0, 7.0710677e-01f, 7.0710677e-01f, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
__m512 fft10951 = _mm512_mul_ps(fft10942, fft10950);
__m512 fft11037 = _mm512_mul_ps(fft11029, fft10950);
__m512 fft10952 = _mm512_mul_ps(fft10943, fft10950);
__m512 fft11038 = _mm512_mul_ps(fft11030, fft10950);
__m512 fft10953 = _mm512_mul_ps(fft10944, fft10950);
__m512 fft11039 = _mm512_mul_ps(fft11031, fft10950);
__m512 fft10954 = _mm512_mul_ps(fft10945, fft10950);
__m512 fft11040 = _mm512_mul_ps(fft11032, fft10950);
__m512 fft10955 = _mm512_mul_ps(fft10946, fft10950);
__m512 fft11041 = _mm512_mul_ps(fft11033, fft10950);
__m512 fft10956 = _mm512_mul_ps(fft10947, fft10950);
__m512 fft11042 = _mm512_mul_ps(fft11034, fft10950);
__m512 fft10957 = _mm512_mul_ps(fft10948, fft10950);
__m512 fft11043 = _mm512_mul_ps(fft11035, fft10950);
__m512 fft10958 = _mm512_mul_ps(fft10949, fft10950);
__m512 fft11044 = _mm512_mul_ps(fft11036, fft10950);
__m512 fft10959 = _mm512_set_ps(7.0710677e-01f, 7.0710677e-01f, 1, 1, 7.0710677e-01f, 7.0710677e-01f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 fft10960 = _mm512_fmadd_ps(fft10943, fft10959, fft10951);
__m512 fft11045 = _mm512_fmadd_ps(fft11030, fft10959, fft11037);
__m512 fft10961 = _mm512_fnmadd_ps(fft10942, fft10959, fft10952);
__m512 fft11046 = _mm512_fnmadd_ps(fft11029, fft10959, fft11038);
__m512 fft10962 = _mm512_fmadd_ps(fft10945, fft10959, fft10953);
__m512 fft11047 = _mm512_fmadd_ps(fft11032, fft10959, fft11039);
__m512 fft10963 = _mm512_fnmadd_ps(fft10944, fft10959, fft10954);
__m512 fft11048 = _mm512_fnmadd_ps(fft11031, fft10959, fft11040);
__m512 fft10964 = _mm512_fmadd_ps(fft10947, fft10959, fft10955);
__m512 fft11049 = _mm512_fmadd_ps(fft11034, fft10959, fft11041);
__m512 fft10965 = _mm512_fnmadd_ps(fft10946, fft10959, fft10956);
__m512 fft11050 = _mm512_fnmadd_ps(fft11033, fft10959, fft11042);
__m512 fft10966 = _mm512_fmadd_ps(fft10949, fft10959, fft10957);
__m512 fft11051 = _mm512_fmadd_ps(fft11036, fft10959, fft11043);
__m512 fft10967 = _mm512_fnmadd_ps(fft10948, fft10959, fft10958);
__m512 fft11052 = _mm512_fnmadd_ps(fft11035, fft10959, fft11044);
__m512 fft10968 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 fft10969 = _mm512_fmadd_ps(fft10960, fft10968, _mm512_shuffle_f32x4(fft10960, fft10960, 177));
__m512 fft11053 = _mm512_fmadd_ps(fft11045, fft10968, _mm512_shuffle_f32x4(fft11045, fft11045, 177));
__m512 fft10970 = _mm512_fmadd_ps(fft10961, fft10968, _mm512_shuffle_f32x4(fft10961, fft10961, 177));
__m512 fft11054 = _mm512_fmadd_ps(fft11046, fft10968, _mm512_shuffle_f32x4(fft11046, fft11046, 177));
__m512 fft10971 = _mm512_fmadd_ps(fft10962, fft10968, _mm512_shuffle_f32x4(fft10962, fft10962, 177));
__m512 fft11055 = _mm512_fmadd_ps(fft11047, fft10968, _mm512_shuffle_f32x4(fft11047, fft11047, 177));
__m512 fft10972 = _mm512_fmadd_ps(fft10963, fft10968, _mm512_shuffle_f32x4(fft10963, fft10963, 177));
__m512 fft11056 = _mm512_fmadd_ps(fft11048, fft10968, _mm512_shuffle_f32x4(fft11048, fft11048, 177));
__m512 fft10973 = _mm512_fmadd_ps(fft10964, fft10968, _mm512_shuffle_f32x4(fft10964, fft10964, 177));
__m512 fft11057 = _mm512_fmadd_ps(fft11049, fft10968, _mm512_shuffle_f32x4(fft11049, fft11049, 177));
__m512 fft10974 = _mm512_fmadd_ps(fft10965, fft10968, _mm512_shuffle_f32x4(fft10965, fft10965, 177));
__m512 fft11058 = _mm512_fmadd_ps(fft11050, fft10968, _mm512_shuffle_f32x4(fft11050, fft11050, 177));
__m512 fft10975 = _mm512_fmadd_ps(fft10966, fft10968, _mm512_shuffle_f32x4(fft10966, fft10966, 177));
__m512 fft11059 = _mm512_fmadd_ps(fft11051, fft10968, _mm512_shuffle_f32x4(fft11051, fft11051, 177));
__m512 fft10976 = _mm512_fmadd_ps(fft10967, fft10968, _mm512_shuffle_f32x4(fft10967, fft10967, 177));
__m512 fft11060 = _mm512_fmadd_ps(fft11052, fft10968, _mm512_shuffle_f32x4(fft11052, fft11052, 177));
__m512 fft10977 = _mm512_mask_mov_ps(fft10969, 49344, fft10970);
__m512 fft11061 = _mm512_mask_mov_ps(fft11053, 49344, fft11054);
__m512 fft10978 = _mm512_mask_sub_ps(fft10970, 49344, _mm512_setzero_ps(), fft10969);
__m512 fft11062 = _mm512_mask_sub_ps(fft11054, 49344, _mm512_setzero_ps(), fft11053);
__m512 fft10979 = _mm512_mask_mov_ps(fft10971, 49344, fft10972);
__m512 fft11063 = _mm512_mask_mov_ps(fft11055, 49344, fft11056);
__m512 fft10980 = _mm512_mask_sub_ps(fft10972, 49344, _mm512_setzero_ps(), fft10971);
__m512 fft11064 = _mm512_mask_sub_ps(fft11056, 49344, _mm512_setzero_ps(), fft11055);
__m512 fft10981 = _mm512_mask_mov_ps(fft10973, 49344, fft10974);
__m512 fft11065 = _mm512_mask_mov_ps(fft11057, 49344, fft11058);
__m512 fft10982 = _mm512_mask_sub_ps(fft10974, 49344, _mm512_setzero_ps(), fft10973);
__m512 fft11066 = _mm512_mask_sub_ps(fft11058, 49344, _mm512_setzero_ps(), fft11057);
__m512 fft10983 = _mm512_mask_mov_ps(fft10975, 49344, fft10976);
__m512 fft11067 = _mm512_mask_mov_ps(fft11059, 49344, fft11060);
__m512 fft10984 = _mm512_mask_sub_ps(fft10976, 49344, _mm512_setzero_ps(), fft10975);
__m512 fft11068 = _mm512_mask_sub_ps(fft11060, 49344, _mm512_setzero_ps(), fft11059);
__m512 fft10985 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 fft10986 = _mm512_fmadd_ps(fft10977, fft10985, _mm512_shuffle_ps(fft10977, fft10977, 78));
__m512 fft11069 = _mm512_fmadd_ps(fft11061, fft10985, _mm512_shuffle_ps(fft11061, fft11061, 78));
__m512 fft10987 = _mm512_fmadd_ps(fft10978, fft10985, _mm512_shuffle_ps(fft10978, fft10978, 78));
__m512 fft11070 = _mm512_fmadd_ps(fft11062, fft10985, _mm512_shuffle_ps(fft11062, fft11062, 78));
__m512 fft10988 = _mm512_fmadd_ps(fft10979, fft10985, _mm512_shuffle_ps(fft10979, fft10979, 78));
__m512 fft11071 = _mm512_fmadd_ps(fft11063, fft10985, _mm512_shuffle_ps(fft11063, fft11063, 78));
__m512 fft10989 = _mm512_fmadd_ps(fft10980, fft10985, _mm512_shuffle_ps(fft10980, fft10980, 78));
__m512 fft11072 = _mm512_fmadd_ps(fft11064, fft10985, _mm512_shuffle_ps(fft11064, fft11064, 78));
__m512 fft10990 = _mm512_fmadd_ps(fft10981, fft10985, _mm512_shuffle_ps(fft10981, fft10981, 78));
__m512 fft11073 = _mm512_fmadd_ps(fft11065, fft10985, _mm512_shuffle_ps(fft11065, fft11065, 78));
__m512 fft10991 = _mm512_fmadd_ps(fft10982, fft10985, _mm512_shuffle_ps(fft10982, fft10982, 78));
__m512 fft11074 = _mm512_fmadd_ps(fft11066, fft10985, _mm512_shuffle_ps(fft11066, fft11066, 78));
__m512 fft10992 = _mm512_fmadd_ps(fft10983, fft10985, _mm512_shuffle_ps(fft10983, fft10983, 78));
__m512 fft11075 = _mm512_fmadd_ps(fft11067, fft10985, _mm512_shuffle_ps(fft11067, fft11067, 78));
__m512 fft10993 = _mm512_fmadd_ps(fft10984, fft10985, _mm512_shuffle_ps(fft10984, fft10984, 78));
__m512 fft11076 = _mm512_fmadd_ps(fft11068, fft10985, _mm512_shuffle_ps(fft11068, fft11068, 78));
__m512i fft10994 = _mm512_set_epi32(13, 13, 9, 9, 5, 5, 3, 3, 12, 12, 8, 8, 4, 4, 2, 2);
__m512 fft10995 = _mm512_permutexvar_ps(fft10994, fft10986);
__m512 fft11077 = _mm512_permutexvar_ps(fft10994, fft11069);
__m512i fft10996 = _mm512_set_epi32(11, 11, 15, 15, 7, 7, 1, 1, 10, 10, 14, 14, 6, 6, 0, 0);
__m512 fft10997 = _mm512_permutexvar_ps(fft10996, fft10986);
__m512 fft11078 = _mm512_permutexvar_ps(fft10996, fft11069);
__m512 fft10998 = _mm512_permutexvar_ps(fft10994, fft10987);
__m512 fft11079 = _mm512_permutexvar_ps(fft10994, fft11070);
__m512 fft10999 = _mm512_permutexvar_ps(fft10996, fft10987);
__m512 fft11080 = _mm512_permutexvar_ps(fft10996, fft11070);
__m512 fft11000 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, 0, 0, -1, 1, -1, 1, -1, 1, 0, 0);
__m512 fft11001 = _mm512_fmadd_ps(fft10995, fft11000, fft10997);
__m512 fft11081 = _mm512_fmadd_ps(fft11077, fft11000, fft11078);
__m512 fft11002 = _mm512_fnmadd_ps(fft10999, fft11000, fft10998);
__m512 fft11082 = _mm512_fnmadd_ps(fft11080, fft11000, fft11079);
__m512 fft11003 = _mm512_mask_mov_ps(fft10999, 21845, fft11001);
__m512 fft11083 = _mm512_mask_mov_ps(fft11080, 21845, fft11081);
__m512 fft11004 = _mm512_mask_mov_ps(fft10995, 43176, fft11001);
__m512 fft11084 = _mm512_mask_mov_ps(fft11077, 43176, fft11081);
__m512 fft11005 = _mm512_mask_mov_ps(fft11003, 43176, fft11002);
__m512 fft11085 = _mm512_mask_mov_ps(fft11083, 43176, fft11082);
__m512 fft11006 = _mm512_mask_mov_ps(fft11004, 22102, fft11002);
__m512 fft11086 = _mm512_mask_mov_ps(fft11084, 22102, fft11082);
__m512 fft11007 = _mm512_mask_mul_ps(fft11005, 64764, fft11005, _mm512_set1_ps(5e-01f));
__m512 fft11087 = _mm512_mask_mul_ps(fft11085, 64764, fft11085, _mm512_set1_ps(5e-01f));
__m512 fft11008 = _mm512_mask_mul_ps(fft11006, 64764, fft11006, _mm512_set1_ps(5e-01f));
__m512 fft11088 = _mm512_mask_mul_ps(fft11086, 64764, fft11086, _mm512_set1_ps(5e-01f));
__m512 df949 = fft11007;
__m512 df957 = fft11087;
__m512 df950 = fft11008;
__m512 df958 = fft11088;
__m512 df951 = fft10988;
__m512 df959 = fft11071;
__m512 df952 = fft10989;
__m512 df960 = fft11072;
__m512 df953 = fft10990;
__m512 df961 = fft11073;
__m512 df954 = fft10991;
__m512 df962 = fft11074;
__m512 df955 = fft10992;
__m512 df963 = fft11075;
__m512 df956 = fft10993;
__m512 df964 = fft11076;
__m512i eo66 = _mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
df951 = _mm512_permutexvar_ps(eo66, df951);
df952 = _mm512_permutexvar_ps(eo66, df952);
__m512 rep33 = _mm512_shuffle_f32x4(df951, df951, 68);
_mm512_mask_storeu_ps(dfPtr18+4096+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep33);
__m512 rep34 = _mm512_shuffle_f32x4(df952, df952, 68);
_mm512_mask_storeu_ps(dfPtr18+4160+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep34);
__m512 rep35 = _mm512_shuffle_f32x4(df951, df951, 238);
_mm512_mask_storeu_ps(dfPtr18+528384+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep35);
__m512 rep36 = _mm512_shuffle_f32x4(df952, df952, 238);
_mm512_mask_storeu_ps(dfPtr18+528448+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep36);
df959 = _mm512_permutexvar_ps(eo66, df959);
df960 = _mm512_permutexvar_ps(eo66, df960);
__m512 rep37 = _mm512_shuffle_f32x4(df959, df959, 68);
_mm512_mask_storeu_ps(dfPtr18+1052672+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep37);
__m512 rep38 = _mm512_shuffle_f32x4(df960, df960, 68);
_mm512_mask_storeu_ps(dfPtr18+1052736+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep38);
__m512 rep39 = _mm512_shuffle_f32x4(df959, df959, 238);
_mm512_mask_storeu_ps(dfPtr18+1576960+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep39);
__m512 rep40 = _mm512_shuffle_f32x4(df960, df960, 238);
_mm512_mask_storeu_ps(dfPtr18+1577024+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep40);
df953 = _mm512_permutexvar_ps(eo66, df953);
df954 = _mm512_permutexvar_ps(eo66, df954);
__m512 rep41 = _mm512_shuffle_f32x4(df953, df953, 68);
_mm512_mask_storeu_ps(dfPtr18+8192+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep41);
__m512 rep42 = _mm512_shuffle_f32x4(df954, df954, 68);
_mm512_mask_storeu_ps(dfPtr18+8256+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep42);
__m512 rep43 = _mm512_shuffle_f32x4(df953, df953, 238);
_mm512_mask_storeu_ps(dfPtr18+532480+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep43);
__m512 rep44 = _mm512_shuffle_f32x4(df954, df954, 238);
_mm512_mask_storeu_ps(dfPtr18+532544+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep44);
df961 = _mm512_permutexvar_ps(eo66, df961);
df962 = _mm512_permutexvar_ps(eo66, df962);
__m512 rep45 = _mm512_shuffle_f32x4(df961, df961, 68);
_mm512_mask_storeu_ps(dfPtr18+1056768+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep45);
__m512 rep46 = _mm512_shuffle_f32x4(df962, df962, 68);
_mm512_mask_storeu_ps(dfPtr18+1056832+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep46);
__m512 rep47 = _mm512_shuffle_f32x4(df961, df961, 238);
_mm512_mask_storeu_ps(dfPtr18+1581056+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep47);
__m512 rep48 = _mm512_shuffle_f32x4(df962, df962, 238);
_mm512_mask_storeu_ps(dfPtr18+1581120+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep48);
df955 = _mm512_permutexvar_ps(eo66, df955);
df956 = _mm512_permutexvar_ps(eo66, df956);
__m512 rep49 = _mm512_shuffle_f32x4(df955, df955, 68);
_mm512_mask_storeu_ps(dfPtr18+12288+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep49);
__m512 rep50 = _mm512_shuffle_f32x4(df956, df956, 68);
_mm512_mask_storeu_ps(dfPtr18+12352+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep50);
__m512 rep51 = _mm512_shuffle_f32x4(df955, df955, 238);
_mm512_mask_storeu_ps(dfPtr18+536576+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep51);
__m512 rep52 = _mm512_shuffle_f32x4(df956, df956, 238);
_mm512_mask_storeu_ps(dfPtr18+536640+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep52);
df963 = _mm512_permutexvar_ps(eo66, df963);
df964 = _mm512_permutexvar_ps(eo66, df964);
__m512 rep53 = _mm512_shuffle_f32x4(df963, df963, 68);
_mm512_mask_storeu_ps(dfPtr18+1060864+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep53);
__m512 rep54 = _mm512_shuffle_f32x4(df964, df964, 68);
_mm512_mask_storeu_ps(dfPtr18+1060928+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep54);
__m512 rep55 = _mm512_shuffle_f32x4(df963, df963, 238);
_mm512_mask_storeu_ps(dfPtr18+1585152+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep55);
__m512 rep56 = _mm512_shuffle_f32x4(df964, df964, 238);
_mm512_mask_storeu_ps(dfPtr18+1585216+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep56);
__m512 rep57 = _mm512_shuffle_f32x4(df949, df949, 68);
_mm512_mask_storeu_ps(dfPtr18+0+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep57);
__m512 rep58 = _mm512_shuffle_f32x4(df950, df950, 68);
_mm512_mask_storeu_ps(dfPtr18+64+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep58);
__m512 rep59 = _mm512_shuffle_f32x4(df949, df949, 238);
_mm512_mask_storeu_ps(dfPtr18+524288+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep59);
__m512 rep60 = _mm512_shuffle_f32x4(df950, df950, 238);
_mm512_mask_storeu_ps(dfPtr18+524352+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep60);
__m512 rep61 = _mm512_shuffle_f32x4(df957, df957, 68);
_mm512_mask_storeu_ps(dfPtr18+1048576+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep61);
__m512 rep62 = _mm512_shuffle_f32x4(df958, df958, 68);
_mm512_mask_storeu_ps(dfPtr18+1048640+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep62);
__m512 rep63 = _mm512_shuffle_f32x4(df957, df957, 238);
_mm512_mask_storeu_ps(dfPtr18+1572864+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep63);
__m512 rep64 = _mm512_shuffle_f32x4(df958, df958, 238);
_mm512_mask_storeu_ps(dfPtr18+1572928+16384*i82+12288*j73+128*k189+128*m66+32*f71, 65535, rep64);
}
++j73;
}
}

static void ResNeXt50StriderArrangeDats4(ResNeXt50ThreaderTeam1* team83, char** tensors139) {
ResNeXt50ThreaderTask1 task143;
task143.callee1 = ResNeXt50StriderArrangeDats4Callee1;
task143.any1 = tensors139;
task143.nd1 = 4;
task143.hull1[0] = 1;
task143.hull1[1] = 1;
task143.hull1[2] = 8;
task143.hull1[3] = 1;
ResNeXt50ThreaderDo1(team83, &task143);
}

static void ResNeXt50StriderProduceSums4Callee1(ResNeXt50ThreaderTask1* task144, int64_t* pt77) {
void** tuple8 = task144->any1;
char** tensors142 = tuple8[0];
ptrdiff_t e41 = 0;
ptrdiff_t z8 = (ptrdiff_t)tuple8[2];
ptrdiff_t g45 = pt77[3];
ptrdiff_t p4 = pt77[2];
ptrdiff_t d28 = 0;
ptrdiff_t w77 = 0;
if (__builtin_expect(!(e41|z8), 0)) {
z8 = 0;
char*restrict bfPtr19 = tensors142[0]+4096*e41;
char*restrict wfPtr19 = tensors142[0]+4096+346554368*e41+4194304*z8;
char*restrict dfPtr19 = tensors142[1]+43319296*e41+524288*z8;
char*restrict sfPtr18 = tensors142[2];
ptrdiff_t i83 = 1*g45;
ptrdiff_t j74 = 1*p4;
ptrdiff_t jj66 = j74+0;
if (__builtin_expect(!j74, 0)) {
ptrdiff_t k190 = 1*d28;
ptrdiff_t l80 = 8*w77;
for (; l80 != 8; ++l80) {
__m512 sfRe509 = _mm512_setzero_ps();
__m512 sfIm509 = _mm512_setzero_ps();
__m512 sfRe510 = _mm512_setzero_ps();
__m512 sfIm510 = _mm512_setzero_ps();
sfRe509 = _mm512_mask_mov_ps(sfRe509, 1, _mm512_set1_ps(*(float*)(bfPtr19+0+128*i83+16*l80)));
sfRe509 = _mm512_mask_mov_ps(sfRe509, 256, _mm512_set1_ps(*(float*)(bfPtr19+4+128*i83+16*l80)));
sfRe510 = _mm512_mask_mov_ps(sfRe510, 1, _mm512_set1_ps(*(float*)(bfPtr19+8+128*i83+16*l80)));
sfRe510 = _mm512_mask_mov_ps(sfRe510, 256, _mm512_set1_ps(*(float*)(bfPtr19+12+128*i83+16*l80)));
for (ptrdiff_t s83 = 0; s83 < 32; ++s83) {
__m512i wfLd41 = _mm512_loadu_si512(wfPtr19+0+131072*i83+32768*j74+4096*l80+128*s83);
__m512 wfRe41 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd41));
__m512 wfIm41 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd41, 1));
__m512 wfMx21 = _mm512_mask_mov_ps(wfIm41, 64764, wfRe41);
__m512i wfLd42 = _mm512_loadu_si512(wfPtr19+64+131072*i83+32768*j74+4096*l80+128*s83);
__m512 wfRe42 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd42));
__m512 wfIm42 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd42, 1));
__m512 wfMx22 = _mm512_mask_mov_ps(wfIm42, 64764, wfRe42);
__m512 dfRe45 = _mm512_loadu_ps(dfPtr19+0+16384*i83+4096*j74+12288*k190+128*s83);
__m512 dfIm45 = _mm512_loadu_ps(dfPtr19+64+16384*i83+4096*j74+12288*k190+128*s83);
sfRe509 = _mm512_fmadd_ps(wfRe41, dfRe45, sfRe509);
sfRe509 = _mm512_mask3_fmadd_ps(wfIm41, dfIm45, sfRe509, 64764);
sfIm509 = _mm512_fmadd_ps(wfMx21, dfIm45, sfIm509);
sfIm509 = _mm512_mask3_fnmadd_ps(wfIm41, dfRe45, sfIm509, 64764);
sfRe510 = _mm512_fmadd_ps(wfRe42, dfRe45, sfRe510);
sfRe510 = _mm512_mask3_fmadd_ps(wfIm42, dfIm45, sfRe510, 64764);
sfIm510 = _mm512_fmadd_ps(wfMx22, dfIm45, sfIm510);
sfIm510 = _mm512_mask3_fnmadd_ps(wfIm42, dfRe45, sfIm510, 64764);
}
_mm512_storeu_ps(sfPtr18+0+8192*i83+2048*j74+12288*k190+256*l80, sfRe509);
_mm512_storeu_ps(sfPtr18+64+8192*i83+2048*j74+12288*k190+256*l80, sfIm509);
_mm512_storeu_ps(sfPtr18+128+8192*i83+2048*j74+12288*k190+256*l80, sfRe510);
_mm512_storeu_ps(sfPtr18+192+8192*i83+2048*j74+12288*k190+256*l80, sfIm510);
}
j74 = 1;
}
for (; j74 <= jj66; ++j74) {
ptrdiff_t k191 = 1*d28;
ptrdiff_t l81 = 8*w77;
for (; l81 != 8; ++l81) {
__m512 sfRe511 = _mm512_setzero_ps();
__m512 sfIm511 = _mm512_setzero_ps();
__m512 sfRe512 = _mm512_setzero_ps();
__m512 sfIm512 = _mm512_setzero_ps();
(void)bfPtr19;
for (ptrdiff_t s84 = 0; s84 < 32; ++s84) {
__m512i wfLd43 = _mm512_loadu_si512(wfPtr19+0+131072*i83+32768*j74+4096*l81+128*s84);
__m512 wfRe43 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd43));
__m512 wfIm43 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd43, 1));
__m512i wfLd44 = _mm512_loadu_si512(wfPtr19+64+131072*i83+32768*j74+4096*l81+128*s84);
__m512 wfRe44 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd44));
__m512 wfIm44 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd44, 1));
__m512 dfRe46 = _mm512_loadu_ps(dfPtr19+0+16384*i83+4096*j74+12288*k191+128*s84);
__m512 dfIm46 = _mm512_loadu_ps(dfPtr19+64+16384*i83+4096*j74+12288*k191+128*s84);
sfRe511 = _mm512_fmadd_ps(wfRe43, dfRe46, sfRe511);
sfRe511 = _mm512_fmadd_ps(wfIm43, dfIm46, sfRe511);
sfIm511 = _mm512_fmadd_ps(wfRe43, dfIm46, sfIm511);
sfIm511 = _mm512_fnmadd_ps(wfIm43, dfRe46, sfIm511);
sfRe512 = _mm512_fmadd_ps(wfRe44, dfRe46, sfRe512);
sfRe512 = _mm512_fmadd_ps(wfIm44, dfIm46, sfRe512);
sfIm512 = _mm512_fmadd_ps(wfRe44, dfIm46, sfIm512);
sfIm512 = _mm512_fnmadd_ps(wfIm44, dfRe46, sfIm512);
}
_mm512_storeu_ps(sfPtr18+0+8192*i83+2048*j74+12288*k191+256*l81, sfRe511);
_mm512_storeu_ps(sfPtr18+64+8192*i83+2048*j74+12288*k191+256*l81, sfIm511);
_mm512_storeu_ps(sfPtr18+128+8192*i83+2048*j74+12288*k191+256*l81, sfRe512);
_mm512_storeu_ps(sfPtr18+192+8192*i83+2048*j74+12288*k191+256*l81, sfIm512);
}
}
return;
}
char*restrict bfPtr20 = tensors142[0]+4096*e41;
char*restrict wfPtr20 = tensors142[0]+4096+346554368*e41+4194304*z8;
char*restrict dfPtr20 = tensors142[1]+43319296*e41+524288*z8;
char*restrict sfPtr19 = tensors142[2];
ptrdiff_t i84 = 1*g45;
ptrdiff_t j75 = 1*p4;
ptrdiff_t jj67 = j75+0;
if (__builtin_expect(!j75, 0)) {
ptrdiff_t k192 = 1*d28;
ptrdiff_t l82 = 8*w77;
for (; l82 != 8; ++l82) {
__m512 sfRe513 = _mm512_setzero_ps();
__m512 sfIm513 = _mm512_setzero_ps();
__m512 sfRe514 = _mm512_setzero_ps();
__m512 sfIm514 = _mm512_setzero_ps();
(void)bfPtr20;
for (ptrdiff_t s85 = 0; s85 < 32; ++s85) {
__m512i wfLd45 = _mm512_loadu_si512(wfPtr20+0+131072*i84+32768*j75+4096*l82+128*s85);
__m512 wfRe45 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd45));
__m512 wfIm45 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd45, 1));
__m512 wfMx23 = _mm512_mask_mov_ps(wfIm45, 64764, wfRe45);
__m512i wfLd46 = _mm512_loadu_si512(wfPtr20+64+131072*i84+32768*j75+4096*l82+128*s85);
__m512 wfRe46 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd46));
__m512 wfIm46 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd46, 1));
__m512 wfMx24 = _mm512_mask_mov_ps(wfIm46, 64764, wfRe46);
__m512 dfRe47 = _mm512_loadu_ps(dfPtr20+0+16384*i84+4096*j75+12288*k192+128*s85);
__m512 dfIm47 = _mm512_loadu_ps(dfPtr20+64+16384*i84+4096*j75+12288*k192+128*s85);
sfRe513 = _mm512_fmadd_ps(wfRe45, dfRe47, sfRe513);
sfRe513 = _mm512_mask3_fmadd_ps(wfIm45, dfIm47, sfRe513, 64764);
sfIm513 = _mm512_fmadd_ps(wfMx23, dfIm47, sfIm513);
sfIm513 = _mm512_mask3_fnmadd_ps(wfIm45, dfRe47, sfIm513, 64764);
sfRe514 = _mm512_fmadd_ps(wfRe46, dfRe47, sfRe514);
sfRe514 = _mm512_mask3_fmadd_ps(wfIm46, dfIm47, sfRe514, 64764);
sfIm514 = _mm512_fmadd_ps(wfMx24, dfIm47, sfIm514);
sfIm514 = _mm512_mask3_fnmadd_ps(wfIm46, dfRe47, sfIm514, 64764);
}
sfRe513 = _mm512_add_ps(sfRe513, _mm512_loadu_ps(sfPtr19+0+8192*i84+2048*j75+12288*k192+256*l82));
sfIm513 = _mm512_add_ps(sfIm513, _mm512_loadu_ps(sfPtr19+64+8192*i84+2048*j75+12288*k192+256*l82));
sfRe514 = _mm512_add_ps(sfRe514, _mm512_loadu_ps(sfPtr19+128+8192*i84+2048*j75+12288*k192+256*l82));
sfIm514 = _mm512_add_ps(sfIm514, _mm512_loadu_ps(sfPtr19+192+8192*i84+2048*j75+12288*k192+256*l82));
_mm512_storeu_ps(sfPtr19+0+8192*i84+2048*j75+12288*k192+256*l82, sfRe513);
_mm512_storeu_ps(sfPtr19+64+8192*i84+2048*j75+12288*k192+256*l82, sfIm513);
_mm512_storeu_ps(sfPtr19+128+8192*i84+2048*j75+12288*k192+256*l82, sfRe514);
_mm512_storeu_ps(sfPtr19+192+8192*i84+2048*j75+12288*k192+256*l82, sfIm514);
}
j75 = 1;
}
for (; j75 <= jj67; ++j75) {
ptrdiff_t k193 = 1*d28;
ptrdiff_t l83 = 8*w77;
for (; l83 != 8; ++l83) {
__m512 sfRe515 = _mm512_setzero_ps();
__m512 sfIm515 = _mm512_setzero_ps();
__m512 sfRe516 = _mm512_setzero_ps();
__m512 sfIm516 = _mm512_setzero_ps();
(void)bfPtr20;
for (ptrdiff_t s86 = 0; s86 < 32; ++s86) {
__m512i wfLd47 = _mm512_loadu_si512(wfPtr20+0+131072*i84+32768*j75+4096*l83+128*s86);
__m512 wfRe47 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd47));
__m512 wfIm47 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd47, 1));
__m512i wfLd48 = _mm512_loadu_si512(wfPtr20+64+131072*i84+32768*j75+4096*l83+128*s86);
__m512 wfRe48 = _mm512_cvtph_ps(_mm512_castsi512_si256(wfLd48));
__m512 wfIm48 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(wfLd48, 1));
__m512 dfRe48 = _mm512_loadu_ps(dfPtr20+0+16384*i84+4096*j75+12288*k193+128*s86);
__m512 dfIm48 = _mm512_loadu_ps(dfPtr20+64+16384*i84+4096*j75+12288*k193+128*s86);
sfRe515 = _mm512_fmadd_ps(wfRe47, dfRe48, sfRe515);
sfRe515 = _mm512_fmadd_ps(wfIm47, dfIm48, sfRe515);
sfIm515 = _mm512_fmadd_ps(wfRe47, dfIm48, sfIm515);
sfIm515 = _mm512_fnmadd_ps(wfIm47, dfRe48, sfIm515);
sfRe516 = _mm512_fmadd_ps(wfRe48, dfRe48, sfRe516);
sfRe516 = _mm512_fmadd_ps(wfIm48, dfIm48, sfRe516);
sfIm516 = _mm512_fmadd_ps(wfRe48, dfIm48, sfIm516);
sfIm516 = _mm512_fnmadd_ps(wfIm48, dfRe48, sfIm516);
}
sfRe515 = _mm512_add_ps(sfRe515, _mm512_loadu_ps(sfPtr19+0+8192*i84+2048*j75+12288*k193+256*l83));
sfIm515 = _mm512_add_ps(sfIm515, _mm512_loadu_ps(sfPtr19+64+8192*i84+2048*j75+12288*k193+256*l83));
sfRe516 = _mm512_add_ps(sfRe516, _mm512_loadu_ps(sfPtr19+128+8192*i84+2048*j75+12288*k193+256*l83));
sfIm516 = _mm512_add_ps(sfIm516, _mm512_loadu_ps(sfPtr19+192+8192*i84+2048*j75+12288*k193+256*l83));
_mm512_storeu_ps(sfPtr19+0+8192*i84+2048*j75+12288*k193+256*l83, sfRe515);
_mm512_storeu_ps(sfPtr19+64+8192*i84+2048*j75+12288*k193+256*l83, sfIm515);
_mm512_storeu_ps(sfPtr19+128+8192*i84+2048*j75+12288*k193+256*l83, sfRe516);
_mm512_storeu_ps(sfPtr19+192+8192*i84+2048*j75+12288*k193+256*l83, sfIm516);
}
}
}

static void ResNeXt50StriderProduceSums4(ResNeXt50ThreaderTeam1* team84, char** tensors141) {
void* tuple7[3];
tuple7[0] = tensors141;
for (ptrdiff_t e42 = 0; e42 < 1; ++e42) {
tuple7[1] = (void*)e42;
for (ptrdiff_t z9 = 0; z9 < 4; ++z9) {
tuple7[2] = (void*)z9;
ResNeXt50ThreaderTask1 task145;
task145.callee1 = ResNeXt50StriderProduceSums4Callee1;
task145.any1 = tuple7;
task145.nd1 = 4;
task145.hull1[0] = 1;
task145.hull1[1] = 1;
task145.hull1[2] = 4;
task145.hull1[3] = 32;
ResNeXt50ThreaderDo1(team84, &task145);
}
}
}

static void ResNeXt50StriderConsumeSums4Callee1(ResNeXt50ThreaderTask1* task146, int64_t* pt78) {
char** tensors144 = task146->any1;
ptrdiff_t w78 = 0;
ptrdiff_t d29 = 0;
ptrdiff_t g46 = pt78[2];
char*restrict sfPtr20 = tensors144[0];
char*restrict datPtr45 = tensors144[1];
ptrdiff_t i85 = 16*g46;
ptrdiff_t ii62 = i85+15;
for (; i85 <= ii62; ++i85) {
ptrdiff_t j76 = 1*d29;
ptrdiff_t rel28 = j76-0;
ptrdiff_t base28 = 0;
ptrdiff_t toH49 = base28+0;
ptrdiff_t toW49 = 0;
ptrdiff_t k194 = 8*w78;
for (; k194 != 8; ++k194) {
ptrdiff_t r25 = 0;
for (; r25 != 2; ++r25) {
ptrdiff_t t45 = 0;
__m512 sfRe517 = _mm512_loadu_ps(sfPtr20+0+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfIm517 = _mm512_loadu_ps(sfPtr20+64+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfRe518 = _mm512_loadu_ps(sfPtr20+2048+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfIm518 = _mm512_loadu_ps(sfPtr20+2112+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfRe519 = _mm512_loadu_ps(sfPtr20+4096+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfIm519 = _mm512_loadu_ps(sfPtr20+4160+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfRe520 = _mm512_loadu_ps(sfPtr20+6144+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512 sfIm520 = _mm512_loadu_ps(sfPtr20+6208+8192*i85+12288*j76+256*k194+128*r25+0*t45);
__m512i ifft7485 = _mm512_set_epi32(12, 14, 14, 12, 10, 10, 9, 8, 4, 6, 6, 4, 2, 2, 1, 0);
__m512 ifft7486 = _mm512_permutexvar_ps(ifft7485, sfRe517);
__m512i ifft7487 = _mm512_set_epi32(13, 15, 15, 13, 11, 11, 8, 9, 5, 7, 7, 5, 3, 3, 0, 1);
__m512 ifft7488 = _mm512_permutexvar_ps(ifft7487, sfRe517);
__m512 ifft7489 = _mm512_permutexvar_ps(ifft7485, sfIm517);
__m512 ifft7490 = _mm512_permutexvar_ps(ifft7487, sfIm517);
__m512 ifft7491 = _mm512_set_ps(1, -1, 1, -1, 1, -1, 0, 0, 1, -1, 1, -1, 1, -1, 0, 0);
__m512 ifft7492 = _mm512_mask_fmadd_ps(ifft7490, 65021, ifft7491, ifft7486);
__m512 ifft7493 = _mm512_mask_fnmadd_ps(ifft7489, 65021, ifft7491, ifft7488);
__m512 ifft7494 = _mm512_set_ps(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
__m512 ifft7495 = _mm512_fmadd_ps(ifft7492, ifft7494, _mm512_shuffle_ps(ifft7492, ifft7492, 177));
__m512 ifft7496 = _mm512_fmadd_ps(ifft7493, ifft7494, _mm512_shuffle_ps(ifft7493, ifft7493, 177));
__m512 ifft7497 = _mm512_fmadd_ps(sfRe518, ifft7494, _mm512_shuffle_ps(sfRe518, sfRe518, 177));
__m512 ifft7498 = _mm512_fmadd_ps(sfIm518, ifft7494, _mm512_shuffle_ps(sfIm518, sfIm518, 177));
__m512 ifft7499 = _mm512_fmadd_ps(sfRe519, ifft7494, _mm512_shuffle_ps(sfRe519, sfRe519, 177));
__m512 ifft7500 = _mm512_fmadd_ps(sfIm519, ifft7494, _mm512_shuffle_ps(sfIm519, sfIm519, 177));
__m512 ifft7501 = _mm512_fmadd_ps(sfRe520, ifft7494, _mm512_shuffle_ps(sfRe520, sfRe520, 177));
__m512 ifft7502 = _mm512_fmadd_ps(sfIm520, ifft7494, _mm512_shuffle_ps(sfIm520, sfIm520, 177));
__m512 ifft7503 = _mm512_set_ps(-7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1, -7.0710677e-01f, 1, 7.0710677e-01f, 1, 0, 1, 1, 1);
__m512 ifft7504 = _mm512_mul_ps(ifft7495, ifft7503);
__m512 ifft7505 = _mm512_mul_ps(ifft7496, ifft7503);
__m512 ifft7506 = _mm512_mul_ps(ifft7497, ifft7503);
__m512 ifft7507 = _mm512_mul_ps(ifft7498, ifft7503);
__m512 ifft7508 = _mm512_mul_ps(ifft7499, ifft7503);
__m512 ifft7509 = _mm512_mul_ps(ifft7500, ifft7503);
__m512 ifft7510 = _mm512_mul_ps(ifft7501, ifft7503);
__m512 ifft7511 = _mm512_mul_ps(ifft7502, ifft7503);
__m512 ifft7512 = _mm512_set_ps(7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0, 7.0710677e-01f, 0, 7.0710677e-01f, 0, 1, 0, 0, 0);
__m512 ifft7513 = _mm512_fnmadd_ps(ifft7496, ifft7512, ifft7504);
__m512 ifft7514 = _mm512_fmadd_ps(ifft7495, ifft7512, ifft7505);
__m512 ifft7515 = _mm512_fnmadd_ps(ifft7498, ifft7512, ifft7506);
__m512 ifft7516 = _mm512_fmadd_ps(ifft7497, ifft7512, ifft7507);
__m512 ifft7517 = _mm512_fnmadd_ps(ifft7500, ifft7512, ifft7508);
__m512 ifft7518 = _mm512_fmadd_ps(ifft7499, ifft7512, ifft7509);
__m512 ifft7519 = _mm512_fnmadd_ps(ifft7502, ifft7512, ifft7510);
__m512 ifft7520 = _mm512_fmadd_ps(ifft7501, ifft7512, ifft7511);
__m512 ifft7521 = _mm512_set_ps(-1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1);
__m512 ifft7522 = _mm512_fmadd_ps(ifft7513, ifft7521, _mm512_shuffle_ps(ifft7513, ifft7513, 78));
__m512 ifft7523 = _mm512_fmadd_ps(ifft7514, ifft7521, _mm512_shuffle_ps(ifft7514, ifft7514, 78));
__m512 ifft7524 = _mm512_fmadd_ps(ifft7515, ifft7521, _mm512_shuffle_ps(ifft7515, ifft7515, 78));
__m512 ifft7525 = _mm512_fmadd_ps(ifft7516, ifft7521, _mm512_shuffle_ps(ifft7516, ifft7516, 78));
__m512 ifft7526 = _mm512_fmadd_ps(ifft7517, ifft7521, _mm512_shuffle_ps(ifft7517, ifft7517, 78));
__m512 ifft7527 = _mm512_fmadd_ps(ifft7518, ifft7521, _mm512_shuffle_ps(ifft7518, ifft7518, 78));
__m512 ifft7528 = _mm512_fmadd_ps(ifft7519, ifft7521, _mm512_shuffle_ps(ifft7519, ifft7519, 78));
__m512 ifft7529 = _mm512_fmadd_ps(ifft7520, ifft7521, _mm512_shuffle_ps(ifft7520, ifft7520, 78));
__m512 ifft7530 = _mm512_mask_sub_ps(ifft7522, 49344, _mm512_setzero_ps(), ifft7523);
__m512 ifft7531 = _mm512_mask_mov_ps(ifft7523, 49344, ifft7522);
__m512 ifft7532 = _mm512_mask_sub_ps(ifft7524, 49344, _mm512_setzero_ps(), ifft7525);
__m512 ifft7533 = _mm512_mask_mov_ps(ifft7525, 49344, ifft7524);
__m512 ifft7534 = _mm512_mask_sub_ps(ifft7526, 49344, _mm512_setzero_ps(), ifft7527);
__m512 ifft7535 = _mm512_mask_mov_ps(ifft7527, 49344, ifft7526);
__m512 ifft7536 = _mm512_mask_sub_ps(ifft7528, 49344, _mm512_setzero_ps(), ifft7529);
__m512 ifft7537 = _mm512_mask_mov_ps(ifft7529, 49344, ifft7528);
__m512 ifft7538 = _mm512_set_ps(-1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1);
__m512 ifft7539 = _mm512_fmadd_ps(ifft7530, ifft7538, _mm512_shuffle_f32x4(ifft7530, ifft7530, 177));
__m512 ifft7540 = _mm512_fmadd_ps(ifft7531, ifft7538, _mm512_shuffle_f32x4(ifft7531, ifft7531, 177));
__m512 ifft7541 = _mm512_fmadd_ps(ifft7532, ifft7538, _mm512_shuffle_f32x4(ifft7532, ifft7532, 177));
__m512 ifft7542 = _mm512_fmadd_ps(ifft7533, ifft7538, _mm512_shuffle_f32x4(ifft7533, ifft7533, 177));
__m512 ifft7543 = _mm512_fmadd_ps(ifft7534, ifft7538, _mm512_shuffle_f32x4(ifft7534, ifft7534, 177));
__m512 ifft7544 = _mm512_fnmsub_ps(ifft7535, ifft7538, _mm512_shuffle_f32x4(ifft7535, ifft7535, 177));
__m512 ifft7545 = _mm512_fmadd_ps(ifft7536, ifft7538, _mm512_shuffle_f32x4(ifft7536, ifft7536, 177));
__m512 ifft7546 = _mm512_fmadd_ps(ifft7537, ifft7538, _mm512_shuffle_f32x4(ifft7537, ifft7537, 177));
__m512 ifft7547 = _mm512_add_ps(ifft7539, ifft7540);
__m512 ifft7548 = _mm512_sub_ps(ifft7539, ifft7540);
__m512 ifft7549 = _mm512_sub_ps(ifft7541, ifft7545);
__m512 ifft7550 = _mm512_add_ps(ifft7542, ifft7546);
__m512 ifft7551 = _mm512_add_ps(ifft7541, ifft7545);
__m512 ifft7552 = _mm512_sub_ps(ifft7542, ifft7546);
__m512 ifft7553 = _mm512_mul_ps(ifft7543, _mm512_set1_ps(3.125e-02f));
__m512 ifft7554 = _mm512_mul_ps(ifft7544, _mm512_set1_ps(3.125e-02f));
__m512 ifft7555 = _mm512_fmadd_ps(ifft7547, _mm512_set1_ps(1.5625e-02f), ifft7553);
__m512 ifft7556 = _mm512_fmsub_ps(ifft7547, _mm512_set1_ps(1.5625e-02f), ifft7553);
__m512 ifft7557 = _mm512_fmadd_ps(ifft7548, _mm512_set1_ps(1.5625e-02f), ifft7554);
__m512 ifft7558 = _mm512_fmsub_ps(ifft7548, _mm512_set1_ps(1.5625e-02f), ifft7554);
__m512 ifft7559 = _mm512_add_ps(ifft7549, ifft7550);
__m512 ifft7560 = _mm512_sub_ps(ifft7549, ifft7550);
__m512 ifft7561 = _mm512_fnmadd_ps(ifft7559, _mm512_set1_ps(7.0710677e-01f), ifft7551);
__m512 ifft7562 = _mm512_fmadd_ps(ifft7559, _mm512_set1_ps(7.0710677e-01f), ifft7551);
__m512 ifft7563 = _mm512_fmadd_ps(ifft7560, _mm512_set1_ps(7.0710677e-01f), ifft7552);
__m512 ifft7564 = _mm512_fmsub_ps(ifft7560, _mm512_set1_ps(7.0710677e-01f), ifft7552);
__m512 ifft7565 = _mm512_add_ps(ifft7561, ifft7562);
__m512 ifft7566 = _mm512_sub_ps(ifft7561, ifft7562);
__m512 ifft7567 = _mm512_add_ps(ifft7563, ifft7564);
__m512 ifft7568 = _mm512_sub_ps(ifft7563, ifft7564);
__m512 ifft7569 = _mm512_fmadd_ps(ifft7565, _mm512_set1_ps(1.5625e-02f), ifft7555);
__m512 ifft7570 = _mm512_fnmadd_ps(ifft7565, _mm512_set1_ps(1.5625e-02f), ifft7555);
__m512 ifft7571 = _mm512_fmadd_ps(ifft7567, _mm512_set1_ps(1.5625e-02f), ifft7557);
__m512 ifft7572 = _mm512_fnmadd_ps(ifft7567, _mm512_set1_ps(1.5625e-02f), ifft7557);
__m512 ifft7573 = _mm512_fnmadd_ps(ifft7568, _mm512_set1_ps(1.5625e-02f), ifft7556);
__m512 ifft7574 = _mm512_fmadd_ps(ifft7568, _mm512_set1_ps(1.5625e-02f), ifft7556);
__m512 ifft7575 = _mm512_fmadd_ps(ifft7566, _mm512_set1_ps(1.5625e-02f), ifft7558);
__m512 ifft7576 = _mm512_fnmadd_ps(ifft7566, _mm512_set1_ps(1.5625e-02f), ifft7558);
__m512 dat2652 = ifft7569;
__m512 dat2653 = ifft7571;
__m512 dat2654 = ifft7573;
__m512 dat2655 = ifft7575;
__m512 dat2656 = ifft7570;
__m512 dat2657 = ifft7572;
__m512 dat2658 = ifft7574;
(void)ifft7576;
dat2652 = _mm512_max_ps(_mm512_setzero_ps(), dat2652);
dat2653 = _mm512_max_ps(_mm512_setzero_ps(), dat2653);
dat2654 = _mm512_max_ps(_mm512_setzero_ps(), dat2654);
dat2655 = _mm512_max_ps(_mm512_setzero_ps(), dat2655);
dat2656 = _mm512_max_ps(_mm512_setzero_ps(), dat2656);
dat2657 = _mm512_max_ps(_mm512_setzero_ps(), dat2657);
dat2658 = _mm512_max_ps(_mm512_setzero_ps(), dat2658);
_mm512_mask_storeu_ps(datPtr45+0+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2652);
_mm512_mask_storeu_ps(datPtr45+288+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2652);
_mm512_mask_storeu_ps(datPtr45+28+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2653);
_mm512_mask_storeu_ps(datPtr45+316+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2653);
_mm512_mask_storeu_ps(datPtr45+56+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2654);
_mm512_mask_storeu_ps(datPtr45+344+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2654);
_mm512_mask_storeu_ps(datPtr45+84+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2655);
_mm512_mask_storeu_ps(datPtr45+372+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2655);
_mm512_mask_storeu_ps(datPtr45+112+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2656);
_mm512_mask_storeu_ps(datPtr45+400+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2656);
_mm512_mask_storeu_ps(datPtr45+140+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2657);
_mm512_mask_storeu_ps(datPtr45+428+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2657);
_mm512_mask_storeu_ps(datPtr45+168+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 127, dat2658);
_mm512_mask_storeu_ps(datPtr45+456+10240*i85+1280*k194+640*r25+28*toH49+4*toW49+0*t45, 32512, dat2658);
}
}
++j76;
}
}

static void ResNeXt50StriderConsumeSums4(ResNeXt50ThreaderTeam1* team85, char** tensors143) {
ResNeXt50ThreaderTask1 task147;
task147.callee1 = ResNeXt50StriderConsumeSums4Callee1;
task147.any1 = tensors143;
task147.nd1 = 3;
task147.hull1[0] = 1;
task147.hull1[1] = 1;
task147.hull1[2] = 2;
ResNeXt50ThreaderDo1(team85, &task147);
}

struct ResNeXt50Net {
char* alloc1;
char* align1;
};

void ResNeXt50NetDestroy(ResNeXt50Net* net2) {
free(net2->alloc1);
free(net2);
}

char* ResNeXt50NetCreate(
ResNeXt50Net** net1,
ResNeXt50Params* params1,
ptrdiff_t threads1
) {
if (__builtin_expect(!__builtin_cpu_supports("avx512f"), 0)) {
return ResNeXt50Errmsg1(__LINE__, "CPU does not support AVX512F");
}
char* alloc3 = malloc(126936351);
if (__builtin_expect(!alloc3, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
char* align3 = (void*)(((size_t)alloc3+63)&-64);
char* tmpAlloc1 = malloc(16447);
if (__builtin_expect(!tmpAlloc1, 0)) {
char* msg6 = ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg6;
}
char* tmpAlign1 = (void*)(((size_t)tmpAlloc1+63)&-64);
ResNeXt50ThreaderTeam1* team12 = 0;
char* err8 = ResNeXt50ThreaderCreate1(&team12, threads1);
if (__builtin_expect(!!err8, 0)) {
free(tmpAlloc1);
free(alloc3);
return err8;
}
{
ResNeXt50BnSimplify1(
params1->bn1Means,
params1->bn1Variances,
params1->bn1Scales,
params1->bn1Shifts,
align3+0
);
ResNeXt50BnSimplify2(
params1->bn2Means,
params1->bn2Variances,
params1->bn2Scales,
params1->bn2Shifts,
tmpAlign1+0
);
char* tensors299[] = {
(char*)params1->sevenDSWeights,
(char*)params1->sevenDSBiases,
tmpAlign1+0,
align3+64
};
ResNeXt50StriderArrangeFilts1(team12, tensors299);
}
{
ResNeXt50BnSimplify3(
params1->bn3Means,
params1->bn3Variances,
params1->bn3Scales,
params1->bn3Shifts,
tmpAlign1+0
);
ResNeXt50BnSimplify4(
params1->bn4Means,
params1->bn4Variances,
params1->bn4Scales,
params1->bn4Shifts,
tmpAlign1+2048
);
char* tensors300[] = {
(char*)params1->one1Weights,
(char*)params1->one1Biases,
tmpAlign1+0,
(char*)params1->one2Weights,
(char*)params1->one2Biases,
tmpAlign1+2048,
align3+98624
};
ResNeXt50OneArrangeWts1(team12, tensors300);
}
{
ResNeXt50BnSimplify4(
params1->bn5Means,
params1->bn5Variances,
params1->bn5Scales,
params1->bn5Shifts,
tmpAlign1+0
);
char* tensors301[] = {
(char*)params1->three1Weights,
(char*)params1->three1Biases,
tmpAlign1+0,
align3+198464
};
ResNeXt50ThreeArrangeFilts1(team12, tensors301);
}
{
ResNeXt50BnSimplify3(
params1->bn6Means,
params1->bn6Variances,
params1->bn6Scales,
params1->bn6Shifts,
tmpAlign1+0
);
char* tensors302[] = {
(char*)params1->one3Weights,
(char*)params1->one3Biases,
tmpAlign1+0,
align3+264512
};
ResNeXt50OneArrangeWts2(team12, tensors302);
}
{
ResNeXt50BnSimplify4(
params1->bn7Means,
params1->bn7Variances,
params1->bn7Scales,
params1->bn7Shifts,
tmpAlign1+0
);
char* tensors303[] = {
(char*)params1->one4Weights,
(char*)params1->one4Biases,
tmpAlign1+0,
align3+396608
};
ResNeXt50OneArrangeWts3(team12, tensors303);
}
{
ResNeXt50BnSimplify4(
params1->bn8Means,
params1->bn8Variances,
params1->bn8Scales,
params1->bn8Shifts,
tmpAlign1+0
);
char* tensors304[] = {
(char*)params1->three2Weights,
(char*)params1->three2Biases,
tmpAlign1+0,
align3+528192
};
ResNeXt50ThreeArrangeFilts2(team12, tensors304);
}
{
ResNeXt50BnSimplify3(
params1->bn9Means,
params1->bn9Variances,
params1->bn9Scales,
params1->bn9Shifts,
tmpAlign1+0
);
char* tensors305[] = {
(char*)params1->one5Weights,
(char*)params1->one5Biases,
tmpAlign1+0,
align3+594240
};
ResNeXt50OneArrangeWts2(team12, tensors305);
}
{
ResNeXt50BnSimplify4(
params1->bn10Means,
params1->bn10Variances,
params1->bn10Scales,
params1->bn10Shifts,
tmpAlign1+0
);
char* tensors306[] = {
(char*)params1->one6Weights,
(char*)params1->one6Biases,
tmpAlign1+0,
align3+726336
};
ResNeXt50OneArrangeWts3(team12, tensors306);
}
{
ResNeXt50BnSimplify4(
params1->bn11Means,
params1->bn11Variances,
params1->bn11Scales,
params1->bn11Shifts,
tmpAlign1+0
);
char* tensors307[] = {
(char*)params1->three3Weights,
(char*)params1->three3Biases,
tmpAlign1+0,
align3+857920
};
ResNeXt50ThreeArrangeFilts2(team12, tensors307);
}
{
ResNeXt50BnSimplify3(
params1->bn12Means,
params1->bn12Variances,
params1->bn12Scales,
params1->bn12Shifts,
tmpAlign1+0
);
char* tensors308[] = {
(char*)params1->one7Weights,
(char*)params1->one7Biases,
tmpAlign1+0,
align3+923968
};
ResNeXt50OneArrangeWts2(team12, tensors308);
}
{
ResNeXt50BnSimplify5(
params1->bn13Means,
params1->bn13Variances,
params1->bn13Scales,
params1->bn13Shifts,
tmpAlign1+0
);
char* tensors309[] = {
(char*)params1->oneDS1Weights,
(char*)params1->oneDS1Biases,
tmpAlign1+0,
align3+1056064
};
ResNeXt50OneArrangeWts4(team12, tensors309);
}
{
ResNeXt50BnSimplify3(
params1->bn14Means,
params1->bn14Variances,
params1->bn14Scales,
params1->bn14Shifts,
tmpAlign1+0
);
char* tensors310[] = {
(char*)params1->one8Weights,
(char*)params1->one8Biases,
tmpAlign1+0,
align3+1582400
};
ResNeXt50OneArrangeWts5(team12, tensors310);
}
{
ResNeXt50BnSimplify3(
params1->bn15Means,
params1->bn15Variances,
params1->bn15Scales,
params1->bn15Shifts,
tmpAlign1+0
);
char* tensors311[] = {
(char*)params1->threeDS1Weights,
(char*)params1->threeDS1Biases,
tmpAlign1+0,
align3+1845568
};
ResNeXt50StriderArrangeFilts2(team12, tensors311);
}
{
ResNeXt50BnSimplify5(
params1->bn16Means,
params1->bn16Variances,
params1->bn16Scales,
params1->bn16Shifts,
tmpAlign1+0
);
char* tensors312[] = {
(char*)params1->one9Weights,
(char*)params1->one9Biases,
tmpAlign1+0,
align3+2895168
};
ResNeXt50OneArrangeWts6(team12, tensors312);
}
{
ResNeXt50BnSimplify3(
params1->bn17Means,
params1->bn17Variances,
params1->bn17Scales,
params1->bn17Shifts,
tmpAlign1+0
);
char* tensors313[] = {
(char*)params1->one10Weights,
(char*)params1->one10Biases,
tmpAlign1+0,
align3+3421504
};
ResNeXt50OneArrangeWts7(team12, tensors313);
}
{
ResNeXt50BnSimplify3(
params1->bn18Means,
params1->bn18Variances,
params1->bn18Scales,
params1->bn18Shifts,
tmpAlign1+0
);
char* tensors314[] = {
(char*)params1->three4Weights,
(char*)params1->three4Biases,
tmpAlign1+0,
align3+3946816
};
ResNeXt50ThreeArrangeFilts3(team12, tensors314);
}
{
ResNeXt50BnSimplify5(
params1->bn19Means,
params1->bn19Variances,
params1->bn19Scales,
params1->bn19Shifts,
tmpAlign1+0
);
char* tensors315[] = {
(char*)params1->one11Weights,
(char*)params1->one11Biases,
tmpAlign1+0,
align3+4209984
};
ResNeXt50OneArrangeWts6(team12, tensors315);
}
{
ResNeXt50BnSimplify3(
params1->bn20Means,
params1->bn20Variances,
params1->bn20Scales,
params1->bn20Shifts,
tmpAlign1+0
);
char* tensors316[] = {
(char*)params1->one12Weights,
(char*)params1->one12Biases,
tmpAlign1+0,
align3+4736320
};
ResNeXt50OneArrangeWts7(team12, tensors316);
}
{
ResNeXt50BnSimplify3(
params1->bn21Means,
params1->bn21Variances,
params1->bn21Scales,
params1->bn21Shifts,
tmpAlign1+0
);
char* tensors317[] = {
(char*)params1->three5Weights,
(char*)params1->three5Biases,
tmpAlign1+0,
align3+5261632
};
ResNeXt50ThreeArrangeFilts3(team12, tensors317);
}
{
ResNeXt50BnSimplify5(
params1->bn22Means,
params1->bn22Variances,
params1->bn22Scales,
params1->bn22Shifts,
tmpAlign1+0
);
char* tensors318[] = {
(char*)params1->one13Weights,
(char*)params1->one13Biases,
tmpAlign1+0,
align3+5524800
};
ResNeXt50OneArrangeWts6(team12, tensors318);
}
{
ResNeXt50BnSimplify3(
params1->bn23Means,
params1->bn23Variances,
params1->bn23Scales,
params1->bn23Shifts,
tmpAlign1+0
);
char* tensors319[] = {
(char*)params1->one14Weights,
(char*)params1->one14Biases,
tmpAlign1+0,
align3+6051136
};
ResNeXt50OneArrangeWts7(team12, tensors319);
}
{
ResNeXt50BnSimplify3(
params1->bn24Means,
params1->bn24Variances,
params1->bn24Scales,
params1->bn24Shifts,
tmpAlign1+0
);
char* tensors320[] = {
(char*)params1->three6Weights,
(char*)params1->three6Biases,
tmpAlign1+0,
align3+6576448
};
ResNeXt50ThreeArrangeFilts3(team12, tensors320);
}
{
ResNeXt50BnSimplify5(
params1->bn25Means,
params1->bn25Variances,
params1->bn25Scales,
params1->bn25Shifts,
tmpAlign1+0
);
char* tensors321[] = {
(char*)params1->one15Weights,
(char*)params1->one15Biases,
tmpAlign1+0,
align3+6839616
};
ResNeXt50OneArrangeWts6(team12, tensors321);
}
{
ResNeXt50BnSimplify6(
params1->bn26Means,
params1->bn26Variances,
params1->bn26Scales,
params1->bn26Shifts,
tmpAlign1+0
);
char* tensors322[] = {
(char*)params1->oneDS2Weights,
(char*)params1->oneDS2Biases,
tmpAlign1+0,
align3+7365952
};
ResNeXt50OneArrangeWts8(team12, tensors322);
}
{
ResNeXt50BnSimplify5(
params1->bn27Means,
params1->bn27Variances,
params1->bn27Scales,
params1->bn27Shifts,
tmpAlign1+0
);
char* tensors323[] = {
(char*)params1->one16Weights,
(char*)params1->one16Biases,
tmpAlign1+0,
align3+9467200
};
ResNeXt50OneArrangeWts9(team12, tensors323);
}
{
ResNeXt50BnSimplify5(
params1->bn28Means,
params1->bn28Variances,
params1->bn28Scales,
params1->bn28Shifts,
tmpAlign1+0
);
char* tensors324[] = {
(char*)params1->threeDS2Weights,
(char*)params1->threeDS2Biases,
tmpAlign1+0,
align3+10517824
};
ResNeXt50StriderArrangeFilts3(team12, tensors324);
}
{
ResNeXt50BnSimplify6(
params1->bn29Means,
params1->bn29Variances,
params1->bn29Scales,
params1->bn29Shifts,
tmpAlign1+0
);
char* tensors325[] = {
(char*)params1->one17Weights,
(char*)params1->one17Biases,
tmpAlign1+0,
align3+14714176
};
ResNeXt50OneArrangeWts10(team12, tensors325);
}
{
ResNeXt50BnSimplify5(
params1->bn30Means,
params1->bn30Variances,
params1->bn30Scales,
params1->bn30Shifts,
tmpAlign1+0
);
char* tensors326[] = {
(char*)params1->one18Weights,
(char*)params1->one18Biases,
tmpAlign1+0,
align3+16815424
};
ResNeXt50OneArrangeWts11(team12, tensors326);
}
{
ResNeXt50BnSimplify5(
params1->bn31Means,
params1->bn31Variances,
params1->bn31Scales,
params1->bn31Shifts,
tmpAlign1+0
);
char* tensors327[] = {
(char*)params1->three7Weights,
(char*)params1->three7Biases,
tmpAlign1+0,
align3+18914624
};
ResNeXt50ThreeArrangeFilts4(team12, tensors327);
}
{
ResNeXt50BnSimplify6(
params1->bn32Means,
params1->bn32Variances,
params1->bn32Scales,
params1->bn32Shifts,
tmpAlign1+0
);
char* tensors328[] = {
(char*)params1->one19Weights,
(char*)params1->one19Biases,
tmpAlign1+0,
align3+19965248
};
ResNeXt50OneArrangeWts10(team12, tensors328);
}
{
ResNeXt50BnSimplify5(
params1->bn33Means,
params1->bn33Variances,
params1->bn33Scales,
params1->bn33Shifts,
tmpAlign1+0
);
char* tensors329[] = {
(char*)params1->one20Weights,
(char*)params1->one20Biases,
tmpAlign1+0,
align3+22066496
};
ResNeXt50OneArrangeWts11(team12, tensors329);
}
{
ResNeXt50BnSimplify5(
params1->bn34Means,
params1->bn34Variances,
params1->bn34Scales,
params1->bn34Shifts,
tmpAlign1+0
);
char* tensors330[] = {
(char*)params1->three8Weights,
(char*)params1->three8Biases,
tmpAlign1+0,
align3+24165696
};
ResNeXt50ThreeArrangeFilts4(team12, tensors330);
}
{
ResNeXt50BnSimplify6(
params1->bn35Means,
params1->bn35Variances,
params1->bn35Scales,
params1->bn35Shifts,
tmpAlign1+0
);
char* tensors331[] = {
(char*)params1->one21Weights,
(char*)params1->one21Biases,
tmpAlign1+0,
align3+25216320
};
ResNeXt50OneArrangeWts10(team12, tensors331);
}
{
ResNeXt50BnSimplify5(
params1->bn36Means,
params1->bn36Variances,
params1->bn36Scales,
params1->bn36Shifts,
tmpAlign1+0
);
char* tensors332[] = {
(char*)params1->one22Weights,
(char*)params1->one22Biases,
tmpAlign1+0,
align3+27317568
};
ResNeXt50OneArrangeWts11(team12, tensors332);
}
{
ResNeXt50BnSimplify5(
params1->bn37Means,
params1->bn37Variances,
params1->bn37Scales,
params1->bn37Shifts,
tmpAlign1+0
);
char* tensors333[] = {
(char*)params1->three9Weights,
(char*)params1->three9Biases,
tmpAlign1+0,
align3+29416768
};
ResNeXt50ThreeArrangeFilts4(team12, tensors333);
}
{
ResNeXt50BnSimplify6(
params1->bn38Means,
params1->bn38Variances,
params1->bn38Scales,
params1->bn38Shifts,
tmpAlign1+0
);
char* tensors334[] = {
(char*)params1->one23Weights,
(char*)params1->one23Biases,
tmpAlign1+0,
align3+30467392
};
ResNeXt50OneArrangeWts10(team12, tensors334);
}
{
ResNeXt50BnSimplify5(
params1->bn39Means,
params1->bn39Variances,
params1->bn39Scales,
params1->bn39Shifts,
tmpAlign1+0
);
char* tensors335[] = {
(char*)params1->one24Weights,
(char*)params1->one24Biases,
tmpAlign1+0,
align3+32568640
};
ResNeXt50OneArrangeWts11(team12, tensors335);
}
{
ResNeXt50BnSimplify5(
params1->bn40Means,
params1->bn40Variances,
params1->bn40Scales,
params1->bn40Shifts,
tmpAlign1+0
);
char* tensors336[] = {
(char*)params1->three10Weights,
(char*)params1->three10Biases,
tmpAlign1+0,
align3+34667840
};
ResNeXt50ThreeArrangeFilts4(team12, tensors336);
}
{
ResNeXt50BnSimplify6(
params1->bn41Means,
params1->bn41Variances,
params1->bn41Scales,
params1->bn41Shifts,
tmpAlign1+0
);
char* tensors337[] = {
(char*)params1->one25Weights,
(char*)params1->one25Biases,
tmpAlign1+0,
align3+35718464
};
ResNeXt50OneArrangeWts10(team12, tensors337);
}
{
ResNeXt50BnSimplify5(
params1->bn42Means,
params1->bn42Variances,
params1->bn42Scales,
params1->bn42Shifts,
tmpAlign1+0
);
char* tensors338[] = {
(char*)params1->one26Weights,
(char*)params1->one26Biases,
tmpAlign1+0,
align3+37819712
};
ResNeXt50OneArrangeWts11(team12, tensors338);
}
{
ResNeXt50BnSimplify5(
params1->bn43Means,
params1->bn43Variances,
params1->bn43Scales,
params1->bn43Shifts,
tmpAlign1+0
);
char* tensors339[] = {
(char*)params1->three11Weights,
(char*)params1->three11Biases,
tmpAlign1+0,
align3+39918912
};
ResNeXt50ThreeArrangeFilts4(team12, tensors339);
}
{
ResNeXt50BnSimplify6(
params1->bn44Means,
params1->bn44Variances,
params1->bn44Scales,
params1->bn44Shifts,
tmpAlign1+0
);
char* tensors340[] = {
(char*)params1->one27Weights,
(char*)params1->one27Biases,
tmpAlign1+0,
align3+40969536
};
ResNeXt50OneArrangeWts10(team12, tensors340);
}
{
ResNeXt50BnSimplify7(
params1->bn45Means,
params1->bn45Variances,
params1->bn45Scales,
params1->bn45Shifts,
tmpAlign1+0
);
char* tensors341[] = {
(char*)params1->oneDS3Weights,
(char*)params1->oneDS3Biases,
tmpAlign1+0,
align3+43070784
};
ResNeXt50OneArrangeWts12(team12, tensors341);
}
{
ResNeXt50BnSimplify6(
params1->bn46Means,
params1->bn46Variances,
params1->bn46Scales,
params1->bn46Shifts,
tmpAlign1+0
);
char* tensors342[] = {
(char*)params1->one28Weights,
(char*)params1->one28Biases,
tmpAlign1+0,
align3+51467584
};
ResNeXt50OneArrangeWts13(team12, tensors342);
}
{
ResNeXt50BnSimplify6(
params1->bn47Means,
params1->bn47Variances,
params1->bn47Scales,
params1->bn47Shifts,
tmpAlign1+0
);
char* tensors343[] = {
(char*)params1->threeDS3Weights,
(char*)params1->threeDS3Biases,
tmpAlign1+0,
align3+55665984
};
ResNeXt50StriderArrangeFilts4(team12, tensors343);
}
{
ResNeXt50BnSimplify7(
params1->bn48Means,
params1->bn48Variances,
params1->bn48Scales,
params1->bn48Shifts,
tmpAlign1+0
);
char* tensors344[] = {
(char*)params1->one29Weights,
(char*)params1->one29Biases,
tmpAlign1+0,
align3+72447296
};
ResNeXt50OneArrangeWts14(team12, tensors344);
}
{
ResNeXt50BnSimplify6(
params1->bn49Means,
params1->bn49Variances,
params1->bn49Scales,
params1->bn49Shifts,
tmpAlign1+0
);
char* tensors345[] = {
(char*)params1->one30Weights,
(char*)params1->one30Biases,
tmpAlign1+0,
align3+80844096
};
ResNeXt50OneArrangeWts15(team12, tensors345);
}
{
ResNeXt50BnSimplify6(
params1->bn50Means,
params1->bn50Variances,
params1->bn50Scales,
params1->bn50Shifts,
tmpAlign1+0
);
char* tensors346[] = {
(char*)params1->three12Weights,
(char*)params1->three12Biases,
tmpAlign1+0,
align3+89244992
};
ResNeXt50ThreeArrangeFilts5(team12, tensors346);
}
{
ResNeXt50BnSimplify7(
params1->bn51Means,
params1->bn51Variances,
params1->bn51Scales,
params1->bn51Shifts,
tmpAlign1+0
);
char* tensors347[] = {
(char*)params1->one31Weights,
(char*)params1->one31Biases,
tmpAlign1+0,
align3+93443392
};
ResNeXt50OneArrangeWts14(team12, tensors347);
}
{
ResNeXt50BnSimplify6(
params1->bn52Means,
params1->bn52Variances,
params1->bn52Scales,
params1->bn52Shifts,
tmpAlign1+0
);
char* tensors348[] = {
(char*)params1->one32Weights,
(char*)params1->one32Biases,
tmpAlign1+0,
align3+101840192
};
ResNeXt50OneArrangeWts15(team12, tensors348);
}
{
ResNeXt50BnSimplify6(
params1->bn53Means,
params1->bn53Variances,
params1->bn53Scales,
params1->bn53Shifts,
tmpAlign1+0
);
char* tensors349[] = {
(char*)params1->three13Weights,
(char*)params1->three13Biases,
tmpAlign1+0,
align3+110241088
};
ResNeXt50ThreeArrangeFilts5(team12, tensors349);
}
{
ResNeXt50BnSimplify7(
params1->bn54Means,
params1->bn54Variances,
params1->bn54Scales,
params1->bn54Shifts,
tmpAlign1+0
);
char* tensors350[] = {
(char*)params1->one33Weights,
(char*)params1->one33Biases,
tmpAlign1+0,
align3+114439488
};
ResNeXt50OneArrangeWts14(team12, tensors350);
}
{
char* tensors351[] = {
(char*)params1->fcWeights,
(char*)params1->fcBiases,
align3+122836288
};
ResNeXt50FcArrange1(team12, tensors351);
}
ResNeXt50ThreaderDestroy1(team12);
free(tmpAlloc1);
ResNeXt50Net* net5 = malloc(sizeof(ResNeXt50Net));
if (__builtin_expect(!net5, 0)) {
char* msg7 = ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
free(alloc3);
return msg7;
}
net5->alloc1 = alloc3;
net5->align1 = align3;
*net1 = net5;
return 0;
}

struct ResNeXt50Engine {
ResNeXt50Net* net3;
ResNeXt50ThreaderTeam1* team11;
char* alloc2;
char* align2;
};

char* ResNeXt50EnginePthreadT(
ResNeXt50Engine* eng2,
ptrdiff_t idx2,
pthread_t* to1
) {
return ResNeXt50ThreaderPthreadT1(to1, eng2->team11, idx2);
}

void ResNeXt50EngineDestroy(ResNeXt50Engine* eng3) {
ResNeXt50ThreaderDestroy1(eng3->team11);
free(eng3->alloc2);
free(eng3);
}

char* ResNeXt50EngineCreate(
ResNeXt50Engine** eng4,
ResNeXt50Net* net4,
ptrdiff_t threads2
) {
ResNeXt50Engine* eng5 = malloc(sizeof(ResNeXt50Engine));
if (__builtin_expect(!eng5, 0)) {
return ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
}
char* alloc4 = malloc(19978495);
if (__builtin_expect(!alloc4, 0)) {
char* msg5 = ResNeXt50Errmsg1(__LINE__, "errno %d", errno);
free(eng5);
return msg5;
}
eng5->alloc2 = alloc4;
eng5->align2 = (void*)(((size_t)alloc4+63)&-64);
char* err7 = ResNeXt50ThreaderCreate1(&eng5->team11, threads2);
if (__builtin_expect(!!err7, 0)) {
free(eng5);
free(alloc4);
return err7;
}
eng5->net3 = net4;
*eng4 = eng5;
return 0;
}

void ResNeXt50EngineInference(
ResNeXt50Engine* eng1,
float* imageData,
float* probData
) {
char* netAlign1 = eng1->net3->align1;
ResNeXt50ThreaderTeam1* team14 = eng1->team11;
char* align4 = eng1->align2;
{
char* tensors174[] = {
(char*)imageData,
netAlign1+0,
align4+9683136
};
ResNeXt50StriderArrangeDats1(team14, tensors174);
char* tensors175[] = {
netAlign1+64,
align4+9683136,
align4+11311296
};
ResNeXt50StriderProduceSums1(team14, tensors175);
char* tensors176[] = {
align4+11311296,
align4+0
};
ResNeXt50StriderConsumeSums1(team14, tensors176);
}
{
char* tensors177[] = {
align4+0,
align4+4841536
};
ResNeXt50Thrpl1(team14, tensors177);
}
{
char* tensors178[] = {
align4+4841536,
align4+9683136
};
ResNeXt50OneArrangeDats1(team14, tensors178);
char* tensors179[] = {
netAlign1+98624,
align4+9683136,
align4+0
};
ResNeXt50OneApply1(team14, tensors179);
}
{
char* tensors180[] = {
align4+3227648,
align4+9683136
};
ResNeXt50ThreeArrangeDats1(team14, tensors180);
char* tensors181[] = {
netAlign1+198464,
align4+9683136,
align4+12959936
};
ResNeXt50ThreeProduceSums1(team14, tensors181);
char* tensors182[] = {
align4+12959936,
align4+8069248
};
ResNeXt50ThreeConsumeSums1(team14, tensors182);
}
{
char* tensors183[] = {
align4+8069248,
align4+9683136
};
ResNeXt50OneArrangeDats2(team14, tensors183);
char* tensors184[] = {
netAlign1+264512,
align4+9683136,
align4+0,
align4+4841536
};
ResNeXt50OneApply2(team14, tensors184);
}
{
char* tensors185[] = {
align4+4841536,
align4+9683136
};
ResNeXt50OneArrangeDats3(team14, tensors185);
char* tensors186[] = {
netAlign1+396608,
align4+9683136,
align4+0
};
ResNeXt50OneApply3(team14, tensors186);
}
{
char* tensors187[] = {
align4+0,
align4+9683136
};
ResNeXt50ThreeArrangeDats2(team14, tensors187);
char* tensors188[] = {
netAlign1+528192,
align4+9683136,
align4+12959936
};
ResNeXt50ThreeProduceSums2(team14, tensors188);
char* tensors189[] = {
align4+12959936,
align4+8069248
};
ResNeXt50ThreeConsumeSums2(team14, tensors189);
}
{
char* tensors190[] = {
align4+8069248,
align4+9683136
};
ResNeXt50OneArrangeDats2(team14, tensors190);
char* tensors191[] = {
netAlign1+594240,
align4+9683136,
align4+4841536,
align4+0
};
ResNeXt50OneApply2(team14, tensors191);
}
{
char* tensors192[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats3(team14, tensors192);
char* tensors193[] = {
netAlign1+726336,
align4+9683136,
align4+3227712
};
ResNeXt50OneApply3(team14, tensors193);
}
{
char* tensors194[] = {
align4+3227712,
align4+9683136
};
ResNeXt50ThreeArrangeDats2(team14, tensors194);
char* tensors195[] = {
netAlign1+857920,
align4+9683136,
align4+12959936
};
ResNeXt50ThreeProduceSums2(team14, tensors195);
char* tensors196[] = {
align4+12959936,
align4+6455424
};
ResNeXt50ThreeConsumeSums2(team14, tensors196);
}
{
char* tensors197[] = {
align4+6455424,
align4+9683136
};
ResNeXt50OneArrangeDats2(team14, tensors197);
char* tensors198[] = {
netAlign1+923968,
align4+9683136,
align4+0,
align4+3227712
};
ResNeXt50OneApply2(team14, tensors198);
}
{
char* tensors199[] = {
align4+3227712,
align4+9683136
};
ResNeXt50OneArrangeDats4(team14, tensors199);
char* tensors200[] = {
netAlign1+1056064,
align4+9683136,
align4+6455424
};
ResNeXt50OneApply4(team14, tensors200);
}
{
char* tensors201[] = {
align4+3227712,
align4+9683136
};
ResNeXt50OneArrangeDats5(team14, tensors201);
char* tensors202[] = {
netAlign1+1582400,
align4+9683136,
align4+0
};
ResNeXt50OneApply5(team14, tensors202);
}
{
char* tensors203[] = {
align4+0,
align4+9683136
};
ResNeXt50StriderArrangeDats2(team14, tensors203);
char* tensors204[] = {
netAlign1+1845568,
align4+9683136,
align4+13877440
};
ResNeXt50StriderProduceSums2(team14, tensors204);
char* tensors205[] = {
align4+13877440,
align4+3227712
};
ResNeXt50StriderConsumeSums2(team14, tensors205);
}
{
char* tensors206[] = {
align4+3227712,
align4+9683136
};
ResNeXt50OneArrangeDats6(team14, tensors206);
char* tensors207[] = {
netAlign1+2895168,
align4+9683136,
align4+6455424,
align4+0
};
ResNeXt50OneApply6(team14, tensors207);
}
{
char* tensors208[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats7(team14, tensors208);
char* tensors209[] = {
netAlign1+3421504,
align4+9683136,
align4+1605696
};
ResNeXt50OneApply7(team14, tensors209);
}
{
char* tensors210[] = {
align4+1605696,
align4+9683136
};
ResNeXt50ThreeArrangeDats3(team14, tensors210);
char* tensors211[] = {
netAlign1+3946816,
align4+9683136,
align4+11321536
};
ResNeXt50ThreeProduceSums3(team14, tensors211);
char* tensors212[] = {
align4+11321536,
align4+3211392
};
ResNeXt50ThreeConsumeSums3(team14, tensors212);
}
{
char* tensors213[] = {
align4+3211392,
align4+9683136
};
ResNeXt50OneArrangeDats6(team14, tensors213);
char* tensors214[] = {
netAlign1+4209984,
align4+9683136,
align4+0,
align4+1605696
};
ResNeXt50OneApply6(team14, tensors214);
}
{
char* tensors215[] = {
align4+1605696,
align4+9683136
};
ResNeXt50OneArrangeDats7(team14, tensors215);
char* tensors216[] = {
netAlign1+4736320,
align4+9683136,
align4+0
};
ResNeXt50OneApply7(team14, tensors216);
}
{
char* tensors217[] = {
align4+0,
align4+9683136
};
ResNeXt50ThreeArrangeDats3(team14, tensors217);
char* tensors218[] = {
netAlign1+5261632,
align4+9683136,
align4+11321536
};
ResNeXt50ThreeProduceSums3(team14, tensors218);
char* tensors219[] = {
align4+11321536,
align4+3211392
};
ResNeXt50ThreeConsumeSums3(team14, tensors219);
}
{
char* tensors220[] = {
align4+3211392,
align4+9683136
};
ResNeXt50OneArrangeDats6(team14, tensors220);
char* tensors221[] = {
netAlign1+5524800,
align4+9683136,
align4+1605696,
align4+0
};
ResNeXt50OneApply6(team14, tensors221);
}
{
char* tensors222[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats7(team14, tensors222);
char* tensors223[] = {
netAlign1+6051136,
align4+9683136,
align4+1605696
};
ResNeXt50OneApply7(team14, tensors223);
}
{
char* tensors224[] = {
align4+1605696,
align4+9683136
};
ResNeXt50ThreeArrangeDats3(team14, tensors224);
char* tensors225[] = {
netAlign1+6576448,
align4+9683136,
align4+11321536
};
ResNeXt50ThreeProduceSums3(team14, tensors225);
char* tensors226[] = {
align4+11321536,
align4+3211392
};
ResNeXt50ThreeConsumeSums3(team14, tensors226);
}
{
char* tensors227[] = {
align4+3211392,
align4+9683136
};
ResNeXt50OneArrangeDats6(team14, tensors227);
char* tensors228[] = {
netAlign1+6839616,
align4+9683136,
align4+0,
align4+1605696
};
ResNeXt50OneApply6(team14, tensors228);
}
{
char* tensors229[] = {
align4+1605696,
align4+9683136
};
ResNeXt50OneArrangeDats8(team14, tensors229);
char* tensors230[] = {
netAlign1+7365952,
align4+9683136,
align4+3211392
};
ResNeXt50OneApply8(team14, tensors230);
}
{
char* tensors231[] = {
align4+1605696,
align4+9683136
};
ResNeXt50OneArrangeDats9(team14, tensors231);
char* tensors232[] = {
netAlign1+9467200,
align4+9683136,
align4+0
};
ResNeXt50OneApply9(team14, tensors232);
}
{
char* tensors233[] = {
align4+0,
align4+9683136
};
ResNeXt50StriderArrangeDats3(team14, tensors233);
char* tensors234[] = {
netAlign1+10517824,
align4+9683136,
align4+11780288
};
ResNeXt50StriderProduceSums3(team14, tensors234);
char* tensors235[] = {
align4+11780288,
align4+1605696
};
ResNeXt50StriderConsumeSums3(team14, tensors235);
}
{
char* tensors236[] = {
align4+1605696,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors236);
char* tensors237[] = {
netAlign1+14714176,
align4+9683136,
align4+3211392,
align4+0
};
ResNeXt50OneApply10(team14, tensors237);
}
{
char* tensors238[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats11(team14, tensors238);
char* tensors239[] = {
netAlign1+16815424,
align4+9683136,
align4+852032
};
ResNeXt50OneApply11(team14, tensors239);
}
{
char* tensors240[] = {
align4+852032,
align4+9683136
};
ResNeXt50ThreeArrangeDats4(team14, tensors240);
char* tensors241[] = {
netAlign1+18914624,
align4+9683136,
align4+10862784
};
ResNeXt50ThreeProduceSums4(team14, tensors241);
char* tensors242[] = {
align4+10862784,
align4+1704064
};
ResNeXt50ThreeConsumeSums4(team14, tensors242);
}
{
char* tensors243[] = {
align4+1704064,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors243);
char* tensors244[] = {
netAlign1+19965248,
align4+9683136,
align4+0,
align4+852032
};
ResNeXt50OneApply10(team14, tensors244);
}
{
char* tensors245[] = {
align4+852032,
align4+9683136
};
ResNeXt50OneArrangeDats11(team14, tensors245);
char* tensors246[] = {
netAlign1+22066496,
align4+9683136,
align4+0
};
ResNeXt50OneApply11(team14, tensors246);
}
{
char* tensors247[] = {
align4+0,
align4+9683136
};
ResNeXt50ThreeArrangeDats4(team14, tensors247);
char* tensors248[] = {
netAlign1+24165696,
align4+9683136,
align4+10862784
};
ResNeXt50ThreeProduceSums4(team14, tensors248);
char* tensors249[] = {
align4+10862784,
align4+1704064
};
ResNeXt50ThreeConsumeSums4(team14, tensors249);
}
{
char* tensors250[] = {
align4+1704064,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors250);
char* tensors251[] = {
netAlign1+25216320,
align4+9683136,
align4+852032,
align4+0
};
ResNeXt50OneApply10(team14, tensors251);
}
{
char* tensors252[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats11(team14, tensors252);
char* tensors253[] = {
netAlign1+27317568,
align4+9683136,
align4+852032
};
ResNeXt50OneApply11(team14, tensors253);
}
{
char* tensors254[] = {
align4+852032,
align4+9683136
};
ResNeXt50ThreeArrangeDats4(team14, tensors254);
char* tensors255[] = {
netAlign1+29416768,
align4+9683136,
align4+10862784
};
ResNeXt50ThreeProduceSums4(team14, tensors255);
char* tensors256[] = {
align4+10862784,
align4+1704064
};
ResNeXt50ThreeConsumeSums4(team14, tensors256);
}
{
char* tensors257[] = {
align4+1704064,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors257);
char* tensors258[] = {
netAlign1+30467392,
align4+9683136,
align4+0,
align4+852032
};
ResNeXt50OneApply10(team14, tensors258);
}
{
char* tensors259[] = {
align4+852032,
align4+9683136
};
ResNeXt50OneArrangeDats11(team14, tensors259);
char* tensors260[] = {
netAlign1+32568640,
align4+9683136,
align4+0
};
ResNeXt50OneApply11(team14, tensors260);
}
{
char* tensors261[] = {
align4+0,
align4+9683136
};
ResNeXt50ThreeArrangeDats4(team14, tensors261);
char* tensors262[] = {
netAlign1+34667840,
align4+9683136,
align4+10862784
};
ResNeXt50ThreeProduceSums4(team14, tensors262);
char* tensors263[] = {
align4+10862784,
align4+1704064
};
ResNeXt50ThreeConsumeSums4(team14, tensors263);
}
{
char* tensors264[] = {
align4+1704064,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors264);
char* tensors265[] = {
netAlign1+35718464,
align4+9683136,
align4+852032,
align4+0
};
ResNeXt50OneApply10(team14, tensors265);
}
{
char* tensors266[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats11(team14, tensors266);
char* tensors267[] = {
netAlign1+37819712,
align4+9683136,
align4+852032
};
ResNeXt50OneApply11(team14, tensors267);
}
{
char* tensors268[] = {
align4+852032,
align4+9683136
};
ResNeXt50ThreeArrangeDats4(team14, tensors268);
char* tensors269[] = {
netAlign1+39918912,
align4+9683136,
align4+10862784
};
ResNeXt50ThreeProduceSums4(team14, tensors269);
char* tensors270[] = {
align4+10862784,
align4+1704064
};
ResNeXt50ThreeConsumeSums4(team14, tensors270);
}
{
char* tensors271[] = {
align4+1704064,
align4+9683136
};
ResNeXt50OneArrangeDats10(team14, tensors271);
char* tensors272[] = {
netAlign1+40969536,
align4+9683136,
align4+0,
align4+852032
};
ResNeXt50OneApply10(team14, tensors272);
}
{
char* tensors273[] = {
align4+852032,
align4+9683136
};
ResNeXt50OneArrangeDats12(team14, tensors273);
char* tensors274[] = {
netAlign1+43070784,
align4+9683136,
align4+1704064
};
ResNeXt50OneApply12(team14, tensors274);
}
{
char* tensors275[] = {
align4+852032,
align4+9683136
};
ResNeXt50OneArrangeDats13(team14, tensors275);
char* tensors276[] = {
netAlign1+51467584,
align4+9683136,
align4+0
};
ResNeXt50OneApply13(team14, tensors276);
}
{
char* tensors277[] = {
align4+0,
align4+9683136
};
ResNeXt50StriderArrangeDats4(team14, tensors277);
char* tensors278[] = {
netAlign1+55665984,
align4+9683136,
align4+11780288
};
ResNeXt50StriderProduceSums4(team14, tensors278);
char* tensors279[] = {
align4+11780288,
align4+852032
};
ResNeXt50StriderConsumeSums4(team14, tensors279);
}
{
char* tensors280[] = {
align4+852032,
align4+9683136
};
ResNeXt50OneArrangeDats14(team14, tensors280);
char* tensors281[] = {
netAlign1+72447296,
align4+9683136,
align4+1704064,
align4+0
};
ResNeXt50OneApply14(team14, tensors281);
}
{
char* tensors282[] = {
align4+0,
align4+9683136
};
ResNeXt50OneArrangeDats15(team14, tensors282);
char* tensors283[] = {
netAlign1+80844096,
align4+9683136,
align4+655424
};
ResNeXt50OneApply15(team14, tensors283);
}
{
char* tensors284[] = {
align4+655424,
align4+9683136
};
ResNeXt50ThreeArrangeDats5(team14, tensors284);
char* tensors285[] = {
netAlign1+89244992,
align4+9683136,
align4+10731712
};
ResNeXt50ThreeProduceSums5(team14, tensors285);
char* tensors286[] = {
align4+10731712,
align4+1310848
};
ResNeXt50ThreeConsumeSums5(team14, tensors286);
}
{
char* tensors287[] = {
align4+1310848,
align4+9683136
};
ResNeXt50OneArrangeDats14(team14, tensors287);
char* tensors288[] = {
netAlign1+93443392,
align4+9683136,
align4+0,
align4+655424
};
ResNeXt50OneApply14(team14, tensors288);
}
{
char* tensors289[] = {
align4+655424,
align4+9683136
};
ResNeXt50OneArrangeDats15(team14, tensors289);
char* tensors290[] = {
netAlign1+101840192,
align4+9683136,
align4+0
};
ResNeXt50OneApply15(team14, tensors290);
}
{
char* tensors291[] = {
align4+0,
align4+9683136
};
ResNeXt50ThreeArrangeDats5(team14, tensors291);
char* tensors292[] = {
netAlign1+110241088,
align4+9683136,
align4+10731712
};
ResNeXt50ThreeProduceSums5(team14, tensors292);
char* tensors293[] = {
align4+10731712,
align4+1310848
};
ResNeXt50ThreeConsumeSums5(team14, tensors293);
}
{
char* tensors294[] = {
align4+1310848,
align4+9683136
};
ResNeXt50OneArrangeDats14(team14, tensors294);
char* tensors295[] = {
netAlign1+114439488,
align4+9683136,
align4+655424,
align4+0
};
ResNeXt50OneApply14(team14, tensors295);
}
{
char* tensors296[] = {
align4+0,
align4+655424
};
ResNeXt50Glopl1(team14, tensors296);
}
{
char* tensors297[] = {
netAlign1+122836288,
align4+655424,
align4+0
};
ResNeXt50FcApply1(team14, tensors297);
}
{
char* tensors298[] = {
align4+0,
(char*)probData
};
ResNeXt50Softmax1(team14, tensors298);
}
}

// End of file.

Top